1 /// @file htslib/cram.h
2 /// CRAM format-specific API functions.
3 /*
4     Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd.
5 
6     Author: James Bonfield <jkb@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 /** @file
27  * Consider using the higher level hts_*() API for programs that wish to
28  * be file format agnostic (see htslib/hts.h).
29  *
30  * This API should be used for CRAM specific code. The specifics of the
31  * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
32  * although these should not be included directly (use this file instead).
33  */
34 
35 module htslib.cram;
36 
37 import core.stdc.stdio;
38 import core.stdc.stdlib;
39 import core.stdc.stdarg : va_list;
40 
41 import htslib.sam;
42 import htslib.hts;
43 import htslib.hfile : hFILE;
44 
45 @system:
46 nothrow:
47 @nogc:
48 
49 extern (C):
50 
51 // see cram/cram_structs.h for an internal more complete copy of this enum
52 
53 // Htslib 1.11 had these listed without any hts prefix, and included
54 // some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever
55 // been public).
56 //
57 // We can't find evidence of these being used and the data type occurs
58 // nowhere in functions or structures meaning using it would be pointless.
59 // However for safety, if you absolute need the API to not change then
60 // define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11).
61 
62 // Public methods as defined in the CRAM spec.
63 
64 // CRAM 2.x and 3.0
65 
66 // NB: the subsequent numbers may change.  They're simply here for
67 // compatibility with the old API, but may have no bearing on the
68 // internal way htslib works.  DO NOT USE
69 
70 //#include <sys/types.h>
71 alias off_t = size_t;
72 alias ssize_t = size_t;
73 
74 enum cram_block_method
75 {
76     BM_ERROR = -1,
77     RAW = 0,
78     GZIP = 1,
79     BZIP2 = 2,
80     LZMA = 3,
81     RANS = 4, // Generic; either order
82     RANS0 = 4,
83     RANS1 = 10, // Not externalised; stored as RANS (generic)
84     GZIP_RLE = 11 // NB: not externalised in CRAM
85 }
86 
87 enum cram_content_type
88 {
89     CT_ERROR = -1,
90     FILE_HEADER = 0,
91     COMPRESSION_HEADER = 1,
92     MAPPED_SLICE = 2,
93     UNMAPPED_SLICE = 3, // CRAM V1.0 only
94     EXTERNAL = 4,
95     CORE = 5
96 }
97 
98 // Opaque data types, see cram_structs for the fully fledged versions.
99 struct cram_file_def;
100 struct cram_fd;
101 struct cram_container;
102 struct cram_block;
103 struct cram_slice;
104 struct cram_metrics;
105 struct cram_block_slice_hdr;
106 struct cram_block_compression_hdr;
107 struct refs_t;
108 
109 // Accessor functions
110 
111 /*
112  *-----------------------------------------------------------------------------
113  * cram_fd
114  */
115 sam_hdr_t* cram_fd_get_header(cram_fd* fd);
116 
117 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr);
118 
119 int cram_fd_get_version(cram_fd* fd);
120 
121 void cram_fd_set_version(cram_fd* fd, int vers);
122 
123 int cram_major_vers(cram_fd* fd);
124 int cram_minor_vers(cram_fd* fd);
125 
126 hFILE* cram_fd_get_fp(cram_fd* fd);
127 void cram_fd_set_fp(cram_fd* fd, hFILE* fp);
128 
129 /*
130  *-----------------------------------------------------------------------------
131  * cram_container
132  */
133 int cram_container_get_length(cram_container* c);
134 void cram_container_set_length(cram_container* c, int length);
135 int cram_container_get_num_blocks(cram_container* c);
136 void cram_container_set_num_blocks(cram_container* c, int num_blocks);
137 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks);
138 void cram_container_set_landmarks(
139     cram_container* c,
140     int num_landmarks,
141     int* landmarks);
142 
143 /* Returns true if the container is empty (EOF marker) */
144 int cram_container_is_empty(cram_fd* fd);
145 
146 /*
147  *-----------------------------------------------------------------------------
148  * cram_block
149  */
150 int cram_block_get_content_id(cram_block* b);
151 int cram_block_get_comp_size(cram_block* b);
152 int cram_block_get_uncomp_size(cram_block* b);
153 int cram_block_get_crc32(cram_block* b);
154 void* cram_block_get_data(cram_block* b);
155 
156 cram_content_type cram_block_get_content_type(cram_block* b);
157 
158 void cram_block_set_content_id(cram_block* b, int id);
159 void cram_block_set_comp_size(cram_block* b, int size);
160 void cram_block_set_uncomp_size(cram_block* b, int size);
161 void cram_block_set_crc32(cram_block* b, int crc);
162 void cram_block_set_data(cram_block* b, void* data);
163 
164 int cram_block_append(cram_block* b, const(void)* data, int size);
165 void cram_block_update_size(cram_block* b);
166 
167 // Offset is known as "size" internally, but it can be confusing.
168 size_t cram_block_get_offset(cram_block* b);
169 void cram_block_set_offset(cram_block* b, size_t offset);
170 
171 /*
172  * Computes the size of a cram block, including the block
173  * header itself.
174  */
175 uint cram_block_size(cram_block* b);
176 
177 /*
178  * Renumbers RG numbers in a cram compression header.
179  *
180  * CRAM stores RG as the Nth number in the header, rather than a
181  * string holding the ID: tag.  This is smaller in space, but means
182  * "samtools cat" to join files together that contain single but
183  * different RG lines needs a way of renumbering them.
184  *
185  * The file descriptor is expected to be immediately after the
186  * cram_container structure (ie before the cram compression header).
187  * Due to the nature of the CRAM format, this needs to read and write
188  * the blocks itself.  Note that there may be multiple slices within
189  * the container, meaning multiple compression headers to manipulate.
190  * Changing RG may change the size of the compression header and
191  * therefore the length field in the container.  Hence we rewrite all
192  * blocks just in case and also emit the adjusted container.
193  *
194  * The current implementation can only cope with renumbering a single
195  * RG (and only then if it is using HUFFMAN or BETA codecs).  In
196  * theory it *may* be possible to renumber multiple RGs if they use
197  * HUFFMAN to the CORE block or use an external block unshared by any
198  * other data series.  So we have an API that can be upgraded to
199  * support this, but do not implement it for now.  An example
200  * implementation of RG as an EXTERNAL block would be to find that
201  * block and rewrite it, returning the number of blocks consumed.
202  *
203  * Returns 0 on success;
204  *        -1 if unable to edit;
205  *        -2 on other errors (eg I/O).
206  */
207 int cram_transcode_rg(
208     cram_fd* in_,
209     cram_fd* out_,
210     cram_container* c,
211     int nrg,
212     int* in_rg,
213     int* out_rg);
214 
215 /*
216  * Copies the blocks representing the next num_slice slices from a
217  * container from 'in' to 'out'.  It is expected that the file pointer
218  * is just after the read of the cram_container and cram compression
219  * header.
220  *
221  * Returns 0 on success
222  *        -1 on failure
223  */
224 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice);
225 
226 /*
227  *-----------------------------------------------------------------------------
228  * cram slice interrogation
229  */
230 
231 /*
232  * Returns the number of cram blocks within this slice.
233  */
234 int cram_slice_hdr_get_num_blocks(cram_block_slice_hdr* hdr);
235 
236 /*
237  * Returns the block content_id for the block containing an embedded reference
238  * sequence.  If none is present, -1 is returned.
239  */
240 int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr* h);
241 
242 /*
243  * Returns slice reference ID, start and span (length) coordinates.
244  * Return parameters may be NULL in which case they are ignored.
245  */
246 void cram_slice_hdr_get_coords(
247     cram_block_slice_hdr* h,
248     int* refid,
249     hts_pos_t* start,
250     hts_pos_t* span);
251 
252 /*
253  * Decodes a slice header from a cram block.
254  * Returns the opaque cram_block_slice_hdr pointer on success,
255  *         NULL on failure.
256  */
257 cram_block_slice_hdr* cram_decode_slice_header(cram_fd* fd, cram_block* b);
258 
259 /*
260  * Frees a cram_block_slice_hdr structure.
261  */
262 void cram_free_slice_header(cram_block_slice_hdr* hdr);
263 
264 /*
265  *-----------------------------------------------------------------------------
266  * cram_io basics
267  */
268 
269 /**@{ ----------------------------------------------------------------------
270  * CRAM blocks - the dynamically growable data block. We have code to
271  * create, update, (un)compress and read/write.
272  *
273  * These are derived from the deflate_interlaced.c blocks, but with the
274  * CRAM extension of content types and IDs.
275  */
276 
277 /*! Allocates a new cram_block structure with a specified content_type and
278  * id.
279  *
280  * @return
281  * Returns block pointer on success;
282  *         NULL on failure
283  *
284  * The cram_block struct returned by a successful call should be freed
285  * via cram_free_block() when it is no longer needed.
286  */
287 cram_block* cram_new_block(cram_content_type content_type, int content_id);
288 
289 /*! Reads a block from a cram file.
290  *
291  * @return
292  * Returns cram_block pointer on success;
293  *         NULL on failure
294  *
295  * The cram_block struct returned by a successful call should be freed
296  * via cram_free_block() when it is no longer needed.
297  */
298 cram_block* cram_read_block(cram_fd* fd);
299 
300 /*! Writes a CRAM block.
301  *
302  * @return
303  * Returns 0 on success;
304  *        -1 on failure
305  */
306 int cram_write_block(cram_fd* fd, cram_block* b);
307 
308 /*! Frees a CRAM block, deallocating internal data too.
309  */
310 void cram_free_block(cram_block* b);
311 
312 /*! Uncompresses a CRAM block, if compressed.
313  *
314  * @return
315  * Returns 0 on success;
316  *        -1 on failure
317  */
318 int cram_uncompress_block(cram_block* b);
319 
320 /*! Compresses a block.
321  *
322  * Compresses a block using one of two different zlib strategies. If we only
323  * want one choice set strat2 to be -1.
324  *
325  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
326  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
327  * significantly faster.
328  *
329  * @return
330  * Returns 0 on success;
331  *        -1 on failure
332  */
333 int cram_compress_block(
334     cram_fd* fd,
335     cram_block* b,
336     cram_metrics* metrics,
337     int method,
338     int level);
339 int cram_compress_block2(
340     cram_fd* fd,
341     cram_slice* s,
342     cram_block* b,
343     cram_metrics* metrics,
344     int method,
345     int level);
346 
347 /**@}*/
348 /**@{ ----------------------------------------------------------------------
349  * Containers
350  */
351 
352 /*! Creates a new container, specifying the maximum number of slices
353  * and records permitted.
354  *
355  * @return
356  * Returns cram_container ptr on success;
357  *         NULL on failure
358  *
359  * The cram_container struct returned by a successful call should be freed
360  * via cram_free_container() when it is no longer needed.
361  */
362 cram_container* cram_new_container(int nrec, int nslice);
363 void cram_free_container(cram_container* c);
364 
365 /*! Reads a container header.
366  *
367  * @return
368  * Returns cram_container on success;
369  *         NULL on failure or no container left (fd->err == 0).
370  *
371  * The cram_container struct returned by a successful call should be freed
372  * via cram_free_container() when it is no longer needed.
373  */
374 cram_container* cram_read_container(cram_fd* fd);
375 
376 /*! Writes a container structure.
377  *
378  * @return
379  * Returns 0 on success;
380  *        -1 on failure
381  */
382 int cram_write_container(cram_fd* fd, cram_container* h);
383 
384 /*
385  * Stores the container structure in dat and returns *size as the
386  * number of bytes written to dat[].  The input size of dat is also
387  * held in *size and should be initialised to cram_container_size(c).
388  *
389  * Returns 0 on success;
390  *        -1 on failure
391  */
392 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size);
393 
394 int cram_container_size(cram_container* c);
395 
396 /**@}*/
397 /**@{ ----------------------------------------------------------------------
398  * The top-level cram opening, closing and option handling
399  */
400 
401 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
402  *
403  * The filename may be "-" to indicate stdin or stdout.
404  *
405  * @return
406  * Returns file handle on success;
407  *         NULL on failure.
408  */
409 cram_fd* cram_open(const(char)* filename, const(char)* mode);
410 
411 /*! Opens an existing stream for reading or writing.
412  *
413  * @return
414  * Returns file handle on success;
415  *         NULL on failure.
416  */
417 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode);
418 
419 /*! Closes a CRAM file.
420  *
421  * @return
422  * Returns 0 on success;
423  *        -1 on failure
424  */
425 int cram_close(cram_fd* fd);
426 
427 /*
428  * Seek within a CRAM file.
429  *
430  * Returns 0 on success
431  *        -1 on failure
432  */
433 int cram_seek(cram_fd* fd, off_t offset, int whence);
434 
435 /*
436  * Flushes a CRAM file.
437  * Useful for when writing to stdout without wishing to close the stream.
438  *
439  * Returns 0 on success
440  *        -1 on failure
441  */
442 int cram_flush(cram_fd* fd);
443 
444 /*! Checks for end of file on a cram_fd stream.
445  *
446  * @return
447  * Returns 0 if not at end of file
448  *         1 if we hit an expected EOF (end of range or EOF block)
449  *         2 for other EOF (end of stream without EOF block)
450  */
451 int cram_eof(cram_fd* fd);
452 
453 /*! Sets options on the cram_fd.
454  *
455  * See CRAM_OPT_* definitions in hts.h.
456  * Use this immediately after opening.
457  *
458  * @return
459  * Returns 0 on success;
460  *        -1 on failure
461  */
462 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...);
463 
464 /*! Sets options on the cram_fd.
465  *
466  * See CRAM_OPT_* definitions in hts.h.
467  * Use this immediately after opening.
468  *
469  * @return
470  * Returns 0 on success;
471  *        -1 on failure
472  */
473 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args);
474 
475 /*!
476  * Attaches a header to a cram_fd.
477  *
478  * This should be used when creating a new cram_fd for writing where
479  * we have an SAM_hdr already constructed (eg from a file we've read
480  * in).
481  *
482  * @return
483  * Returns 0 on success;
484  *        -1 on failure
485  */
486 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr);
487 
488 /*! Check if this file has a proper EOF block
489  *
490  * @return
491  * Returns 3 if the file is a version of CRAM that does not contain EOF blocks
492  *         2 if the file is a stream and thus unseekable
493  *         1 if the file contains an EOF block
494  *         0 if the file does not contain an EOF block
495  *        -1 if an error occurred whilst reading the file or we could not seek back to where we were
496  *
497  */
498 int cram_check_EOF(cram_fd* fd);
499 
500 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
501 int int32_put_blk(cram_block* b, int val);
502 
503 /**@}*/
504 /**@{ -------------------------------------------------------------------
505  * Old typedef and function names for compatibility with existing code.
506  * Header functionality is now provided by sam.h's sam_hdr_t functions.
507  */
508 
509 alias SAM_hdr = sam_hdr_t;
510 
511 /*! Tokenises a SAM header into a hash table.
512  *
513  * Also extracts a few bits on specific data types, such as @RG lines.
514  *
515  * @return
516  * Returns a SAM_hdr struct on success (free with sam_hdr_free());
517  *         NULL on failure
518  */
519 pragma(inline, true)
520 SAM_hdr* sam_hdr_parse_(const (char)* hdr, size_t len) { return sam_hdr_parse(len, hdr); }
521 
522 /*! Deallocates all storage used by a SAM_hdr struct.
523  *
524  * This also decrements the header reference count. If after decrementing
525  * it is still non-zero then the header is assumed to be in use by another
526  * caller and the free is not done.
527  */
528 pragma(inline, true)
529 void sam_hdr_free(SAM_hdr* hdr) { sam_hdr_destroy(hdr); }
530 
531 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */
532 
533 /*! Add an @PG line.
534  *
535  * If we wish complete control over this use sam_hdr_add_line() directly. This
536  * function uses that, but attempts to do a lot of tedious house work for
537  * you too.
538  *
539  * - It will generate a suitable ID if the supplied one clashes.
540  * - It will generate multiple @PG records if we have multiple PG chains.
541  *
542  * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
543  * in NULL.
544  *
545  * @return
546  * Returns 0 on success;
547  *        -1 on failure
548  */
549 alias sam_hdr_add_PG = sam_hdr_add_pg;
550 
551 /**@{ -------------------------------------------------------------------*/
552 
553 /*!
554  * Returns the refs_t structure used by a cram file handle.
555  *
556  * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
557  * share reference memory between multiple file handles.
558  *
559  * @return
560  * Returns NULL if none exists or the file handle is not a CRAM file.
561  */
562 refs_t* cram_get_refs(htsFile* fd);
563 
564 /**@}*/
565