1 /// @file htslib/cram.h
2 /// CRAM format-specific API functions.
3 /*
4     Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd.
5 
6     Author: James Bonfield <jkb@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 /** @file
27  * Consider using the higher level hts_*() API for programs that wish to
28  * be file format agnostic (see htslib/hts.h).
29  *
30  * This API should be used for CRAM specific code. The specifics of the
31  * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
32  * although these should not be included directly (use this file instead).
33  */
34 
35 module htslib.cram;
36 
37 import core.stdc.stdio;
38 import core.stdc.stdlib;
39 import core.stdc.stdarg : va_list;
40 
41 import htslib.sam;
42 import htslib.hts;
43 import htslib.hfile : hFILE;
44 
45 @system:
46 nothrow:
47 @nogc:
48 
49 extern (C):
50 
51 // see cram/cram_structs.h for an internal more complete copy of this enum
52 
53 // Htslib 1.11 had these listed without any hts prefix, and included
54 // some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever
55 // been public).
56 //
57 // We can't find evidence of these being used and the data type occurs
58 // nowhere in functions or structures meaning using it would be pointless.
59 // However for safety, if you absolute need the API to not change then
60 // define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11).
61 
62 // Public methods as defined in the CRAM spec.
63 
64 // CRAM 2.x and 3.0
65 
66 // NB: the subsequent numbers may change.  They're simply here for
67 // compatibility with the old API, but may have no bearing on the
68 // internal way htslib works.  DO NOT USE
69 
70 //#include <sys/types.h>
71 alias off_t = size_t;
72 alias ssize_t = size_t;
73 
74 enum cram_block_method
75 {
76     BM_ERROR = -1,
77     RAW = 0,
78     GZIP = 1,
79     BZIP2 = 2,
80     LZMA = 3,
81     RANS = 4, // Generic; either order
82     RANS0 = 4,
83     RANS1 = 10, // Not externalised; stored as RANS (generic)
84     GZIP_RLE = 11 // NB: not externalised in CRAM
85 }
86 
87 enum cram_content_type
88 {
89     CT_ERROR = -1,
90     FILE_HEADER = 0,
91     COMPRESSION_HEADER = 1,
92     MAPPED_SLICE = 2,
93     UNMAPPED_SLICE = 3, // CRAM V1.0 only
94     EXTERNAL = 4,
95     CORE = 5
96 }
97 
98 // Opaque data types, see cram_structs for the fully fledged versions.
99 struct cram_file_def;
100 struct cram_fd;
101 struct cram_container;
102 struct cram_block;
103 struct cram_slice;
104 struct cram_metrics;
105 struct cram_block_slice_hdr;
106 struct cram_block_compression_hdr;
107 struct refs_t;
108 
109 // Accessor functions
110 
111 /*
112  *-----------------------------------------------------------------------------
113  * cram_fd
114  */
115 sam_hdr_t* cram_fd_get_header(cram_fd* fd);
116 
117 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr);
118 
119 int cram_fd_get_version(cram_fd* fd);
120 
121 void cram_fd_set_version(cram_fd* fd, int vers);
122 
123 int cram_major_vers(cram_fd* fd);
124 int cram_minor_vers(cram_fd* fd);
125 
126 hFILE* cram_fd_get_fp(cram_fd* fd);
127 void cram_fd_set_fp(cram_fd* fd, hFILE* fp);
128 
129 /*
130  *-----------------------------------------------------------------------------
131  * cram_container
132  */
133 int cram_container_get_length(cram_container* c);
134 void cram_container_set_length(cram_container* c, int length);
135 int cram_container_get_num_blocks(cram_container* c);
136 void cram_container_set_num_blocks(cram_container* c, int num_blocks);
137 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks);
138 void cram_container_set_landmarks(
139     cram_container* c,
140     int num_landmarks,
141     int* landmarks);
142 
143 /* Returns true if the container is empty (EOF marker) */
144 int cram_container_is_empty(cram_fd* fd);
145 
146 /*
147  *-----------------------------------------------------------------------------
148  * cram_block
149  */
150 int cram_block_get_content_id(cram_block* b);
151 int cram_block_get_comp_size(cram_block* b);
152 int cram_block_get_uncomp_size(cram_block* b);
153 int cram_block_get_crc32(cram_block* b);
154 void* cram_block_get_data(cram_block* b);
155 
156 cram_content_type cram_block_get_content_type(cram_block* b);
157 
158 void cram_block_set_content_id(cram_block* b, int id);
159 void cram_block_set_comp_size(cram_block* b, int size);
160 void cram_block_set_uncomp_size(cram_block* b, int size);
161 void cram_block_set_crc32(cram_block* b, int crc);
162 void cram_block_set_data(cram_block* b, void* data);
163 
164 int cram_block_append(cram_block* b, const(void)* data, int size);
165 void cram_block_update_size(cram_block* b);
166 
167 // Offset is known as "size" internally, but it can be confusing.
168 size_t cram_block_get_offset(cram_block* b);
169 void cram_block_set_offset(cram_block* b, size_t offset);
170 
171 /*
172  * Computes the size of a cram block, including the block
173  * header itself.
174  */
175 uint cram_block_size(cram_block* b);
176 
177 /*
178  * Renumbers RG numbers in a cram compression header.
179  *
180  * CRAM stores RG as the Nth number in the header, rather than a
181  * string holding the ID: tag.  This is smaller in space, but means
182  * "samtools cat" to join files together that contain single but
183  * different RG lines needs a way of renumbering them.
184  *
185  * The file descriptor is expected to be immediately after the
186  * cram_container structure (ie before the cram compression header).
187  * Due to the nature of the CRAM format, this needs to read and write
188  * the blocks itself.  Note that there may be multiple slices within
189  * the container, meaning multiple compression headers to manipulate.
190  * Changing RG may change the size of the compression header and
191  * therefore the length field in the container.  Hence we rewrite all
192  * blocks just in case and also emit the adjusted container.
193  *
194  * The current implementation can only cope with renumbering a single
195  * RG (and only then if it is using HUFFMAN or BETA codecs).  In
196  * theory it *may* be possible to renumber multiple RGs if they use
197  * HUFFMAN to the CORE block or use an external block unshared by any
198  * other data series.  So we have an API that can be upgraded to
199  * support this, but do not implement it for now.  An example
200  * implementation of RG as an EXTERNAL block would be to find that
201  * block and rewrite it, returning the number of blocks consumed.
202  *
203  * Returns 0 on success;
204  *        -1 if unable to edit;
205  *        -2 on other errors (eg I/O).
206  */
207 int cram_transcode_rg(
208     cram_fd* in_,
209     cram_fd* out_,
210     cram_container* c,
211     int nrg,
212     int* in_rg,
213     int* out_rg);
214 
215 /*
216  * Copies the blocks representing the next num_slice slices from a
217  * container from 'in' to 'out'.  It is expected that the file pointer
218  * is just after the read of the cram_container and cram compression
219  * header.
220  *
221  * Returns 0 on success
222  *        -1 on failure
223  */
224 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice);
225 
226 /*
227  *-----------------------------------------------------------------------------
228  * cram_io basics
229  */
230 
231 /**@{ ----------------------------------------------------------------------
232  * CRAM blocks - the dynamically growable data block. We have code to
233  * create, update, (un)compress and read/write.
234  *
235  * These are derived from the deflate_interlaced.c blocks, but with the
236  * CRAM extension of content types and IDs.
237  */
238 
239 /*! Allocates a new cram_block structure with a specified content_type and
240  * id.
241  *
242  * @return
243  * Returns block pointer on success;
244  *         NULL on failure
245  *
246  * The cram_block struct returned by a successful call should be freed
247  * via cram_free_block() when it is no longer needed.
248  */
249 cram_block* cram_new_block(cram_content_type content_type, int content_id);
250 
251 /*! Reads a block from a cram file.
252  *
253  * @return
254  * Returns cram_block pointer on success;
255  *         NULL on failure
256  *
257  * The cram_block struct returned by a successful call should be freed
258  * via cram_free_block() when it is no longer needed.
259  */
260 cram_block* cram_read_block(cram_fd* fd);
261 
262 /*! Writes a CRAM block.
263  *
264  * @return
265  * Returns 0 on success;
266  *        -1 on failure
267  */
268 int cram_write_block(cram_fd* fd, cram_block* b);
269 
270 /*! Frees a CRAM block, deallocating internal data too.
271  */
272 void cram_free_block(cram_block* b);
273 
274 /*! Uncompresses a CRAM block, if compressed.
275  *
276  * @return
277  * Returns 0 on success;
278  *        -1 on failure
279  */
280 int cram_uncompress_block(cram_block* b);
281 
282 /*! Compresses a block.
283  *
284  * Compresses a block using one of two different zlib strategies. If we only
285  * want one choice set strat2 to be -1.
286  *
287  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
288  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
289  * significantly faster.
290  *
291  * @return
292  * Returns 0 on success;
293  *        -1 on failure
294  */
295 int cram_compress_block(
296     cram_fd* fd,
297     cram_block* b,
298     cram_metrics* metrics,
299     int method,
300     int level);
301 int cram_compress_block2(
302     cram_fd* fd,
303     cram_slice* s,
304     cram_block* b,
305     cram_metrics* metrics,
306     int method,
307     int level);
308 
309 /**@}*/
310 /**@{ ----------------------------------------------------------------------
311  * Containers
312  */
313 
314 /*! Creates a new container, specifying the maximum number of slices
315  * and records permitted.
316  *
317  * @return
318  * Returns cram_container ptr on success;
319  *         NULL on failure
320  *
321  * The cram_container struct returned by a successful call should be freed
322  * via cram_free_container() when it is no longer needed.
323  */
324 cram_container* cram_new_container(int nrec, int nslice);
325 void cram_free_container(cram_container* c);
326 
327 /*! Reads a container header.
328  *
329  * @return
330  * Returns cram_container on success;
331  *         NULL on failure or no container left (fd->err == 0).
332  *
333  * The cram_container struct returned by a successful call should be freed
334  * via cram_free_container() when it is no longer needed.
335  */
336 cram_container* cram_read_container(cram_fd* fd);
337 
338 /*! Writes a container structure.
339  *
340  * @return
341  * Returns 0 on success;
342  *        -1 on failure
343  */
344 int cram_write_container(cram_fd* fd, cram_container* h);
345 
346 /*
347  * Stores the container structure in dat and returns *size as the
348  * number of bytes written to dat[].  The input size of dat is also
349  * held in *size and should be initialised to cram_container_size(c).
350  *
351  * Returns 0 on success;
352  *        -1 on failure
353  */
354 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size);
355 
356 int cram_container_size(cram_container* c);
357 
358 /**@}*/
359 /**@{ ----------------------------------------------------------------------
360  * The top-level cram opening, closing and option handling
361  */
362 
363 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
364  *
365  * The filename may be "-" to indicate stdin or stdout.
366  *
367  * @return
368  * Returns file handle on success;
369  *         NULL on failure.
370  */
371 cram_fd* cram_open(const(char)* filename, const(char)* mode);
372 
373 /*! Opens an existing stream for reading or writing.
374  *
375  * @return
376  * Returns file handle on success;
377  *         NULL on failure.
378  */
379 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode);
380 
381 /*! Closes a CRAM file.
382  *
383  * @return
384  * Returns 0 on success;
385  *        -1 on failure
386  */
387 int cram_close(cram_fd* fd);
388 
389 /*
390  * Seek within a CRAM file.
391  *
392  * Returns 0 on success
393  *        -1 on failure
394  */
395 int cram_seek(cram_fd* fd, off_t offset, int whence);
396 
397 /*
398  * Flushes a CRAM file.
399  * Useful for when writing to stdout without wishing to close the stream.
400  *
401  * Returns 0 on success
402  *        -1 on failure
403  */
404 int cram_flush(cram_fd* fd);
405 
406 /*! Checks for end of file on a cram_fd stream.
407  *
408  * @return
409  * Returns 0 if not at end of file
410  *         1 if we hit an expected EOF (end of range or EOF block)
411  *         2 for other EOF (end of stream without EOF block)
412  */
413 int cram_eof(cram_fd* fd);
414 
415 /*! Sets options on the cram_fd.
416  *
417  * See CRAM_OPT_* definitions in hts.h.
418  * Use this immediately after opening.
419  *
420  * @return
421  * Returns 0 on success;
422  *        -1 on failure
423  */
424 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...);
425 
426 /*! Sets options on the cram_fd.
427  *
428  * See CRAM_OPT_* definitions in hts.h.
429  * Use this immediately after opening.
430  *
431  * @return
432  * Returns 0 on success;
433  *        -1 on failure
434  */
435 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args);
436 
437 /*!
438  * Attaches a header to a cram_fd.
439  *
440  * This should be used when creating a new cram_fd for writing where
441  * we have an SAM_hdr already constructed (eg from a file we've read
442  * in).
443  *
444  * @return
445  * Returns 0 on success;
446  *        -1 on failure
447  */
448 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr);
449 
450 /*! Check if this file has a proper EOF block
451  *
452  * @return
453  * Returns 3 if the file is a version of CRAM that does not contain EOF blocks
454  *         2 if the file is a stream and thus unseekable
455  *         1 if the file contains an EOF block
456  *         0 if the file does not contain an EOF block
457  *        -1 if an error occurred whilst reading the file or we could not seek back to where we were
458  *
459  */
460 int cram_check_EOF(cram_fd* fd);
461 
462 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
463 int int32_put_blk(cram_block* b, int val);
464 
465 /**@}*/
466 /**@{ -------------------------------------------------------------------
467  * Old typedef and function names for compatibility with existing code.
468  * Header functionality is now provided by sam.h's sam_hdr_t functions.
469  */
470 
471 alias SAM_hdr = sam_hdr_t;
472 
473 /*! Tokenises a SAM header into a hash table.
474  *
475  * Also extracts a few bits on specific data types, such as @RG lines.
476  *
477  * @return
478  * Returns a SAM_hdr struct on success (free with sam_hdr_free());
479  *         NULL on failure
480  */
481 pragma(inline, true)
482 SAM_hdr* sam_hdr_parse_(const (char)* hdr, size_t len) { return sam_hdr_parse(len, hdr); }
483 
484 /*! Deallocates all storage used by a SAM_hdr struct.
485  *
486  * This also decrements the header reference count. If after decrementing
487  * it is still non-zero then the header is assumed to be in use by another
488  * caller and the free is not done.
489  */
490 pragma(inline, true)
491 void sam_hdr_free(SAM_hdr* hdr) { sam_hdr_destroy(hdr); }
492 
493 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */
494 
495 /*! Add an @PG line.
496  *
497  * If we wish complete control over this use sam_hdr_add_line() directly. This
498  * function uses that, but attempts to do a lot of tedious house work for
499  * you too.
500  *
501  * - It will generate a suitable ID if the supplied one clashes.
502  * - It will generate multiple @PG records if we have multiple PG chains.
503  *
504  * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
505  * in NULL.
506  *
507  * @return
508  * Returns 0 on success;
509  *        -1 on failure
510  */
511 alias sam_hdr_add_PG = sam_hdr_add_pg;
512 
513 /**@{ -------------------------------------------------------------------*/
514 
515 /*!
516  * Returns the refs_t structure used by a cram file handle.
517  *
518  * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
519  * share reference memory between multiple file handles.
520  *
521  * @return
522  * Returns NULL if none exists or the file handle is not a CRAM file.
523  */
524 refs_t* cram_get_refs(htsFile* fd);
525 
526 /**@}*/
527