1 /// @file htslib/cram.h
2 /// CRAM format-specific API functions.
3 /*
4     Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd.
5 
6     Author: James Bonfield <jkb@sanger.ac.uk>
7 
8 Permission is hereby granted, free of charge, to any person obtaining a copy
9 of this software and associated documentation files (the "Software"), to deal
10 in the Software without restriction, including without limitation the rights
11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 copies of the Software, and to permit persons to whom the Software is
13 furnished to do so, subject to the following conditions:
14 
15 The above copyright notice and this permission notice shall be included in
16 all copies or substantial portions of the Software.
17 
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 DEALINGS IN THE SOFTWARE.  */
25 
26 /** @file
27  * Consider using the higher level hts_*() API for programs that wish to
28  * be file format agnostic (see htslib/hts.h).
29  *
30  * This API should be used for CRAM specific code. The specifics of the
31  * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h
32  * although these should not be included directly (use this file instead).
33  */
34 
35 module htslib.cram;
36 
37 import core.stdc.stdarg;
38 import core.sys.posix.sys.types;
39 
40 import htslib.hts;
41 import htslib.sam;
42 
43 extern (C):
44 
45 enum cram_block_method
46 {
47     BM_ERROR = -1,
48     RAW = 0,
49     GZIP = 1,
50     BZIP2 = 2,
51     LZMA = 3,
52     RANS = 4, // Generic; either order
53     RANS0 = 4,
54     RANS1 = 10, // Not externalised; stored as RANS (generic)
55     GZIP_RLE = 11 // NB: not externalised in CRAM
56 }
57 
58 enum cram_content_type
59 {
60     CT_ERROR = -1,
61     FILE_HEADER = 0,
62     COMPRESSION_HEADER = 1,
63     MAPPED_SLICE = 2,
64     UNMAPPED_SLICE = 3, // CRAM V1.0 only
65     EXTERNAL = 4,
66     CORE = 5
67 }
68 
69 // Opaque data types, see cram_structs for the fully fledged versions.
70 struct cram_file_def;
71 struct cram_fd;
72 struct cram_container;
73 struct cram_block;
74 struct cram_slice;
75 struct cram_metrics;
76 struct cram_block_slice_hdr;
77 struct cram_block_compression_hdr;
78 struct refs_t;
79 
80 struct hFILE;
81 
82 // Accessor functions
83 
84 /*
85  *-----------------------------------------------------------------------------
86  * cram_fd
87  */
88 sam_hdr_t* cram_fd_get_header(cram_fd* fd);
89 
90 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr);
91 
92 int cram_fd_get_version(cram_fd* fd);
93 
94 void cram_fd_set_version(cram_fd* fd, int vers);
95 
96 int cram_major_vers(cram_fd* fd);
97 int cram_minor_vers(cram_fd* fd);
98 
99 hFILE* cram_fd_get_fp(cram_fd* fd);
100 void cram_fd_set_fp(cram_fd* fd, hFILE* fp);
101 
102 /*
103  *-----------------------------------------------------------------------------
104  * cram_container
105  */
106 int cram_container_get_length(cram_container* c);
107 void cram_container_set_length(cram_container* c, int length);
108 int cram_container_get_num_blocks(cram_container* c);
109 void cram_container_set_num_blocks(cram_container* c, int num_blocks);
110 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks);
111 void cram_container_set_landmarks(
112     cram_container* c,
113     int num_landmarks,
114     int* landmarks);
115 
116 /* Returns true if the container is empty (EOF marker) */
117 int cram_container_is_empty(cram_fd* fd);
118 
119 /*
120  *-----------------------------------------------------------------------------
121  * cram_block
122  */
123 int cram_block_get_content_id(cram_block* b);
124 int cram_block_get_comp_size(cram_block* b);
125 int cram_block_get_uncomp_size(cram_block* b);
126 int cram_block_get_crc32(cram_block* b);
127 void* cram_block_get_data(cram_block* b);
128 
129 cram_content_type cram_block_get_content_type(cram_block* b);
130 
131 void cram_block_set_content_id(cram_block* b, int id);
132 void cram_block_set_comp_size(cram_block* b, int size);
133 void cram_block_set_uncomp_size(cram_block* b, int size);
134 void cram_block_set_crc32(cram_block* b, int crc);
135 void cram_block_set_data(cram_block* b, void* data);
136 
137 int cram_block_append(cram_block* b, const(void)* data, int size);
138 void cram_block_update_size(cram_block* b);
139 
140 // Offset is known as "size" internally, but it can be confusing.
141 size_t cram_block_get_offset(cram_block* b);
142 void cram_block_set_offset(cram_block* b, size_t offset);
143 
144 /*
145  * Computes the size of a cram block, including the block
146  * header itself.
147  */
148 uint cram_block_size(cram_block* b);
149 
150 /*
151  * Renumbers RG numbers in a cram compression header.
152  *
153  * CRAM stores RG as the Nth number in the header, rather than a
154  * string holding the ID: tag.  This is smaller in space, but means
155  * "samtools cat" to join files together that contain single but
156  * different RG lines needs a way of renumbering them.
157  *
158  * The file descriptor is expected to be immediately after the
159  * cram_container structure (ie before the cram compression header).
160  * Due to the nature of the CRAM format, this needs to read and write
161  * the blocks itself.  Note that there may be multiple slices within
162  * the container, meaning multiple compression headers to manipulate.
163  * Changing RG may change the size of the compression header and
164  * therefore the length field in the container.  Hence we rewrite all
165  * blocks just incase and also emit the adjusted container.
166  *
167  * The current implementation can only cope with renumbering a single
168  * RG (and only then if it is using HUFFMAN or BETA codecs).  In
169  * theory it *may* be possible to renumber multiple RGs if they use
170  * HUFFMAN to the CORE block or use an external block unshared by any
171  * other data series.  So we have an API that can be upgraded to
172  * support this, but do not implement it for now.  An example
173  * implementation of RG as an EXTERNAL block would be to find that
174  * block and rewrite it, returning the number of blocks consumed.
175  *
176  * Returns 0 on success;
177  *        -1 if unable to edit;
178  *        -2 on other errors (eg I/O).
179  */
180 int cram_transcode_rg(
181     cram_fd* in_,
182     cram_fd* out_,
183     cram_container* c,
184     int nrg,
185     int* in_rg,
186     int* out_rg);
187 
188 /*
189  * Copies the blocks representing the next num_slice slices from a
190  * container from 'in' to 'out'.  It is expected that the file pointer
191  * is just after the read of the cram_container and cram compression
192  * header.
193  *
194  * Returns 0 on success
195  *        -1 on failure
196  */
197 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice);
198 
199 /*
200  *-----------------------------------------------------------------------------
201  * cram_io basics
202  */
203 
204 /**@{ ----------------------------------------------------------------------
205  * CRAM blocks - the dynamically growable data block. We have code to
206  * create, update, (un)compress and read/write.
207  *
208  * These are derived from the deflate_interlaced.c blocks, but with the
209  * CRAM extension of content types and IDs.
210  */
211 
212 /*! Allocates a new cram_block structure with a specified content_type and
213  * id.
214  *
215  * @return
216  * Returns block pointer on success;
217  *         NULL on failure
218  *
219  * The cram_block struct returned by a successful call should be freed
220  * via cram_free_block() when it is no longer needed.
221  */
222 cram_block* cram_new_block(cram_content_type content_type, int content_id);
223 
224 /*! Reads a block from a cram file.
225  *
226  * @return
227  * Returns cram_block pointer on success;
228  *         NULL on failure
229  *
230  * The cram_block struct returned by a successful call should be freed
231  * via cram_free_block() when it is no longer needed.
232  */
233 cram_block* cram_read_block(cram_fd* fd);
234 
235 /*! Writes a CRAM block.
236  *
237  * @return
238  * Returns 0 on success;
239  *        -1 on failure
240  */
241 int cram_write_block(cram_fd* fd, cram_block* b);
242 
243 /*! Frees a CRAM block, deallocating internal data too.
244  */
245 void cram_free_block(cram_block* b);
246 
247 /*! Uncompresses a CRAM block, if compressed.
248  *
249  * @return
250  * Returns 0 on success;
251  *        -1 on failure
252  */
253 int cram_uncompress_block(cram_block* b);
254 
255 /*! Compresses a block.
256  *
257  * Compresses a block using one of two different zlib strategies. If we only
258  * want one choice set strat2 to be -1.
259  *
260  * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED
261  * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is
262  * significantly faster.
263  *
264  * @return
265  * Returns 0 on success;
266  *        -1 on failure
267  */
268 int cram_compress_block(
269     cram_fd* fd,
270     cram_block* b,
271     cram_metrics* metrics,
272     int method,
273     int level);
274 
275 /**@}*/
276 /**@{ ----------------------------------------------------------------------
277  * Containers
278  */
279 
280 /*! Creates a new container, specifying the maximum number of slices
281  * and records permitted.
282  *
283  * @return
284  * Returns cram_container ptr on success;
285  *         NULL on failure
286  *
287  * The cram_container struct returned by a successful call should be freed
288  * via cram_free_container() when it is no longer needed.
289  */
290 cram_container* cram_new_container(int nrec, int nslice);
291 void cram_free_container(cram_container* c);
292 
293 /*! Reads a container header.
294  *
295  * @return
296  * Returns cram_container on success;
297  *         NULL on failure or no container left (fd->err == 0).
298  *
299  * The cram_container struct returned by a successful call should be freed
300  * via cram_free_container() when it is no longer needed.
301  */
302 cram_container* cram_read_container(cram_fd* fd);
303 
304 /*! Writes a container structure.
305  *
306  * @return
307  * Returns 0 on success;
308  *        -1 on failure
309  */
310 int cram_write_container(cram_fd* fd, cram_container* h);
311 
312 /*
313  * Stores the container structure in dat and returns *size as the
314  * number of bytes written to dat[].  The input size of dat is also
315  * held in *size and should be initialised to cram_container_size(c).
316  *
317  * Returns 0 on success;
318  *        -1 on failure
319  */
320 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size);
321 
322 int cram_container_size(cram_container* c);
323 
324 /**@}*/
325 /**@{ ----------------------------------------------------------------------
326  * The top-level cram opening, closing and option handling
327  */
328 
329 /*! Opens a CRAM file for read (mode "rb") or write ("wb").
330  *
331  * The filename may be "-" to indicate stdin or stdout.
332  *
333  * @return
334  * Returns file handle on success;
335  *         NULL on failure.
336  */
337 cram_fd* cram_open(const(char)* filename, const(char)* mode);
338 
339 /*! Opens an existing stream for reading or writing.
340  *
341  * @return
342  * Returns file handle on success;
343  *         NULL on failure.
344  */
345 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode);
346 
347 /*! Closes a CRAM file.
348  *
349  * @return
350  * Returns 0 on success;
351  *        -1 on failure
352  */
353 int cram_close(cram_fd* fd);
354 
355 /*
356  * Seek within a CRAM file.
357  *
358  * Returns 0 on success
359  *        -1 on failure
360  */
361 int cram_seek(cram_fd* fd, off_t offset, int whence);
362 
363 /*
364  * Flushes a CRAM file.
365  * Useful for when writing to stdout without wishing to close the stream.
366  *
367  * Returns 0 on success
368  *        -1 on failure
369  */
370 int cram_flush(cram_fd* fd);
371 
372 /*! Checks for end of file on a cram_fd stream.
373  *
374  * @return
375  * Returns 0 if not at end of file
376  *         1 if we hit an expected EOF (end of range or EOF block)
377  *         2 for other EOF (end of stream without EOF block)
378  */
379 int cram_eof(cram_fd* fd);
380 
381 /*! Sets options on the cram_fd.
382  *
383  * See CRAM_OPT_* definitions in hts.h.
384  * Use this immediately after opening.
385  *
386  * @return
387  * Returns 0 on success;
388  *        -1 on failure
389  */
390 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...);
391 
392 /*! Sets options on the cram_fd.
393  *
394  * See CRAM_OPT_* definitions in hts.h.
395  * Use this immediately after opening.
396  *
397  * @return
398  * Returns 0 on success;
399  *        -1 on failure
400  */
401 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args);
402 
403 /*!
404  * Attaches a header to a cram_fd.
405  *
406  * This should be used when creating a new cram_fd for writing where
407  * we have an SAM_hdr already constructed (eg from a file we've read
408  * in).
409  *
410  * @return
411  * Returns 0 on success;
412  *        -1 on failure
413  */
414 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr);
415 
416 /*! Check if this file has a proper EOF block
417  *
418  * @return
419  * Returns 3 if the file is a version of CRAM that does not contain EOF blocks
420  *         2 if the file is a stream and thus unseekable
421  *         1 if the file contains an EOF block
422  *         0 if the file does not contain an EOF block
423  *        -1 if an error occured whilst reading the file or we could not seek back to where we were
424  *
425  */
426 int cram_check_EOF(cram_fd* fd);
427 
428 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */
429 int int32_put_blk(cram_block* b, int val);
430 
431 /**@}*/
432 /**@{ -------------------------------------------------------------------
433  * Old typedef and function names for compatibility with existing code.
434  * Header functionality is now provided by sam.h's sam_hdr_t functions.
435  */
436 
437 alias SAM_hdr = sam_hdr_t;
438 
439 /*! Tokenises a SAM header into a hash table.
440  *
441  * Also extracts a few bits on specific data types, such as @RG lines.
442  *
443  * @return
444  * Returns a SAM_hdr struct on success (free with sam_hdr_free());
445  *         NULL on failure
446  */
447 SAM_hdr* sam_hdr_parse_(const(char)* hdr, size_t len);
448 
449 /*! Deallocates all storage used by a SAM_hdr struct.
450  *
451  * This also decrements the header reference count. If after decrementing
452  * it is still non-zero then the header is assumed to be in use by another
453  * caller and the free is not done.
454  */
455 void sam_hdr_free(SAM_hdr* hdr);
456 
457 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */
458 
459 /*! Add an @PG line.
460  *
461  * If we wish complete control over this use sam_hdr_add_line() directly. This
462  * function uses that, but attempts to do a lot of tedious house work for
463  * you too.
464  *
465  * - It will generate a suitable ID if the supplied one clashes.
466  * - It will generate multiple @PG records if we have multiple PG chains.
467  *
468  * Call it as per sam_hdr_add_line() with a series of key,value pairs ending
469  * in NULL.
470  *
471  * @return
472  * Returns 0 on success;
473  *        -1 on failure
474  */
475 alias sam_hdr_add_PG = sam_hdr_add_pg;
476 
477 /**@{ -------------------------------------------------------------------*/
478 
479 /*!
480  * Returns the refs_t structure used by a cram file handle.
481  *
482  * This may be used in conjunction with option CRAM_OPT_SHARED_REF to
483  * share reference memory between multiple file handles.
484  *
485  * @return
486  * Returns NULL if none exists or the file handle is not a CRAM file.
487  */
488 refs_t* cram_get_refs(htsFile* fd);
489 
490 /**@}*/
491