1 /// @file htslib/cram.h 2 /// CRAM format-specific API functions. 3 /* 4 Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd. 5 6 Author: James Bonfield <jkb@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. */ 25 26 /** @file 27 * Consider using the higher level hts_*() API for programs that wish to 28 * be file format agnostic (see htslib/hts.h). 29 * 30 * This API should be used for CRAM specific code. The specifics of the 31 * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h 32 * although these should not be included directly (use this file instead). 33 */ 34 35 module htslib.cram; 36 37 import core.stdc.stdarg; 38 import core.sys.posix.sys.types; 39 40 import htslib.hts; 41 import htslib.sam; 42 43 extern (C): 44 45 enum cram_block_method 46 { 47 BM_ERROR = -1, 48 RAW = 0, 49 GZIP = 1, 50 BZIP2 = 2, 51 LZMA = 3, 52 RANS = 4, // Generic; either order 53 RANS0 = 4, 54 RANS1 = 10, // Not externalised; stored as RANS (generic) 55 GZIP_RLE = 11 // NB: not externalised in CRAM 56 } 57 58 enum cram_content_type 59 { 60 CT_ERROR = -1, 61 FILE_HEADER = 0, 62 COMPRESSION_HEADER = 1, 63 MAPPED_SLICE = 2, 64 UNMAPPED_SLICE = 3, // CRAM V1.0 only 65 EXTERNAL = 4, 66 CORE = 5 67 } 68 69 // Opaque data types, see cram_structs for the fully fledged versions. 70 struct cram_file_def; 71 struct cram_fd; 72 struct cram_container; 73 struct cram_block; 74 struct cram_slice; 75 struct cram_metrics; 76 struct cram_block_slice_hdr; 77 struct cram_block_compression_hdr; 78 struct refs_t; 79 80 struct hFILE; 81 82 // Accessor functions 83 84 /* 85 *----------------------------------------------------------------------------- 86 * cram_fd 87 */ 88 sam_hdr_t* cram_fd_get_header(cram_fd* fd); 89 90 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr); 91 92 int cram_fd_get_version(cram_fd* fd); 93 94 void cram_fd_set_version(cram_fd* fd, int vers); 95 96 int cram_major_vers(cram_fd* fd); 97 int cram_minor_vers(cram_fd* fd); 98 99 hFILE* cram_fd_get_fp(cram_fd* fd); 100 void cram_fd_set_fp(cram_fd* fd, hFILE* fp); 101 102 /* 103 *----------------------------------------------------------------------------- 104 * cram_container 105 */ 106 int cram_container_get_length(cram_container* c); 107 void cram_container_set_length(cram_container* c, int length); 108 int cram_container_get_num_blocks(cram_container* c); 109 void cram_container_set_num_blocks(cram_container* c, int num_blocks); 110 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks); 111 void cram_container_set_landmarks( 112 cram_container* c, 113 int num_landmarks, 114 int* landmarks); 115 116 /* Returns true if the container is empty (EOF marker) */ 117 int cram_container_is_empty(cram_fd* fd); 118 119 /* 120 *----------------------------------------------------------------------------- 121 * cram_block 122 */ 123 int cram_block_get_content_id(cram_block* b); 124 int cram_block_get_comp_size(cram_block* b); 125 int cram_block_get_uncomp_size(cram_block* b); 126 int cram_block_get_crc32(cram_block* b); 127 void* cram_block_get_data(cram_block* b); 128 129 cram_content_type cram_block_get_content_type(cram_block* b); 130 131 void cram_block_set_content_id(cram_block* b, int id); 132 void cram_block_set_comp_size(cram_block* b, int size); 133 void cram_block_set_uncomp_size(cram_block* b, int size); 134 void cram_block_set_crc32(cram_block* b, int crc); 135 void cram_block_set_data(cram_block* b, void* data); 136 137 int cram_block_append(cram_block* b, const(void)* data, int size); 138 void cram_block_update_size(cram_block* b); 139 140 // Offset is known as "size" internally, but it can be confusing. 141 size_t cram_block_get_offset(cram_block* b); 142 void cram_block_set_offset(cram_block* b, size_t offset); 143 144 /* 145 * Computes the size of a cram block, including the block 146 * header itself. 147 */ 148 uint cram_block_size(cram_block* b); 149 150 /* 151 * Renumbers RG numbers in a cram compression header. 152 * 153 * CRAM stores RG as the Nth number in the header, rather than a 154 * string holding the ID: tag. This is smaller in space, but means 155 * "samtools cat" to join files together that contain single but 156 * different RG lines needs a way of renumbering them. 157 * 158 * The file descriptor is expected to be immediately after the 159 * cram_container structure (ie before the cram compression header). 160 * Due to the nature of the CRAM format, this needs to read and write 161 * the blocks itself. Note that there may be multiple slices within 162 * the container, meaning multiple compression headers to manipulate. 163 * Changing RG may change the size of the compression header and 164 * therefore the length field in the container. Hence we rewrite all 165 * blocks just incase and also emit the adjusted container. 166 * 167 * The current implementation can only cope with renumbering a single 168 * RG (and only then if it is using HUFFMAN or BETA codecs). In 169 * theory it *may* be possible to renumber multiple RGs if they use 170 * HUFFMAN to the CORE block or use an external block unshared by any 171 * other data series. So we have an API that can be upgraded to 172 * support this, but do not implement it for now. An example 173 * implementation of RG as an EXTERNAL block would be to find that 174 * block and rewrite it, returning the number of blocks consumed. 175 * 176 * Returns 0 on success; 177 * -1 if unable to edit; 178 * -2 on other errors (eg I/O). 179 */ 180 int cram_transcode_rg( 181 cram_fd* in_, 182 cram_fd* out_, 183 cram_container* c, 184 int nrg, 185 int* in_rg, 186 int* out_rg); 187 188 /* 189 * Copies the blocks representing the next num_slice slices from a 190 * container from 'in' to 'out'. It is expected that the file pointer 191 * is just after the read of the cram_container and cram compression 192 * header. 193 * 194 * Returns 0 on success 195 * -1 on failure 196 */ 197 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice); 198 199 /* 200 *----------------------------------------------------------------------------- 201 * cram_io basics 202 */ 203 204 /**@{ ---------------------------------------------------------------------- 205 * CRAM blocks - the dynamically growable data block. We have code to 206 * create, update, (un)compress and read/write. 207 * 208 * These are derived from the deflate_interlaced.c blocks, but with the 209 * CRAM extension of content types and IDs. 210 */ 211 212 /*! Allocates a new cram_block structure with a specified content_type and 213 * id. 214 * 215 * @return 216 * Returns block pointer on success; 217 * NULL on failure 218 * 219 * The cram_block struct returned by a successful call should be freed 220 * via cram_free_block() when it is no longer needed. 221 */ 222 cram_block* cram_new_block(cram_content_type content_type, int content_id); 223 224 /*! Reads a block from a cram file. 225 * 226 * @return 227 * Returns cram_block pointer on success; 228 * NULL on failure 229 * 230 * The cram_block struct returned by a successful call should be freed 231 * via cram_free_block() when it is no longer needed. 232 */ 233 cram_block* cram_read_block(cram_fd* fd); 234 235 /*! Writes a CRAM block. 236 * 237 * @return 238 * Returns 0 on success; 239 * -1 on failure 240 */ 241 int cram_write_block(cram_fd* fd, cram_block* b); 242 243 /*! Frees a CRAM block, deallocating internal data too. 244 */ 245 void cram_free_block(cram_block* b); 246 247 /*! Uncompresses a CRAM block, if compressed. 248 * 249 * @return 250 * Returns 0 on success; 251 * -1 on failure 252 */ 253 int cram_uncompress_block(cram_block* b); 254 255 /*! Compresses a block. 256 * 257 * Compresses a block using one of two different zlib strategies. If we only 258 * want one choice set strat2 to be -1. 259 * 260 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED 261 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is 262 * significantly faster. 263 * 264 * @return 265 * Returns 0 on success; 266 * -1 on failure 267 */ 268 int cram_compress_block( 269 cram_fd* fd, 270 cram_block* b, 271 cram_metrics* metrics, 272 int method, 273 int level); 274 275 /**@}*/ 276 /**@{ ---------------------------------------------------------------------- 277 * Containers 278 */ 279 280 /*! Creates a new container, specifying the maximum number of slices 281 * and records permitted. 282 * 283 * @return 284 * Returns cram_container ptr on success; 285 * NULL on failure 286 * 287 * The cram_container struct returned by a successful call should be freed 288 * via cram_free_container() when it is no longer needed. 289 */ 290 cram_container* cram_new_container(int nrec, int nslice); 291 void cram_free_container(cram_container* c); 292 293 /*! Reads a container header. 294 * 295 * @return 296 * Returns cram_container on success; 297 * NULL on failure or no container left (fd->err == 0). 298 * 299 * The cram_container struct returned by a successful call should be freed 300 * via cram_free_container() when it is no longer needed. 301 */ 302 cram_container* cram_read_container(cram_fd* fd); 303 304 /*! Writes a container structure. 305 * 306 * @return 307 * Returns 0 on success; 308 * -1 on failure 309 */ 310 int cram_write_container(cram_fd* fd, cram_container* h); 311 312 /* 313 * Stores the container structure in dat and returns *size as the 314 * number of bytes written to dat[]. The input size of dat is also 315 * held in *size and should be initialised to cram_container_size(c). 316 * 317 * Returns 0 on success; 318 * -1 on failure 319 */ 320 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size); 321 322 int cram_container_size(cram_container* c); 323 324 /**@}*/ 325 /**@{ ---------------------------------------------------------------------- 326 * The top-level cram opening, closing and option handling 327 */ 328 329 /*! Opens a CRAM file for read (mode "rb") or write ("wb"). 330 * 331 * The filename may be "-" to indicate stdin or stdout. 332 * 333 * @return 334 * Returns file handle on success; 335 * NULL on failure. 336 */ 337 cram_fd* cram_open(const(char)* filename, const(char)* mode); 338 339 /*! Opens an existing stream for reading or writing. 340 * 341 * @return 342 * Returns file handle on success; 343 * NULL on failure. 344 */ 345 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode); 346 347 /*! Closes a CRAM file. 348 * 349 * @return 350 * Returns 0 on success; 351 * -1 on failure 352 */ 353 int cram_close(cram_fd* fd); 354 355 /* 356 * Seek within a CRAM file. 357 * 358 * Returns 0 on success 359 * -1 on failure 360 */ 361 int cram_seek(cram_fd* fd, off_t offset, int whence); 362 363 /* 364 * Flushes a CRAM file. 365 * Useful for when writing to stdout without wishing to close the stream. 366 * 367 * Returns 0 on success 368 * -1 on failure 369 */ 370 int cram_flush(cram_fd* fd); 371 372 /*! Checks for end of file on a cram_fd stream. 373 * 374 * @return 375 * Returns 0 if not at end of file 376 * 1 if we hit an expected EOF (end of range or EOF block) 377 * 2 for other EOF (end of stream without EOF block) 378 */ 379 int cram_eof(cram_fd* fd); 380 381 /*! Sets options on the cram_fd. 382 * 383 * See CRAM_OPT_* definitions in hts.h. 384 * Use this immediately after opening. 385 * 386 * @return 387 * Returns 0 on success; 388 * -1 on failure 389 */ 390 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...); 391 392 /*! Sets options on the cram_fd. 393 * 394 * See CRAM_OPT_* definitions in hts.h. 395 * Use this immediately after opening. 396 * 397 * @return 398 * Returns 0 on success; 399 * -1 on failure 400 */ 401 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args); 402 403 /*! 404 * Attaches a header to a cram_fd. 405 * 406 * This should be used when creating a new cram_fd for writing where 407 * we have an SAM_hdr already constructed (eg from a file we've read 408 * in). 409 * 410 * @return 411 * Returns 0 on success; 412 * -1 on failure 413 */ 414 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr); 415 416 /*! Check if this file has a proper EOF block 417 * 418 * @return 419 * Returns 3 if the file is a version of CRAM that does not contain EOF blocks 420 * 2 if the file is a stream and thus unseekable 421 * 1 if the file contains an EOF block 422 * 0 if the file does not contain an EOF block 423 * -1 if an error occured whilst reading the file or we could not seek back to where we were 424 * 425 */ 426 int cram_check_EOF(cram_fd* fd); 427 428 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ 429 int int32_put_blk(cram_block* b, int val); 430 431 /**@}*/ 432 /**@{ ------------------------------------------------------------------- 433 * Old typedef and function names for compatibility with existing code. 434 * Header functionality is now provided by sam.h's sam_hdr_t functions. 435 */ 436 437 alias SAM_hdr = sam_hdr_t; 438 439 /*! Tokenises a SAM header into a hash table. 440 * 441 * Also extracts a few bits on specific data types, such as @RG lines. 442 * 443 * @return 444 * Returns a SAM_hdr struct on success (free with sam_hdr_free()); 445 * NULL on failure 446 */ 447 SAM_hdr* sam_hdr_parse_(const(char)* hdr, size_t len); 448 449 /*! Deallocates all storage used by a SAM_hdr struct. 450 * 451 * This also decrements the header reference count. If after decrementing 452 * it is still non-zero then the header is assumed to be in use by another 453 * caller and the free is not done. 454 */ 455 void sam_hdr_free(SAM_hdr* hdr); 456 457 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */ 458 459 /*! Add an @PG line. 460 * 461 * If we wish complete control over this use sam_hdr_add_line() directly. This 462 * function uses that, but attempts to do a lot of tedious house work for 463 * you too. 464 * 465 * - It will generate a suitable ID if the supplied one clashes. 466 * - It will generate multiple @PG records if we have multiple PG chains. 467 * 468 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending 469 * in NULL. 470 * 471 * @return 472 * Returns 0 on success; 473 * -1 on failure 474 */ 475 alias sam_hdr_add_PG = sam_hdr_add_pg; 476 477 /**@{ -------------------------------------------------------------------*/ 478 479 /*! 480 * Returns the refs_t structure used by a cram file handle. 481 * 482 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to 483 * share reference memory between multiple file handles. 484 * 485 * @return 486 * Returns NULL if none exists or the file handle is not a CRAM file. 487 */ 488 refs_t* cram_get_refs(htsFile* fd); 489 490 /**@}*/ 491