1 /// @file htslib/cram.h 2 /// CRAM format-specific API functions. 3 /* 4 Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd. 5 6 Author: James Bonfield <jkb@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. */ 25 26 /** @file 27 * Consider using the higher level hts_*() API for programs that wish to 28 * be file format agnostic (see htslib/hts.h). 29 * 30 * This API should be used for CRAM specific code. The specifics of the 31 * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h 32 * although these should not be included directly (use this file instead). 33 */ 34 35 module htslib.cram; 36 37 import core.stdc.stdio; 38 import core.stdc.stdlib; 39 import core.stdc.stdarg : va_list; 40 41 import htslib.sam; 42 import htslib.hts; 43 import htslib.hfile : hFILE; 44 45 @system: 46 nothrow: 47 @nogc: 48 49 extern (C): 50 51 // see cram/cram_structs.h for an internal more complete copy of this enum 52 53 // Htslib 1.11 had these listed without any hts prefix, and included 54 // some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever 55 // been public). 56 // 57 // We can't find evidence of these being used and the data type occurs 58 // nowhere in functions or structures meaning using it would be pointless. 59 // However for safety, if you absolute need the API to not change then 60 // define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11). 61 62 // Public methods as defined in the CRAM spec. 63 64 // CRAM 2.x and 3.0 65 66 // NB: the subsequent numbers may change. They're simply here for 67 // compatibility with the old API, but may have no bearing on the 68 // internal way htslib works. DO NOT USE 69 70 //#include <sys/types.h> 71 alias off_t = size_t; 72 alias ssize_t = size_t; 73 74 enum cram_block_method 75 { 76 BM_ERROR = -1, 77 RAW = 0, 78 GZIP = 1, 79 BZIP2 = 2, 80 LZMA = 3, 81 RANS = 4, // Generic; either order 82 RANS0 = 4, 83 RANS1 = 10, // Not externalised; stored as RANS (generic) 84 GZIP_RLE = 11 // NB: not externalised in CRAM 85 } 86 87 enum cram_content_type 88 { 89 CT_ERROR = -1, 90 FILE_HEADER = 0, 91 COMPRESSION_HEADER = 1, 92 MAPPED_SLICE = 2, 93 UNMAPPED_SLICE = 3, // CRAM V1.0 only 94 EXTERNAL = 4, 95 CORE = 5 96 } 97 98 // Opaque data types, see cram_structs for the fully fledged versions. 99 struct cram_file_def; 100 struct cram_fd; 101 struct cram_container; 102 struct cram_block; 103 struct cram_slice; 104 struct cram_metrics; 105 struct cram_block_slice_hdr; 106 struct cram_block_compression_hdr; 107 struct refs_t; 108 109 // Accessor functions 110 111 /* 112 *----------------------------------------------------------------------------- 113 * cram_fd 114 */ 115 sam_hdr_t* cram_fd_get_header(cram_fd* fd); 116 117 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr); 118 119 int cram_fd_get_version(cram_fd* fd); 120 121 void cram_fd_set_version(cram_fd* fd, int vers); 122 123 int cram_major_vers(cram_fd* fd); 124 int cram_minor_vers(cram_fd* fd); 125 126 hFILE* cram_fd_get_fp(cram_fd* fd); 127 void cram_fd_set_fp(cram_fd* fd, hFILE* fp); 128 129 /* 130 *----------------------------------------------------------------------------- 131 * cram_container 132 */ 133 int cram_container_get_length(cram_container* c); 134 void cram_container_set_length(cram_container* c, int length); 135 int cram_container_get_num_blocks(cram_container* c); 136 void cram_container_set_num_blocks(cram_container* c, int num_blocks); 137 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks); 138 void cram_container_set_landmarks( 139 cram_container* c, 140 int num_landmarks, 141 int* landmarks); 142 143 /* Returns true if the container is empty (EOF marker) */ 144 int cram_container_is_empty(cram_fd* fd); 145 146 /* 147 *----------------------------------------------------------------------------- 148 * cram_block 149 */ 150 int cram_block_get_content_id(cram_block* b); 151 int cram_block_get_comp_size(cram_block* b); 152 int cram_block_get_uncomp_size(cram_block* b); 153 int cram_block_get_crc32(cram_block* b); 154 void* cram_block_get_data(cram_block* b); 155 156 cram_content_type cram_block_get_content_type(cram_block* b); 157 158 void cram_block_set_content_id(cram_block* b, int id); 159 void cram_block_set_comp_size(cram_block* b, int size); 160 void cram_block_set_uncomp_size(cram_block* b, int size); 161 void cram_block_set_crc32(cram_block* b, int crc); 162 void cram_block_set_data(cram_block* b, void* data); 163 164 int cram_block_append(cram_block* b, const(void)* data, int size); 165 void cram_block_update_size(cram_block* b); 166 167 // Offset is known as "size" internally, but it can be confusing. 168 size_t cram_block_get_offset(cram_block* b); 169 void cram_block_set_offset(cram_block* b, size_t offset); 170 171 /* 172 * Computes the size of a cram block, including the block 173 * header itself. 174 */ 175 uint cram_block_size(cram_block* b); 176 177 /* 178 * Renumbers RG numbers in a cram compression header. 179 * 180 * CRAM stores RG as the Nth number in the header, rather than a 181 * string holding the ID: tag. This is smaller in space, but means 182 * "samtools cat" to join files together that contain single but 183 * different RG lines needs a way of renumbering them. 184 * 185 * The file descriptor is expected to be immediately after the 186 * cram_container structure (ie before the cram compression header). 187 * Due to the nature of the CRAM format, this needs to read and write 188 * the blocks itself. Note that there may be multiple slices within 189 * the container, meaning multiple compression headers to manipulate. 190 * Changing RG may change the size of the compression header and 191 * therefore the length field in the container. Hence we rewrite all 192 * blocks just in case and also emit the adjusted container. 193 * 194 * The current implementation can only cope with renumbering a single 195 * RG (and only then if it is using HUFFMAN or BETA codecs). In 196 * theory it *may* be possible to renumber multiple RGs if they use 197 * HUFFMAN to the CORE block or use an external block unshared by any 198 * other data series. So we have an API that can be upgraded to 199 * support this, but do not implement it for now. An example 200 * implementation of RG as an EXTERNAL block would be to find that 201 * block and rewrite it, returning the number of blocks consumed. 202 * 203 * Returns 0 on success; 204 * -1 if unable to edit; 205 * -2 on other errors (eg I/O). 206 */ 207 int cram_transcode_rg( 208 cram_fd* in_, 209 cram_fd* out_, 210 cram_container* c, 211 int nrg, 212 int* in_rg, 213 int* out_rg); 214 215 /* 216 * Copies the blocks representing the next num_slice slices from a 217 * container from 'in' to 'out'. It is expected that the file pointer 218 * is just after the read of the cram_container and cram compression 219 * header. 220 * 221 * Returns 0 on success 222 * -1 on failure 223 */ 224 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice); 225 226 /* 227 *----------------------------------------------------------------------------- 228 * cram slice interrogation 229 */ 230 231 /* 232 * Returns the number of cram blocks within this slice. 233 */ 234 int cram_slice_hdr_get_num_blocks(cram_block_slice_hdr* hdr); 235 236 /* 237 * Returns the block content_id for the block containing an embedded reference 238 * sequence. If none is present, -1 is returned. 239 */ 240 int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr* h); 241 242 /* 243 * Returns slice reference ID, start and span (length) coordinates. 244 * Return parameters may be NULL in which case they are ignored. 245 */ 246 void cram_slice_hdr_get_coords( 247 cram_block_slice_hdr* h, 248 int* refid, 249 hts_pos_t* start, 250 hts_pos_t* span); 251 252 /* 253 * Decodes a slice header from a cram block. 254 * Returns the opaque cram_block_slice_hdr pointer on success, 255 * NULL on failure. 256 */ 257 cram_block_slice_hdr* cram_decode_slice_header(cram_fd* fd, cram_block* b); 258 259 /* 260 * Frees a cram_block_slice_hdr structure. 261 */ 262 void cram_free_slice_header(cram_block_slice_hdr* hdr); 263 264 /* 265 *----------------------------------------------------------------------------- 266 * cram_io basics 267 */ 268 269 /**@{ ---------------------------------------------------------------------- 270 * CRAM blocks - the dynamically growable data block. We have code to 271 * create, update, (un)compress and read/write. 272 * 273 * These are derived from the deflate_interlaced.c blocks, but with the 274 * CRAM extension of content types and IDs. 275 */ 276 277 /*! Allocates a new cram_block structure with a specified content_type and 278 * id. 279 * 280 * @return 281 * Returns block pointer on success; 282 * NULL on failure 283 * 284 * The cram_block struct returned by a successful call should be freed 285 * via cram_free_block() when it is no longer needed. 286 */ 287 cram_block* cram_new_block(cram_content_type content_type, int content_id); 288 289 /*! Reads a block from a cram file. 290 * 291 * @return 292 * Returns cram_block pointer on success; 293 * NULL on failure 294 * 295 * The cram_block struct returned by a successful call should be freed 296 * via cram_free_block() when it is no longer needed. 297 */ 298 cram_block* cram_read_block(cram_fd* fd); 299 300 /*! Writes a CRAM block. 301 * 302 * @return 303 * Returns 0 on success; 304 * -1 on failure 305 */ 306 int cram_write_block(cram_fd* fd, cram_block* b); 307 308 /*! Frees a CRAM block, deallocating internal data too. 309 */ 310 void cram_free_block(cram_block* b); 311 312 /*! Uncompresses a CRAM block, if compressed. 313 * 314 * @return 315 * Returns 0 on success; 316 * -1 on failure 317 */ 318 int cram_uncompress_block(cram_block* b); 319 320 /*! Compresses a block. 321 * 322 * Compresses a block using one of two different zlib strategies. If we only 323 * want one choice set strat2 to be -1. 324 * 325 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED 326 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is 327 * significantly faster. 328 * 329 * @return 330 * Returns 0 on success; 331 * -1 on failure 332 */ 333 int cram_compress_block( 334 cram_fd* fd, 335 cram_block* b, 336 cram_metrics* metrics, 337 int method, 338 int level); 339 int cram_compress_block2( 340 cram_fd* fd, 341 cram_slice* s, 342 cram_block* b, 343 cram_metrics* metrics, 344 int method, 345 int level); 346 347 /**@}*/ 348 /**@{ ---------------------------------------------------------------------- 349 * Containers 350 */ 351 352 /*! Creates a new container, specifying the maximum number of slices 353 * and records permitted. 354 * 355 * @return 356 * Returns cram_container ptr on success; 357 * NULL on failure 358 * 359 * The cram_container struct returned by a successful call should be freed 360 * via cram_free_container() when it is no longer needed. 361 */ 362 cram_container* cram_new_container(int nrec, int nslice); 363 void cram_free_container(cram_container* c); 364 365 /*! Reads a container header. 366 * 367 * @return 368 * Returns cram_container on success; 369 * NULL on failure or no container left (fd->err == 0). 370 * 371 * The cram_container struct returned by a successful call should be freed 372 * via cram_free_container() when it is no longer needed. 373 */ 374 cram_container* cram_read_container(cram_fd* fd); 375 376 /*! Writes a container structure. 377 * 378 * @return 379 * Returns 0 on success; 380 * -1 on failure 381 */ 382 int cram_write_container(cram_fd* fd, cram_container* h); 383 384 /* 385 * Stores the container structure in dat and returns *size as the 386 * number of bytes written to dat[]. The input size of dat is also 387 * held in *size and should be initialised to cram_container_size(c). 388 * 389 * Returns 0 on success; 390 * -1 on failure 391 */ 392 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size); 393 394 int cram_container_size(cram_container* c); 395 396 /**@}*/ 397 /**@{ ---------------------------------------------------------------------- 398 * The top-level cram opening, closing and option handling 399 */ 400 401 /*! Opens a CRAM file for read (mode "rb") or write ("wb"). 402 * 403 * The filename may be "-" to indicate stdin or stdout. 404 * 405 * @return 406 * Returns file handle on success; 407 * NULL on failure. 408 */ 409 cram_fd* cram_open(const(char)* filename, const(char)* mode); 410 411 /*! Opens an existing stream for reading or writing. 412 * 413 * @return 414 * Returns file handle on success; 415 * NULL on failure. 416 */ 417 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode); 418 419 /*! Closes a CRAM file. 420 * 421 * @return 422 * Returns 0 on success; 423 * -1 on failure 424 */ 425 int cram_close(cram_fd* fd); 426 427 /* 428 * Seek within a CRAM file. 429 * 430 * Returns 0 on success 431 * -1 on failure 432 */ 433 int cram_seek(cram_fd* fd, off_t offset, int whence); 434 435 /* 436 * Flushes a CRAM file. 437 * Useful for when writing to stdout without wishing to close the stream. 438 * 439 * Returns 0 on success 440 * -1 on failure 441 */ 442 int cram_flush(cram_fd* fd); 443 444 /*! Checks for end of file on a cram_fd stream. 445 * 446 * @return 447 * Returns 0 if not at end of file 448 * 1 if we hit an expected EOF (end of range or EOF block) 449 * 2 for other EOF (end of stream without EOF block) 450 */ 451 int cram_eof(cram_fd* fd); 452 453 /*! Sets options on the cram_fd. 454 * 455 * See CRAM_OPT_* definitions in hts.h. 456 * Use this immediately after opening. 457 * 458 * @return 459 * Returns 0 on success; 460 * -1 on failure 461 */ 462 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...); 463 464 /*! Sets options on the cram_fd. 465 * 466 * See CRAM_OPT_* definitions in hts.h. 467 * Use this immediately after opening. 468 * 469 * @return 470 * Returns 0 on success; 471 * -1 on failure 472 */ 473 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args); 474 475 /*! 476 * Attaches a header to a cram_fd. 477 * 478 * This should be used when creating a new cram_fd for writing where 479 * we have an SAM_hdr already constructed (eg from a file we've read 480 * in). 481 * 482 * @return 483 * Returns 0 on success; 484 * -1 on failure 485 */ 486 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr); 487 488 /*! Check if this file has a proper EOF block 489 * 490 * @return 491 * Returns 3 if the file is a version of CRAM that does not contain EOF blocks 492 * 2 if the file is a stream and thus unseekable 493 * 1 if the file contains an EOF block 494 * 0 if the file does not contain an EOF block 495 * -1 if an error occurred whilst reading the file or we could not seek back to where we were 496 * 497 */ 498 int cram_check_EOF(cram_fd* fd); 499 500 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ 501 int int32_put_blk(cram_block* b, int val); 502 503 /**@}*/ 504 /**@{ ------------------------------------------------------------------- 505 * Old typedef and function names for compatibility with existing code. 506 * Header functionality is now provided by sam.h's sam_hdr_t functions. 507 */ 508 509 alias SAM_hdr = sam_hdr_t; 510 511 /*! Tokenises a SAM header into a hash table. 512 * 513 * Also extracts a few bits on specific data types, such as @RG lines. 514 * 515 * @return 516 * Returns a SAM_hdr struct on success (free with sam_hdr_free()); 517 * NULL on failure 518 */ 519 pragma(inline, true) 520 SAM_hdr* sam_hdr_parse_(const (char)* hdr, size_t len) { return sam_hdr_parse(len, hdr); } 521 522 /*! Deallocates all storage used by a SAM_hdr struct. 523 * 524 * This also decrements the header reference count. If after decrementing 525 * it is still non-zero then the header is assumed to be in use by another 526 * caller and the free is not done. 527 */ 528 pragma(inline, true) 529 void sam_hdr_free(SAM_hdr* hdr) { sam_hdr_destroy(hdr); } 530 531 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */ 532 533 /*! Add an @PG line. 534 * 535 * If we wish complete control over this use sam_hdr_add_line() directly. This 536 * function uses that, but attempts to do a lot of tedious house work for 537 * you too. 538 * 539 * - It will generate a suitable ID if the supplied one clashes. 540 * - It will generate multiple @PG records if we have multiple PG chains. 541 * 542 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending 543 * in NULL. 544 * 545 * @return 546 * Returns 0 on success; 547 * -1 on failure 548 */ 549 alias sam_hdr_add_PG = sam_hdr_add_pg; 550 551 /**@{ -------------------------------------------------------------------*/ 552 553 /*! 554 * Returns the refs_t structure used by a cram file handle. 555 * 556 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to 557 * share reference memory between multiple file handles. 558 * 559 * @return 560 * Returns NULL if none exists or the file handle is not a CRAM file. 561 */ 562 refs_t* cram_get_refs(htsFile* fd); 563 564 /**@}*/ 565