1 /// @file htslib/cram.h 2 /// CRAM format-specific API functions. 3 /* 4 Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd. 5 6 Author: James Bonfield <jkb@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. */ 25 26 /** @file 27 * Consider using the higher level hts_*() API for programs that wish to 28 * be file format agnostic (see htslib/hts.h). 29 * 30 * This API should be used for CRAM specific code. The specifics of the 31 * public API are implemented in cram_io.h, cram_encode.h and cram_decode.h 32 * although these should not be included directly (use this file instead). 33 */ 34 35 module htslib.cram; 36 37 import core.stdc.stdio; 38 import core.stdc.stdlib; 39 import core.stdc.stdarg : va_list; 40 41 import htslib.sam; 42 import htslib.hts; 43 import htslib.hfile : hFILE; 44 45 @system: 46 nothrow: 47 @nogc: 48 49 extern (C): 50 51 // see cram/cram_structs.h for an internal more complete copy of this enum 52 53 // Htslib 1.11 had these listed without any hts prefix, and included 54 // some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever 55 // been public). 56 // 57 // We can't find evidence of these being used and the data type occurs 58 // nowhere in functions or structures meaning using it would be pointless. 59 // However for safety, if you absolute need the API to not change then 60 // define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11). 61 62 // Public methods as defined in the CRAM spec. 63 64 // CRAM 2.x and 3.0 65 66 // NB: the subsequent numbers may change. They're simply here for 67 // compatibility with the old API, but may have no bearing on the 68 // internal way htslib works. DO NOT USE 69 70 //#include <sys/types.h> 71 alias off_t = size_t; 72 alias ssize_t = size_t; 73 74 enum cram_block_method 75 { 76 BM_ERROR = -1, 77 RAW = 0, 78 GZIP = 1, 79 BZIP2 = 2, 80 LZMA = 3, 81 RANS = 4, // Generic; either order 82 RANS0 = 4, 83 RANS1 = 10, // Not externalised; stored as RANS (generic) 84 GZIP_RLE = 11 // NB: not externalised in CRAM 85 } 86 87 enum cram_content_type 88 { 89 CT_ERROR = -1, 90 FILE_HEADER = 0, 91 COMPRESSION_HEADER = 1, 92 MAPPED_SLICE = 2, 93 UNMAPPED_SLICE = 3, // CRAM V1.0 only 94 EXTERNAL = 4, 95 CORE = 5 96 } 97 98 // Opaque data types, see cram_structs for the fully fledged versions. 99 struct cram_file_def; 100 struct cram_fd; 101 struct cram_container; 102 struct cram_block; 103 struct cram_slice; 104 struct cram_metrics; 105 struct cram_block_slice_hdr; 106 struct cram_block_compression_hdr; 107 struct refs_t; 108 109 // Accessor functions 110 111 /* 112 *----------------------------------------------------------------------------- 113 * cram_fd 114 */ 115 sam_hdr_t* cram_fd_get_header(cram_fd* fd); 116 117 void cram_fd_set_header(cram_fd* fd, sam_hdr_t* hdr); 118 119 int cram_fd_get_version(cram_fd* fd); 120 121 void cram_fd_set_version(cram_fd* fd, int vers); 122 123 int cram_major_vers(cram_fd* fd); 124 int cram_minor_vers(cram_fd* fd); 125 126 hFILE* cram_fd_get_fp(cram_fd* fd); 127 void cram_fd_set_fp(cram_fd* fd, hFILE* fp); 128 129 /* 130 *----------------------------------------------------------------------------- 131 * cram_container 132 */ 133 int cram_container_get_length(cram_container* c); 134 void cram_container_set_length(cram_container* c, int length); 135 int cram_container_get_num_blocks(cram_container* c); 136 void cram_container_set_num_blocks(cram_container* c, int num_blocks); 137 int* cram_container_get_landmarks(cram_container* c, int* num_landmarks); 138 void cram_container_set_landmarks( 139 cram_container* c, 140 int num_landmarks, 141 int* landmarks); 142 143 /* Returns true if the container is empty (EOF marker) */ 144 int cram_container_is_empty(cram_fd* fd); 145 146 /* 147 *----------------------------------------------------------------------------- 148 * cram_block 149 */ 150 int cram_block_get_content_id(cram_block* b); 151 int cram_block_get_comp_size(cram_block* b); 152 int cram_block_get_uncomp_size(cram_block* b); 153 int cram_block_get_crc32(cram_block* b); 154 void* cram_block_get_data(cram_block* b); 155 156 cram_content_type cram_block_get_content_type(cram_block* b); 157 158 void cram_block_set_content_id(cram_block* b, int id); 159 void cram_block_set_comp_size(cram_block* b, int size); 160 void cram_block_set_uncomp_size(cram_block* b, int size); 161 void cram_block_set_crc32(cram_block* b, int crc); 162 void cram_block_set_data(cram_block* b, void* data); 163 164 int cram_block_append(cram_block* b, const(void)* data, int size); 165 void cram_block_update_size(cram_block* b); 166 167 // Offset is known as "size" internally, but it can be confusing. 168 size_t cram_block_get_offset(cram_block* b); 169 void cram_block_set_offset(cram_block* b, size_t offset); 170 171 /* 172 * Computes the size of a cram block, including the block 173 * header itself. 174 */ 175 uint cram_block_size(cram_block* b); 176 177 /* 178 * Renumbers RG numbers in a cram compression header. 179 * 180 * CRAM stores RG as the Nth number in the header, rather than a 181 * string holding the ID: tag. This is smaller in space, but means 182 * "samtools cat" to join files together that contain single but 183 * different RG lines needs a way of renumbering them. 184 * 185 * The file descriptor is expected to be immediately after the 186 * cram_container structure (ie before the cram compression header). 187 * Due to the nature of the CRAM format, this needs to read and write 188 * the blocks itself. Note that there may be multiple slices within 189 * the container, meaning multiple compression headers to manipulate. 190 * Changing RG may change the size of the compression header and 191 * therefore the length field in the container. Hence we rewrite all 192 * blocks just in case and also emit the adjusted container. 193 * 194 * The current implementation can only cope with renumbering a single 195 * RG (and only then if it is using HUFFMAN or BETA codecs). In 196 * theory it *may* be possible to renumber multiple RGs if they use 197 * HUFFMAN to the CORE block or use an external block unshared by any 198 * other data series. So we have an API that can be upgraded to 199 * support this, but do not implement it for now. An example 200 * implementation of RG as an EXTERNAL block would be to find that 201 * block and rewrite it, returning the number of blocks consumed. 202 * 203 * Returns 0 on success; 204 * -1 if unable to edit; 205 * -2 on other errors (eg I/O). 206 */ 207 int cram_transcode_rg( 208 cram_fd* in_, 209 cram_fd* out_, 210 cram_container* c, 211 int nrg, 212 int* in_rg, 213 int* out_rg); 214 215 /* 216 * Copies the blocks representing the next num_slice slices from a 217 * container from 'in' to 'out'. It is expected that the file pointer 218 * is just after the read of the cram_container and cram compression 219 * header. 220 * 221 * Returns 0 on success 222 * -1 on failure 223 */ 224 int cram_copy_slice(cram_fd* in_, cram_fd* out_, int num_slice); 225 226 /* 227 *----------------------------------------------------------------------------- 228 * cram_io basics 229 */ 230 231 /**@{ ---------------------------------------------------------------------- 232 * CRAM blocks - the dynamically growable data block. We have code to 233 * create, update, (un)compress and read/write. 234 * 235 * These are derived from the deflate_interlaced.c blocks, but with the 236 * CRAM extension of content types and IDs. 237 */ 238 239 /*! Allocates a new cram_block structure with a specified content_type and 240 * id. 241 * 242 * @return 243 * Returns block pointer on success; 244 * NULL on failure 245 * 246 * The cram_block struct returned by a successful call should be freed 247 * via cram_free_block() when it is no longer needed. 248 */ 249 cram_block* cram_new_block(cram_content_type content_type, int content_id); 250 251 /*! Reads a block from a cram file. 252 * 253 * @return 254 * Returns cram_block pointer on success; 255 * NULL on failure 256 * 257 * The cram_block struct returned by a successful call should be freed 258 * via cram_free_block() when it is no longer needed. 259 */ 260 cram_block* cram_read_block(cram_fd* fd); 261 262 /*! Writes a CRAM block. 263 * 264 * @return 265 * Returns 0 on success; 266 * -1 on failure 267 */ 268 int cram_write_block(cram_fd* fd, cram_block* b); 269 270 /*! Frees a CRAM block, deallocating internal data too. 271 */ 272 void cram_free_block(cram_block* b); 273 274 /*! Uncompresses a CRAM block, if compressed. 275 * 276 * @return 277 * Returns 0 on success; 278 * -1 on failure 279 */ 280 int cram_uncompress_block(cram_block* b); 281 282 /*! Compresses a block. 283 * 284 * Compresses a block using one of two different zlib strategies. If we only 285 * want one choice set strat2 to be -1. 286 * 287 * The logic here is that sometimes Z_RLE does a better job than Z_FILTERED 288 * or Z_DEFAULT_STRATEGY on quality data. If so, we'd rather use it as it is 289 * significantly faster. 290 * 291 * @return 292 * Returns 0 on success; 293 * -1 on failure 294 */ 295 int cram_compress_block( 296 cram_fd* fd, 297 cram_block* b, 298 cram_metrics* metrics, 299 int method, 300 int level); 301 int cram_compress_block2( 302 cram_fd* fd, 303 cram_slice* s, 304 cram_block* b, 305 cram_metrics* metrics, 306 int method, 307 int level); 308 309 /**@}*/ 310 /**@{ ---------------------------------------------------------------------- 311 * Containers 312 */ 313 314 /*! Creates a new container, specifying the maximum number of slices 315 * and records permitted. 316 * 317 * @return 318 * Returns cram_container ptr on success; 319 * NULL on failure 320 * 321 * The cram_container struct returned by a successful call should be freed 322 * via cram_free_container() when it is no longer needed. 323 */ 324 cram_container* cram_new_container(int nrec, int nslice); 325 void cram_free_container(cram_container* c); 326 327 /*! Reads a container header. 328 * 329 * @return 330 * Returns cram_container on success; 331 * NULL on failure or no container left (fd->err == 0). 332 * 333 * The cram_container struct returned by a successful call should be freed 334 * via cram_free_container() when it is no longer needed. 335 */ 336 cram_container* cram_read_container(cram_fd* fd); 337 338 /*! Writes a container structure. 339 * 340 * @return 341 * Returns 0 on success; 342 * -1 on failure 343 */ 344 int cram_write_container(cram_fd* fd, cram_container* h); 345 346 /* 347 * Stores the container structure in dat and returns *size as the 348 * number of bytes written to dat[]. The input size of dat is also 349 * held in *size and should be initialised to cram_container_size(c). 350 * 351 * Returns 0 on success; 352 * -1 on failure 353 */ 354 int cram_store_container(cram_fd* fd, cram_container* c, char* dat, int* size); 355 356 int cram_container_size(cram_container* c); 357 358 /**@}*/ 359 /**@{ ---------------------------------------------------------------------- 360 * The top-level cram opening, closing and option handling 361 */ 362 363 /*! Opens a CRAM file for read (mode "rb") or write ("wb"). 364 * 365 * The filename may be "-" to indicate stdin or stdout. 366 * 367 * @return 368 * Returns file handle on success; 369 * NULL on failure. 370 */ 371 cram_fd* cram_open(const(char)* filename, const(char)* mode); 372 373 /*! Opens an existing stream for reading or writing. 374 * 375 * @return 376 * Returns file handle on success; 377 * NULL on failure. 378 */ 379 cram_fd* cram_dopen(hFILE* fp, const(char)* filename, const(char)* mode); 380 381 /*! Closes a CRAM file. 382 * 383 * @return 384 * Returns 0 on success; 385 * -1 on failure 386 */ 387 int cram_close(cram_fd* fd); 388 389 /* 390 * Seek within a CRAM file. 391 * 392 * Returns 0 on success 393 * -1 on failure 394 */ 395 int cram_seek(cram_fd* fd, off_t offset, int whence); 396 397 /* 398 * Flushes a CRAM file. 399 * Useful for when writing to stdout without wishing to close the stream. 400 * 401 * Returns 0 on success 402 * -1 on failure 403 */ 404 int cram_flush(cram_fd* fd); 405 406 /*! Checks for end of file on a cram_fd stream. 407 * 408 * @return 409 * Returns 0 if not at end of file 410 * 1 if we hit an expected EOF (end of range or EOF block) 411 * 2 for other EOF (end of stream without EOF block) 412 */ 413 int cram_eof(cram_fd* fd); 414 415 /*! Sets options on the cram_fd. 416 * 417 * See CRAM_OPT_* definitions in hts.h. 418 * Use this immediately after opening. 419 * 420 * @return 421 * Returns 0 on success; 422 * -1 on failure 423 */ 424 int cram_set_option(cram_fd* fd, hts_fmt_option opt, ...); 425 426 /*! Sets options on the cram_fd. 427 * 428 * See CRAM_OPT_* definitions in hts.h. 429 * Use this immediately after opening. 430 * 431 * @return 432 * Returns 0 on success; 433 * -1 on failure 434 */ 435 int cram_set_voption(cram_fd* fd, hts_fmt_option opt, va_list args); 436 437 /*! 438 * Attaches a header to a cram_fd. 439 * 440 * This should be used when creating a new cram_fd for writing where 441 * we have an SAM_hdr already constructed (eg from a file we've read 442 * in). 443 * 444 * @return 445 * Returns 0 on success; 446 * -1 on failure 447 */ 448 int cram_set_header(cram_fd* fd, sam_hdr_t* hdr); 449 450 /*! Check if this file has a proper EOF block 451 * 452 * @return 453 * Returns 3 if the file is a version of CRAM that does not contain EOF blocks 454 * 2 if the file is a stream and thus unseekable 455 * 1 if the file contains an EOF block 456 * 0 if the file does not contain an EOF block 457 * -1 if an error occurred whilst reading the file or we could not seek back to where we were 458 * 459 */ 460 int cram_check_EOF(cram_fd* fd); 461 462 /* As int32_decoded/encode, but from/to blocks instead of cram_fd */ 463 int int32_put_blk(cram_block* b, int val); 464 465 /**@}*/ 466 /**@{ ------------------------------------------------------------------- 467 * Old typedef and function names for compatibility with existing code. 468 * Header functionality is now provided by sam.h's sam_hdr_t functions. 469 */ 470 471 alias SAM_hdr = sam_hdr_t; 472 473 /*! Tokenises a SAM header into a hash table. 474 * 475 * Also extracts a few bits on specific data types, such as @RG lines. 476 * 477 * @return 478 * Returns a SAM_hdr struct on success (free with sam_hdr_free()); 479 * NULL on failure 480 */ 481 pragma(inline, true) 482 SAM_hdr* sam_hdr_parse_(const (char)* hdr, size_t len) { return sam_hdr_parse(len, hdr); } 483 484 /*! Deallocates all storage used by a SAM_hdr struct. 485 * 486 * This also decrements the header reference count. If after decrementing 487 * it is still non-zero then the header is assumed to be in use by another 488 * caller and the free is not done. 489 */ 490 pragma(inline, true) 491 void sam_hdr_free(SAM_hdr* hdr) { sam_hdr_destroy(hdr); } 492 493 /* sam_hdr_length() and sam_hdr_str() are now provided by sam.h. */ 494 495 /*! Add an @PG line. 496 * 497 * If we wish complete control over this use sam_hdr_add_line() directly. This 498 * function uses that, but attempts to do a lot of tedious house work for 499 * you too. 500 * 501 * - It will generate a suitable ID if the supplied one clashes. 502 * - It will generate multiple @PG records if we have multiple PG chains. 503 * 504 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending 505 * in NULL. 506 * 507 * @return 508 * Returns 0 on success; 509 * -1 on failure 510 */ 511 alias sam_hdr_add_PG = sam_hdr_add_pg; 512 513 /**@{ -------------------------------------------------------------------*/ 514 515 /*! 516 * Returns the refs_t structure used by a cram file handle. 517 * 518 * This may be used in conjunction with option CRAM_OPT_SHARED_REF to 519 * share reference memory between multiple file handles. 520 * 521 * @return 522 * Returns NULL if none exists or the file handle is not a CRAM file. 523 */ 524 refs_t* cram_get_refs(htsFile* fd); 525 526 /**@}*/ 527