1 /// @file htslib/hts.h 2 /// Format-neutral I/O, indexing, and iterator API functions. 3 /* 4 Copyright (C) 2012-2020 Genome Research Ltd. 5 Copyright (C) 2010, 2012 Broad Institute. 6 Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk> 7 8 Author: Heng Li <lh3@sanger.ac.uk> 9 10 Permission is hereby granted, free of charge, to any person obtaining a copy 11 of this software and associated documentation files (the "Software"), to deal 12 in the Software without restriction, including without limitation the rights 13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 copies of the Software, and to permit persons to whom the Software is 15 furnished to do so, subject to the following conditions: 16 17 The above copyright notice and this permission notice shall be included in 18 all copies or substantial portions of the Software. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. */ 27 module htslib.hts; 28 29 import core.stdc.config; 30 import core.stdc.inttypes; 31 import core.stdc.limits; 32 import core.stdc.stdint; 33 34 import htslib.bgzf : BGZF; 35 import htslib.cram : cram_fd; 36 import htslib.hfile : hFILE; 37 import htslib.thread_pool : hts_tpool; 38 import htslib.sam : sam_hdr_t; 39 import htslib.kstring : kstring_t; 40 41 @system: 42 nothrow: 43 @nogc: 44 45 extern (C): 46 47 // Separator used to split HTS_PATH (for plugins); REF_PATH (cram references) 48 49 version(Windows) 50 { 51 enum HTS_PATH_SEPARATOR_CHAR = ';'; 52 enum HTS_PATH_SEPARATOR_STR = ";"; 53 } 54 else 55 { 56 enum HTS_PATH_SEPARATOR_CHAR = ':'; 57 enum HTS_PATH_SEPARATOR_STR = ":"; 58 } 59 60 61 /** 62 * @hideinitializer 63 * Deprecated macro to expand a dynamic array of a given type 64 * 65 * @param type_t The type of the array elements 66 * @param[in] n Requested number of elements of type type_t 67 * @param[in,out] m Size of memory allocated 68 * @param[in,out] ptr Pointer to the array 69 * 70 * @discussion 71 * Do not use this macro. Use hts_resize() instead as allows allocation 72 * failures to be handled more gracefully. 73 * 74 * The array *ptr will be expanded if necessary so that it can hold @p n 75 * or more elements. If the array is expanded then the new size will be 76 * written to @p m and the value in @p ptr may change. 77 * 78 * It must be possible to take the address of @p ptr and @p m must be usable 79 * as an lvalue. 80 * 81 * @bug 82 * If the memory allocation fails, this will call exit(1). This is 83 * not ideal behaviour in a library. 84 */ 85 86 /** 87 * @hideinitializer 88 * Macro to expand a dynamic array, zeroing any newly-allocated memory 89 * 90 * @param type_t The type of the array elements 91 * @param[in] n Requested number of elements of type type_t 92 * @param[in,out] m Size of memory allocated 93 * @param[in,out] ptr Pointer to the array 94 * 95 * @discussion 96 * Do not use this macro. Use hts_resize() instead as allows allocation 97 * failures to be handled more gracefully. 98 * 99 * As for hts_expand(), except the bytes that make up the array elements 100 * between the old and new values of @p m are set to zero using memset(). 101 * 102 * @bug 103 * If the memory allocation fails, this will call exit(1). This is 104 * not ideal behaviour in a library. 105 */ 106 107 // For internal use (by hts_resize()) only 108 int hts_resize_array_(size_t, size_t, size_t, void*, void**, int, const(char)*); 109 110 enum HTS_RESIZE_CLEAR = 1; 111 112 /** 113 * @hideinitializer 114 * Macro to expand a dynamic array of a given type 115 * 116 * @param type_t The type of the array elements 117 * @param[in] num Requested number of elements of type type_t 118 * @param[in,out] size_ptr Pointer to where the size (in elements) of the 119 array is stored. 120 * @param[in,out] ptr Location of the pointer to the array 121 * @param[in] flags Option flags 122 * 123 * @return 0 for success, or negative if an error occurred. 124 * 125 * @discussion 126 * The array *ptr will be expanded if necessary so that it can hold @p num 127 * or more elements. If the array is expanded then the new size will be 128 * written to @p *size_ptr and the value in @p *ptr may change. 129 * 130 * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will 131 * be cleared. 132 */ 133 134 pragma(inline,true) 135 int hts_resize(T)(size_t num, ref size_t size, T* ptr, int flags) 136 { 137 return (num > size) 138 ? hts_resize_array_(T.sizeof, num, size_t.sizeof, &size, cast(void **)&ptr, flags, __FUNCTION__) 139 : 0; 140 } 141 142 /// Release resources when dlclosing a dynamically loaded HTSlib 143 /** @discussion 144 * Normally HTSlib cleans up automatically when your program exits, 145 * whether that is via exit(3) or returning from main(). However if you 146 * have dlopen(3)ed HTSlib and wish to close it before your main program 147 * exits, you must call hts_lib_shutdown() before dlclose(3). 148 */ 149 void hts_lib_shutdown(); 150 151 /** 152 * Wrapper function for free(). Enables memory deallocation across DLL 153 * boundary. Should be used by all applications, which are compiled 154 * with a different standard library than htslib and call htslib 155 * methods that return dynamically allocated data. 156 */ 157 void hts_free(void* ptr); 158 159 /************ 160 * File I/O * 161 ************/ 162 163 // Add new entries only at the end (but before the *_maximum entry) 164 // of these enums, as their numbering is part of the htslib ABI. 165 166 /// Broad format category (sequence data, variant data, index, regions, etc.) 167 enum htsFormatCategory // @suppress(dscanner.style.phobos_naming_convention) 168 { 169 unknown_category = 0, 170 sequence_data = 1, // Sequence data -- SAM, BAM, CRAM, etc 171 variant_data = 2, // Variant calling data -- VCF, BCF, etc 172 index_file = 3, // Index file associated with some data file 173 region_list = 4, // Coordinate intervals or regions -- BED, etc 174 category_maximum = 32_767 175 } 176 177 /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.) 178 enum htsExactFormat // @suppress(dscanner.style.phobos_naming_convention) 179 { 180 unknown_format = 0, 181 binary_format = 1, 182 text_format = 2, 183 sam = 3, 184 bam = 4, 185 bai = 5, 186 cram = 6, 187 crai = 7, 188 vcf = 8, 189 bcf = 9, 190 csi = 10, 191 gzi = 11, 192 tbi = 12, 193 bed = 13, 194 htsget = 14, 195 json = 14, 196 empty_format = 15, // File is empty (or empty after decompression) 197 fasta_format = 16, 198 fastq_format = 17, 199 fai_format = 18, 200 fqi_format = 19, 201 hts_crypt4gh_format = 20, 202 format_maximum = 32_767 203 } 204 205 /// Compression type 206 enum htsCompression // @suppress(dscanner.style.phobos_naming_convention) 207 { 208 no_compression = 0, 209 gzip = 1, 210 bgzf = 2, 211 custom = 3, 212 bzip2_compression = 4, 213 razf_compression = 5, 214 compression_maximum = 32_767 215 } 216 217 struct htsFormat 218 { 219 htsFormatCategory category; /// Broad format category (sequence data, variant data, index, regions, etc.) 220 htsExactFormat format; /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.) 221 /// format version 222 struct Vers { short major, minor; } // @suppress(dscanner.style.undocumented_declaration) 223 Vers v; /// format version 224 htsCompression compression; /// Compression type 225 short compression_level;/// currently unused 226 void *specific; /// format specific options; see struct hts_opt. 227 } 228 229 struct hts_idx_t; 230 struct hts_filter_t; 231 232 /** 233 * @brief File handle returned by hts_open() etc. 234 * This structure should be considered opaque by end users. There should be 235 * no need to access most fields directly in user code, and in cases where 236 * it is desirable accessor functions such as hts_get_format() are provided. 237 */ 238 // Maintainers note htsFile cannot be an incomplete struct because some of its 239 // fields are part of libhts.so's ABI (hence these fields must not be moved): 240 // - fp is used in the public sam_itr_next()/etc macros 241 // - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1 242 // - is_write and is_cram are used directly in samtools <= 1.1 243 // - fp is used directly in samtools (up to and including current develop) 244 // - line is used directly in bcftools (up to and including current develop) 245 // - is_bgzf and is_cram flags indicate which fp union member to use. 246 // Note is_bgzf being set does not indicate the flag is BGZF compressed, 247 // nor even whether it is compressed at all (eg on naked BAMs). 248 struct htsFile // @suppress(dscanner.style.phobos_naming_convention) 249 { 250 import std.bitmanip : bitfields; 251 252 mixin(bitfields!( 253 uint, "is_bin", 1, 254 uint, "is_write", 1, 255 uint, "is_be", 1, 256 uint, "is_cram", 1, 257 uint, "is_bgzf", 1, 258 uint, "dummy", 27)); 259 long lineno; /// uncompressed(?) file line no. 260 kstring_t line; /// buffer to hold line 261 char *fn; /// filename 262 char *fn_aux; /// auxillary (i.e, index) file name 263 /// hFile plus any needed bgzf or CRAM (if applicable) structure data 264 union FP { 265 BGZF *bgzf; /// see bgzf.d 266 cram_fd *cram; /// see cram.d 267 hFILE *hfile; /// see hfile.d 268 } 269 FP fp; /// hFile plus any needed bgzf or CRAM (if applicable) structure data 270 void* state; // format specific state information 271 htsFormat format; /// hts file complete file format information 272 hts_idx_t* idx; 273 const(char)* fnidx; 274 sam_hdr_t* bam_header; 275 hts_filter_t* filter; 276 } 277 278 /// A combined thread pool and queue allocation size. 279 /// The pool should already be defined, but qsize may be zero to 280 /// indicate an appropriate queue size is taken from the pool. 281 /// 282 /// Reasons for explicitly setting it could be where many more file 283 /// descriptors are in use than threads, so keeping memory low is 284 /// important. 285 struct htsThreadPool // @suppress(dscanner.style.phobos_naming_convention) 286 { 287 hts_tpool* pool; // The shared thread pool itself 288 int qsize; // Size of I/O queue to use for this fp 289 } 290 291 /// REQUIRED_FIELDS 292 enum sam_fields // @suppress(dscanner.style.phobos_naming_convention) 293 { 294 SAM_QNAME = 0x00000001, 295 SAM_FLAG = 0x00000002, 296 SAM_RNAME = 0x00000004, 297 SAM_POS = 0x00000008, 298 SAM_MAPQ = 0x00000010, 299 SAM_CIGAR = 0x00000020, 300 SAM_RNEXT = 0x00000040, 301 SAM_PNEXT = 0x00000080, 302 SAM_TLEN = 0x00000100, 303 SAM_SEQ = 0x00000200, 304 SAM_QUAL = 0x00000400, 305 SAM_AUX = 0x00000800, 306 SAM_RGAUX = 0x00001000 307 } 308 309 /// Mostly CRAM only, but this could also include other format options 310 enum hts_fmt_option 311 { 312 // CRAM specific 313 CRAM_OPT_DECODE_MD = 0, 314 CRAM_OPT_PREFIX = 1, 315 CRAM_OPT_VERBOSITY = 2, // obsolete, use hts_set_log_level() instead 316 CRAM_OPT_SEQS_PER_SLICE = 3, 317 CRAM_OPT_SLICES_PER_CONTAINER = 4, 318 CRAM_OPT_RANGE = 5, 319 CRAM_OPT_VERSION = 6, // rename to cram_version? 320 CRAM_OPT_EMBED_REF = 7, 321 CRAM_OPT_IGNORE_MD5 = 8, 322 CRAM_OPT_REFERENCE = 9, // make general 323 CRAM_OPT_MULTI_SEQ_PER_SLICE = 10, 324 CRAM_OPT_NO_REF = 11, 325 CRAM_OPT_USE_BZIP2 = 12, 326 CRAM_OPT_SHARED_REF = 13, 327 CRAM_OPT_NTHREADS = 14, // deprecated, use HTS_OPT_NTHREADS 328 CRAM_OPT_THREAD_POOL = 15, // make general 329 CRAM_OPT_USE_LZMA = 16, 330 CRAM_OPT_USE_RANS = 17, 331 CRAM_OPT_REQUIRED_FIELDS = 18, 332 CRAM_OPT_LOSSY_NAMES = 19, 333 CRAM_OPT_BASES_PER_SLICE = 20, 334 CRAM_OPT_STORE_MD = 21, 335 CRAM_OPT_STORE_NM = 22, 336 CRAM_OPT_RANGE_NOSEEK = 23, // CRAM_OPT_RANGE minus the seek 337 CRAM_OPT_USE_TOK = 24, 338 CRAM_OPT_USE_FQZ = 25, 339 CRAM_OPT_USE_ARITH = 26, 340 CRAM_OPT_POS_DELTA = 27, // force delta for AP, even on non-pos sorted data 341 342 // General purpose 343 HTS_OPT_COMPRESSION_LEVEL = 100, 344 HTS_OPT_NTHREADS = 101, 345 HTS_OPT_THREAD_POOL = 102, 346 HTS_OPT_CACHE_SIZE = 103, 347 HTS_OPT_BLOCK_SIZE = 104, 348 HTS_OPT_FILTER = 105, 349 HTS_OPT_PROFILE = 106, 350 351 // Fastq 352 353 // Boolean. 354 // Read / Write CASAVA 1.8 format. 355 // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf 356 // 357 // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+ 358 // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag, 359 // \d+ is a control number, and the sequence at the end is 360 // for barcode sequence. Barcodes are read into the aux tag defined 361 // by FASTQ_OPT_BARCODE ("BC" by default). 362 FASTQ_OPT_CASAVA = 1000, 363 364 // String. 365 // Whether to read / write extra SAM format aux tags from the fastq 366 // identifier line. For reading this can simply be "1" to request 367 // decoding aux tags. For writing it is a comma separated list of aux 368 // tag types to be written out. 369 FASTQ_OPT_AUX = 1001, 370 371 // Boolean. 372 // Whether to add /1 and /2 to read identifiers when writing FASTQ. 373 // These come from the BAM_FREAD1 or BAM_FREAD2 flags. 374 // (Detecting the /1 and /2 is automatic when reading fastq.) 375 FASTQ_OPT_RNUM = 1002, 376 377 // Two character string. 378 // Barcode aux tag for CASAVA; defaults to "BC". 379 FASTQ_OPT_BARCODE = 1003, 380 } 381 382 /// Profile options for encoding; primarily used at present in CRAM 383 /// but also usable in BAM as a synonym for deflate compression levels. 384 enum hts_profile_option 385 { 386 HTS_PROFILE_FAST = 0, 387 HTS_PROFILE_NORMAL = 1, 388 HTS_PROFILE_SMALL = 2, 389 HTS_PROFILE_ARCHIVE = 3 390 } 391 392 /// For backwards compatibility 393 alias cram_option = hts_fmt_option; 394 395 /// Options for cache, (de)compression, threads, CRAM, etc. 396 struct hts_opt // @suppress(dscanner.style.phobos_naming_convention) 397 { 398 char *arg; /// string form, strdup()ed 399 hts_fmt_option opt; /// tokenised key 400 /// option value 401 union VAL { /// ... and value 402 int i; /// int value 403 char *s; /// string value 404 } 405 VAL val; /// value 406 hts_opt *next; /// next option (linked list) 407 } 408 409 /* 410 * Explicit index file name delimiter, see below 411 */ 412 enum HTS_IDX_DELIM = "##idx##"; 413 414 /********************** 415 * Exported functions * 416 **********************/ 417 418 /** 419 * Parses arg and appends it to the option list. 420 * 421 * Returns 0 on success; 422 * -1 on failure. 423 */ 424 int hts_opt_add(hts_opt** opts, const(char)* c_arg); 425 426 /** 427 * Applies an hts_opt option list to a given htsFile. 428 * 429 * Returns 0 on success 430 * -1 on failure 431 */ 432 int hts_opt_apply(htsFile* fp, hts_opt* opts); 433 434 /** 435 * Frees an hts_opt list. 436 */ 437 void hts_opt_free(hts_opt* opts); 438 439 /** 440 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally 441 * followed by a comma separated list of key=value options and splits 442 * these up into the fields of htsFormat struct. 443 * 444 * Returns 0 on success 445 * -1 on failure. 446 */ 447 int hts_parse_format(htsFormat* opt, const(char)* str); 448 449 /** 450 * Tokenise options as (key(=value)?,)*(key(=value)?)? 451 * NB: No provision for ',' appearing in the value! 452 * Add backslashing rules? 453 * 454 * This could be used as part of a general command line option parser or 455 * as a string concatenated onto the file open mode. 456 * 457 * Returns 0 on success 458 * -1 on failure. 459 */ 460 int hts_parse_opt_list(htsFormat* opt, const(char)* str); 461 462 /*! @abstract Table for converting a nucleotide character to 4-bit encoding. 463 The input character may be either an IUPAC ambiguity code, '=' for 0, or 464 '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 465 for A/C/G/T or combinations of these bits for ambiguous bases. 466 */ 467 468 version(Windows){ 469 __gshared const(ubyte)[256] seq_nt16_table = [ 470 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 471 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 472 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 473 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, 474 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 475 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 476 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 477 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 478 479 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 480 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 481 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 482 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 483 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 484 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 485 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 486 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 487 ]; 488 }else{ 489 extern __gshared const(ubyte)[256] seq_nt16_table; 490 } 491 /**! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC 492 ambiguity code letter (or '=' when given 0). 493 */ 494 495 version(Windows) __gshared const(char)[16] seq_nt16_str = ['=','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N']; 496 else extern __gshared const(char)[16] seq_nt16_str; 497 498 /**! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. 499 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). 500 */ 501 version(Windows) __gshared const(int)[16] seq_nt16_int = [ 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 ]; 502 else extern __gshared const(int)[16] seq_nt16_int; 503 /**! 504 @abstract Get the htslib version number 505 @return For released versions, a string like "N.N[.N]"; or git describe 506 output if using a library built within a Git repository. 507 */ 508 const(char)* hts_version(); 509 510 /**! 511 @abstract Compile-time HTSlib version number, for use in #if checks 512 @return For released versions X.Y[.Z], an integer of the form XYYYZZ; 513 useful for preprocessor conditionals such as 514 #if HTS_VERSION >= 101000 // Check for v1.10 or later 515 */ 516 // Maintainers: Bump this in the final stage of preparing a new release. 517 // Immediately after release, bump ZZ to 90 to distinguish in-development 518 // Git repository builds from the release; you may wish to increment this 519 // further when significant features are merged. 520 enum HTS_VERSION = 101300; 521 522 /**! @abstract Introspection on the features enabled in htslib 523 * 524 * @return a bitfield of HTS_FEATURE_* macros. 525 */ 526 uint hts_features(); 527 528 const(char)* hts_test_feature(uint id); 529 530 /**! @abstract Introspection on the features enabled in htslib, string form 531 * 532 * @return a string describing htslib build features 533 */ 534 const(char)* hts_feature_string(); 535 536 /// Whether ./configure was used or vanilla Makefile 537 enum HTS_FEATURE_CONFIGURE = 1; 538 539 /// Whether --enable-plugins was used 540 enum HTS_FEATURE_PLUGINS = 2; 541 542 /// Transport specific 543 enum HTS_FEATURE_LIBCURL = 1u << 10; 544 enum HTS_FEATURE_S3 = 1u << 11; 545 enum HTS_FEATURE_GCS = 1u << 12; 546 547 /// Compression options 548 enum HTS_FEATURE_LIBDEFLATE = 1u << 20; 549 enum HTS_FEATURE_LZMA = 1u << 21; 550 enum HTS_FEATURE_BZIP2 = 1u << 22; 551 enum HTS_FEATURE_HTSCODECS = 1u << 23; // htscodecs library version 552 553 /// Build params 554 enum HTS_FEATURE_CC = 1u << 27; 555 enum HTS_FEATURE_CFLAGS = 1u << 28; 556 enum HTS_FEATURE_CPPFLAGS = 1u << 29; 557 enum HTS_FEATURE_LDFLAGS = 1u << 30; 558 559 /**! 560 @abstract Determine format by peeking at the start of a file 561 @param fp File opened for reading, positioned at the beginning 562 @param fmt Format structure that will be filled out on return 563 @return 0 for success, or negative if an error occurred. 564 */ 565 int hts_detect_format(hFILE* fp, htsFormat* fmt); 566 567 /**! 568 @abstract Get a human-readable description of the file format 569 @param fmt Format structure holding type, version, compression, etc. 570 @return Description string, to be freed by the caller after use. 571 */ 572 char* hts_format_description(const(htsFormat)* format); 573 574 /**! 575 @abstract Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF) 576 or possibly-compressed textual line-orientated file 577 @param fn The file name or "-" for stdin/stdout. For indexed files 578 with a non-standard naming, the file name can include the 579 name of the index file delimited with HTS_IDX_DELIM 580 @param mode Mode matching / [rwa][bcefFguxz0-9]* / 581 @discussion 582 With 'r' opens for reading; any further format mode letters are ignored 583 as the format is detected by checking the first few bytes or BGZF blocks 584 of the file. With 'w' or 'a' opens for writing or appending, with format 585 specifier letters: 586 b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) 587 c CRAM format 588 g gzip compressed 589 u uncompressed 590 z bgzf compressed 591 [0-9] zlib compression level 592 and with non-format option letters (for any of 'r'/'w'/'a'): 593 e close the file on exec(2) (opens with O_CLOEXEC, where supported) 594 x create the file exclusively (opens with O_EXCL, where supported) 595 Note that there is a distinction between 'u' and '0': the first yields 596 plain uncompressed output whereas the latter outputs uncompressed data 597 wrapped in the zlib format. 598 @example 599 [rw]b .. compressed BCF, BAM, FAI 600 [rw]bu .. uncompressed BCF 601 [rw]z .. compressed VCF 602 [rw] .. uncompressed VCF 603 */ 604 htsFile* hts_open(const(char)* fn, const(char)* mode); 605 606 /**! 607 @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file 608 @param fn The file name or "-" for stdin/stdout 609 @param mode Open mode, as per hts_open() 610 @param fmt Optional format specific parameters 611 @discussion 612 See hts_open() for description of fn and mode. 613 // TODO Update documentation for s/opts/fmt/ 614 Opts contains a format string (sam, bam, cram, vcf, bcf) which will, 615 if defined, override mode. Opts also contains a linked list of hts_opt 616 structures to apply to the open file handle. These can contain things 617 like pointers to the reference or information on compression levels, 618 block sizes, etc. 619 */ 620 htsFile* hts_open_format( 621 const(char)* fn, 622 const(char)* mode, 623 const(htsFormat)* fmt); 624 625 /**! 626 @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file 627 @param fn The already-open file handle 628 @param mode Open mode, as per hts_open() 629 */ 630 htsFile* hts_hopen(hFILE* fp, const(char)* fn, const(char)* mode); 631 632 /**! 633 @abstract Close a file handle, flushing buffered data for output streams 634 @param fp The file handle to be closed 635 @return 0 for success, or negative if an error occurred. 636 */ 637 int hts_close(htsFile* fp); 638 639 /**! 640 @abstract Returns the file's format information 641 @param fp The file handle 642 @return Read-only pointer to the file's htsFormat. 643 */ 644 const(htsFormat)* hts_get_format(htsFile* fp); 645 646 /**! 647 @ abstract Returns a string containing the file format extension. 648 @ param format Format structure containing the file type. 649 @ return A string ("sam", "bam", etc) or "?" for unknown formats. 650 */ 651 const(char)* hts_format_file_extension(const(htsFormat)* format); 652 653 /**! 654 @abstract Sets a specified CRAM option on the open file handle. 655 @param fp The file handle open the open file. 656 @param opt The CRAM_OPT_* option. 657 @param ... Optional arguments, dependent on the option used. 658 @return 0 for success, or negative if an error occurred. 659 */ 660 int hts_set_opt(htsFile* fp, hts_fmt_option opt, ...); 661 662 /**! 663 @abstract Read a line (and its \n or \r\n terminator) from a file 664 @param fp The file handle 665 @param delimiter Unused, but must be '\n' (or KS_SEP_LINE) 666 @param str The line (not including the terminator) is written here 667 @return Length of the string read; 668 -1 on end-of-file; <= -2 on error 669 */ 670 int hts_getline(htsFile* fp, int delimiter, kstring_t* str); 671 672 char** hts_readlines(const(char)* fn, int* _n); 673 /**! 674 @abstract Parse comma-separated list or read list from a file 675 @param list File name or comma-separated list 676 @param is_file 677 @param _n Size of the output array (number of items read) 678 @return NULL on failure or pointer to newly allocated array of 679 strings 680 */ 681 char** hts_readlist(const(char)* fn, int is_file, int* _n); 682 683 /**! 684 @abstract Create extra threads to aid compress/decompression for this file 685 @param fp The file handle 686 @param n The number of worker threads to create 687 @return 0 for success, or negative if an error occurred. 688 @notes This function creates non-shared threads for use solely by fp. 689 The hts_set_thread_pool function is the recommended alternative. 690 */ 691 int hts_set_threads(htsFile* fp, int n); 692 693 /**! 694 @abstract Create extra threads to aid compress/decompression for this file 695 @param fp The file handle 696 @param p A pool of worker threads, previously allocated by hts_create_threads(). 697 @return 0 for success, or negative if an error occurred. 698 */ 699 int hts_set_thread_pool(htsFile* fp, htsThreadPool* p); 700 701 /**! 702 @abstract Adds a cache of decompressed blocks, potentially speeding up seeks. 703 This may not work for all file types (currently it is bgzf only). 704 @param fp The file handle 705 @param n The size of cache, in bytes 706 */ 707 void hts_set_cache_size(htsFile* fp, int n); 708 709 /*! 710 @abstract Set .fai filename for a file opened for reading 711 @return 0 for success, negative on failure 712 @discussion 713 Called before *_hdr_read(), this provides the name of a .fai file 714 used to provide a reference list if the htsFile contains no @SQ headers. 715 */ 716 int hts_set_fai_filename(htsFile* fp, const(char)* fn_aux); 717 718 /**! 719 @abstract Sets a filter expression 720 @return 0 for success, negative on failure 721 @discussion 722 To clear an existing filter, specifying expr as NULL. 723 */ 724 int hts_set_filter_expression(htsFile* fp, const(char)* expr); 725 726 /**! 727 @abstract Determine whether a given htsFile contains a valid EOF block 728 @return 3 for a non-EOF checkable filetype; 729 2 for an unseekable file type where EOF cannot be checked; 730 1 for a valid EOF block; 731 0 for if the EOF marker is absent when it should be present; 732 -1 (with errno set) on failure 733 @discussion 734 Check if the BGZF end-of-file (EOF) marker is present 735 */ 736 int hts_check_EOF(htsFile* fp); 737 738 /************ 739 * Indexing * 740 ************/ 741 742 /**! 743 These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc, 744 producing iterators operating as follows: 745 - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file 746 - HTS_IDX_START iterates over the entire file 747 - HTS_IDX_REST iterates from the current position to the end of the file 748 - HTS_IDX_NONE always returns "no more alignment records" 749 When one of these special tid values is used, beg and end are ignored. 750 When REST or NONE is used, idx is also ignored and may be NULL. 751 */ 752 enum HTS_IDX_NOCOOR = -2; 753 enum HTS_IDX_START = -3; 754 enum HTS_IDX_REST = -4; 755 enum HTS_IDX_NONE = -5; 756 757 enum HTS_FMT_CSI = 0; 758 enum HTS_FMT_BAI = 1; 759 enum HTS_FMT_TBI = 2; 760 enum HTS_FMT_CRAI = 3; 761 enum HTS_FMT_FAI = 4; 762 763 // Almost INT64_MAX, but when cast into a 32-bit int it's 764 // also INT_MAX instead of -1. This avoids bugs with old code 765 // using the new hts_pos_t data type. 766 enum HTS_POS_MAX = ((cast(long) INT_MAX) << 32) | INT_MAX; 767 enum HTS_POS_MIN = INT64_MIN; 768 enum PRIhts_pos = PRId64; 769 alias hts_pos_t = c_long; 770 771 // For comparison with previous release: 772 // 773 // #define HTS_POS_MAX INT_MAX 774 // #define HTS_POS_MIN INT_MIN 775 // #define PRIhts_pos PRId32 776 // typedef int32_t hts_pos_t; 777 778 struct hts_pair_pos_t 779 { 780 hts_pos_t beg; 781 hts_pos_t end; 782 } 783 784 alias hts_pair32_t = hts_pair_pos_t; // For backwards compatibility 785 786 struct hts_pair64_t // @suppress(dscanner.style.phobos_naming_convention) 787 { 788 ulong u; 789 ulong v; 790 } 791 792 /// 64-bit start, end coordinate pair tracking max (internally used in hts.c) 793 struct hts_pair64_max_t // @suppress(dscanner.style.phobos_naming_convention) 794 { 795 ulong u; 796 ulong v; 797 ulong max; 798 } 799 800 /// Region list used in iterators (NB: apparently confined to single contig/tid) 801 struct hts_reglist_t 802 { 803 const(char) *reg; /// Region string 804 hts_pair_pos_t *intervals; /// (start,end) intervals 805 int tid; /// Contig id 806 uint count; /// How many intervals 807 hts_pos_t min_beg; /// absolute bounds 808 hts_pos_t max_end; /// absolute bounds 809 } 810 811 alias hts_readrec_func = int function(BGZF* fp, void* data, void* r, int* tid, hts_pos_t* beg, hts_pos_t* end); 812 alias hts_seek_func = int function(void* fp, long offset, int where); 813 alias hts_tell_func = c_long function(void* fp); 814 815 /** 816 * @brief File iterator that can handle multiple target regions. 817 * This structure should be considered opaque by end users. 818 * It does both the stepping inside the file and the filtering of alignments. 819 * It can operate in single or multi-region mode, and depending on this, 820 * it uses different fields. 821 * 822 * read_rest (1) - read everything from the current offset, without filtering 823 * finished (1) - no more iterations 824 * is_cram (1) - current file has CRAM format 825 * nocoor (1) - read all unmapped reads 826 * 827 * multi (1) - multi-region moode 828 * reg_list - List of target regions 829 * n_reg - Size of the above list 830 * curr_reg - List index of the current region of search 831 * curr_intv - Interval index inside the current region; points to a (beg, end) 832 * end - Used for CRAM files, to preserve the max end coordinate 833 * 834 * multi (0) - single-region mode 835 * tid - Reference id of the target region 836 * beg - Start position of the target region 837 * end - End position of the target region 838 * 839 * Common fields: 840 * off - List of file offsets computed from the index 841 * n_off - Size of the above list 842 * i - List index of the current file offset 843 * curr_off - File offset for the next file read 844 * curr_tid - Reference id of the current alignment 845 * curr_beg - Start position of the current alignment 846 * curr_end - End position of the current alignment 847 * nocoor_off - File offset where the unmapped reads start 848 * 849 * readrec - File specific function that reads an alignment 850 * seek - File specific function for changing the file offset 851 * tell - File specific function for indicating the file offset 852 */ 853 854 struct hts_itr_t // @suppress(dscanner.style.phobos_naming_convention) 855 { 856 import std.bitmanip : bitfields; 857 858 mixin(bitfields!( 859 uint, "read_rest", 1, 860 uint, "finished", 1, 861 uint, "is_cram", 1, 862 uint, "nocoor", 1, 863 uint, "multi", 1, 864 uint, "dummy", 27)); 865 866 int tid; 867 int n_off; 868 int i; 869 int n_reg; 870 hts_pos_t beg; 871 hts_pos_t end; 872 hts_reglist_t* reg_list; 873 int curr_tid; 874 int curr_reg; 875 int curr_intv; 876 hts_pos_t curr_beg; 877 hts_pos_t curr_end; 878 ulong curr_off; 879 ulong nocoor_off; 880 hts_pair64_max_t* off; 881 int function() readrec; 882 int function() seek; 883 long function() tell; 884 885 struct Bins 886 { 887 int n; 888 int m; 889 int* a; 890 } 891 892 Bins bins; 893 } 894 /// ? index key 895 struct aux_key_t { // @suppress(dscanner.style.phobos_naming_convention) 896 int key; /// ??? 897 /// ??? 898 ulong min_off, max_off; 899 } 900 901 alias hts_itr_multi_t = hts_itr_t; 902 903 pragma(inline, true) 904 extern (D) auto hts_bin_first(T)(auto ref T l) 905 { 906 return ((1 << ((l << 1) + l)) - 1) / 7; 907 } 908 909 pragma(inline, true) 910 extern (D) auto hts_bin_parent(T)(auto ref T b) 911 { 912 return (b - 1) >> 3; 913 } 914 915 /////////////////////////////////////////////////////////// 916 // Low-level API for building indexes. 917 918 /// Create a BAI/CSI/TBI type index structure 919 /** @param n Initial number of targets 920 @param fmt Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI 921 @param offset0 Initial file offset 922 @param min_shift Number of bits for the minimal interval 923 @param n_lvls Number of levels in the binning index 924 @return An initialised hts_idx_t struct on success; NULL on failure 925 926 The struct returned by a successful call should be freed via hts_idx_destroy() 927 when it is no longer needed. 928 */ 929 hts_idx_t* hts_idx_init( 930 int n, 931 int fmt, 932 ulong offset0, 933 int min_shift, 934 int n_lvls); 935 936 /// Free a BAI/CSI/TBI type index 937 /** @param idx Index structure to free 938 */ 939 void hts_idx_destroy(hts_idx_t* idx); 940 941 /// Push an index entry 942 /** @param idx Index 943 @param tid Target id 944 @param beg Range start (zero-based) 945 @param end Range end (zero-based, half-open) 946 @param offset File offset 947 @param is_mapped Range corresponds to a mapped read 948 @return 0 on success; -1 on failure 949 950 The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts 951 stored in the meta-data bin. 952 */ 953 int hts_idx_push( 954 hts_idx_t* idx, 955 int tid, 956 hts_pos_t beg, 957 hts_pos_t end, 958 ulong offset, 959 int is_mapped); 960 961 /// Finish building an index 962 /** @param idx Index 963 @param final_offset Last file offset 964 @return 0 on success; non-zero on failure. 965 */ 966 int hts_idx_finish(hts_idx_t* idx, ulong final_offset); 967 968 /// Returns index format 969 /** @param idx Index 970 @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI 971 */ 972 int hts_idx_fmt(hts_idx_t* idx); 973 974 /// Add name to TBI index meta-data 975 /** @param idx Index 976 @param tid Target identifier 977 @param name Target name 978 @return Index number of name in names list on success; -1 on failure. 979 */ 980 int hts_idx_tbi_name(hts_idx_t* idx, int tid, const(char)* name); 981 982 // Index loading and saving 983 984 /// Save an index to a file 985 /** @param idx Index to be written 986 @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added 987 @param fmt One of the HTS_FMT_* index formats 988 @return 0 if successful, or negative if an error occurred. 989 */ 990 int hts_idx_save(const(hts_idx_t)* idx, const(char)* fn, int fmt); 991 992 /// Save an index to a specific file 993 /** @param idx Index to be written 994 @param fn Input BAM/BCF/etc filename 995 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 996 @param fmt One of the HTS_FMT_* index formats 997 @return 0 if successful, or negative if an error occurred. 998 */ 999 int hts_idx_save_as( 1000 const(hts_idx_t)* idx, 1001 const(char)* fn, 1002 const(char)* fnidx, 1003 int fmt); 1004 1005 /// Load an index file 1006 /** @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or 1007 the extension substituted, to search for an existing index file. 1008 In case of a non-standard naming, the file name can include the 1009 name of the index file delimited with HTS_IDX_DELIM. 1010 @param fmt One of the HTS_FMT_* index formats 1011 @return The index, or NULL if an error occurred. 1012 1013 If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before 1014 the delimiter will be used as the name of the data file and the part after 1015 it will be used as the name of the index. 1016 1017 Otherwise, this function tries to work out the index name as follows: 1018 1019 It will try appending ".csi" to @p fn 1020 It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi" 1021 Then, if @p fmt is HTS_FMT_BAI: 1022 It will try appending ".bai" to @p fn 1023 To will substituting the existing suffix (e.g. .bam) with ".bai" 1024 else if @p fmt is HTS_FMT_TBI: 1025 It will try appending ".tbi" to @p fn 1026 To will substituting the existing suffix (e.g. .vcf) with ".tbi" 1027 1028 If the index file is remote (served over a protocol like https), first a check 1029 is made to see is a locally cached copy is available. This is done for all 1030 of the possible names listed above. If a cached copy is not available then 1031 the index will be downloaded and stored in the current working directory, 1032 with the same name as the remote index. 1033 1034 Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE); 1035 */ 1036 hts_idx_t* hts_idx_load(const(char)* fn, int fmt); 1037 1038 /// Load a specific index file 1039 /** @param fn Input BAM/BCF/etc filename 1040 @param fnidx The input index filename 1041 @return The index, or NULL if an error occurred. 1042 1043 Equivalent to hts_idx_load3(fn, fnidx, 0, 0); 1044 1045 This function will not attempt to save index files locally. 1046 */ 1047 hts_idx_t* hts_idx_load2(const(char)* fn, const(char)* fnidx); 1048 1049 /// Load a specific index file 1050 /** @param fn Input BAM/BCF/etc filename 1051 @param fnidx The input index filename 1052 @param fmt One of the HTS_FMT_* index formats 1053 @param flags Flags to alter behaviour (see description) 1054 @return The index, or NULL if an error occurred. 1055 1056 If @p fnidx is NULL, the index name will be derived from @p fn in the 1057 same way as hts_idx_load(). 1058 1059 If @p fnidx is not NULL, @p fmt is ignored. 1060 1061 The @p flags parameter can be set to a combination of the following 1062 values: 1063 1064 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes 1065 HTS_IDX_SILENT_FAIL Fail silently if the index is not present 1066 1067 The index struct returned by a successful call should be freed 1068 via hts_idx_destroy() when it is no longer needed. 1069 */ 1070 hts_idx_t* hts_idx_load3( 1071 const(char)* fn, 1072 const(char)* fnidx, 1073 int fmt, 1074 int flags); 1075 1076 /// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() ) 1077 enum HTS_IDX_FLAG : int 1078 { 1079 HTS_IDX_SAVE_REMOTE = 1, 1080 HTS_IDX_SILENT_FAIL = 2 1081 } 1082 /////////////////////////////////////////////////////////// 1083 // Functions for accessing meta-data stored in indexes 1084 1085 alias hts_id2name_f = const(char)* function(void*, int); 1086 1087 /// Get extra index meta-data 1088 /** @param idx The index 1089 @param l_meta Pointer to where the length of the extra data is stored 1090 @return Pointer to the extra data if present; NULL otherwise 1091 1092 Indexes (both .tbi and .csi) made by tabix include extra data about 1093 the indexed file. The returns a pointer to this data. Note that the 1094 data is stored exactly as it is in the index. Callers need to interpret 1095 the results themselves, including knowing what sort of data to expect; 1096 byte swapping etc. 1097 */ 1098 ubyte* hts_idx_get_meta(hts_idx_t* idx, uint* l_meta); 1099 1100 /// Set extra index meta-data 1101 /** @param idx The index 1102 @param l_meta Length of data 1103 @param meta Pointer to the extra data 1104 @param is_copy If not zero, a copy of the data is taken 1105 @return 0 on success; -1 on failure (out of memory). 1106 1107 Sets the data that is returned by hts_idx_get_meta(). 1108 1109 If is_copy != 0, a copy of the input data is taken. If not, ownership of 1110 the data pointed to by *meta passes to the index. 1111 */ 1112 int hts_idx_set_meta(hts_idx_t* idx, uint l_meta, ubyte* meta, int is_copy); 1113 1114 /// Get number of mapped and unmapped reads from an index 1115 /** @param idx Index 1116 @param tid Target ID 1117 @param[out] mapped Location to store number of mapped reads 1118 @param[out] unmapped Location to store number of unmapped reads 1119 @return 0 on success; -1 on failure (data not available) 1120 1121 BAI and CSI indexes store information on the number of reads for each 1122 target that were mapped or unmapped (unmapped reads will generally have 1123 a paired read that is mapped to the target). This function returns this 1124 information if it is available. 1125 1126 @note Cram CRAI indexes do not include this information. 1127 */ 1128 int hts_idx_get_stat( 1129 const(hts_idx_t)* idx, 1130 int tid, 1131 ulong* mapped, 1132 ulong* unmapped); 1133 1134 /// Return the number of unplaced reads from an index 1135 /** @param idx Index 1136 @return Unplaced reads count 1137 1138 Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM 1139 files). 1140 */ 1141 ulong hts_idx_get_n_no_coor(const(hts_idx_t)* idx); 1142 1143 /// Return a list of target names from an index 1144 /** @param idx Index 1145 @param[out] n Location to store the number of targets 1146 @param getid Callback function to get the name for a target ID 1147 @param hdr Header from indexed file 1148 @return An array of pointers to the names on success; NULL on failure 1149 1150 @note The names are pointers into the header data structure. When cleaning 1151 up, only the array should be freed, not the names. 1152 */ 1153 const(char *)* hts_idx_seqnames(const(hts_idx_t)* idx, int* n, hts_id2name_f getid, void* hdr); // free only the array, not the values 1154 1155 /// Return the number of targets from an index 1156 /** @param idx Index 1157 @return The number of targets 1158 */ 1159 int hts_idx_nseq(const(hts_idx_t) *idx); 1160 1161 /////////////////////////////////////////////////////////// 1162 // Region parsing 1163 1164 enum HTS_PARSE_FLAGS : int 1165 { 1166 HTS_PARSE_THOUSANDS_SEP = 1, ///< Ignore ',' separators within numbers 1167 HTS_PARSE_ONE_COORD = 2, ///< chr:pos means chr:pos-pos and not chr:pos-end 1168 HTS_PARSE_LIST = 4, ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) 1169 } 1170 1171 /// Parse a numeric string 1172 /** The number may be expressed in scientific notation, and optionally may 1173 contain commas in the integer part (before any decimal point or E notation). 1174 @param str String to be parsed 1175 @param strend If non-NULL, set on return to point to the first character 1176 in @a str after those forming the parsed number 1177 @param flags Or'ed-together combination of HTS_PARSE_* flags 1178 @return Converted value of the parsed number. 1179 1180 When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING 1181 or more) if there are any trailing characters after the number. 1182 */ 1183 long hts_parse_decimal(const(char)* str, char** strend, HTS_PARSE_FLAGS flags); 1184 1185 alias hts_name2id_f = int function(void*, const(char)*) *; 1186 1187 /// Parse a "CHR:START-END"-style region string 1188 /** @param str String to be parsed 1189 @param beg Set on return to the 0-based start of the region 1190 @param end Set on return to the 1-based end of the region 1191 @return Pointer to the colon or '\0' after the reference sequence name, 1192 or NULL if @a str could not be parsed. 1193 1194 NOTE: For compatibility with hts_parse_reg only. 1195 Please use hts_parse_region instead. 1196 */ 1197 const(char)* hts_parse_reg64(const(char)* str, hts_pos_t* beg, hts_pos_t* end); 1198 1199 /// Parse a "CHR:START-END"-style region string 1200 /** @param str String to be parsed 1201 @param beg Set on return to the 0-based start of the region 1202 @param end Set on return to the 1-based end of the region 1203 @return Pointer to the colon or '\0' after the reference sequence name, 1204 or NULL if @a str could not be parsed. 1205 */ 1206 const(char)* hts_parse_reg(const(char)* str, int* beg, int* end); 1207 1208 /// Parse a "CHR:START-END"-style region string 1209 /** @param str String to be parsed 1210 @param tid Set on return (if not NULL) to be reference index (-1 if invalid) 1211 @param beg Set on return to the 0-based start of the region 1212 @param end Set on return to the 1-based end of the region 1213 @param getid Function pointer. Called if not NULL to set tid. 1214 @param hdr Caller data passed to getid. 1215 @param flags Bitwise HTS_PARSE_* flags listed above. 1216 @return Pointer to the byte after the end of the entire region 1217 specifier (including any trailing comma) on success, 1218 or NULL if @a str could not be parsed. 1219 1220 A variant of hts_parse_reg which is reference-id aware. It uses 1221 the iterator name2id callbacks to validate the region tokenisation works. 1222 1223 This is necessary due to GRCh38 HLA additions which have reference names 1224 like "HLA-DRB1*12:17". 1225 1226 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 1227 are reference names, quote using curly braces. 1228 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 1229 1230 Flags are used to control how parsing works, and can be one of the below. 1231 1232 HTS_PARSE_THOUSANDS_SEP: 1233 Ignore commas in numbers. For example with this flag 1,234,567 1234 is interpreted as 1234567. 1235 1236 HTS_PARSE_LIST: 1237 If present, the region is assmed to be a comma separated list and 1238 position parsing will not contain commas (this implicitly 1239 clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). 1240 On success the return pointer will be the start of the next region, ie 1241 the character after the comma. (If *ret != '\0' then the caller can 1242 assume another region is present in the list.) 1243 1244 If not set then positions may contain commas. In this case the return 1245 value should point to the end of the string, or NULL on failure. 1246 1247 HTS_PARSE_ONE_COORD: 1248 If present, X:100 is treated as the single base pair region X:100-100. 1249 In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>. 1250 (This is the standard bcftools region convention.) 1251 1252 When not set X:100 is considered to be X:100-<end> where <end> is 1253 the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are 1254 invalid. 1255 (This is the standard samtools region convention.) 1256 1257 Note the supplied string expects 1 based inclusive coordinates, but the 1258 returned coordinates start from 0 and are half open, so pos0 is valid 1259 for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" 1260 1261 If NULL is returned, the value in tid mat give additional information 1262 about the error: 1263 1264 -2 Failed to parse @p hdr; or out of memory 1265 -1 The reference in @p str has mismatched braces, or does not 1266 exist in @p hdr 1267 >= 0 The specified range in @p str could not be parsed 1268 */ 1269 const(char)* hts_parse_region( 1270 const(char)* s, 1271 int* tid, 1272 hts_pos_t* beg, 1273 hts_pos_t* end, 1274 hts_name2id_f getid, 1275 void* hdr, 1276 HTS_PARSE_FLAGS flags); 1277 1278 /////////////////////////////////////////////////////////// 1279 // Generic iterators 1280 // 1281 // These functions provide the low-level infrastructure for iterators. 1282 // Wrappers around these are used to make iterators for specific file types. 1283 // See: 1284 // htslib/sam.h for SAM/BAM/CRAM iterators 1285 // htslib/vcf.h for VCF/BCF iterators 1286 // htslib/tbx.h for files indexed by tabix 1287 1288 /// Create a single-region iterator 1289 /** @param idx Index 1290 @param tid Target ID 1291 @param beg Start of region 1292 @param end End of region 1293 @param readrec Callback to read a record from the input file 1294 @return An iterator on success; NULL on failure 1295 1296 The iterator struct returned by a successful call should be freed 1297 via hts_itr_destroy() when it is no longer needed. 1298 */ 1299 hts_itr_t* hts_itr_query( 1300 const(hts_idx_t)* idx, 1301 int tid, 1302 hts_pos_t beg, 1303 hts_pos_t end, 1304 hts_readrec_func readrec); 1305 1306 /// Free an iterator 1307 /** @param iter Iterator to free 1308 */ 1309 void hts_itr_destroy(hts_itr_t* iter); 1310 1311 alias hts_itr_query_func = hts_itr_t* function(const(hts_idx_t)* idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func readrec); 1312 1313 /// Create a single-region iterator from a text region specification 1314 /** @param idx Index 1315 @param reg Region specifier 1316 @param getid Callback function to return the target ID for a name 1317 @param hdr Input file header 1318 @param itr_query Callback function returning an iterator for a numeric tid, 1319 start and end position 1320 @param readrec Callback to read a record from the input file 1321 @return An iterator on success; NULL on error 1322 1323 The iterator struct returned by a successful call should be freed 1324 via hts_itr_destroy() when it is no longer needed. 1325 */ 1326 hts_itr_t* hts_itr_querys( 1327 const(hts_idx_t)* idx, 1328 const(char)* reg, 1329 hts_name2id_f getid, 1330 void* hdr, 1331 hts_itr_query_func itr_query, 1332 hts_readrec_func readrec); 1333 1334 /// Return the next record from an iterator 1335 /** @param fp Input file handle 1336 @param iter Iterator 1337 @param r Pointer to record placeholder 1338 @param data Data passed to the readrec callback 1339 @return >= 0 on success, -1 when there is no more data, < -1 on error 1340 */ 1341 int hts_itr_next(BGZF* fp, hts_itr_t* iter, void* r, void* data); 1342 1343 1344 /********************************** 1345 * Iterator with multiple regions * 1346 **********************************/ 1347 1348 alias hts_itr_multi_query_func = int function(const(hts_idx_t)* idx, hts_itr_t* itr); 1349 int hts_itr_multi_bam(const(hts_idx_t)* idx, hts_itr_t* iter); 1350 int hts_itr_multi_cram(const(hts_idx_t)* idx, hts_itr_t* iter); 1351 1352 /// Create a multi-region iterator from a region list 1353 /** @param idx Index 1354 @param reglist Region list 1355 @param count Number of items in region list 1356 @param getid Callback to convert names to target IDs 1357 @param hdr Indexed file header (passed to getid) 1358 @param itr_specific Filetype-specific callback function 1359 @param readrec Callback to read an input file record 1360 @param seek Callback to seek in the input file 1361 @param tell Callback to return current input file location 1362 @return An iterator on success; NULL on failure 1363 1364 The iterator struct returned by a successful call should be freed 1365 via hts_itr_destroy() when it is no longer needed. 1366 */ 1367 hts_itr_t* hts_itr_regions( 1368 const(hts_idx_t)* idx, 1369 hts_reglist_t* reglist, 1370 int count, 1371 hts_name2id_f getid, 1372 void* hdr, 1373 hts_itr_multi_query_func itr_specific, 1374 hts_readrec_func readrec, 1375 hts_seek_func seek, 1376 hts_tell_func tell); 1377 1378 /// Return the next record from an iterator 1379 /** @param fp Input file handle 1380 @param iter Iterator 1381 @param r Pointer to record placeholder 1382 @return >= 0 on success, -1 when there is no more data, < -1 on error 1383 */ 1384 int hts_itr_multi_next(htsFile* fd, hts_itr_t* iter, void* r); 1385 1386 /// Create a region list from a char array 1387 /** @param argv Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2 1388 @param argc Number of items in the array 1389 @param r_count Pointer to the number of items in the resulting region list 1390 @param hdr Header for the sam/bam/cram file 1391 @param getid Callback to convert target names to target ids. 1392 @return A region list on success, NULL on failure 1393 1394 The hts_reglist_t struct returned by a successful call should be freed 1395 via hts_reglist_free() when it is no longer needed. 1396 */ 1397 hts_reglist_t* hts_reglist_create( 1398 char** argv, 1399 int argc, 1400 int* r_count, 1401 void* hdr, 1402 hts_name2id_f getid); 1403 1404 /// Free a region list 1405 /** @param reglist Region list 1406 @param count Number of items in the list 1407 */ 1408 void hts_reglist_free(hts_reglist_t* reglist, int count); 1409 1410 /// Free a multi-region iterator 1411 /** @param iter Iterator to free 1412 */ 1413 alias hts_itr_multi_destroy = hts_itr_destroy; 1414 1415 /** 1416 * hts_file_type() - Convenience function to determine file type 1417 * DEPRECATED: This function has been replaced by hts_detect_format(). 1418 * It and these FT_* macros will be removed in a future HTSlib release. 1419 */ 1420 enum FT_UNKN = 0; 1421 enum FT_GZ = 1; 1422 enum FT_VCF = 2; 1423 enum FT_VCF_GZ = FT_GZ | FT_VCF; 1424 enum FT_BCF = 1 << 2; 1425 enum FT_BCF_GZ = FT_GZ | FT_BCF; 1426 enum FT_STDIN = 1 << 3; 1427 int hts_file_type(const(char)* fname); 1428 1429 /*************************** 1430 * Revised MAQ error model * 1431 ***************************/ 1432 1433 struct errmod_t; 1434 1435 errmod_t* errmod_init(double depcorr); 1436 void errmod_destroy(errmod_t* em); 1437 1438 /* 1439 n: number of bases 1440 m: maximum base 1441 bases[i]: qual:6, strand:1, base:4 1442 q[i*m+j]: phred-scaled likelihood of (i,j) 1443 */ 1444 int errmod_cal(const(errmod_t)* em, int n, int m, ushort* bases, float* q); 1445 1446 /***************************************************** 1447 * Probabilistic banded glocal alignment * 1448 * See https://doi.org/10.1093/bioinformatics/btr076 * 1449 *****************************************************/ 1450 1451 struct probaln_par_t 1452 { 1453 float d; 1454 float e; 1455 int bw; 1456 } 1457 1458 /// Perform probabilistic banded glocal alignment 1459 /** @param ref Reference sequence 1460 @param l_ref Length of reference 1461 @param query Query sequence 1462 @param l_query Length of query sequence 1463 @param iqual Query base qualities 1464 @param c Alignment parameters 1465 @param[out] state Output alignment 1466 @param[out] q Phred scaled posterior probability of state[i] being wrong 1467 @return Phred-scaled likelihood score, or INT_MIN on failure. 1468 1469 The reference and query sequences are coded using integers 0,1,2,3,4 for 1470 bases A,C,G,T,N respectively (N here is for any ambiguity code). 1471 1472 On output, state and q are arrays of length l_query. The higher 30 1473 bits give the reference position the query base is matched to and the 1474 lower two bits can be 0 (an alignment match) or 1 (an 1475 insertion). q[i] gives the phred scaled posterior probability of 1476 state[i] being wrong. 1477 1478 On failure, errno will be set to EINVAL if the values of l_ref or l_query 1479 were invalid; or ENOMEM if a memory allocation failed. 1480 */ 1481 1482 int probaln_glocal( 1483 const(ubyte)* ref_, 1484 int l_ref, 1485 const(ubyte)* query, 1486 int l_query, 1487 const(ubyte)* iqual, 1488 const(probaln_par_t)* c, 1489 int* state, 1490 ubyte* q); 1491 1492 /********************** 1493 * MD5 implementation * 1494 **********************/ 1495 1496 struct hts_md5_context; 1497 1498 /*! @abstract Initialises an MD5 context. 1499 * @discussion 1500 * The expected use is to allocate an hts_md5_context using 1501 * hts_md5_init(). This pointer is then passed into one or more calls 1502 * of hts_md5_update() to compute successive internal portions of the 1503 * MD5 sum, which can then be externalised as a full 16-byte MD5sum 1504 * calculation by calling hts_md5_final(). This can then be turned 1505 * into ASCII via hts_md5_hex(). 1506 * 1507 * To dealloate any resources created by hts_md5_init() call the 1508 * hts_md5_destroy() function. 1509 * 1510 * @return hts_md5_context pointer on success, NULL otherwise. 1511 */ 1512 hts_md5_context* hts_md5_init(); 1513 1514 /*! @abstract Updates the context with the MD5 of the data. */ 1515 void hts_md5_update(hts_md5_context* ctx, const(void)* data, c_ulong size); 1516 1517 /*! @abstract Computes the final 128-bit MD5 hash from the given context */ 1518 void hts_md5_final(ubyte* digest, hts_md5_context* ctx); 1519 1520 /*! @abstract Resets an md5_context to the initial state, as returned 1521 * by hts_md5_init(). 1522 */ 1523 void hts_md5_reset(hts_md5_context* ctx); 1524 1525 /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated 1526 * hex string. 1527 */ 1528 void hts_md5_hex(char* hex, const(ubyte)* digest); 1529 1530 /*! @abstract Deallocates any memory allocated by hts_md5_init. */ 1531 void hts_md5_destroy(hts_md5_context* ctx); 1532 1533 pragma(inline,true) 1534 long hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) 1535 { 1536 int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; 1537 for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) 1538 if (beg>>s == end>>s) return t + (beg>>s); 1539 return 0; 1540 } 1541 1542 /// Compute the level of a bin in a binning index 1543 pragma(inline, true) 1544 int hts_bin_level(int bin) { 1545 int l, b; 1546 for (l = 0, b = bin; b; ++l){ b = hts_bin_parent(b);} 1547 return l; 1548 } 1549 1550 //! Compute the corresponding entry into the linear index of a given bin from 1551 //! a binning index 1552 /*! 1553 * @param bin The bin number 1554 * @param n_lvls The index depth (number of levels - 0 based) 1555 * @return The integer offset into the linear index 1556 * 1557 * Explanation of the return value formula: 1558 * Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs. 1559 * A linear index entry covers exp(2, min_shift) base pairs. 1560 */ 1561 1562 // compute the level of bin 1563 pragma(inline, true) 1564 int hts_bin_bot(int bin, int n_lvls) 1565 { 1566 int l = hts_bin_level(bin); 1567 return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; 1568 } 1569 1570 /************** 1571 * Endianness * 1572 **************/ 1573 pragma(inline, true) 1574 int ed_is_big() 1575 { 1576 long one= 1; 1577 return !(*(cast(char *)(&one))); 1578 } 1579 pragma(inline, true) 1580 ushort ed_swap_2(ushort v) 1581 { 1582 return cast(ushort)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 1583 } 1584 pragma(inline, true) 1585 void *ed_swap_2p(void *x) 1586 { 1587 *cast(ushort*)x = ed_swap_2(*cast(ushort*)x); 1588 return x; 1589 } 1590 pragma(inline, true) 1591 uint ed_swap_4(uint v) 1592 { 1593 v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 1594 return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 1595 } 1596 pragma(inline, true) 1597 void *ed_swap_4p(void *x) 1598 { 1599 *cast(uint*)x = ed_swap_4(*cast(uint*)x); 1600 return x; 1601 } 1602 pragma(inline, true) 1603 ulong ed_swap_8(ulong v) 1604 { 1605 v = ((v & 0x00000000FFFFFFFFLU) << 32) | (v >> 32); 1606 v = ((v & 0x0000FFFF0000FFFFLU) << 16) | ((v & 0xFFFF0000FFFF0000LU) >> 16); 1607 return ((v & 0x00FF00FF00FF00FFLU) << 8) | ((v & 0xFF00FF00FF00FF00LU) >> 8); 1608 } 1609 pragma(inline, true) 1610 void *ed_swap_8p(void *x) 1611 { 1612 *cast(ulong*)x = ed_swap_8(*cast(ulong*)x); 1613 return x; 1614 } 1615