1 /// @file htslib/hts.h 2 /// Format-neutral I/O, indexing, and iterator API functions. 3 /* 4 Copyright (C) 2012-2022 Genome Research Ltd. 5 Copyright (C) 2010, 2012 Broad Institute. 6 Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk> 7 8 Author: Heng Li <lh3@sanger.ac.uk> 9 10 Permission is hereby granted, free of charge, to any person obtaining a copy 11 of this software and associated documentation files (the "Software"), to deal 12 in the Software without restriction, including without limitation the rights 13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 copies of the Software, and to permit persons to whom the Software is 15 furnished to do so, subject to the following conditions: 16 17 The above copyright notice and this permission notice shall be included in 18 all copies or substantial portions of the Software. 19 20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 26 DEALINGS IN THE SOFTWARE. */ 27 module htslib.hts; 28 29 import core.stdc.config; 30 import core.stdc.inttypes; 31 import core.stdc.limits; 32 import core.stdc.stdint; 33 34 import htslib.bgzf : BGZF; 35 import htslib.cram : cram_fd; 36 import htslib.hfile : hFILE; 37 import htslib.thread_pool : hts_tpool; 38 import htslib.sam : sam_hdr_t; 39 import htslib.kstring : kstring_t; 40 41 @system: 42 nothrow: 43 @nogc: 44 45 extern (C): 46 47 // Separator used to split HTS_PATH (for plugins); REF_PATH (cram references) 48 49 version(Windows) 50 { 51 enum HTS_PATH_SEPARATOR_CHAR = ';'; 52 enum HTS_PATH_SEPARATOR_STR = ";"; 53 } 54 else 55 { 56 enum HTS_PATH_SEPARATOR_CHAR = ':'; 57 enum HTS_PATH_SEPARATOR_STR = ":"; 58 } 59 60 61 /** 62 * @hideinitializer 63 * Deprecated macro to expand a dynamic array of a given type 64 * 65 * @param type_t The type of the array elements 66 * @param[in] n Requested number of elements of type type_t 67 * @param[in,out] m Size of memory allocated 68 * @param[in,out] ptr Pointer to the array 69 * 70 * @discussion 71 * Do not use this macro. Use hts_resize() instead as allows allocation 72 * failures to be handled more gracefully. 73 * 74 * The array *ptr will be expanded if necessary so that it can hold @p n 75 * or more elements. If the array is expanded then the new size will be 76 * written to @p m and the value in @p ptr may change. 77 * 78 * It must be possible to take the address of @p ptr and @p m must be usable 79 * as an lvalue. 80 * 81 * @bug 82 * If the memory allocation fails, this will call exit(1). This is 83 * not ideal behaviour in a library. 84 */ 85 86 /** 87 * @hideinitializer 88 * Macro to expand a dynamic array, zeroing any newly-allocated memory 89 * 90 * @param type_t The type of the array elements 91 * @param[in] n Requested number of elements of type type_t 92 * @param[in,out] m Size of memory allocated 93 * @param[in,out] ptr Pointer to the array 94 * 95 * @discussion 96 * Do not use this macro. Use hts_resize() instead as allows allocation 97 * failures to be handled more gracefully. 98 * 99 * As for hts_expand(), except the bytes that make up the array elements 100 * between the old and new values of @p m are set to zero using memset(). 101 * 102 * @bug 103 * If the memory allocation fails, this will call exit(1). This is 104 * not ideal behaviour in a library. 105 */ 106 107 // For internal use (by hts_resize()) only 108 int hts_resize_array_(size_t, size_t, size_t, void*, void**, int, const(char)*); 109 110 enum HTS_RESIZE_CLEAR = 1; 111 112 /** 113 * @hideinitializer 114 * Macro to expand a dynamic array of a given type 115 * 116 * @param type_t The type of the array elements 117 * @param[in] num Requested number of elements of type type_t 118 * @param[in,out] size_ptr Pointer to where the size (in elements) of the 119 array is stored. 120 * @param[in,out] ptr Location of the pointer to the array 121 * @param[in] flags Option flags 122 * 123 * @return 0 for success, or negative if an error occurred. 124 * 125 * @discussion 126 * The array *ptr will be expanded if necessary so that it can hold @p num 127 * or more elements. If the array is expanded then the new size will be 128 * written to @p *size_ptr and the value in @p *ptr may change. 129 * 130 * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will 131 * be cleared. 132 */ 133 134 pragma(inline,true) 135 int hts_resize(T)(size_t num, ref size_t size, T* ptr, int flags) 136 { 137 return (num > size) 138 ? hts_resize_array_(T.sizeof, num, size_t.sizeof, &size, cast(void **)&ptr, flags, __FUNCTION__) 139 : 0; 140 } 141 142 /// Release resources when dlclosing a dynamically loaded HTSlib 143 /** @discussion 144 * Normally HTSlib cleans up automatically when your program exits, 145 * whether that is via exit(3) or returning from main(). However if you 146 * have dlopen(3)ed HTSlib and wish to close it before your main program 147 * exits, you must call hts_lib_shutdown() before dlclose(3). 148 */ 149 void hts_lib_shutdown(); 150 151 /** 152 * Wrapper function for free(). Enables memory deallocation across DLL 153 * boundary. Should be used by all applications, which are compiled 154 * with a different standard library than htslib and call htslib 155 * methods that return dynamically allocated data. 156 */ 157 void hts_free(void* ptr); 158 159 /************ 160 * File I/O * 161 ************/ 162 163 // Add new entries only at the end (but before the *_maximum entry) 164 // of these enums, as their numbering is part of the htslib ABI. 165 166 /// Broad format category (sequence data, variant data, index, regions, etc.) 167 enum htsFormatCategory // @suppress(dscanner.style.phobos_naming_convention) 168 { 169 unknown_category = 0, 170 sequence_data = 1, // Sequence data -- SAM, BAM, CRAM, etc 171 variant_data = 2, // Variant calling data -- VCF, BCF, etc 172 index_file = 3, // Index file associated with some data file 173 region_list = 4, // Coordinate intervals or regions -- BED, etc 174 category_maximum = 32_767 175 } 176 177 /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.) 178 enum htsExactFormat // @suppress(dscanner.style.phobos_naming_convention) 179 { 180 unknown_format = 0, 181 binary_format = 1, 182 text_format = 2, 183 sam = 3, 184 bam = 4, 185 bai = 5, 186 cram = 6, 187 crai = 7, 188 vcf = 8, 189 bcf = 9, 190 csi = 10, 191 gzi = 11, 192 tbi = 12, 193 bed = 13, 194 htsget = 14, 195 json = 14, 196 empty_format = 15, // File is empty (or empty after decompression) 197 fasta_format = 16, 198 fastq_format = 17, 199 fai_format = 18, 200 fqi_format = 19, 201 hts_crypt4gh_format = 20, 202 d4_format = 21, 203 format_maximum = 32_767 204 } 205 206 /// Compression type 207 enum htsCompression // @suppress(dscanner.style.phobos_naming_convention) 208 { 209 no_compression = 0, 210 gzip = 1, 211 bgzf = 2, 212 custom = 3, 213 bzip2_compression = 4, 214 razf_compression = 5, 215 xz_compression = 6, 216 zstd_compression = 7, 217 compression_maximum = 32_767 218 } 219 220 struct htsFormat 221 { 222 htsFormatCategory category; /// Broad format category (sequence data, variant data, index, regions, etc.) 223 htsExactFormat format; /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.) 224 /// format version 225 struct Vers { short major, minor; } // @suppress(dscanner.style.undocumented_declaration) 226 Vers v; /// format version 227 htsCompression compression; /// Compression type 228 short compression_level;/// currently unused 229 void *specific; /// format specific options; see struct hts_opt. 230 } 231 232 struct hts_idx_t; 233 struct hts_filter_t; 234 235 /** 236 * @brief File handle returned by hts_open() etc. 237 * This structure should be considered opaque by end users. There should be 238 * no need to access most fields directly in user code, and in cases where 239 * it is desirable accessor functions such as hts_get_format() are provided. 240 */ 241 // Maintainers note htsFile cannot be an incomplete struct because some of its 242 // fields are part of libhts.so's ABI (hence these fields must not be moved): 243 // - fp is used in the public sam_itr_next()/etc macros 244 // - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1 245 // - is_write and is_cram are used directly in samtools <= 1.1 246 // - fp is used directly in samtools (up to and including current develop) 247 // - line is used directly in bcftools (up to and including current develop) 248 // - is_bgzf and is_cram flags indicate which fp union member to use. 249 // Note is_bgzf being set does not indicate the flag is BGZF compressed, 250 // nor even whether it is compressed at all (eg on naked BAMs). 251 struct htsFile // @suppress(dscanner.style.phobos_naming_convention) 252 { 253 import std.bitmanip : bitfields; 254 255 mixin(bitfields!( 256 uint, "is_bin", 1, 257 uint, "is_write", 1, 258 uint, "is_be", 1, 259 uint, "is_cram", 1, 260 uint, "is_bgzf", 1, 261 uint, "dummy", 27)); 262 long lineno; /// uncompressed(?) file line no. 263 kstring_t line; /// buffer to hold line 264 char *fn; /// filename 265 char *fn_aux; /// auxillary (i.e, index) file name 266 /// hFile plus any needed bgzf or CRAM (if applicable) structure data 267 union FP { 268 BGZF *bgzf; /// see bgzf.d 269 cram_fd *cram; /// see cram.d 270 hFILE *hfile; /// see hfile.d 271 } 272 FP fp; /// hFile plus any needed bgzf or CRAM (if applicable) structure data 273 void* state; // format specific state information 274 htsFormat format; /// hts file complete file format information 275 hts_idx_t* idx; 276 const(char)* fnidx; 277 sam_hdr_t* bam_header; 278 hts_filter_t* filter; 279 } 280 281 /// A combined thread pool and queue allocation size. 282 /// The pool should already be defined, but qsize may be zero to 283 /// indicate an appropriate queue size is taken from the pool. 284 /// 285 /// Reasons for explicitly setting it could be where many more file 286 /// descriptors are in use than threads, so keeping memory low is 287 /// important. 288 struct htsThreadPool // @suppress(dscanner.style.phobos_naming_convention) 289 { 290 hts_tpool* pool; // The shared thread pool itself 291 int qsize; // Size of I/O queue to use for this fp 292 } 293 294 /// REQUIRED_FIELDS 295 enum sam_fields // @suppress(dscanner.style.phobos_naming_convention) 296 { 297 SAM_QNAME = 0x00000001, 298 SAM_FLAG = 0x00000002, 299 SAM_RNAME = 0x00000004, 300 SAM_POS = 0x00000008, 301 SAM_MAPQ = 0x00000010, 302 SAM_CIGAR = 0x00000020, 303 SAM_RNEXT = 0x00000040, 304 SAM_PNEXT = 0x00000080, 305 SAM_TLEN = 0x00000100, 306 SAM_SEQ = 0x00000200, 307 SAM_QUAL = 0x00000400, 308 SAM_AUX = 0x00000800, 309 SAM_RGAUX = 0x00001000 310 } 311 312 /// Mostly CRAM only, but this could also include other format options 313 enum hts_fmt_option 314 { 315 // CRAM specific 316 CRAM_OPT_DECODE_MD = 0, 317 CRAM_OPT_PREFIX = 1, 318 CRAM_OPT_VERBOSITY = 2, // obsolete, use hts_set_log_level() instead 319 CRAM_OPT_SEQS_PER_SLICE = 3, 320 CRAM_OPT_SLICES_PER_CONTAINER = 4, 321 CRAM_OPT_RANGE = 5, 322 CRAM_OPT_VERSION = 6, // rename to cram_version? 323 CRAM_OPT_EMBED_REF = 7, 324 CRAM_OPT_IGNORE_MD5 = 8, 325 CRAM_OPT_REFERENCE = 9, // make general 326 CRAM_OPT_MULTI_SEQ_PER_SLICE = 10, 327 CRAM_OPT_NO_REF = 11, 328 CRAM_OPT_USE_BZIP2 = 12, 329 CRAM_OPT_SHARED_REF = 13, 330 CRAM_OPT_NTHREADS = 14, // deprecated, use HTS_OPT_NTHREADS 331 CRAM_OPT_THREAD_POOL = 15, // make general 332 CRAM_OPT_USE_LZMA = 16, 333 CRAM_OPT_USE_RANS = 17, 334 CRAM_OPT_REQUIRED_FIELDS = 18, 335 CRAM_OPT_LOSSY_NAMES = 19, 336 CRAM_OPT_BASES_PER_SLICE = 20, 337 CRAM_OPT_STORE_MD = 21, 338 CRAM_OPT_STORE_NM = 22, 339 CRAM_OPT_RANGE_NOSEEK = 23, // CRAM_OPT_RANGE minus the seek 340 CRAM_OPT_USE_TOK = 24, 341 CRAM_OPT_USE_FQZ = 25, 342 CRAM_OPT_USE_ARITH = 26, 343 CRAM_OPT_POS_DELTA = 27, // force delta for AP, even on non-pos sorted data 344 345 // General purpose 346 HTS_OPT_COMPRESSION_LEVEL = 100, 347 HTS_OPT_NTHREADS = 101, 348 HTS_OPT_THREAD_POOL = 102, 349 HTS_OPT_CACHE_SIZE = 103, 350 HTS_OPT_BLOCK_SIZE = 104, 351 HTS_OPT_FILTER = 105, 352 HTS_OPT_PROFILE = 106, 353 354 // Fastq 355 356 // Boolean. 357 // Read / Write CASAVA 1.8 format. 358 // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf 359 // 360 // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+ 361 // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag, 362 // \d+ is a control number, and the sequence at the end is 363 // for barcode sequence. Barcodes are read into the aux tag defined 364 // by FASTQ_OPT_BARCODE ("BC" by default). 365 FASTQ_OPT_CASAVA = 1000, 366 367 // String. 368 // Whether to read / write extra SAM format aux tags from the fastq 369 // identifier line. For reading this can simply be "1" to request 370 // decoding aux tags. For writing it is a comma separated list of aux 371 // tag types to be written out. 372 FASTQ_OPT_AUX = 1001, 373 374 // Boolean. 375 // Whether to add /1 and /2 to read identifiers when writing FASTQ. 376 // These come from the BAM_FREAD1 or BAM_FREAD2 flags. 377 // (Detecting the /1 and /2 is automatic when reading fastq.) 378 FASTQ_OPT_RNUM = 1002, 379 380 // Two character string. 381 // Barcode aux tag for CASAVA; defaults to "BC". 382 FASTQ_OPT_BARCODE = 1003, 383 384 // Process SRA and ENA read names which pointlessly move the original 385 // name to the second field and insert a constructed <run>.<number> 386 // name in its place. 387 FASTQ_OPT_NAME2 = 1004 388 } 389 390 /// Profile options for encoding; primarily used at present in CRAM 391 /// but also usable in BAM as a synonym for deflate compression levels. 392 enum hts_profile_option 393 { 394 HTS_PROFILE_FAST = 0, 395 HTS_PROFILE_NORMAL = 1, 396 HTS_PROFILE_SMALL = 2, 397 HTS_PROFILE_ARCHIVE = 3 398 } 399 400 /// For backwards compatibility 401 alias cram_option = hts_fmt_option; 402 403 /// Options for cache, (de)compression, threads, CRAM, etc. 404 struct hts_opt // @suppress(dscanner.style.phobos_naming_convention) 405 { 406 char *arg; /// string form, strdup()ed 407 hts_fmt_option opt; /// tokenised key 408 /// option value 409 union VAL { /// ... and value 410 int i; /// int value 411 char *s; /// string value 412 } 413 VAL val; /// value 414 hts_opt *next; /// next option (linked list) 415 } 416 417 /* 418 * Explicit index file name delimiter, see below 419 */ 420 enum HTS_IDX_DELIM = "##idx##"; 421 422 /********************** 423 * Exported functions * 424 **********************/ 425 426 /** 427 * Parses arg and appends it to the option list. 428 * 429 * Returns 0 on success; 430 * -1 on failure. 431 */ 432 int hts_opt_add(hts_opt** opts, const(char)* c_arg); 433 434 /** 435 * Applies an hts_opt option list to a given htsFile. 436 * 437 * Returns 0 on success 438 * -1 on failure 439 */ 440 int hts_opt_apply(htsFile* fp, hts_opt* opts); 441 442 /** 443 * Frees an hts_opt list. 444 */ 445 void hts_opt_free(hts_opt* opts); 446 447 /** 448 * Accepts a string file format (sam, bam, cram, vcf, bam) optionally 449 * followed by a comma separated list of key=value options and splits 450 * these up into the fields of htsFormat struct. 451 * 452 * Returns 0 on success 453 * -1 on failure. 454 */ 455 int hts_parse_format(htsFormat* opt, const(char)* str); 456 457 /** 458 * Tokenise options as (key(=value)?,)*(key(=value)?)? 459 * NB: No provision for ',' appearing in the value! 460 * Add backslashing rules? 461 * 462 * This could be used as part of a general command line option parser or 463 * as a string concatenated onto the file open mode. 464 * 465 * Returns 0 on success 466 * -1 on failure. 467 */ 468 int hts_parse_opt_list(htsFormat* opt, const(char)* str); 469 470 /*! @abstract Table for converting a nucleotide character to 4-bit encoding. 471 The input character may be either an IUPAC ambiguity code, '=' for 0, or 472 '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 473 for A/C/G/T or combinations of these bits for ambiguous bases. 474 */ 475 476 version(Windows){ 477 __gshared const(ubyte)[256] seq_nt16_table = [ 478 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 479 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 480 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 481 1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15, 482 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 483 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 484 15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15, 485 15,15, 5, 6, 8,15, 7, 9, 15,10,15,15, 15,15,15,15, 486 487 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 488 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 489 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 490 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 491 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 492 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 493 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, 494 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15 495 ]; 496 }else{ 497 extern __gshared const(ubyte)[256] seq_nt16_table; 498 } 499 /**! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC 500 ambiguity code letter (or '=' when given 0). 501 */ 502 503 version(Windows) __gshared const(char)[16] seq_nt16_str = ['=','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N']; 504 else extern __gshared const(char)[16] seq_nt16_str; 505 506 /**! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. 507 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). 508 */ 509 version(Windows) __gshared const(int)[16] seq_nt16_int = [ 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 ]; 510 else extern __gshared const(int)[16] seq_nt16_int; 511 /**! 512 @abstract Get the htslib version number 513 @return For released versions, a string like "N.N[.N]"; or git describe 514 output if using a library built within a Git repository. 515 */ 516 const(char)* hts_version(); 517 518 /**! 519 @abstract Compile-time HTSlib version number, for use in #if checks 520 @return For released versions X.Y[.Z], an integer of the form XYYYZZ; 521 useful for preprocessor conditionals such as 522 #if HTS_VERSION >= 101000 // Check for v1.10 or later 523 */ 524 // Maintainers: Bump this in the final stage of preparing a new release. 525 // Immediately after release, bump ZZ to 90 to distinguish in-development 526 // Git repository builds from the release; you may wish to increment this 527 // further when significant features are merged. 528 enum HTS_VERSION = 101600; 529 530 /**! @abstract Introspection on the features enabled in htslib 531 * 532 * @return a bitfield of HTS_FEATURE_* macros. 533 */ 534 uint hts_features(); 535 536 const(char)* hts_test_feature(uint id); 537 538 /**! @abstract Introspection on the features enabled in htslib, string form 539 * 540 * @return a string describing htslib build features 541 */ 542 const(char)* hts_feature_string(); 543 544 /// Whether ./configure was used or vanilla Makefile 545 enum HTS_FEATURE_CONFIGURE = 1; 546 547 /// Whether --enable-plugins was used 548 enum HTS_FEATURE_PLUGINS = 2; 549 550 /// Transport specific 551 enum HTS_FEATURE_LIBCURL = 1u << 10; 552 enum HTS_FEATURE_S3 = 1u << 11; 553 enum HTS_FEATURE_GCS = 1u << 12; 554 555 /// Compression options 556 enum HTS_FEATURE_LIBDEFLATE = 1u << 20; 557 enum HTS_FEATURE_LZMA = 1u << 21; 558 enum HTS_FEATURE_BZIP2 = 1u << 22; 559 enum HTS_FEATURE_HTSCODECS = 1u << 23; // htscodecs library version 560 561 /// Build params 562 enum HTS_FEATURE_CC = 1u << 27; 563 enum HTS_FEATURE_CFLAGS = 1u << 28; 564 enum HTS_FEATURE_CPPFLAGS = 1u << 29; 565 enum HTS_FEATURE_LDFLAGS = 1u << 30; 566 567 /**! 568 @abstract Determine format by peeking at the start of a file 569 @param fp File opened for reading, positioned at the beginning 570 @param fmt Format structure that will be filled out on return 571 @return 0 for success, or negative if an error occurred. 572 573 Equivalent to hts_detect_format2(fp, NULL, fmt). 574 */ 575 int hts_detect_format(hFILE* fp, htsFormat* fmt); 576 577 /**History: ! 578 @abstract Determine format primarily by peeking at the start of a file 579 @param fp File opened for reading, positioned at the beginning 580 @param fname Name of the file, or NULL if not available 581 @param fmt Format structure that will be filled out on return 582 @return 0 for success, or negative if an error occurred. 583 @since 1.15 584 585 Some formats are only recognised if the filename is available and has the 586 expected extension, as otherwise more generic files may be misrecognised. 587 In particular: 588 - FASTA/Q indexes must have .fai/.fqi extensions; without this requirement, 589 some similar BED files would be misrecognised as indexes. 590 */ 591 int hts_detect_format2(hFILE* fp, const(char)* fname, htsFormat* fmt); 592 593 /**! 594 @abstract Get a human-readable description of the file format 595 @param fmt Format structure holding type, version, compression, etc. 596 @return Description string, to be freed by the caller after use. 597 */ 598 char* hts_format_description(const(htsFormat)* format); 599 600 /**! 601 @abstract Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF) 602 or possibly-compressed textual line-orientated file 603 @param fn The file name or "-" for stdin/stdout. For indexed files 604 with a non-standard naming, the file name can include the 605 name of the index file delimited with HTS_IDX_DELIM 606 @param mode Mode matching / [rwa][bcefFguxz0-9]* / 607 @discussion 608 With 'r' opens for reading; any further format mode letters are ignored 609 as the format is detected by checking the first few bytes or BGZF blocks 610 of the file. With 'w' or 'a' opens for writing or appending, with format 611 specifier letters: 612 b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) 613 c CRAM format 614 f FASTQ format 615 F FASTA format 616 g gzip compressed 617 u uncompressed 618 z bgzf compressed 619 [0-9] zlib compression level 620 and with non-format option letters (for any of 'r'/'w'/'a'): 621 e close the file on exec(2) (opens with O_CLOEXEC, where supported) 622 x create the file exclusively (opens with O_EXCL, where supported) 623 Note that there is a distinction between 'u' and '0': the first yields 624 plain uncompressed output whereas the latter outputs uncompressed data 625 wrapped in the zlib format. 626 @example 627 [rw]b .. compressed BCF, BAM, FAI 628 [rw]bu .. uncompressed BCF 629 [rw]z .. compressed VCF 630 [rw] .. uncompressed VCF 631 */ 632 htsFile* hts_open(const(char)* fn, const(char)* mode); 633 634 /**! 635 @abstract Open a SAM/BAM/CRAM/VCF/BCF/etc file 636 @param fn The file name or "-" for stdin/stdout 637 @param mode Open mode, as per hts_open() 638 @param fmt Optional format specific parameters 639 @discussion 640 See hts_open() for description of fn and mode. 641 // TODO Update documentation for s/opts/fmt/ 642 Opts contains a format string (sam, bam, cram, vcf, bcf) which will, 643 if defined, override mode. Opts also contains a linked list of hts_opt 644 structures to apply to the open file handle. These can contain things 645 like pointers to the reference or information on compression levels, 646 block sizes, etc. 647 */ 648 htsFile* hts_open_format( 649 const(char)* fn, 650 const(char)* mode, 651 const(htsFormat)* fmt); 652 653 /**! 654 @abstract For output streams, flush any buffered data 655 @param fp The file handle to be flushed 656 @return 0 for success, or negative if an error occurred. 657 @since 1.14 658 */ 659 int hts_flush(htsFile* fp); 660 661 /**! 662 @abstract Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file 663 @param fn The already-open file handle 664 @param mode Open mode, as per hts_open() 665 */ 666 htsFile* hts_hopen(hFILE* fp, const(char)* fn, const(char)* mode); 667 668 /**! 669 @abstract Close a file handle, flushing buffered data for output streams 670 @param fp The file handle to be closed 671 @return 0 for success, or negative if an error occurred. 672 */ 673 int hts_close(htsFile* fp); 674 675 /**! 676 @abstract Returns the file's format information 677 @param fp The file handle 678 @return Read-only pointer to the file's htsFormat. 679 */ 680 const(htsFormat)* hts_get_format(htsFile* fp); 681 682 /**! 683 @ abstract Returns a string containing the file format extension. 684 @ param format Format structure containing the file type. 685 @ return A string ("sam", "bam", etc) or "?" for unknown formats. 686 */ 687 const(char)* hts_format_file_extension(const(htsFormat)* format); 688 689 /**! 690 @abstract Sets a specified CRAM option on the open file handle. 691 @param fp The file handle open the open file. 692 @param opt The CRAM_OPT_* option. 693 @param ... Optional arguments, dependent on the option used. 694 @return 0 for success, or negative if an error occurred. 695 */ 696 int hts_set_opt(htsFile* fp, hts_fmt_option opt, ...); 697 698 /**! 699 @abstract Read a line (and its \n or \r\n terminator) from a file 700 @param fp The file handle 701 @param delimiter Unused, but must be '\n' (or KS_SEP_LINE) 702 @param str The line (not including the terminator) is written here 703 @return Length of the string read (capped at INT_MAX); 704 -1 on end-of-file; <= -2 on error 705 */ 706 int hts_getline(htsFile* fp, int delimiter, kstring_t* str); 707 708 char** hts_readlines(const(char)* fn, int* _n); 709 /**! 710 @abstract Parse comma-separated list or read list from a file 711 @param list File name or comma-separated list 712 @param is_file 713 @param _n Size of the output array (number of items read) 714 @return NULL on failure or pointer to newly allocated array of 715 strings 716 */ 717 char** hts_readlist(const(char)* fn, int is_file, int* _n); 718 719 /**! 720 @abstract Create extra threads to aid compress/decompression for this file 721 @param fp The file handle 722 @param n The number of worker threads to create 723 @return 0 for success, or negative if an error occurred. 724 @notes This function creates non-shared threads for use solely by fp. 725 The hts_set_thread_pool function is the recommended alternative. 726 */ 727 int hts_set_threads(htsFile* fp, int n); 728 729 /**! 730 @abstract Create extra threads to aid compress/decompression for this file 731 @param fp The file handle 732 @param p A pool of worker threads, previously allocated by hts_create_threads(). 733 @return 0 for success, or negative if an error occurred. 734 */ 735 int hts_set_thread_pool(htsFile* fp, htsThreadPool* p); 736 737 /**! 738 @abstract Adds a cache of decompressed blocks, potentially speeding up seeks. 739 This may not work for all file types (currently it is bgzf only). 740 @param fp The file handle 741 @param n The size of cache, in bytes 742 */ 743 void hts_set_cache_size(htsFile* fp, int n); 744 745 /*! 746 @abstract Set .fai filename for a file opened for reading 747 @return 0 for success, negative on failure 748 @discussion 749 Called before *_hdr_read(), this provides the name of a .fai file 750 used to provide a reference list if the htsFile contains no @SQ headers. 751 */ 752 int hts_set_fai_filename(htsFile* fp, const(char)* fn_aux); 753 754 /**! 755 @abstract Sets a filter expression 756 @return 0 for success, negative on failure 757 @discussion 758 To clear an existing filter, specifying expr as NULL. 759 */ 760 int hts_set_filter_expression(htsFile* fp, const(char)* expr); 761 762 /**! 763 @abstract Determine whether a given htsFile contains a valid EOF block 764 @return 3 for a non-EOF checkable filetype; 765 2 for an unseekable file type where EOF cannot be checked; 766 1 for a valid EOF block; 767 0 for if the EOF marker is absent when it should be present; 768 -1 (with errno set) on failure 769 @discussion 770 Check if the BGZF end-of-file (EOF) marker is present 771 */ 772 int hts_check_EOF(htsFile* fp); 773 774 /************ 775 * Indexing * 776 ************/ 777 778 /**! 779 These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc, 780 producing iterators operating as follows: 781 - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file 782 - HTS_IDX_START iterates over the entire file 783 - HTS_IDX_REST iterates from the current position to the end of the file 784 - HTS_IDX_NONE always returns "no more alignment records" 785 When one of these special tid values is used, beg and end are ignored. 786 When REST or NONE is used, idx is also ignored and may be NULL. 787 */ 788 enum HTS_IDX_NOCOOR = -2; 789 enum HTS_IDX_START = -3; 790 enum HTS_IDX_REST = -4; 791 enum HTS_IDX_NONE = -5; 792 793 enum HTS_FMT_CSI = 0; 794 enum HTS_FMT_BAI = 1; 795 enum HTS_FMT_TBI = 2; 796 enum HTS_FMT_CRAI = 3; 797 enum HTS_FMT_FAI = 4; 798 799 // Almost INT64_MAX, but when cast into a 32-bit int it's 800 // also INT_MAX instead of -1. This avoids bugs with old code 801 // using the new hts_pos_t data type. 802 enum HTS_POS_MAX = ((cast(long) INT_MAX) << 32) | INT_MAX; 803 enum HTS_POS_MIN = INT64_MIN; 804 enum PRIhts_pos = PRId64; 805 alias hts_pos_t = c_long; 806 807 // For comparison with previous release: 808 // 809 // #define HTS_POS_MAX INT_MAX 810 // #define HTS_POS_MIN INT_MIN 811 // #define PRIhts_pos PRId32 812 // typedef int32_t hts_pos_t; 813 814 struct hts_pair_pos_t 815 { 816 hts_pos_t beg; 817 hts_pos_t end; 818 } 819 820 alias hts_pair32_t = hts_pair_pos_t; // For backwards compatibility 821 822 struct hts_pair64_t // @suppress(dscanner.style.phobos_naming_convention) 823 { 824 ulong u; 825 ulong v; 826 } 827 828 /// 64-bit start, end coordinate pair tracking max (internally used in hts.c) 829 struct hts_pair64_max_t // @suppress(dscanner.style.phobos_naming_convention) 830 { 831 ulong u; 832 ulong v; 833 ulong max; 834 } 835 836 /// Region list used in iterators (NB: apparently confined to single contig/tid) 837 struct hts_reglist_t 838 { 839 const(char) *reg; /// Region string 840 hts_pair_pos_t *intervals; /// (start,end) intervals 841 int tid; /// Contig id 842 uint count; /// How many intervals 843 hts_pos_t min_beg; /// absolute bounds 844 hts_pos_t max_end; /// absolute bounds 845 } 846 847 alias hts_readrec_func = int function(BGZF* fp, void* data, void* r, int* tid, hts_pos_t* beg, hts_pos_t* end); 848 alias hts_seek_func = int function(void* fp, long offset, int where); 849 alias hts_tell_func = c_long function(void* fp); 850 851 /** 852 * @brief File iterator that can handle multiple target regions. 853 * This structure should be considered opaque by end users. 854 * It does both the stepping inside the file and the filtering of alignments. 855 * It can operate in single or multi-region mode, and depending on this, 856 * it uses different fields. 857 * 858 * read_rest (1) - read everything from the current offset, without filtering 859 * finished (1) - no more iterations 860 * is_cram (1) - current file has CRAM format 861 * nocoor (1) - read all unmapped reads 862 * 863 * multi (1) - multi-region moode 864 * reg_list - List of target regions 865 * n_reg - Size of the above list 866 * curr_reg - List index of the current region of search 867 * curr_intv - Interval index inside the current region; points to a (beg, end) 868 * end - Used for CRAM files, to preserve the max end coordinate 869 * 870 * multi (0) - single-region mode 871 * tid - Reference id of the target region 872 * beg - Start position of the target region 873 * end - End position of the target region 874 * 875 * Common fields: 876 * off - List of file offsets computed from the index 877 * n_off - Size of the above list 878 * i - List index of the current file offset 879 * curr_off - File offset for the next file read 880 * curr_tid - Reference id of the current alignment 881 * curr_beg - Start position of the current alignment 882 * curr_end - End position of the current alignment 883 * nocoor_off - File offset where the unmapped reads start 884 * 885 * readrec - File specific function that reads an alignment 886 * seek - File specific function for changing the file offset 887 * tell - File specific function for indicating the file offset 888 */ 889 890 struct hts_itr_t // @suppress(dscanner.style.phobos_naming_convention) 891 { 892 import std.bitmanip : bitfields; 893 894 mixin(bitfields!( 895 uint, "read_rest", 1, 896 uint, "finished", 1, 897 uint, "is_cram", 1, 898 uint, "nocoor", 1, 899 uint, "multi", 1, 900 uint, "dummy", 27)); 901 902 int tid; 903 int n_off; 904 int i; 905 int n_reg; 906 hts_pos_t beg; 907 hts_pos_t end; 908 hts_reglist_t* reg_list; 909 int curr_tid; 910 int curr_reg; 911 int curr_intv; 912 hts_pos_t curr_beg; 913 hts_pos_t curr_end; 914 ulong curr_off; 915 ulong nocoor_off; 916 hts_pair64_max_t* off; 917 int function() readrec; 918 int function() seek; 919 long function() tell; 920 921 struct Bins 922 { 923 int n; 924 int m; 925 int* a; 926 } 927 928 Bins bins; 929 } 930 /// ? index key 931 struct aux_key_t { // @suppress(dscanner.style.phobos_naming_convention) 932 int key; /// ??? 933 /// ??? 934 ulong min_off, max_off; 935 } 936 937 alias hts_itr_multi_t = hts_itr_t; 938 939 pragma(inline, true) 940 extern (D) auto hts_bin_first(T)(auto ref T l) 941 { 942 return ((1 << ((l << 1) + l)) - 1) / 7; 943 } 944 945 pragma(inline, true) 946 extern (D) auto hts_bin_parent(T)(auto ref T b) 947 { 948 return (b - 1) >> 3; 949 } 950 951 /////////////////////////////////////////////////////////// 952 // Low-level API for building indexes. 953 954 /// Create a BAI/CSI/TBI type index structure 955 /** @param n Initial number of targets 956 @param fmt Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI 957 @param offset0 Initial file offset 958 @param min_shift Number of bits for the minimal interval 959 @param n_lvls Number of levels in the binning index 960 @return An initialised hts_idx_t struct on success; NULL on failure 961 962 The struct returned by a successful call should be freed via hts_idx_destroy() 963 when it is no longer needed. 964 */ 965 hts_idx_t* hts_idx_init( 966 int n, 967 int fmt, 968 ulong offset0, 969 int min_shift, 970 int n_lvls); 971 972 /// Free a BAI/CSI/TBI type index 973 /** @param idx Index structure to free 974 */ 975 void hts_idx_destroy(hts_idx_t* idx); 976 977 /// Push an index entry 978 /** @param idx Index 979 @param tid Target id 980 @param beg Range start (zero-based) 981 @param end Range end (zero-based, half-open) 982 @param offset File offset 983 @param is_mapped Range corresponds to a mapped read 984 @return 0 on success; -1 on failure 985 986 The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts 987 stored in the meta-data bin. 988 */ 989 int hts_idx_push( 990 hts_idx_t* idx, 991 int tid, 992 hts_pos_t beg, 993 hts_pos_t end, 994 ulong offset, 995 int is_mapped); 996 997 /// Finish building an index 998 /** @param idx Index 999 @param final_offset Last file offset 1000 @return 0 on success; non-zero on failure. 1001 */ 1002 int hts_idx_finish(hts_idx_t* idx, ulong final_offset); 1003 1004 /// Returns index format 1005 /** @param idx Index 1006 @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI 1007 */ 1008 int hts_idx_fmt(hts_idx_t* idx); 1009 1010 /// Add name to TBI index meta-data 1011 /** @param idx Index 1012 @param tid Target identifier 1013 @param name Target name 1014 @return Index number of name in names list on success; -1 on failure. 1015 */ 1016 int hts_idx_tbi_name(hts_idx_t* idx, int tid, const(char)* name); 1017 1018 // Index loading and saving 1019 1020 /// Save an index to a file 1021 /** @param idx Index to be written 1022 @param fn Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added 1023 @param fmt One of the HTS_FMT_* index formats 1024 @return 0 if successful, or negative if an error occurred. 1025 */ 1026 int hts_idx_save(const(hts_idx_t)* idx, const(char)* fn, int fmt); 1027 1028 /// Save an index to a specific file 1029 /** @param idx Index to be written 1030 @param fn Input BAM/BCF/etc filename 1031 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1032 @param fmt One of the HTS_FMT_* index formats 1033 @return 0 if successful, or negative if an error occurred. 1034 */ 1035 int hts_idx_save_as( 1036 const(hts_idx_t)* idx, 1037 const(char)* fn, 1038 const(char)* fnidx, 1039 int fmt); 1040 1041 /// Load an index file 1042 /** @param fn BAM/BCF/etc filename, to which .bai/.csi/etc will be added or 1043 the extension substituted, to search for an existing index file. 1044 In case of a non-standard naming, the file name can include the 1045 name of the index file delimited with HTS_IDX_DELIM. 1046 @param fmt One of the HTS_FMT_* index formats 1047 @return The index, or NULL if an error occurred. 1048 1049 If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before 1050 the delimiter will be used as the name of the data file and the part after 1051 it will be used as the name of the index. 1052 1053 Otherwise, this function tries to work out the index name as follows: 1054 1055 It will try appending ".csi" to @p fn 1056 It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi" 1057 Then, if @p fmt is HTS_FMT_BAI: 1058 It will try appending ".bai" to @p fn 1059 To will substituting the existing suffix (e.g. .bam) with ".bai" 1060 else if @p fmt is HTS_FMT_TBI: 1061 It will try appending ".tbi" to @p fn 1062 To will substituting the existing suffix (e.g. .vcf) with ".tbi" 1063 1064 If the index file is remote (served over a protocol like https), first a check 1065 is made to see is a locally cached copy is available. This is done for all 1066 of the possible names listed above. If a cached copy is not available then 1067 the index will be downloaded and stored in the current working directory, 1068 with the same name as the remote index. 1069 1070 Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE); 1071 */ 1072 hts_idx_t* hts_idx_load(const(char)* fn, int fmt); 1073 1074 /// Load a specific index file 1075 /** @param fn Input BAM/BCF/etc filename 1076 @param fnidx The input index filename 1077 @return The index, or NULL if an error occurred. 1078 1079 Equivalent to hts_idx_load3(fn, fnidx, 0, 0); 1080 1081 This function will not attempt to save index files locally. 1082 */ 1083 hts_idx_t* hts_idx_load2(const(char)* fn, const(char)* fnidx); 1084 1085 /// Load a specific index file 1086 /** @param fn Input BAM/BCF/etc filename 1087 @param fnidx The input index filename 1088 @param fmt One of the HTS_FMT_* index formats 1089 @param flags Flags to alter behaviour (see description) 1090 @return The index, or NULL if an error occurred. 1091 1092 If @p fnidx is NULL, the index name will be derived from @p fn in the 1093 same way as hts_idx_load(). 1094 1095 If @p fnidx is not NULL, @p fmt is ignored. 1096 1097 The @p flags parameter can be set to a combination of the following 1098 values: 1099 1100 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes 1101 HTS_IDX_SILENT_FAIL Fail silently if the index is not present 1102 1103 The index struct returned by a successful call should be freed 1104 via hts_idx_destroy() when it is no longer needed. 1105 */ 1106 hts_idx_t* hts_idx_load3( 1107 const(char)* fn, 1108 const(char)* fnidx, 1109 int fmt, 1110 int flags); 1111 1112 /// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() ) 1113 enum HTS_IDX_FLAG : int 1114 { 1115 HTS_IDX_SAVE_REMOTE = 1, 1116 HTS_IDX_SILENT_FAIL = 2 1117 } 1118 /////////////////////////////////////////////////////////// 1119 // Functions for accessing meta-data stored in indexes 1120 1121 alias hts_id2name_f = const(char)* function(void*, int); 1122 1123 /// Get extra index meta-data 1124 /** @param idx The index 1125 @param l_meta Pointer to where the length of the extra data is stored 1126 @return Pointer to the extra data if present; NULL otherwise 1127 1128 Indexes (both .tbi and .csi) made by tabix include extra data about 1129 the indexed file. The returns a pointer to this data. Note that the 1130 data is stored exactly as it is in the index. Callers need to interpret 1131 the results themselves, including knowing what sort of data to expect; 1132 byte swapping etc. 1133 */ 1134 ubyte* hts_idx_get_meta(hts_idx_t* idx, uint* l_meta); 1135 1136 /// Set extra index meta-data 1137 /** @param idx The index 1138 @param l_meta Length of data 1139 @param meta Pointer to the extra data 1140 @param is_copy If not zero, a copy of the data is taken 1141 @return 0 on success; -1 on failure (out of memory). 1142 1143 Sets the data that is returned by hts_idx_get_meta(). 1144 1145 If is_copy != 0, a copy of the input data is taken. If not, ownership of 1146 the data pointed to by *meta passes to the index. 1147 */ 1148 int hts_idx_set_meta(hts_idx_t* idx, uint l_meta, ubyte* meta, int is_copy); 1149 1150 /// Get number of mapped and unmapped reads from an index 1151 /** @param idx Index 1152 @param tid Target ID 1153 @param[out] mapped Location to store number of mapped reads 1154 @param[out] unmapped Location to store number of unmapped reads 1155 @return 0 on success; -1 on failure (data not available) 1156 1157 BAI and CSI indexes store information on the number of reads for each 1158 target that were mapped or unmapped (unmapped reads will generally have 1159 a paired read that is mapped to the target). This function returns this 1160 information if it is available. 1161 1162 @note Cram CRAI indexes do not include this information. 1163 */ 1164 int hts_idx_get_stat( 1165 const(hts_idx_t)* idx, 1166 int tid, 1167 ulong* mapped, 1168 ulong* unmapped); 1169 1170 /// Return the number of unplaced reads from an index 1171 /** @param idx Index 1172 @return Unplaced reads count 1173 1174 Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM 1175 files). 1176 */ 1177 ulong hts_idx_get_n_no_coor(const(hts_idx_t)* idx); 1178 1179 /// Return a list of target names from an index 1180 /** @param idx Index 1181 @param[out] n Location to store the number of targets 1182 @param getid Callback function to get the name for a target ID 1183 @param hdr Header from indexed file 1184 @return An array of pointers to the names on success; NULL on failure 1185 1186 @note The names are pointers into the header data structure. When cleaning 1187 up, only the array should be freed, not the names. 1188 */ 1189 const(char *)* hts_idx_seqnames(const(hts_idx_t)* idx, int* n, hts_id2name_f getid, void* hdr); // free only the array, not the values 1190 1191 /// Return the number of targets from an index 1192 /** @param idx Index 1193 @return The number of targets 1194 */ 1195 int hts_idx_nseq(const(hts_idx_t) *idx); 1196 1197 /////////////////////////////////////////////////////////// 1198 // Region parsing 1199 1200 enum HTS_PARSE_FLAGS : int 1201 { 1202 HTS_PARSE_THOUSANDS_SEP = 1, ///< Ignore ',' separators within numbers 1203 HTS_PARSE_ONE_COORD = 2, ///< chr:pos means chr:pos-pos and not chr:pos-end 1204 HTS_PARSE_LIST = 4, ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP) 1205 } 1206 1207 /// Parse a numeric string 1208 /** The number may be expressed in scientific notation, and optionally may 1209 contain commas in the integer part (before any decimal point or E notation). 1210 @param str String to be parsed 1211 @param strend If non-NULL, set on return to point to the first character 1212 in @a str after those forming the parsed number 1213 @param flags Or'ed-together combination of HTS_PARSE_* flags 1214 @return Integer value of the parsed number, or 0 if no valid number 1215 1216 The input string is parsed as: optional whitespace; an optional '+' or 1217 '-' sign; decimal digits possibly including ',' characters (if @a flags 1218 includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional 1219 case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific 1220 notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and 1221 decimal digits. To be considered a valid numeric value, the main part (not 1222 including any suffix or scientific notation) must contain at least one 1223 digit (either before or after the decimal point). 1224 1225 When @a strend is NULL, @a str is expected to contain only (optional 1226 whitespace followed by) the numeric value. A warning will be printed 1227 (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number 1228 is found or if there are any unused characters after the number. 1229 1230 When @a strend is non-NULL, @a str starts with (optional whitespace 1231 followed by) the numeric value. On return, @a strend is set to point 1232 to the first unused character after the numeric value, or to @a str 1233 if no valid parsable number is found. 1234 */ 1235 long hts_parse_decimal(const(char)* str, char** strend, HTS_PARSE_FLAGS flags); 1236 1237 alias hts_name2id_f = int function(void*, const(char)*) *; 1238 1239 /// Parse a "CHR:START-END"-style region string 1240 /** @param str String to be parsed 1241 @param beg Set on return to the 0-based start of the region 1242 @param end Set on return to the 1-based end of the region 1243 @return Pointer to the colon or '\0' after the reference sequence name, 1244 or NULL if @a str could not be parsed. 1245 1246 NOTE: For compatibility with hts_parse_reg only. 1247 Please use hts_parse_region instead. 1248 */ 1249 const(char)* hts_parse_reg64(const(char)* str, hts_pos_t* beg, hts_pos_t* end); 1250 1251 /// Parse a "CHR:START-END"-style region string 1252 /** @param str String to be parsed 1253 @param beg Set on return to the 0-based start of the region 1254 @param end Set on return to the 1-based end of the region 1255 @return Pointer to the colon or '\0' after the reference sequence name, 1256 or NULL if @a str could not be parsed. 1257 */ 1258 const(char)* hts_parse_reg(const(char)* str, int* beg, int* end); 1259 1260 /// Parse a "CHR:START-END"-style region string 1261 /** @param str String to be parsed 1262 @param tid Set on return (if not NULL) to be reference index (-1 if invalid) 1263 @param beg Set on return to the 0-based start of the region 1264 @param end Set on return to the 1-based end of the region 1265 @param getid Function pointer. Called if not NULL to set tid. 1266 @param hdr Caller data passed to getid. 1267 @param flags Bitwise HTS_PARSE_* flags listed above. 1268 @return Pointer to the byte after the end of the entire region 1269 specifier (including any trailing comma) on success, 1270 or NULL if @a str could not be parsed. 1271 1272 A variant of hts_parse_reg which is reference-id aware. It uses 1273 the iterator name2id callbacks to validate the region tokenisation works. 1274 1275 This is necessary due to GRCh38 HLA additions which have reference names 1276 like "HLA-DRB1*12:17". 1277 1278 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 1279 are reference names, quote using curly braces. 1280 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 1281 1282 Flags are used to control how parsing works, and can be one of the below. 1283 1284 HTS_PARSE_THOUSANDS_SEP: 1285 Ignore commas in numbers. For example with this flag 1,234,567 1286 is interpreted as 1234567. 1287 1288 HTS_PARSE_LIST: 1289 If present, the region is assmed to be a comma separated list and 1290 position parsing will not contain commas (this implicitly 1291 clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). 1292 On success the return pointer will be the start of the next region, ie 1293 the character after the comma. (If *ret != '\0' then the caller can 1294 assume another region is present in the list.) 1295 1296 If not set then positions may contain commas. In this case the return 1297 value should point to the end of the string, or NULL on failure. 1298 1299 HTS_PARSE_ONE_COORD: 1300 If present, X:100 is treated as the single base pair region X:100-100. 1301 In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>. 1302 (This is the standard bcftools region convention.) 1303 1304 When not set X:100 is considered to be X:100-<end> where <end> is 1305 the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are 1306 invalid. 1307 (This is the standard samtools region convention.) 1308 1309 Note the supplied string expects 1 based inclusive coordinates, but the 1310 returned coordinates start from 0 and are half open, so pos0 is valid 1311 for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" 1312 1313 If NULL is returned, the value in tid mat give additional information 1314 about the error: 1315 1316 -2 Failed to parse @p hdr; or out of memory 1317 -1 The reference in @p str has mismatched braces, or does not 1318 exist in @p hdr 1319 >= 0 The specified range in @p str could not be parsed 1320 */ 1321 const(char)* hts_parse_region( 1322 const(char)* s, 1323 int* tid, 1324 hts_pos_t* beg, 1325 hts_pos_t* end, 1326 hts_name2id_f getid, 1327 void* hdr, 1328 HTS_PARSE_FLAGS flags); 1329 1330 /////////////////////////////////////////////////////////// 1331 // Generic iterators 1332 // 1333 // These functions provide the low-level infrastructure for iterators. 1334 // Wrappers around these are used to make iterators for specific file types. 1335 // See: 1336 // htslib/sam.h for SAM/BAM/CRAM iterators 1337 // htslib/vcf.h for VCF/BCF iterators 1338 // htslib/tbx.h for files indexed by tabix 1339 1340 /// Create a single-region iterator 1341 /** @param idx Index 1342 @param tid Target ID 1343 @param beg Start of region 1344 @param end End of region 1345 @param readrec Callback to read a record from the input file 1346 @return An iterator on success; NULL on failure 1347 1348 The iterator struct returned by a successful call should be freed 1349 via hts_itr_destroy() when it is no longer needed. 1350 */ 1351 hts_itr_t* hts_itr_query( 1352 const(hts_idx_t)* idx, 1353 int tid, 1354 hts_pos_t beg, 1355 hts_pos_t end, 1356 hts_readrec_func readrec); 1357 1358 /// Free an iterator 1359 /** @param iter Iterator to free 1360 */ 1361 void hts_itr_destroy(hts_itr_t* iter); 1362 1363 alias hts_itr_query_func = hts_itr_t* function(const(hts_idx_t)* idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func readrec); 1364 1365 /// Create a single-region iterator from a text region specification 1366 /** @param idx Index 1367 @param reg Region specifier 1368 @param getid Callback function to return the target ID for a name 1369 @param hdr Input file header 1370 @param itr_query Callback function returning an iterator for a numeric tid, 1371 start and end position 1372 @param readrec Callback to read a record from the input file 1373 @return An iterator on success; NULL on error 1374 1375 The iterator struct returned by a successful call should be freed 1376 via hts_itr_destroy() when it is no longer needed. 1377 */ 1378 hts_itr_t* hts_itr_querys( 1379 const(hts_idx_t)* idx, 1380 const(char)* reg, 1381 hts_name2id_f getid, 1382 void* hdr, 1383 hts_itr_query_func itr_query, 1384 hts_readrec_func readrec); 1385 1386 /// Return the next record from an iterator 1387 /** @param fp Input file handle 1388 @param iter Iterator 1389 @param r Pointer to record placeholder 1390 @param data Data passed to the readrec callback 1391 @return >= 0 on success, -1 when there is no more data, < -1 on error 1392 */ 1393 int hts_itr_next(BGZF* fp, hts_itr_t* iter, void* r, void* data); 1394 1395 1396 /********************************** 1397 * Iterator with multiple regions * 1398 **********************************/ 1399 1400 alias hts_itr_multi_query_func = int function(const(hts_idx_t)* idx, hts_itr_t* itr); 1401 int hts_itr_multi_bam(const(hts_idx_t)* idx, hts_itr_t* iter); 1402 int hts_itr_multi_cram(const(hts_idx_t)* idx, hts_itr_t* iter); 1403 1404 /// Create a multi-region iterator from a region list 1405 /** @param idx Index 1406 @param reglist Region list 1407 @param count Number of items in region list 1408 @param getid Callback to convert names to target IDs 1409 @param hdr Indexed file header (passed to getid) 1410 @param itr_specific Filetype-specific callback function 1411 @param readrec Callback to read an input file record 1412 @param seek Callback to seek in the input file 1413 @param tell Callback to return current input file location 1414 @return An iterator on success; NULL on failure 1415 1416 The iterator struct returned by a successful call should be freed 1417 via hts_itr_destroy() when it is no longer needed. 1418 */ 1419 hts_itr_t* hts_itr_regions( 1420 const(hts_idx_t)* idx, 1421 hts_reglist_t* reglist, 1422 int count, 1423 hts_name2id_f getid, 1424 void* hdr, 1425 hts_itr_multi_query_func itr_specific, 1426 hts_readrec_func readrec, 1427 hts_seek_func seek, 1428 hts_tell_func tell); 1429 1430 /// Return the next record from an iterator 1431 /** @param fp Input file handle 1432 @param iter Iterator 1433 @param r Pointer to record placeholder 1434 @return >= 0 on success, -1 when there is no more data, < -1 on error 1435 */ 1436 int hts_itr_multi_next(htsFile* fd, hts_itr_t* iter, void* r); 1437 1438 /// Create a region list from a char array 1439 /** @param argv Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2 1440 @param argc Number of items in the array 1441 @param r_count Pointer to the number of items in the resulting region list 1442 @param hdr Header for the sam/bam/cram file 1443 @param getid Callback to convert target names to target ids. 1444 @return A region list on success, NULL on failure 1445 1446 The hts_reglist_t struct returned by a successful call should be freed 1447 via hts_reglist_free() when it is no longer needed. 1448 */ 1449 hts_reglist_t* hts_reglist_create( 1450 char** argv, 1451 int argc, 1452 int* r_count, 1453 void* hdr, 1454 hts_name2id_f getid); 1455 1456 /// Free a region list 1457 /** @param reglist Region list 1458 @param count Number of items in the list 1459 */ 1460 void hts_reglist_free(hts_reglist_t* reglist, int count); 1461 1462 /// Free a multi-region iterator 1463 /** @param iter Iterator to free 1464 */ 1465 alias hts_itr_multi_destroy = hts_itr_destroy; 1466 1467 /** 1468 * hts_file_type() - Convenience function to determine file type 1469 * DEPRECATED: This function has been replaced by hts_detect_format(). 1470 * It and these FT_* macros will be removed in a future HTSlib release. 1471 */ 1472 enum FT_UNKN = 0; 1473 enum FT_GZ = 1; 1474 enum FT_VCF = 2; 1475 enum FT_VCF_GZ = FT_GZ | FT_VCF; 1476 enum FT_BCF = 1 << 2; 1477 enum FT_BCF_GZ = FT_GZ | FT_BCF; 1478 enum FT_STDIN = 1 << 3; 1479 int hts_file_type(const(char)* fname); 1480 1481 /*************************** 1482 * Revised MAQ error model * 1483 ***************************/ 1484 1485 struct errmod_t; 1486 1487 errmod_t* errmod_init(double depcorr); 1488 void errmod_destroy(errmod_t* em); 1489 1490 /* 1491 n: number of bases 1492 m: maximum base 1493 bases[i]: qual:6, strand:1, base:4 1494 q[i*m+j]: phred-scaled likelihood of (i,j) 1495 */ 1496 int errmod_cal(const(errmod_t)* em, int n, int m, ushort* bases, float* q); 1497 1498 /***************************************************** 1499 * Probabilistic banded glocal alignment * 1500 * See https://doi.org/10.1093/bioinformatics/btr076 * 1501 *****************************************************/ 1502 1503 struct probaln_par_t 1504 { 1505 float d; 1506 float e; 1507 int bw; 1508 } 1509 1510 /// Perform probabilistic banded glocal alignment 1511 /** @param ref Reference sequence 1512 @param l_ref Length of reference 1513 @param query Query sequence 1514 @param l_query Length of query sequence 1515 @param iqual Query base qualities 1516 @param c Alignment parameters 1517 @param[out] state Output alignment 1518 @param[out] q Phred scaled posterior probability of state[i] being wrong 1519 @return Phred-scaled likelihood score, or INT_MIN on failure. 1520 1521 The reference and query sequences are coded using integers 0,1,2,3,4 for 1522 bases A,C,G,T,N respectively (N here is for any ambiguity code). 1523 1524 On output, state and q are arrays of length l_query. The higher 30 1525 bits give the reference position the query base is matched to and the 1526 lower two bits can be 0 (an alignment match) or 1 (an 1527 insertion). q[i] gives the phred scaled posterior probability of 1528 state[i] being wrong. 1529 1530 On failure, errno will be set to EINVAL if the values of l_ref or l_query 1531 were invalid; or ENOMEM if a memory allocation failed. 1532 */ 1533 1534 int probaln_glocal( 1535 const(ubyte)* ref_, 1536 int l_ref, 1537 const(ubyte)* query, 1538 int l_query, 1539 const(ubyte)* iqual, 1540 const(probaln_par_t)* c, 1541 int* state, 1542 ubyte* q); 1543 1544 /********************** 1545 * MD5 implementation * 1546 **********************/ 1547 1548 struct hts_md5_context; 1549 1550 /*! @abstract Initialises an MD5 context. 1551 * @discussion 1552 * The expected use is to allocate an hts_md5_context using 1553 * hts_md5_init(). This pointer is then passed into one or more calls 1554 * of hts_md5_update() to compute successive internal portions of the 1555 * MD5 sum, which can then be externalised as a full 16-byte MD5sum 1556 * calculation by calling hts_md5_final(). This can then be turned 1557 * into ASCII via hts_md5_hex(). 1558 * 1559 * To dealloate any resources created by hts_md5_init() call the 1560 * hts_md5_destroy() function. 1561 * 1562 * @return hts_md5_context pointer on success, NULL otherwise. 1563 */ 1564 hts_md5_context* hts_md5_init(); 1565 1566 /*! @abstract Updates the context with the MD5 of the data. */ 1567 void hts_md5_update(hts_md5_context* ctx, const(void)* data, c_ulong size); 1568 1569 /*! @abstract Computes the final 128-bit MD5 hash from the given context */ 1570 void hts_md5_final(ubyte* digest, hts_md5_context* ctx); 1571 1572 /*! @abstract Resets an md5_context to the initial state, as returned 1573 * by hts_md5_init(). 1574 */ 1575 void hts_md5_reset(hts_md5_context* ctx); 1576 1577 /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated 1578 * hex string. 1579 */ 1580 void hts_md5_hex(char* hex, const(ubyte)* digest); 1581 1582 /*! @abstract Deallocates any memory allocated by hts_md5_init. */ 1583 void hts_md5_destroy(hts_md5_context* ctx); 1584 1585 pragma(inline,true) 1586 long hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) 1587 { 1588 int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; 1589 for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) 1590 if (beg>>s == end>>s) return t + (beg>>s); 1591 return 0; 1592 } 1593 1594 /// Compute the level of a bin in a binning index 1595 pragma(inline, true) 1596 int hts_bin_level(int bin) { 1597 int l, b; 1598 for (l = 0, b = bin; b; ++l){ b = hts_bin_parent(b);} 1599 return l; 1600 } 1601 1602 //! Compute the corresponding entry into the linear index of a given bin from 1603 //! a binning index 1604 /*! 1605 * @param bin The bin number 1606 * @param n_lvls The index depth (number of levels - 0 based) 1607 * @return The integer offset into the linear index 1608 * 1609 * Explanation of the return value formula: 1610 * Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs. 1611 * A linear index entry covers exp(2, min_shift) base pairs. 1612 */ 1613 1614 // compute the level of bin 1615 pragma(inline, true) 1616 int hts_bin_bot(int bin, int n_lvls) 1617 { 1618 int l = hts_bin_level(bin); 1619 return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; 1620 } 1621 1622 /************** 1623 * Endianness * 1624 **************/ 1625 pragma(inline, true) 1626 int ed_is_big() 1627 { 1628 long one= 1; 1629 return !(*(cast(char *)(&one))); 1630 } 1631 pragma(inline, true) 1632 ushort ed_swap_2(ushort v) 1633 { 1634 return cast(ushort)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); 1635 } 1636 pragma(inline, true) 1637 void *ed_swap_2p(void *x) 1638 { 1639 *cast(ushort*)x = ed_swap_2(*cast(ushort*)x); 1640 return x; 1641 } 1642 pragma(inline, true) 1643 uint ed_swap_4(uint v) 1644 { 1645 v = ((v & 0x0000FFFFU) << 16) | (v >> 16); 1646 return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); 1647 } 1648 pragma(inline, true) 1649 void *ed_swap_4p(void *x) 1650 { 1651 *cast(uint*)x = ed_swap_4(*cast(uint*)x); 1652 return x; 1653 } 1654 pragma(inline, true) 1655 ulong ed_swap_8(ulong v) 1656 { 1657 v = ((v & 0x00000000FFFFFFFFLU) << 32) | (v >> 32); 1658 v = ((v & 0x0000FFFF0000FFFFLU) << 16) | ((v & 0xFFFF0000FFFF0000LU) >> 16); 1659 return ((v & 0x00FF00FF00FF00FFLU) << 8) | ((v & 0xFF00FF00FF00FF00LU) >> 8); 1660 } 1661 pragma(inline, true) 1662 void *ed_swap_8p(void *x) 1663 { 1664 *cast(ulong*)x = ed_swap_8(*cast(ulong*)x); 1665 return x; 1666 } 1667