1 /// @file htslib/sam.h 2 /// High-level SAM/BAM/CRAM sequence file operations. 3 /* 4 Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd. 5 Copyright (C) 2010, 2012, 2013 Broad Institute. 6 7 Author: Heng Li <lh3@sanger.ac.uk> 8 9 Permission is hereby granted, free of charge, to any person obtaining a copy 10 of this software and associated documentation files (the "Software"), to deal 11 in the Software without restriction, including without limitation the rights 12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 copies of the Software, and to permit persons to whom the Software is 14 furnished to do so, subject to the following conditions: 15 16 The above copyright notice and this permission notice shall be included in 17 all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 DEALINGS IN THE SOFTWARE. */ 26 27 module htslib.sam; 28 29 import core.stdc.stddef; 30 import core.stdc.stdlib; 31 import std.format: format; 32 33 import htslib.hts; 34 import htslib.hts_log; 35 import htslib.bgzf : BGZF; 36 import htslib.kstring; 37 import htslib.hts_endian; 38 import core.stdc.errno : EINVAL, ENOMEM, errno; 39 40 @system: 41 extern (C): 42 @nogc nothrow { 43 44 /// Highest SAM format version supported by this library 45 enum SAM_FORMAT_VERSION = "1.6"; 46 47 /*************************** 48 *** SAM/BAM/CRAM header *** 49 ***************************/ 50 51 /*! @typedef 52 * @abstract Header extension structure, grouping a collection 53 * of hash tables that contain the parsed header data. 54 */ 55 56 struct sam_hrecs_t; 57 58 /*! @typedef 59 @abstract Structure for the alignment header. 60 @field n_targets number of reference sequences 61 @field l_text length of the plain text in the header (may be zero if 62 the header has been edited) 63 @field target_len lengths of the reference sequences 64 @field target_name names of the reference sequences 65 @field text plain text (may be NULL if the header has been edited) 66 @field sdict header dictionary 67 @field hrecs pointer to the extended header struct (internal use only) 68 @field ref_count reference count 69 70 @note The text and l_text fields are included for backwards compatibility. 71 These fields may be set to NULL and zero respectively as a side-effect 72 of calling some header API functions. New code that needs to access the 73 header text should use the sam_hdr_str() and sam_hdr_length() functions 74 instead of these fields. 75 */ 76 77 struct sam_hdr_t 78 { 79 int n_targets; 80 int ignore_sam_err; 81 size_t l_text; 82 uint* target_len; 83 const(byte)* cigar_tab; 84 char** target_name; 85 char* text; 86 void* sdict; 87 sam_hrecs_t* hrecs; 88 uint ref_count; 89 } 90 91 /*! @typedef 92 * @abstract Old name for compatibility with existing code. 93 */ 94 alias bam_hdr_t = sam_hdr_t; 95 96 /**************************** 97 *** CIGAR related macros *** 98 ****************************/ 99 100 enum BAM_CMATCH = 0; 101 enum BAM_CINS = 1; 102 enum BAM_CDEL = 2; 103 enum BAM_CREF_SKIP = 3; 104 enum BAM_CSOFT_CLIP = 4; 105 enum BAM_CHARD_CLIP = 5; 106 enum BAM_CPAD = 6; 107 enum BAM_CEQUAL = 7; 108 enum BAM_CDIFF = 8; 109 enum BAM_CBACK = 9; 110 111 enum BAM_CIGAR_STR = "MIDNSHP=XB"; 112 enum BAM_CIGAR_SHIFT = 4; 113 enum BAM_CIGAR_MASK = 0xf; 114 enum BAM_CIGAR_TYPE = 0x3C1A7; 115 116 /*! @abstract Table for converting a CIGAR operator character to BAM_CMATCH etc. 117 Result is operator code or -1. Be sure to cast the index if it is a plain char: 118 int op = bam_cigar_table[(unsigned char) ch]; 119 */ 120 extern __gshared const(byte)[256] bam_cigar_table; 121 122 extern (D) auto bam_cigar_op(T)(auto ref T c) 123 { 124 return c & BAM_CIGAR_MASK; 125 } 126 127 extern (D) auto bam_cigar_oplen(T)(auto ref T c) 128 { 129 return c >> BAM_CIGAR_SHIFT; 130 } 131 132 // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that 133 // the array look-up will not fall off the end. '?' is chosen as the 134 // padding character so it's easy to spot if one is emitted, and will 135 // result in a parsing failure (in sam_parse1(), at least) if read. 136 extern (D) auto bam_cigar_gen(T0, T1)(auto ref T0 l, auto ref T1 o) 137 { 138 return l << BAM_CIGAR_SHIFT | o; 139 } 140 141 /* bam_cigar_type returns a bit flag with: 142 * bit 1 set if the cigar operation consumes the query 143 * bit 2 set if the cigar operation consumes the reference 144 * 145 * For reference, the unobfuscated truth table for this function is: 146 * BAM_CIGAR_TYPE QUERY REFERENCE 147 * -------------------------------- 148 * BAM_CMATCH 1 1 149 * BAM_CINS 1 0 150 * BAM_CDEL 0 1 151 * BAM_CREF_SKIP 0 1 152 * BAM_CSOFT_CLIP 1 0 153 * BAM_CHARD_CLIP 0 0 154 * BAM_CPAD 0 0 155 * BAM_CEQUAL 1 1 156 * BAM_CDIFF 1 1 157 * BAM_CBACK 0 0 158 * -------------------------------- 159 */ 160 extern (D) auto bam_cigar_type(T)(auto ref T o) 161 { 162 return BAM_CIGAR_TYPE >> (o << 1) & 3; 163 } // bit 1: consume query; bit 2: consume reference 164 165 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ 166 enum BAM_FPAIRED = 1; 167 /*! @abstract the read is mapped in a proper pair */ 168 enum BAM_FPROPER_PAIR = 2; 169 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ 170 enum BAM_FUNMAP = 4; 171 /*! @abstract the mate is unmapped */ 172 enum BAM_FMUNMAP = 8; 173 /*! @abstract the read is mapped to the reverse strand */ 174 enum BAM_FREVERSE = 16; 175 /*! @abstract the mate is mapped to the reverse strand */ 176 enum BAM_FMREVERSE = 32; 177 /*! @abstract this is read1 */ 178 enum BAM_FREAD1 = 64; 179 /*! @abstract this is read2 */ 180 enum BAM_FREAD2 = 128; 181 /*! @abstract not primary alignment */ 182 enum BAM_FSECONDARY = 256; 183 /*! @abstract QC failure */ 184 enum BAM_FQCFAIL = 512; 185 /*! @abstract optical or PCR duplicate */ 186 enum BAM_FDUP = 1024; 187 /*! @abstract supplementary alignment */ 188 enum BAM_FSUPPLEMENTARY = 2048; 189 190 /************************* 191 *** Alignment records *** 192 *************************/ 193 194 /* 195 * Assumptions made here. While pos can be 64-bit, no sequence 196 * itself is that long, but due to ref skip CIGAR fields it 197 * may span more than that. (CIGAR itself is 28-bit len + 4 bit 198 * type, but in theory we can combine multiples together.) 199 * 200 * Mate position and insert size also need to be 64-bit, but 201 * we won't accept more than 32-bit for tid. 202 * 203 * The bam_core_t structure is the *in memory* layout and not 204 * the same as the on-disk format. 64-bit changes here permit 205 * SAM to work with very long chromosomes and permit BAM and CRAM 206 * to seamlessly update in the future without further API/ABI 207 * revisions. 208 */ 209 210 /*! @typedef 211 @abstract Structure for core alignment information. 212 @field pos 0-based leftmost coordinate 213 @field tid chromosome ID, defined by sam_hdr_t 214 @field bin bin calculated by bam_reg2bin() 215 @field qual mapping quality 216 @field l_extranul length of extra NULs between qname & cigar (for alignment) 217 @field flag bitwise flag 218 @field l_qname length of the query name 219 @field n_cigar number of CIGAR operations 220 @field l_qseq length of the query sequence (read) 221 @field mtid chromosome ID of next read in template, defined by sam_hdr_t 222 @field mpos 0-based leftmost coordinate of next read in template 223 @field isize observed template length ("insert size") 224 */ 225 struct bam1_core_t 226 { 227 hts_pos_t pos; 228 int tid; 229 ushort bin; // NB: invalid on 64-bit pos 230 ubyte qual; 231 ubyte l_extranul; 232 ushort flag; 233 ushort l_qname; 234 uint n_cigar; 235 int l_qseq; 236 int mtid; 237 hts_pos_t mpos; 238 hts_pos_t isize; 239 } 240 241 /*! @typedef 242 @abstract Structure for one alignment. 243 @field core core information about the alignment 244 @field id 245 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux 246 @field l_data current length of bam1_t::data 247 @field m_data maximum length of bam1_t::data 248 @field mempolicy memory handling policy, see bam_set_mempolicy() 249 250 @discussion Notes: 251 252 1. The data blob should be accessed using bam_get_qname, bam_get_cigar, 253 bam_get_seq, bam_get_qual and bam_get_aux macros. These returns pointers 254 to the start of each type of data. 255 2. qname is terminated by one to four NULs, so that the following 256 cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, 257 while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). 258 3. Cigar data is encoded 4 bytes per CIGAR operation. 259 See the bam_cigar_* macros for manipulation. 260 4. seq is nibble-encoded according to bam_nt16_table. 261 See the bam_seqi macro for retrieving individual bases. 262 5. Per base qualities are stored in the Phred scale with no +33 offset. 263 Ie as per the BAM specification and not the SAM ASCII printable method. 264 */ 265 struct bam1_t 266 { 267 import std.bitmanip : bitfields; 268 269 bam1_core_t core; 270 ulong id; 271 ubyte* data; 272 int l_data; 273 uint m_data; 274 275 mixin(bitfields!( 276 uint, "mempolicy", 2, 277 uint, "", 30)); 278 279 /* Reserved */ 280 } 281 282 /*! @function 283 @abstract Get whether the query is on the reverse strand 284 @param b pointer to an alignment 285 @return boolean true if query is on the reverse strand 286 */ 287 extern (D) auto bam_is_rev(T)(auto ref T b) 288 { 289 return (b.core.flag & BAM_FREVERSE) != 0; 290 } 291 292 /*! @function 293 @abstract Get whether the query's mate is on the reverse strand 294 @param b pointer to an alignment 295 @return boolean true if query's mate on the reverse strand 296 */ 297 extern (D) auto bam_is_mrev(T)(auto ref T b) 298 { 299 return (b.core.flag & BAM_FMREVERSE) != 0; 300 } 301 302 /*! @function 303 @abstract Get the name of the query 304 @param b pointer to an alignment 305 @return pointer to the name string, null terminated 306 */ 307 extern (D) auto bam_get_qname(T)(auto ref T b) 308 { 309 return cast(char*) b.data; 310 } 311 312 /*! @function 313 @abstract Get the CIGAR array 314 @param b pointer to an alignment 315 @return pointer to the CIGAR array 316 317 @discussion In the CIGAR array, each element is a 32-bit integer. The 318 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the 319 length of a CIGAR. 320 */ 321 extern (D) auto bam_get_cigar(bam1_t * b) 322 { 323 return cast(uint*) ((*b).data + (*b).core.l_qname); 324 } 325 326 /*! @function 327 @abstract Get query sequence 328 @param b pointer to an alignment 329 @return pointer to sequence 330 331 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 332 8 for T and 15 for N. Two bases are packed in one byte with the base 333 at the higher 4 bits having smaller coordinate on the read. It is 334 recommended to use bam_seqi() macro to get the base. 335 */ 336 extern (D) auto bam_get_seq(T)(auto ref T b) 337 { 338 return b.data + (b.core.n_cigar << 2) + b.core.l_qname; 339 } 340 341 /*! @function 342 @abstract Get query quality 343 @param b pointer to an alignment 344 @return pointer to quality string 345 */ 346 extern (D) auto bam_get_qual(T)(auto ref T b) 347 { 348 return b.data + (b.core.n_cigar << 2) + b.core.l_qname + ((b.core.l_qseq + 1) >> 1); 349 } 350 351 /*! @function 352 @abstract Get auxiliary data 353 @param b pointer to an alignment 354 @return pointer to the concatenated auxiliary data 355 */ 356 extern (D) auto bam_get_aux(T)(auto ref T b) 357 { 358 return b.data + (b.core.n_cigar << 2) + b.core.l_qname + ((b.core.l_qseq + 1) >> 1) + b.core.l_qseq; 359 } 360 361 /*! @function 362 @abstract Get length of auxiliary data 363 @param b pointer to an alignment 364 @return length of the concatenated auxiliary data 365 */ 366 extern (D) auto bam_get_l_aux(T)(auto ref T b) 367 { 368 return b.l_data - (b.core.n_cigar << 2) - b.core.l_qname - b.core.l_qseq - ((b.core.l_qseq + 1) >> 1); 369 } 370 371 /*! @function 372 @abstract Get a base on read 373 @param s Query sequence returned by bam_get_seq() 374 @param i The i-th position, 0-based 375 @return 4-bit integer representing the base. 376 */ 377 extern (D) auto bam_seqi(T0, T1)(auto ref T0 s, auto ref T1 i) 378 { 379 return s[i >> 1] >> ((~i & 1) << 2) & 0xf; 380 } 381 382 /*! 383 @abstract Modifies a single base in the bam structure. 384 @param s Query sequence returned by bam_get_seq() 385 @param i The i-th position, 0-based 386 @param b Base in nt16 nomenclature (see seq_nt16_table) 387 */ 388 389 extern (D) void bam_set_seqi(T0, T1, T3)(auto ref T0 s, auto ref T1 i, auto ref T3 b) 390 { 391 s[i >> 1] = (s[i >> 1] & (0xf0 >> ((~i & 1) << 2))) | cast(ubyte)(b << ((~i & 1) << 2)); 392 } 393 394 /************************** 395 *** Exported functions *** 396 **************************/ 397 398 /*************** 399 *** BAM I/O *** 400 ***************/ 401 402 /* Header */ 403 404 /// Generates a new unpopulated header structure. 405 /*! 406 * 407 * @return A valid pointer to new header on success, NULL on failure 408 * 409 * The sam_hdr_t struct returned by a successful call should be freed 410 * via sam_hdr_destroy() when it is no longer needed. 411 */ 412 sam_hdr_t* sam_hdr_init(); 413 414 /// Read the header from a BAM compressed file. 415 /*! 416 * @param fp File pointer 417 * @return A valid pointer to new header on success, NULL on failure 418 * 419 * This function only works with BAM files. It is usually better to use 420 * sam_hdr_read(), which works on SAM, BAM and CRAM files. 421 * 422 * The sam_hdr_t struct returned by a successful call should be freed 423 * via sam_hdr_destroy() when it is no longer needed. 424 */ 425 sam_hdr_t* bam_hdr_read(BGZF* fp); 426 427 /// Writes the header to a BAM file. 428 /*! 429 * @param fp File pointer 430 * @param h Header pointer 431 * @return 0 on success, -1 on failure 432 * 433 * This function only works with BAM files. Use sam_hdr_write() to 434 * write in any of the SAM, BAM or CRAM formats. 435 */ 436 int bam_hdr_write(BGZF* fp, const(sam_hdr_t)* h); 437 438 /*! 439 * Frees the resources associated with a header. 440 */ 441 void sam_hdr_destroy(sam_hdr_t* h); 442 443 /// Duplicate a header structure. 444 /*! 445 * @return A valid pointer to new header on success, NULL on failure 446 * 447 * The sam_hdr_t struct returned by a successful call should be freed 448 * via sam_hdr_destroy() when it is no longer needed. 449 */ 450 sam_hdr_t* sam_hdr_dup(const(sam_hdr_t)* h0); 451 452 /*! 453 * @abstract Old names for compatibility with existing code. 454 */ 455 pragma(inline,true) 456 sam_hdr_t* bam_hdr_init() { return sam_hdr_init(); } 457 458 pragma(inline,true) 459 void bam_hdr_destroy(sam_hdr_t* h) { sam_hdr_destroy(h); } 460 461 pragma(inline,true) 462 sam_hdr_t* bam_hdr_dup(const(sam_hdr_t)* h0) { return sam_hdr_dup(h0); } 463 464 alias samFile = htsFile; 465 466 /// Create a header from existing text. 467 /*! 468 * @param l_text Length of text 469 * @param text Header text 470 * @return A populated sam_hdr_t structure on success; NULL on failure. 471 * @note The text field of the returned header will be NULL, and the l_text 472 * field will be zero. 473 * 474 * The sam_hdr_t struct returned by a successful call should be freed 475 * via sam_hdr_destroy() when it is no longer needed. 476 */ 477 sam_hdr_t* sam_hdr_parse(size_t l_text, const(char)* text); 478 479 /// Read a header from a SAM, BAM or CRAM file. 480 /*! 481 * @param fp Pointer to a SAM, BAM or CRAM file handle 482 * @return A populated sam_hdr_t struct on success; NULL on failure. 483 * 484 * The sam_hdr_t struct returned by a successful call should be freed 485 * via sam_hdr_destroy() when it is no longer needed. 486 */ 487 sam_hdr_t* sam_hdr_read(samFile* fp); 488 489 /// Write a header to a SAM, BAM or CRAM file. 490 /*! 491 * @param fp SAM, BAM or CRAM file header 492 * @param h Header structure to write 493 * @return 0 on success; -1 on failure 494 */ 495 int sam_hdr_write(samFile* fp, const(sam_hdr_t)* h); 496 497 /// Returns the current length of the header text. 498 /*! 499 * @return >= 0 on success, SIZE_MAX on failure 500 */ 501 size_t sam_hdr_length(sam_hdr_t* h); 502 503 /// Returns the text representation of the header. 504 /*! 505 * @return valid char pointer on success, NULL on failure 506 * 507 * The returned string is part of the header structure. It will remain 508 * valid until a call to a header API function causes the string to be 509 * invalidated, or the header is destroyed. 510 * 511 * The caller should not attempt to free or realloc this pointer. 512 */ 513 const(char)* sam_hdr_str(sam_hdr_t* h); 514 515 /// Returns the number of references in the header. 516 /*! 517 * @return >= 0 on success, -1 on failure 518 */ 519 int sam_hdr_nref(const(sam_hdr_t)* h); 520 521 /* ==== Line level methods ==== */ 522 523 /// Add formatted lines to an existing header. 524 /*! 525 * @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with 526 * optional new-line. If it contains more than 1 line then 527 * multiple lines will be added in order 528 * @param len The maximum length of lines (if an early NUL is not 529 * encountered). len may be 0 if unknown, in which case 530 * lines must be NUL-terminated 531 * @return 0 on success, -1 on failure 532 * 533 * The lines will be appended to the end of the existing header 534 * (apart from HD, which always comes first). 535 */ 536 int sam_hdr_add_lines(sam_hdr_t* h, const(char)* lines, size_t len); 537 538 /// Adds a single line to an existing header. 539 /*! 540 * Specify type and one or more key,value pairs, ending with the NULL key. 541 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). 542 * 543 * @param type Type of the added line. Eg. "SQ" 544 * @return 0 on success, -1 on failure 545 * 546 * The new line will be added immediately after any others of the same 547 * type, or at the end of the existing header if no lines of the 548 * given type currently exist. The exception is HD lines, which always 549 * come first. If an HD line already exists, it will be replaced. 550 */ 551 int sam_hdr_add_line(sam_hdr_t* h, const(char)* type, ...); 552 553 /// Returns a complete line of formatted text for a given type and ID. 554 /*! 555 * @param type Type of the searched line. Eg. "SQ" 556 * @param ID_key Tag key defining the line. Eg. "SN" 557 * @param ID_value Tag value associated with the key above. Eg. "ref1" 558 * @param ks kstring to hold the result 559 * @return 0 on success; 560 * -1 if no matching line is found 561 * -2 on other failures 562 * 563 * Puts a complete line of formatted text for a specific header type/ID 564 * combination into @p ks. If ID_key is NULL then it returns the first line of 565 * the specified type. 566 * 567 * Any existing content in @p ks will be overwritten. 568 */ 569 int sam_hdr_find_line_id( 570 sam_hdr_t* h, 571 const(char)* type, 572 const(char)* ID_key, 573 const(char)* ID_val, 574 kstring_t* ks); 575 576 /// Returns a complete line of formatted text for a given type and index. 577 /*! 578 * @param type Type of the searched line. Eg. "SQ" 579 * @param position Index in lines of this type (zero-based) 580 * @param ks kstring to hold the result 581 * @return 0 on success; 582 * -1 if no matching line is found 583 * -2 on other failures 584 * 585 * Puts a complete line of formatted text for a specific line into @p ks. 586 * The header line is selected using the @p type and @p position parameters. 587 * 588 * Any existing content in @p ks will be overwritten. 589 */ 590 int sam_hdr_find_line_pos( 591 sam_hdr_t* h, 592 const(char)* type, 593 int pos, 594 kstring_t* ks); 595 596 /// Remove a line with given type / id from a header 597 /*! 598 * @param type Type of the searched line. Eg. "SQ" 599 * @param ID_key Tag key defining the line. Eg. "SN" 600 * @param ID_value Tag value associated with the key above. Eg. "ref1" 601 * @return 0 on success, -1 on error 602 * 603 * Remove a line from the header by specifying a tag:value that uniquely 604 * identifies the line, i.e. the @SQ line containing "SN:ref1". 605 * 606 * \@SQ line is uniquely identified by the SN tag. 607 * \@RG line is uniquely identified by the ID tag. 608 * \@PG line is uniquely identified by the ID tag. 609 * Eg. sam_hdr_remove_line_id(h, "SQ", "SN", "ref1") 610 * 611 * If no key:value pair is specified, the type MUST be followed by a NULL argument and 612 * the first line of the type will be removed, if any. 613 * Eg. sam_hdr_remove_line_id(h, "SQ", NULL, NULL) 614 * 615 * @note Removing \@PG lines is currently unsupported. 616 */ 617 int sam_hdr_remove_line_id( 618 sam_hdr_t* h, 619 const(char)* type, 620 const(char)* ID_key, 621 const(char)* ID_value); 622 623 /// Remove nth line of a given type from a header 624 /*! 625 * @param type Type of the searched line. Eg. "SQ" 626 * @param position Index in lines of this type (zero-based). E.g. 3 627 * @return 0 on success, -1 on error 628 * 629 * Remove a line from the header by specifying the position in the type 630 * group, i.e. 3rd @SQ line. 631 */ 632 int sam_hdr_remove_line_pos(sam_hdr_t* h, const(char)* type, int position); 633 634 /// Add or update tag key,value pairs in a header line. 635 /*! 636 * @param type Type of the searched line. Eg. "SQ" 637 * @param ID_key Tag key defining the line. Eg. "SN" 638 * @param ID_value Tag value associated with the key above. Eg. "ref1" 639 * @return 0 on success, -1 on error 640 * 641 * Adds or updates tag key,value pairs in a header line. 642 * Eg. for adding M5 tags to @SQ lines or updating sort order for the 643 * @HD line. 644 * 645 * Specify multiple key,value pairs ending in NULL. Eg. 646 * sam_hdr_update_line(h, "RG", "ID", "rg1", "DS", "description", "PG", "samtools", NULL) 647 * 648 * Attempting to update the record name (i.e. @SQ SN or @RG ID) will 649 * work as long as the new name is not already in use, however doing this 650 * on a file opened for reading may produce unexpected results. 651 * 652 * Renaming an @RG record in this way will only change the header. Alignment 653 * records written later will not be updated automatically even if they 654 * reference the old read group name. 655 * 656 * Attempting to change an @PG ID tag is not permitted. 657 */ 658 int sam_hdr_update_line( 659 sam_hdr_t* h, 660 const(char)* type, 661 const(char)* ID_key, 662 const(char)* ID_value, 663 ...); 664 665 /// Remove all lines of a given type from a header, except the one matching an ID 666 /*! 667 * @param type Type of the searched line. Eg. "SQ" 668 * @param ID_key Tag key defining the line. Eg. "SN" 669 * @param ID_value Tag value associated with the key above. Eg. "ref1" 670 * @return 0 on success, -1 on failure 671 * 672 * Remove all lines of type <type> from the header, except the one 673 * specified by tag:value, i.e. the @SQ line containing "SN:ref1". 674 * 675 * If no line matches the key:value ID, all lines of the given type are removed. 676 * To remove all lines of a given type, use NULL for both ID_key and ID_value. 677 */ 678 int sam_hdr_remove_except( 679 sam_hdr_t* h, 680 const(char)* type, 681 const(char)* ID_key, 682 const(char)* ID_value); 683 684 /// Remove header lines of a given type, except those in a given ID set 685 /*! 686 * @param type Type of the searched line. Eg. "RG" 687 * @param id Tag key defining the line. Eg. "ID" 688 * @param rh Hash set initialised by the caller with the values to be kept. 689 * See description for how to create this. If @p rh is NULL, all 690 * lines of this type will be removed. 691 * @return 0 on success, -1 on failure 692 * 693 * Remove all lines of type @p type from the header, except the ones 694 * specified in the hash set @p rh. If @p rh is NULL, all lines of 695 * this type will be removed. 696 * Declaration of @p rh is done using KHASH_SET_INIT_STR macro. Eg. 697 * @code{.c} 698 * #include "htslib/khash.h" 699 * KHASH_SET_INIT_STR(keep) 700 * typedef khash_t(keep) *keephash_t; 701 * 702 * void your_method() { 703 * samFile *sf = sam_open("alignment.bam", "r"); 704 * sam_hdr_t *h = sam_hdr_read(sf); 705 * keephash_t rh = kh_init(keep); 706 * int ret = 0; 707 * kh_put(keep, rh, strdup("chr2"), &ret); 708 * kh_put(keep, rh, strdup("chr3"), &ret); 709 * if (sam_hdr_remove_lines(h, "SQ", "SN", rh) == -1) 710 * fprintf(stderr, "Error removing lines\n"); 711 * khint_t k; 712 * for (k = 0; k < kh_end(rh); ++k) 713 * if (kh_exist(rh, k)) free((char*)kh_key(rh, k)); 714 * kh_destroy(keep, rh); 715 * sam_hdr_destroy(h); 716 * sam_close(sf); 717 * } 718 * @endcode 719 * 720 */ 721 int sam_hdr_remove_lines( 722 sam_hdr_t* h, 723 const(char)* type, 724 const(char)* id, 725 void* rh); 726 727 /// Count the number of lines for a given header type 728 /*! 729 * @param h BAM header 730 * @param type Header type to count. Eg. "RG" 731 * @return Number of lines of this type on success; -1 on failure 732 */ 733 int sam_hdr_count_lines(sam_hdr_t* h, const(char)* type); 734 735 /// Index of the line for the types that have dedicated look-up tables (SQ, RG, PG) 736 /*! 737 * @param h BAM header 738 * @param type Type of the searched line. Eg. "RG" 739 * @param key The value of the identifying key. Eg. "rg1" 740 * @return 0-based index on success; -1 if line does not exist; -2 on failure 741 */ 742 int sam_hdr_line_index(sam_hdr_t* bh, const(char)* type, const(char)* key); 743 744 /// Id key of the line for the types that have dedicated look-up tables (SQ, RG, PG) 745 /*! 746 * @param h BAM header 747 * @param type Type of the searched line. Eg. "RG" 748 * @param pos Zero-based index inside the type group. Eg. 2 (for the third RG line) 749 * @return Valid key string on success; NULL on failure 750 */ 751 const(char)* sam_hdr_line_name(sam_hdr_t* bh, const(char)* type, int pos); 752 753 /* ==== Key:val level methods ==== */ 754 755 /// Return the value associated with a key for a header line identified by ID_key:ID_val 756 /*! 757 * @param type Type of the line to which the tag belongs. Eg. "SQ" 758 * @param ID_key Tag key defining the line. Eg. "SN". Can be NULL, if looking for the first line. 759 * @param ID_value Tag value associated with the key above. Eg. "ref1". Can be NULL, if ID_key is NULL. 760 * @param key Key of the searched tag. Eg. "LN" 761 * @param ks kstring where the value will be written 762 * @return 0 on success 763 * -1 if the requested tag does not exist 764 * -2 on other errors 765 * 766 * Looks for a specific key in a single SAM header line and writes the 767 * associated value into @p ks. The header line is selected using the ID_key 768 * and ID_value parameters. Any pre-existing content in @p ks will be 769 * overwritten. 770 */ 771 int sam_hdr_find_tag_id( 772 sam_hdr_t* h, 773 const(char)* type, 774 const(char)* ID_key, 775 const(char)* ID_value, 776 const(char)* key, 777 kstring_t* ks); 778 779 /// Return the value associated with a key for a header line identified by position 780 /*! 781 * @param type Type of the line to which the tag belongs. Eg. "SQ" 782 * @param position Index in lines of this type (zero-based). E.g. 3 783 * @param key Key of the searched tag. Eg. "LN" 784 * @param ks kstring where the value will be written 785 * @return 0 on success 786 * -1 if the requested tag does not exist 787 * -2 on other errors 788 * 789 * Looks for a specific key in a single SAM header line and writes the 790 * associated value into @p ks. The header line is selected using the @p type 791 * and @p position parameters. Any pre-existing content in @p ks will be 792 * overwritten. 793 */ 794 int sam_hdr_find_tag_pos( 795 sam_hdr_t* h, 796 const(char)* type, 797 int pos, 798 const(char)* key, 799 kstring_t* ks); 800 801 /// Remove the key from the line identified by type, ID_key and ID_value. 802 /*! 803 * @param type Type of the line to which the tag belongs. Eg. "SQ" 804 * @param ID_key Tag key defining the line. Eg. "SN" 805 * @param ID_value Tag value associated with the key above. Eg. "ref1" 806 * @param key Key of the targeted tag. Eg. "M5" 807 * @return 1 if the key was removed; 0 if it was not present; -1 on error 808 */ 809 int sam_hdr_remove_tag_id( 810 sam_hdr_t* h, 811 const(char)* type, 812 const(char)* ID_key, 813 const(char)* ID_value, 814 const(char)* key); 815 816 /// Get the target id for a given reference sequence name 817 /*! 818 * @param ref Reference name 819 * @return Positive value on success, 820 * -1 if unknown reference, 821 * -2 if the header could not be parsed 822 * 823 * Looks up a reference sequence by name in the reference hash table 824 * and returns the numerical target id. 825 */ 826 int sam_hdr_name2tid(sam_hdr_t* h, const(char)* ref_); 827 828 /// Get the reference sequence name from a target index 829 /*! 830 * @param tid Target index 831 * @return Valid reference name on success, NULL on failure 832 * 833 * Fetch the reference sequence name from the target name array, 834 * using the numerical target id. 835 */ 836 const(char)* sam_hdr_tid2name(const(sam_hdr_t)* h, int tid); 837 838 /// Get the reference sequence length from a target index 839 /*! 840 * @param tid Target index 841 * @return Strictly positive value on success, 0 on failure 842 * 843 * Fetch the reference sequence length from the target length array, 844 * using the numerical target id. 845 */ 846 hts_pos_t sam_hdr_tid2len(const(sam_hdr_t)* h, int tid); 847 848 /// Alias of sam_hdr_name2tid(), for backwards compatibility. 849 /*! 850 * @param ref Reference name 851 * @return Positive value on success, 852 * -1 if unknown reference, 853 * -2 if the header could not be parsed 854 */ 855 pragma(inline,true) 856 int bam_name2id(sam_hdr_t* h, const(char)* ref_) { return sam_hdr_name2tid(h, ref_); } 857 858 /// Generate a unique \@PG ID: value 859 /*! 860 * @param name Name of the program. Eg. samtools 861 * @return Valid ID on success, NULL on failure 862 * 863 * Returns a unique ID from a base name. The string returned will remain 864 * valid until the next call to this function, or the header is destroyed. 865 * The caller should not attempt to free() or realloc() it. 866 */ 867 const(char)* sam_hdr_pg_id(sam_hdr_t* h, const(char)* name); 868 869 /// Add an \@PG line. 870 /*! 871 * @param name Name of the program. Eg. samtools 872 * @return 0 on success, -1 on failure 873 * 874 * If we wish complete control over this use sam_hdr_add_line() directly. This 875 * function uses that, but attempts to do a lot of tedious house work for 876 * you too. 877 * 878 * - It will generate a suitable ID if the supplied one clashes. 879 * - It will generate multiple \@PG records if we have multiple PG chains. 880 * 881 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending 882 * in NULL. 883 */ 884 int sam_hdr_add_pg(sam_hdr_t* h, const(char)* name, ...); 885 886 /*! 887 * A function to help with construction of CL tags in @PG records. 888 * Takes an argc, argv pair and returns a single space-separated string. 889 * This string should be deallocated by the calling function. 890 * 891 * @return 892 * Returns malloced char * on success; 893 * NULL on failure 894 */ 895 char* stringify_argv(int argc, char** argv); 896 897 /// Increments the reference count on a header 898 /*! 899 * This permits multiple files to share the same header, all calling 900 * sam_hdr_destroy when done, without causing errors for other open files. 901 */ 902 void sam_hdr_incr_ref(sam_hdr_t* h); 903 904 /* 905 * Macros for changing the \@HD line. They eliminate the need to use NULL method arguments. 906 */ 907 908 /// Returns the SAM formatted text of the \@HD header line 909 extern (D) auto sam_hdr_find_hd(T0, T1)(auto ref T0 h, auto ref T1 ks) 910 { 911 return sam_hdr_find_line_id(h, "HD", NULL, NULL, ks); 912 } 913 914 /// Returns the value associated with a given \@HD line tag 915 extern (D) auto sam_hdr_find_tag_hd(T0, T1, T2)(auto ref T0 h, auto ref T1 key, auto ref T2 ks) 916 { 917 return sam_hdr_find_tag_id(h, "HD", NULL, NULL, key, ks); 918 } 919 920 /// Adds or updates tags on the header \@HD line 921 extern (D) auto sam_hdr_update_hd(T, A...)(auto ref T h, auto ref A a) 922 { 923 // NOTE: This macro was dropped by dstep due to variadic args 924 static assert (a.length %2 == 0); // K-V pairs => even number of variadic args 925 return sam_hdr_update_line(h, "HD", null, null, a, null); 926 } 927 928 /// Removes the \@HD line tag with the given key 929 extern (D) auto sam_hdr_remove_tag_hd(T0, T1)(auto ref T0 h, auto ref T1 key) 930 { 931 return sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key); 932 } 933 934 /* Alignment */ 935 936 /// Create a new bam1_t alignment structure 937 /** 938 @return An empty bam1_t structure on success, NULL on failure 939 940 The bam1_t struct returned by a successful call should be freed 941 via bam_destroy1() when it is no longer needed. 942 */ 943 bam1_t* bam_init1(); 944 945 /// Destroy a bam1_t structure 946 /** 947 @param b structure to destroy 948 949 Does nothing if @p b is NULL. If not, all memory associated with @p b 950 will be freed, along with the structure itself. @p b should not be 951 accessed after calling this function. 952 */ 953 void bam_destroy1(bam1_t* b); 954 955 enum BAM_USER_OWNS_STRUCT = 1; 956 enum BAM_USER_OWNS_DATA = 2; 957 958 /// Set alignment record memory policy 959 /** 960 @param b Alignment record 961 @param policy Desired policy 962 963 Allows the way HTSlib reallocates and frees bam1_t data to be 964 changed. @policy can be set to the bitwise-or of the following 965 values: 966 967 \li \c BAM_USER_OWNS_STRUCT 968 If this is set then bam_destroy1() will not try to free the bam1_t struct. 969 970 \li \c BAM_USER_OWNS_DATA 971 If this is set, bam_destroy1() will not free the bam1_t::data pointer. 972 Also, functions which need to expand bam1_t::data memory will change 973 behaviour. Instead of calling realloc() on the pointer, they will 974 allocate a new data buffer and copy any existing content in to it. 975 The existing memory will \b not be freed. bam1_t::data will be 976 set to point to the new memory and the BAM_USER_OWNS_DATA flag will be 977 cleared. 978 979 BAM_USER_OWNS_STRUCT allows bam_destroy1() to be called on bam1_t 980 structures that are members of an array. 981 982 BAM_USER_OWNS_DATA can be used by applications that want more control 983 over where the variable-length parts of the bam record will be stored. 984 By preventing calls to free() and realloc(), it allows bam1_t::data 985 to hold pointers to memory that cannot be passed to those functions. 986 987 Example: Read a block of alignment records, storing the variable-length 988 data in a single buffer and the records in an array. Stop when either 989 the array or the buffer is full. 990 991 \code{.c} 992 #define MAX_RECS 1000 993 #define REC_LENGTH 400 // Average length estimate, to get buffer size 994 size_t bufsz = MAX_RECS * REC_LENGTH, nrecs, buff_used = 0; 995 bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t)); 996 uint8_t *buffer = malloc(bufsz); 997 int res = 0, result = EXIT_FAILURE; 998 uint32_t new_m_data; 999 1000 if (!recs || !buffer) goto cleanup; 1001 for (nrecs = 0; nrecs < MAX_RECS; nrecs++) { 1002 bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); 1003 1004 // Set data pointer to unused part of buffer 1005 recs[nrecs].data = &buffer[buff_used]; 1006 1007 // Set m_data to size of unused part of buffer. On 64-bit platforms it 1008 // will be necessary to limit this to UINT32_MAX due to the size of 1009 // bam1_t::m_data (not done here as our buffer is only 400K). 1010 recs[nrecs].m_data = bufsz - buff_used; 1011 1012 // Read the record 1013 res = sam_read1(file_handle, header, &recs[nrecs]); 1014 if (res <= 0) break; // EOF or error 1015 1016 // Check if the record data didn't fit - if not, stop reading 1017 if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) { 1018 nrecs++; // Include last record in count 1019 break; 1020 } 1021 1022 // Adjust m_data to the space actually used. If space is available, 1023 // round up to eight bytes so the next record aligns nicely. 1024 new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U); 1025 if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data; 1026 1027 buff_used += recs[nrecs].m_data; 1028 } 1029 if (res < 0) goto cleanup; 1030 result = EXIT_SUCCESS; 1031 1032 // ... use data ... 1033 1034 cleanup: 1035 for (size_t i = 0; i < nrecs; i++) 1036 bam_destroy1(i); 1037 free(buffer); 1038 free(recs); 1039 1040 \endcode 1041 */ 1042 pragma(inline,true) 1043 void bam_set_mempolicy(bam1_t* b, uint policy) { 1044 b.mempolicy = policy; 1045 } 1046 1047 /// Get alignment record memory policy 1048 /** @param b Alignment record 1049 1050 See bam_set_mempolicy() 1051 */ 1052 pragma(inline,true) 1053 uint bam_get_mempolicy(bam1_t* b) { 1054 return b.mempolicy; 1055 } 1056 1057 /// Read a BAM format alignment record 1058 /** 1059 @param fp BGZF file being read 1060 @param b Destination for the alignment data 1061 @return number of bytes read on success 1062 -1 at end of file 1063 < -1 on failure 1064 1065 This function can only read BAM format files. Most code should use 1066 sam_read1() instead, which can be used with BAM, SAM and CRAM formats. 1067 */ 1068 int bam_read1(BGZF* fp, bam1_t* b); 1069 1070 /// Write a BAM format alignment record 1071 /** 1072 @param fp BGZF file being written 1073 @param b Alignment record to write 1074 @return number of bytes written on success 1075 -1 on error 1076 1077 This function can only write BAM format files. Most code should use 1078 sam_write1() instead, which can be used with BAM, SAM and CRAM formats. 1079 */ 1080 int bam_write1(BGZF* fp, const(bam1_t)* b); 1081 1082 /// Copy alignment record data 1083 /** 1084 @param bdst Destination alignment record 1085 @param bsrc Source alignment record 1086 @return bdst on success; NULL on failure 1087 */ 1088 bam1_t* bam_copy1(bam1_t* bdst, const(bam1_t)* bsrc); 1089 1090 /// Create a duplicate alignment record 1091 /** 1092 @param bsrc Source alignment record 1093 @return Pointer to a new alignment record on success; NULL on failure 1094 1095 The bam1_t struct returned by a successful call should be freed 1096 via bam_destroy1() when it is no longer needed. 1097 */ 1098 bam1_t* bam_dup1(const(bam1_t)* bsrc); 1099 1100 /// Sets all components of an alignment structure 1101 /** 1102 @param bam Target alignment structure. Must be initialized by a call to bam_init1(). 1103 The data field will be reallocated automatically as needed. 1104 @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used. 1105 @param qname Query name, may be NULL if l_qname = 0 1106 @param flag Bitwise flag, a combination of the BAM_F* constants. 1107 @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME). 1108 @param pos 0-based leftmost coordinate. 1109 @param mapq Mapping quality. 1110 @param n_cigar Number of CIGAR operations. 1111 @param cigar CIGAR data, may be NULL if n_cigar = 0. 1112 @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT). 1113 @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT). 1114 @param isize Observed template length ("insert size") (a.k.a. TLEN). 1115 @param l_seq Length of the query sequence (read) and sequence quality string. 1116 @param seq Sequence, may be NULL if l_seq = 0. 1117 @param qual Sequence quality, may be NULL. 1118 @param l_aux Length to be reserved for auxiliary field data, may be 0. 1119 1120 @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. 1121 */ 1122 int bam_set1( 1123 bam1_t* bam, 1124 size_t l_qname, 1125 const(char)* qname, 1126 ushort flag, 1127 int tid, 1128 hts_pos_t pos, 1129 ubyte mapq, 1130 size_t n_cigar, 1131 const(uint)* cigar, 1132 int mtid, 1133 hts_pos_t mpos, 1134 hts_pos_t isize, 1135 size_t l_seq, 1136 const(char)* seq, 1137 const(char)* qual, 1138 size_t l_aux); 1139 1140 /// Calculate query length from CIGAR data 1141 /** 1142 @param n_cigar Number of items in @p cigar 1143 @param cigar CIGAR data 1144 @return Query length 1145 1146 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op 1147 where op_len is the length in bases and op is a value between 0 and 8 1148 representing one of the operations "MIDNSHP=X" (M = 0; X = 8) 1149 1150 This function returns the sum of the lengths of the M, I, S, = and X 1151 operations in @p cigar (these are the operations that "consume" query 1152 bases). All other operations (including invalid ones) are ignored. 1153 1154 @note This return type of this function is hts_pos_t so that it can 1155 correctly return the length of CIGAR sequences including many long 1156 operations without overflow. However, other restrictions (notably the sizes 1157 of bam1_core_t::l_qseq and bam1_t::data) limit the maximum query sequence 1158 length supported by HTSlib to fewer than INT_MAX bases. 1159 */ 1160 hts_pos_t bam_cigar2qlen(int n_cigar, const(uint)* cigar); 1161 1162 /// Calculate reference length from CIGAR data 1163 /** 1164 @param n_cigar Number of items in @p cigar 1165 @param cigar CIGAR data 1166 @return Reference length 1167 1168 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op 1169 where op_len is the length in bases and op is a value between 0 and 8 1170 representing one of the operations "MIDNSHP=X" (M = 0; X = 8) 1171 1172 This function returns the sum of the lengths of the M, D, N, = and X 1173 operations in @p cigar (these are the operations that "consume" reference 1174 bases). All other operations (including invalid ones) are ignored. 1175 */ 1176 hts_pos_t bam_cigar2rlen(int n_cigar, const(uint)* cigar); 1177 1178 /*! 1179 @abstract Calculate the rightmost base position of an alignment on the 1180 reference genome. 1181 1182 @param b pointer to an alignment 1183 @return the coordinate of the first base after the alignment, 0-based 1184 1185 @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. 1186 For an unmapped read (either according to its flags or if it has no cigar 1187 string) or a read whose cigar string consumes no reference bases at all, 1188 we return b->core.pos + 1 by convention. 1189 */ 1190 hts_pos_t bam_endpos(const(bam1_t)* b); 1191 1192 int bam_str2flag(const(char)* str); /** returns negative value on error */ 1193 1194 char* bam_flag2str(int flag); /** The string must be freed by the user */ 1195 1196 /*! @function 1197 @abstract Set the name of the query 1198 @param b pointer to an alignment 1199 @return 0 on success, -1 on failure 1200 */ 1201 int bam_set_qname(bam1_t* b, const(char)* qname); 1202 1203 /*! @function 1204 @abstract Parse a CIGAR string into a uint32_t array 1205 @param in [in] pointer to the source string 1206 @param end [out] address of the pointer to the new end of the input string 1207 can be NULL 1208 @param a_cigar [in/out] address of the destination uint32_t buffer 1209 @param a_mem [in/out] address of the allocated number of buffer elements 1210 @return number of processed CIGAR operators; -1 on error 1211 */ 1212 ssize_t sam_parse_cigar( 1213 const(char)* in_, 1214 char** end, 1215 uint** a_cigar, 1216 size_t* a_mem); 1217 1218 /*! @function 1219 @abstract Parse a CIGAR string into a bam1_t struct 1220 @param in [in] pointer to the source string 1221 @param end [out] address of the pointer to the new end of the input string 1222 can be NULL 1223 @param b [in/out] address of the destination bam1_t struct 1224 @return number of processed CIGAR operators; -1 on error 1225 */ 1226 ssize_t bam_parse_cigar(const(char)* in_, char** end, bam1_t* b); 1227 1228 /************************* 1229 *** BAM/CRAM indexing *** 1230 *************************/ 1231 1232 // These BAM iterator functions work only on BAM files. To work with either 1233 // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. 1234 alias bam_itr_destroy = hts_itr_destroy; 1235 alias bam_itr_queryi = sam_itr_queryi; 1236 alias bam_itr_querys = sam_itr_querys; 1237 1238 pragma(inline, true) 1239 extern (D) auto bam_itr_next(T0, T1, T2)(auto ref T0 htsfp, auto ref T1 itr, auto ref T2 r) 1240 { 1241 return hts_itr_next(htsfp.fp.bgzf, itr, r, 0); 1242 } 1243 1244 // Load/build .csi or .bai BAM index file. Does not work with CRAM. 1245 // It is recommended to use the sam_index_* functions below instead. 1246 extern (D) auto bam_index_load(T)(auto ref T fn) 1247 { 1248 return hts_idx_load(fn, HTS_FMT_BAI); 1249 } 1250 1251 extern (D) auto bam_index_build(T0, T1)(auto ref T0 fn, auto ref T1 min_shift) 1252 { 1253 return sam_index_build(fn, min_shift); 1254 } 1255 1256 /// Initialise fp->idx for the current format type for SAM, BAM and CRAM types . 1257 /** @param fp File handle for the data file being written. 1258 @param h Bam header structured (needed for BAI and CSI). 1259 @param min_shift 0 for BAI, or larger for CSI (CSI defaults to 14). 1260 @param fnidx Filename to write index to. This pointer must remain valid 1261 until after sam_idx_save is called. 1262 @return 0 on success, <0 on failure. 1263 1264 @note This must be called after the header has been written, but before 1265 any other data. 1266 */ 1267 int sam_idx_init(htsFile* fp, sam_hdr_t* h, int min_shift, const(char)* fnidx); 1268 1269 /// Writes the index initialised with sam_idx_init to disk. 1270 /** @param fp File handle for the data file being written. 1271 @return 0 on success, <0 on failure. 1272 */ 1273 int sam_idx_save(htsFile* fp); 1274 1275 /// Load a BAM (.csi or .bai) or CRAM (.crai) index file 1276 /** @param fp File handle of the data file whose index is being opened 1277 @param fn BAM/CRAM/etc filename to search alongside for the index file 1278 @return The index, or NULL if an error occurred. 1279 1280 Equivalent to sam_index_load3(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); 1281 */ 1282 hts_idx_t* sam_index_load(htsFile* fp, const(char)* fn); 1283 1284 /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file 1285 /** @param fp File handle of the data file whose index is being opened 1286 @param fn BAM/CRAM/etc data file filename 1287 @param fnidx Index filename, or NULL to search alongside @a fn 1288 @return The index, or NULL if an error occurred. 1289 1290 Equivalent to sam_index_load3(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); 1291 */ 1292 hts_idx_t* sam_index_load2(htsFile* fp, const(char)* fn, const(char)* fnidx); 1293 1294 /// Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file 1295 /** @param fp File handle of the data file whose index is being opened 1296 @param fn BAM/CRAM/etc data file filename 1297 @param fnidx Index filename, or NULL to search alongside @a fn 1298 @param flags Flags to alter behaviour (see description) 1299 @return The index, or NULL if an error occurred. 1300 1301 The @p flags parameter can be set to a combination of the following values: 1302 1303 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes 1304 HTS_IDX_SILENT_FAIL Fail silently if the index is not present 1305 1306 Note that HTS_IDX_SAVE_REMOTE has no effect for remote CRAM indexes. They 1307 are always downloaded and never cached locally. 1308 1309 The index struct returned by a successful call should be freed 1310 via hts_idx_destroy() when it is no longer needed. 1311 */ 1312 hts_idx_t* sam_index_load3( 1313 htsFile* fp, 1314 const(char)* fn, 1315 const(char)* fnidx, 1316 int flags); 1317 1318 /// Generate and save an index file 1319 /** @param fn Input BAM/etc filename, to which .csi/etc will be added 1320 @param min_shift Positive to generate CSI, or 0 to generate BAI 1321 @return 0 if successful, or negative if an error occurred (usually -1; or 1322 -2: opening fn failed; -3: format not indexable; -4: 1323 failed to create and/or save the index) 1324 */ 1325 int sam_index_build(const(char)* fn, int min_shift); 1326 1327 /// Generate and save an index to a specific file 1328 /** @param fn Input BAM/CRAM/etc filename 1329 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1330 @param min_shift Positive to generate CSI, or 0 to generate BAI 1331 @return 0 if successful, or negative if an error occurred (see 1332 sam_index_build for error codes) 1333 */ 1334 int sam_index_build2(const(char)* fn, const(char)* fnidx, int min_shift); 1335 1336 /// Generate and save an index to a specific file 1337 /** @param fn Input BAM/CRAM/etc filename 1338 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1339 @param min_shift Positive to generate CSI, or 0 to generate BAI 1340 @param nthreads Number of threads to use when building the index 1341 @return 0 if successful, or negative if an error occurred (see 1342 sam_index_build for error codes) 1343 */ 1344 int sam_index_build3( 1345 const(char)* fn, 1346 const(char)* fnidx, 1347 int min_shift, 1348 int nthreads); 1349 1350 /// Free a SAM iterator 1351 /// @param iter Iterator to free 1352 alias sam_itr_destroy = hts_itr_destroy; 1353 1354 /// Create a BAM/CRAM iterator 1355 /** @param idx Index 1356 @param tid Target id 1357 @param beg Start position in target 1358 @param end End position in target 1359 @return An iterator on success; NULL on failure 1360 1361 The following special values (defined in htslib/hts.h)can be used for @p tid. 1362 When using one of these values, @p beg and @p end are ignored. 1363 1364 HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file 1365 HTS_IDX_START iterates over the entire file 1366 HTS_IDX_REST iterates from the current position to the end of the file 1367 HTS_IDX_NONE always returns "no more alignment records" 1368 1369 When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. 1370 */ 1371 hts_itr_t* sam_itr_queryi( 1372 const(hts_idx_t)* idx, 1373 int tid, 1374 hts_pos_t beg, 1375 hts_pos_t end); 1376 1377 /// Create a SAM/BAM/CRAM iterator 1378 /** @param idx Index 1379 @param hdr Header 1380 @param region Region specification 1381 @return An iterator on success; NULL on failure 1382 1383 Regions are parsed by hts_parse_reg(), and take one of the following forms: 1384 1385 region | Outputs 1386 --------------- | ------------- 1387 REF | All reads with RNAME REF 1388 REF: | All reads with RNAME REF 1389 REF:START | Reads with RNAME REF overlapping START to end of REF 1390 REF:-END | Reads with RNAME REF overlapping start of REF to END 1391 REF:START-END | Reads with RNAME REF overlapping START to END 1392 . | All reads from the start of the file 1393 * | Unmapped reads at the end of the file (RNAME '*' in SAM) 1394 1395 The form `REF:` should be used when the reference name itself contains a colon. 1396 1397 Note that SAM files must be bgzf-compressed for iterators to work. 1398 */ 1399 hts_itr_t* sam_itr_querys( 1400 const(hts_idx_t)* idx, 1401 sam_hdr_t* hdr, 1402 const(char)* region); 1403 1404 /// Create a multi-region iterator 1405 /** @param idx Index 1406 @param hdr Header 1407 @param reglist Array of regions to iterate over 1408 @param regcount Number of items in reglist 1409 1410 Each @p reglist entry should have the reference name in the `reg` field, an 1411 array of regions for that reference in `intervals` and the number of items 1412 in `intervals` should be stored in `count`. No other fields need to be filled 1413 in. 1414 1415 The iterator will return all reads overlapping the given regions. If a read 1416 overlaps more than one region, it will only be returned once. 1417 */ 1418 hts_itr_t* sam_itr_regions( 1419 const(hts_idx_t)* idx, 1420 sam_hdr_t* hdr, 1421 hts_reglist_t* reglist, 1422 uint regcount); 1423 1424 /// Create a multi-region iterator 1425 /** @param idx Index 1426 @param hdr Header 1427 @param regarray Array of ref:interval region specifiers 1428 @param regcount Number of items in regarray 1429 1430 Each @p regarray entry is parsed by hts_parse_reg(), and takes one of the 1431 following forms: 1432 1433 region | Outputs 1434 --------------- | ------------- 1435 REF | All reads with RNAME REF 1436 REF: | All reads with RNAME REF 1437 REF:START | Reads with RNAME REF overlapping START to end of REF 1438 REF:-END | Reads with RNAME REF overlapping start of REF to END 1439 REF:START-END | Reads with RNAME REF overlapping START to END 1440 . | All reads from the start of the file 1441 * | Unmapped reads at the end of the file (RNAME '*' in SAM) 1442 1443 The form `REF:` should be used when the reference name itself contains a colon. 1444 1445 The iterator will return all reads overlapping the given regions. If a read 1446 overlaps more than one region, it will only be returned once. 1447 */ 1448 hts_itr_t* sam_itr_regarray( 1449 const(hts_idx_t)* idx, 1450 sam_hdr_t* hdr, 1451 char** regarray, 1452 uint regcount); 1453 1454 } /// closing @nogc 1455 1456 /// Get the next read from a SAM/BAM/CRAM iterator 1457 /** @param htsfp Htsfile pointer for the input file 1458 @param itr Iterator 1459 @param r Pointer to a bam1_t struct 1460 @return >= 0 on success; -1 when there is no more data; < -1 on error 1461 */ 1462 pragma(inline,true) 1463 int sam_itr_next(htsFile* htsfp, hts_itr_t* itr, bam1_t* r) { 1464 if (!htsfp.is_bgzf && !htsfp.is_cram) { 1465 hts_log_error(__FUNCTION__, format("%s not BGZF compressed", htsfp.fn ? htsfp.fn : "File")); 1466 return -2; 1467 } 1468 if (!itr) { 1469 hts_log_error(__FUNCTION__,"Null iterator"); 1470 return -2; 1471 } 1472 1473 if (itr.multi) 1474 return hts_itr_multi_next(htsfp, itr, r); 1475 else 1476 return hts_itr_next(htsfp.is_bgzf ? htsfp.fp.bgzf : null, itr, r, htsfp); 1477 } 1478 1479 @nogc nothrow: 1480 1481 /// Get the next read from a BAM/CRAM multi-iterator 1482 /** @param htsfp Htsfile pointer for the input file 1483 @param itr Iterator 1484 @param r Pointer to a bam1_t struct 1485 @return >= 0 on success; -1 when there is no more data; < -1 on error 1486 */ 1487 alias sam_itr_multi_next = sam_itr_next; 1488 1489 const(char)* sam_parse_region( 1490 sam_hdr_t* h, 1491 const(char)* s, 1492 int* tid, 1493 hts_pos_t* beg, 1494 hts_pos_t* end, 1495 int flags); 1496 1497 /*************** 1498 *** SAM I/O *** 1499 ***************/ 1500 1501 extern (D) auto sam_open(T0, T1)(auto ref T0 fn, auto ref T1 mode) 1502 { 1503 return hts_open(fn, mode); 1504 } 1505 1506 extern (D) auto sam_open_format(T0, T1, T2)(auto ref T0 fn, auto ref T1 mode, auto ref T2 fmt) 1507 { 1508 return hts_open_format(fn, mode, fmt); 1509 } 1510 1511 alias sam_flush = hts_flush; 1512 alias sam_close = hts_close; 1513 1514 int sam_open_mode(char* mode, const(char)* fn, const(char)* format); 1515 1516 // A version of sam_open_mode that can handle ,key=value options. 1517 // The format string is allocated and returned, to be freed by the caller. 1518 // Prefix should be "r" or "w", 1519 char* sam_open_mode_opts( 1520 const(char)* fn, 1521 const(char)* mode, 1522 const(char)* format); 1523 1524 int sam_hdr_change_HD(sam_hdr_t* h, const(char)* key, const(char)* val); 1525 1526 int sam_parse1(kstring_t* s, sam_hdr_t* h, bam1_t* b); 1527 int sam_format1(const(sam_hdr_t)* h, const(bam1_t)* b, kstring_t* str); 1528 1529 /// sam_read1 - Read a record from a file 1530 /** @param fp Pointer to the source file 1531 * @param h Pointer to the header previously read (fully or partially) 1532 * @param b Pointer to the record placeholder 1533 * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error 1534 */ 1535 int sam_read1(samFile* fp, sam_hdr_t* h, bam1_t* b); 1536 /// sam_write1 - Write a record to a file 1537 /** @param fp Pointer to the destination file 1538 * @param h Pointer to the header structure previously read 1539 * @param b Pointer to the record to be written 1540 * @return >= 0 on successfully writing the record, -1 on error 1541 */ 1542 int sam_write1(samFile* fp, const(sam_hdr_t)* h, const(bam1_t)* b); 1543 1544 // Forward declaration, see hts_expr.h for full. 1545 struct hts_filter_t; 1546 1547 /// sam_passes_filter - Checks whether a record passes an hts_filter. 1548 /** @param h Pointer to the header structure previously read 1549 * @param b Pointer to the BAM record to be checked 1550 * @param filt Pointer to the filter, created from hts_filter_init. 1551 * @return 1 if passes, 0 if not, and <0 on error. 1552 */ 1553 int sam_passes_filter( 1554 const(sam_hdr_t)* h, 1555 const(bam1_t)* b, 1556 hts_filter_t* filt); 1557 1558 /************************************* 1559 *** Manipulating auxiliary fields *** 1560 *************************************/ 1561 1562 /// Converts a BAM aux tag to SAM format 1563 /* 1564 * @param b Pointer to the bam record 1565 * @param key Two letter tag key 1566 * @param type Single letter type code: ACcSsIifHZB. 1567 * @param tag Tag data pointer, in BAM format 1568 * @param end Pointer to end of bam record (largest extent of tag) 1569 * @param ks kstring to write the formatted tag to 1570 * 1571 * @return pointer to end of tag on success, 1572 * NULL on failure. 1573 * 1574 * @discussion The three separate parameters key, type, tag may be 1575 * derived from a s=bam_aux_get() query as s-2, *s and s+1. However 1576 * it is recommended to use bam_aux_get_str in this situation. 1577 * The desire to split these parameters up is for potential processing 1578 * of non-BAM formats that encode using a BAM type mechanism 1579 * (such as the internal CRAM representation). 1580 */ 1581 pragma(inline, true) const(ubyte)* sam_format_aux1(const ubyte *key, 1582 const ubyte type, 1583 const ubyte *tag, 1584 const ubyte *end, 1585 kstring_t *ks) { 1586 int r = 0; 1587 const(ubyte) *s = tag; // brevity and consistency with other code. 1588 r |= kputsn_(cast(char*)key, 2, ks) < 0; 1589 r |= kputc_(':', ks) < 0; 1590 if (type == 'C') { 1591 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1592 r |= kputw(*s, ks) < 0; 1593 ++s; 1594 } else if (type == 'c') { 1595 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1596 r |= kputw(*cast(byte*)s, ks) < 0; 1597 ++s; 1598 } else if (type == 'S') { 1599 if (end - s >= 2) { 1600 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1601 r |= kputuw(le_to_u16(s), ks) < 0; 1602 s += 2; 1603 } else goto bad_aux; 1604 } else if (type == 's') { 1605 if (end - s >= 2) { 1606 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1607 r |= kputw(le_to_i16(s), ks) < 0; 1608 s += 2; 1609 } else goto bad_aux; 1610 } else if (type == 'I') { 1611 if (end - s >= 4) { 1612 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1613 r |= kputuw(le_to_u32(s), ks) < 0; 1614 s += 4; 1615 } else goto bad_aux; 1616 } else if (type == 'i') { 1617 if (end - s >= 4) { 1618 r |= kputsn_(cast(char*)"i:", 2, ks) < 0; 1619 r |= kputw(le_to_i32(s), ks) < 0; 1620 s += 4; 1621 } else goto bad_aux; 1622 } else if (type == 'A') { 1623 r |= kputsn_(cast(char*)"A:", 2, ks) < 0; 1624 r |= kputc_(*s, ks) < 0; 1625 ++s; 1626 } else if (type == 'f') { 1627 if (end - s >= 4) { 1628 // cast to avoid triggering -Wdouble-promotion 1629 ksprintf(ks, cast(char*)"f:%g", cast(double)le_to_float(s)); 1630 s += 4; 1631 } else goto bad_aux; 1632 1633 } else if (type == 'd') { 1634 // NB: "d" is not an official type in the SAM spec. 1635 // However for unknown reasons samtools has always supported this. 1636 // We believe, HOPE, it is not in general usage and we do not 1637 // encourage it. 1638 if (end - s >= 8) { 1639 ksprintf(ks, "d:%g", le_to_double(s)); 1640 s += 8; 1641 } else goto bad_aux; 1642 } else if (type == 'Z' || type == 'H') { 1643 r |= kputc_(type, ks) < 0; 1644 r |= kputc_(':', ks) < 0; 1645 while (s < end && *s) r |= kputc_(*s++, ks) < 0; 1646 if (s >= end) 1647 goto bad_aux; 1648 ++s; 1649 } else if (type == 'B') { 1650 ubyte sub_type = *(s++); 1651 int sub_type_size; 1652 1653 // or externalise sam.c's aux_type2size function? 1654 switch (sub_type) { 1655 case 'A': case 'c': case 'C': 1656 sub_type_size = 1; 1657 break; 1658 case 's': case 'S': 1659 sub_type_size = 2; 1660 break; 1661 case 'i': case 'I': case 'f': 1662 sub_type_size = 4; 1663 break; 1664 default: 1665 sub_type_size = 0; 1666 break; 1667 } 1668 1669 uint i, n; 1670 if (sub_type_size == 0 || end - s < 4) 1671 goto bad_aux; 1672 n = le_to_u32(s); 1673 s += 4; // now points to the start of the array 1674 if ((end - s) / sub_type_size < n) 1675 goto bad_aux; 1676 r |= kputsn_(cast(char*)"B:", 2, ks) < 0; 1677 r |= kputc(sub_type, ks) < 0; // write the type 1678 switch (sub_type) { 1679 case 'c': 1680 if (ks_expand(ks, n*2) < 0) goto mem_err; 1681 for (i = 0; i < n; ++i) { 1682 ks.s[ks.l++] = ','; 1683 r |= kputw(*cast(byte*)s, ks) < 0; 1684 ++s; 1685 } 1686 break; 1687 case 'C': 1688 if (ks_expand(ks, n*2) < 0) goto mem_err; 1689 for (i = 0; i < n; ++i) { 1690 ks.s[ks.l++] = ','; 1691 r |= kputuw(*cast(ubyte*)s, ks) < 0; 1692 ++s; 1693 } 1694 break; 1695 case 's': 1696 if (ks_expand(ks, n*4) < 0) goto mem_err; 1697 for (i = 0; i < n; ++i) { 1698 ks.s[ks.l++] = ','; 1699 r |= kputw(le_to_i16(s), ks) < 0; 1700 s += 2; 1701 } 1702 break; 1703 case 'S': 1704 if (ks_expand(ks, n*4) < 0) goto mem_err; 1705 for (i = 0; i < n; ++i) { 1706 ks.s[ks.l++] = ','; 1707 r |= kputuw(le_to_u16(s), ks) < 0; 1708 s += 2; 1709 } 1710 break; 1711 case 'i': 1712 if (ks_expand(ks, n*6) < 0) goto mem_err; 1713 for (i = 0; i < n; ++i) { 1714 ks.s[ks.l++] = ','; 1715 r |= kputw(le_to_i32(s), ks) < 0; 1716 s += 4; 1717 } 1718 break; 1719 case 'I': 1720 if (ks_expand(ks, n*6) < 0) goto mem_err; 1721 for (i = 0; i < n; ++i) { 1722 ks.s[ks.l++] = ','; 1723 r |= kputuw(le_to_u32(s), ks) < 0; 1724 s += 4; 1725 } 1726 break; 1727 case 'f': 1728 if (ks_expand(ks, n*8) < 0) goto mem_err; 1729 for (i = 0; i < n; ++i) { 1730 ks.s[ks.l++] = ','; 1731 // cast to avoid triggering -Wdouble-promotion 1732 r |= kputd(cast(double)le_to_float(s), ks) < 0; 1733 s += 4; 1734 } 1735 break; 1736 default: 1737 goto bad_aux; 1738 } 1739 } else { // Unknown type 1740 goto bad_aux; 1741 } 1742 return r ? null : s; 1743 1744 bad_aux: 1745 errno = EINVAL; 1746 return null; 1747 1748 mem_err: 1749 import dhtslib.memory: hts_log_errorNoGC; 1750 hts_log_errorNoGC!__FUNCTION__("Out of memory"); 1751 errno = ENOMEM; 1752 return null; 1753 } 1754 1755 /// Return a pointer to an aux record 1756 /** @param b Pointer to the bam record 1757 @param tag Desired aux tag 1758 @return Pointer to the tag data, or NULL if tag is not present or on error 1759 If the tag is not present, this function returns NULL and sets errno to 1760 ENOENT. If the bam record's aux data is corrupt (either a tag has an 1761 invalid type, or the last record is incomplete) then errno is set to 1762 EINVAL and NULL is returned. 1763 */ 1764 ubyte* bam_aux_get(const(bam1_t)* b, ref const(char)[2] tag); 1765 1766 /// Return a SAM formatting string containing a BAM tag 1767 /** @param b Pointer to the bam record 1768 @param tag Desired aux tag 1769 @param s The kstring to write to. 1770 1771 @return 1 on success, 1772 0 on no tag found with errno = ENOENT, 1773 -1 on error (errno will be either EINVAL or ENOMEM). 1774 */ 1775 int bam_aux_get_str(const(bam1_t)* b, ref const(char)[2] tag, kstring_t* s); 1776 1777 /// Get an integer aux value 1778 /** @param s Pointer to the tag data, as returned by bam_aux_get() 1779 @return The value, or 0 if the tag was not an integer type 1780 If the tag is not an integer type, errno is set to EINVAL. This function 1781 will not return the value of floating-point tags. 1782 */ 1783 long bam_aux2i(const(ubyte)* s); 1784 1785 /// Get an integer aux value 1786 /** @param s Pointer to the tag data, as returned by bam_aux_get() 1787 @return The value, or 0 if the tag was not an integer type 1788 If the tag is not an numeric type, errno is set to EINVAL. The value of 1789 integer flags will be returned cast to a double. 1790 */ 1791 double bam_aux2f(const(ubyte)* s); 1792 1793 /// Get a character aux value 1794 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1795 @return The value, or 0 if the tag was not a character ('A') type 1796 If the tag is not a character type, errno is set to EINVAL. 1797 */ 1798 char bam_aux2A(const(ubyte)* s); 1799 1800 /// Get a string aux value 1801 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1802 @return Pointer to the string, or NULL if the tag was not a string type 1803 If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL. 1804 */ 1805 char* bam_aux2Z(const(ubyte)* s); 1806 1807 /// Get the length of an array-type ('B') tag 1808 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1809 @return The length of the array, or 0 if the tag is not an array type. 1810 If the tag is not an array type, errno is set to EINVAL. 1811 */ 1812 uint bam_auxB_len(const(ubyte)* s); 1813 1814 /// Get an integer value from an array-type tag 1815 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1816 @param idx 0-based Index into the array 1817 @return The idx'th value, or 0 on error. 1818 If the array is not an integer type, errno is set to EINVAL. If idx 1819 is greater than or equal to the value returned by bam_auxB_len(s), 1820 errno is set to ERANGE. In both cases, 0 will be returned. 1821 */ 1822 long bam_auxB2i(const(ubyte)* s, uint idx); 1823 1824 /// Get a floating-point value from an array-type tag 1825 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1826 @param idx 0-based Index into the array 1827 @return The idx'th value, or 0.0 on error. 1828 If the array is not a numeric type, errno is set to EINVAL. This can 1829 only actually happen if the input record has an invalid type field. If 1830 idx is greater than or equal to the value returned by bam_auxB_len(s), 1831 errno is set to ERANGE. In both cases, 0.0 will be returned. 1832 */ 1833 double bam_auxB2f(const(ubyte)* s, uint idx); 1834 1835 /// Append tag data to a bam record 1836 /* @param b The bam record to append to. 1837 @param tag Tag identifier 1838 @param type Tag data type 1839 @param len Length of the data in bytes 1840 @param data The data to append 1841 @return 0 on success; -1 on failure. 1842 If there is not enough space to store the additional tag, errno is set to 1843 ENOMEM. If the type is invalid, errno may be set to EINVAL. errno is 1844 also set to EINVAL if the bam record's aux data is corrupt. 1845 */ 1846 int bam_aux_append( 1847 bam1_t* b, 1848 ref const(char)[2] tag, 1849 char type, 1850 int len, 1851 const(ubyte)* data); 1852 1853 /// Delete tag data from a bam record 1854 /* @param b The bam record to update 1855 @param s Pointer to the tag to delete, as returned by bam_aux_get(). 1856 @return 0 on success; -1 on failure 1857 If the bam record's aux data is corrupt, errno is set to EINVAL and this 1858 function returns -1; 1859 */ 1860 int bam_aux_del(bam1_t* b, ubyte* s); 1861 1862 /// Update or add a string-type tag 1863 /* @param b The bam record to update 1864 @param tag Tag identifier 1865 @param len The length of the new string 1866 @param data The new string 1867 @return 0 on success, -1 on failure 1868 This function will not change the ordering of tags in the bam record. 1869 New tags will be appended to any existing aux records. 1870 1871 If @p len is less than zero, the length of the input string will be 1872 calculated using strlen(). Otherwise exactly @p len bytes will be 1873 copied from @p data to make the new tag. If these bytes do not 1874 include a terminating NUL character, one will be added. (Note that 1875 versions of HTSlib up to 1.10.2 had different behaviour here and 1876 simply copied @p len bytes from data. To generate a valid tag it 1877 was necessary to ensure the last character was a NUL, and include 1878 it in @p len.) 1879 1880 On failure, errno may be set to one of the following values: 1881 1882 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1883 given ID is not of type 'Z'. 1884 1885 ENOMEM: The bam data needs to be expanded and either the attempt to 1886 reallocate the data buffer failed or the resulting buffer would be 1887 longer than the maximum size allowed in a bam record (2Gbytes). 1888 */ 1889 int bam_aux_update_str( 1890 bam1_t* b, 1891 ref const(char)[2] tag, 1892 int len, 1893 const(char)* data); 1894 1895 /// Update or add an integer tag 1896 /* @param b The bam record to update 1897 @param tag Tag identifier 1898 @param val The new value 1899 @return 0 on success, -1 on failure 1900 This function will not change the ordering of tags in the bam record. 1901 New tags will be appended to any existing aux records. 1902 1903 On failure, errno may be set to one of the following values: 1904 1905 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1906 given ID is not of an integer type (c, C, s, S, i or I). 1907 1908 EOVERFLOW (or ERANGE on systems that do not have EOVERFLOW): val is 1909 outside the range that can be stored in an integer bam tag (-2147483647 1910 to 4294967295). 1911 1912 ENOMEM: The bam data needs to be expanded and either the attempt to 1913 reallocate the data buffer failed or the resulting buffer would be 1914 longer than the maximum size allowed in a bam record (2Gbytes). 1915 */ 1916 int bam_aux_update_int(bam1_t* b, ref const(char)[2] tag, long val); 1917 1918 /// Update or add a floating-point tag 1919 /* @param b The bam record to update 1920 @param tag Tag identifier 1921 @param val The new value 1922 @return 0 on success, -1 on failure 1923 This function will not change the ordering of tags in the bam record. 1924 New tags will be appended to any existing aux records. 1925 1926 On failure, errno may be set to one of the following values: 1927 1928 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1929 given ID is not of a float type. 1930 1931 ENOMEM: The bam data needs to be expanded and either the attempt to 1932 reallocate the data buffer failed or the resulting buffer would be 1933 longer than the maximum size allowed in a bam record (2Gbytes). 1934 */ 1935 int bam_aux_update_float(bam1_t* b, ref const(char)[2] tag, float val); 1936 1937 /// Update or add an array tag 1938 /* @param b The bam record to update 1939 @param tag Tag identifier 1940 @param type Data type (one of c, C, s, S, i, I or f) 1941 @param items Number of items 1942 @param data Pointer to data 1943 @return 0 on success, -1 on failure 1944 The type parameter indicates the how the data is interpreted: 1945 1946 Letter code | Data type | Item Size (bytes) 1947 ----------- | --------- | ----------------- 1948 c | int8_t | 1 1949 C | uint8_t | 1 1950 s | int16_t | 2 1951 S | uint16_t | 2 1952 i | int32_t | 4 1953 I | uint32_t | 4 1954 f | float | 4 1955 1956 This function will not change the ordering of tags in the bam record. 1957 New tags will be appended to any existing aux records. The bam record 1958 will grow or shrink in order to accommodate the new data. 1959 1960 The data parameter must not point to any data in the bam record itself or 1961 undefined behaviour may result. 1962 1963 On failure, errno may be set to one of the following values: 1964 1965 EINVAL: The bam record's aux data is corrupt, an existing tag with the 1966 given ID is not of an array type or the type parameter is not one of 1967 the values listed above. 1968 1969 ENOMEM: The bam data needs to be expanded and either the attempt to 1970 reallocate the data buffer failed or the resulting buffer would be 1971 longer than the maximum size allowed in a bam record (2Gbytes). 1972 */ 1973 int bam_aux_update_array( 1974 bam1_t* b, 1975 ref const(char)[2] tag, 1976 ubyte type, 1977 uint items, 1978 void* data); 1979 1980 /************************** 1981 *** Pileup and Mpileup *** 1982 **************************/ 1983 1984 /*! @typedef 1985 @abstract Generic pileup 'client data'. 1986 1987 @discussion The pileup iterator allows setting a constructor and 1988 destructor function, which will be called every time a sequence is 1989 fetched and discarded. This permits caching of per-sequence data in 1990 a tidy manner during the pileup process. This union is the cached 1991 data to be manipulated by the "client" (the caller of pileup). 1992 */ 1993 union bam_pileup_cd 1994 { 1995 void* p; 1996 long i; 1997 double f; 1998 } 1999 2000 /*! @typedef 2001 @abstract Structure for one alignment covering the pileup position. 2002 @field b pointer to the alignment 2003 @field qpos position of the read base at the pileup site, 0-based 2004 @field indel indel length; 0 for no indel, positive for ins and negative for del 2005 @field level the level of the read in the "viewer" mode 2006 @field is_del 1 iff the base on the padded read is a deletion 2007 @field is_head 1 iff this is the first base in the query sequence 2008 @field is_tail 1 iff this is the last base in the query sequence 2009 @field is_refskip 1 iff the base on the padded read is part of CIGAR N op 2010 @field aux (used by bcf_call_gap_prep()) 2011 @field cigar_ind index of the CIGAR operator that has just been processed 2012 2013 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The 2014 difference between the two functions is that the former does not 2015 set bam_pileup1_t::level, while the later does. Level helps the 2016 implementation of alignment viewers, but calculating this has some 2017 overhead. 2018 */ 2019 struct bam_pileup1_t 2020 { 2021 import std.bitmanip : bitfields; 2022 2023 bam1_t* b; 2024 int qpos; 2025 int indel; 2026 int level; 2027 2028 mixin(bitfields!( 2029 uint, "is_del", 1, 2030 uint, "is_head", 1, 2031 uint, "is_tail", 1, 2032 uint, "is_refskip", 1, 2033 uint, "", 1, 2034 uint, "aux", 27)); 2035 2036 /* reserved */ 2037 bam_pileup_cd cd; // generic per-struct data, owned by caller. 2038 int cigar_ind; 2039 } 2040 2041 alias bam_plp_auto_f = int function(void* data, bam1_t* b); 2042 2043 struct bam_plp_s; 2044 alias bam_plp_t = bam_plp_s*; 2045 2046 struct bam_mplp_s; 2047 alias bam_mplp_t = bam_mplp_s*; 2048 2049 /** 2050 * bam_plp_init() - sets an iterator over multiple 2051 * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return 2052 * status: 0 on success, -1 on end, < -1 on non-recoverable errors 2053 * @data: user data to pass to @func 2054 * 2055 * The struct returned by a successful call should be freed 2056 * via bam_plp_destroy() when it is no longer needed. 2057 */ 2058 bam_plp_t bam_plp_init(bam_plp_auto_f func, void* data); 2059 2060 void bam_plp_destroy(bam_plp_t iter); 2061 2062 int bam_plp_push(bam_plp_t iter, const(bam1_t)* b); 2063 2064 const(bam_pileup1_t)* bam_plp_next( 2065 bam_plp_t iter, 2066 int* _tid, 2067 int* _pos, 2068 int* _n_plp); 2069 2070 const(bam_pileup1_t)* bam_plp_auto( 2071 bam_plp_t iter, 2072 int* _tid, 2073 int* _pos, 2074 int* _n_plp); 2075 2076 const(bam_pileup1_t)* bam_plp64_next( 2077 bam_plp_t iter, 2078 int* _tid, 2079 hts_pos_t* _pos, 2080 int* _n_plp); 2081 2082 const(bam_pileup1_t)* bam_plp64_auto( 2083 bam_plp_t iter, 2084 int* _tid, 2085 hts_pos_t* _pos, 2086 int* _n_plp); 2087 2088 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); 2089 2090 void bam_plp_reset(bam_plp_t iter); 2091 2092 /** 2093 * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields. 2094 * @plp: The bam_plp_t initialised using bam_plp_init. 2095 * @func: The callback function itself. When called, it is given 2096 * the data argument (specified in bam_plp_init), the bam 2097 * structure and a pointer to a locally allocated 2098 * bam_pileup_cd union. This union will also be present in 2099 * each bam_pileup1_t created. 2100 * The callback function should have a negative return 2101 * value to indicate an error. (Similarly for destructor.) 2102 */ 2103 void bam_plp_constructor( 2104 bam_plp_t plp, 2105 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2106 void bam_plp_destructor( 2107 bam_plp_t plp, 2108 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2109 2110 /// Get pileup padded insertion sequence 2111 /** 2112 * @param p pileup data 2113 * @param ins the kstring where the insertion sequence will be written 2114 * @param del_len location for deletion length 2115 * @return the length of insertion string on success; -1 on failure. 2116 * 2117 * Fills out the kstring with the padded insertion sequence for the current 2118 * location in 'p'. If this is not an insertion site, the string is blank. 2119 * 2120 * If del_len is not NULL, the location pointed to is set to the length of 2121 * any deletion immediately following the insertion, or zero if none. 2122 */ 2123 int bam_plp_insertion(const(bam_pileup1_t)* p, kstring_t* ins, int* del_len); 2124 2125 /**! @typedef 2126 @abstract An opaque type used for caching base modification state between 2127 successive calls to bam_mods_* functions. 2128 */ 2129 struct hts_base_mod_state; 2130 2131 /// Get pileup padded insertion sequence, including base modifications 2132 /** 2133 * @param p pileup data 2134 * @param m state data for the base modification finder 2135 * @param ins the kstring where the insertion sequence will be written 2136 * @param del_len location for deletion length 2137 * @return the number of insertion string on success, with string length 2138 * being accessable via ins->l; -1 on failure. 2139 * 2140 * Fills out the kstring with the padded insertion sequence for the current 2141 * location in 'p'. If this is not an insertion site, the string is blank. 2142 * 2143 * The modification state needs to have been previously initialised using 2144 * bam_parse_basemod. It is permitted to be passed in as NULL, in which 2145 * case this function outputs identically to bam_plp_insertion. 2146 * 2147 * If del_len is not NULL, the location pointed to is set to the length of 2148 * any deletion immediately following the insertion, or zero if none. 2149 */ 2150 int bam_plp_insertion_mod( 2151 const(bam_pileup1_t)* p, 2152 hts_base_mod_state* m, 2153 kstring_t* ins, 2154 int* del_len); 2155 2156 /// Create a new bam_mplp_t structure 2157 /** The struct returned by a successful call should be freed 2158 * via bam_mplp_destroy() when it is no longer needed. 2159 */ 2160 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void** data); 2161 2162 /// Set up mpileup overlap detection 2163 /** 2164 * @param iter mpileup iterator 2165 * @return 0 on success; a negative value on error 2166 * 2167 * If called, mpileup will detect overlapping 2168 * read pairs and for each base pair set the base quality of the 2169 * lower-quality base to zero, thus effectively discarding it from 2170 * calling. If the two bases are identical, the quality of the other base 2171 * is increased to the sum of their qualities (capped at 200), otherwise 2172 * it is multiplied by 0.8. 2173 */ 2174 int bam_mplp_init_overlaps(bam_mplp_t iter); 2175 2176 void bam_mplp_destroy(bam_mplp_t iter); 2177 2178 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); 2179 2180 int bam_mplp_auto( 2181 bam_mplp_t iter, 2182 int* _tid, 2183 int* _pos, 2184 int* n_plp, 2185 const(bam_pileup1_t*)* plp); 2186 2187 int bam_mplp64_auto( 2188 bam_mplp_t iter, 2189 int* _tid, 2190 hts_pos_t* _pos, 2191 int* n_plp, 2192 const(bam_pileup1_t*)* plp); 2193 2194 void bam_mplp_reset(bam_mplp_t iter); 2195 2196 void bam_mplp_constructor( 2197 bam_mplp_t iter, 2198 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2199 2200 void bam_mplp_destructor( 2201 bam_mplp_t iter, 2202 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2203 2204 // ~!defined(BAM_NO_PILEUP) 2205 2206 /*********************************** 2207 * BAQ calculation and realignment * 2208 ***********************************/ 2209 2210 int sam_cap_mapq(bam1_t* b, const(char)* ref_, hts_pos_t ref_len, int thres); 2211 2212 /// Used as flag parameter in sam_prob_realn. 2213 enum htsRealnFlags { 2214 BAQ_APPLY = 1, 2215 BAQ_EXTEND = 2, 2216 BAQ_REDO = 4, 2217 2218 // Platform subfield, in bit position 3 onwards 2219 BAQ_AUTO = 0 << 3, 2220 BAQ_ILLUMINA = 1 << 3, 2221 BAQ_PACBIOCCS = 2 << 3, 2222 BAQ_PACBIO = 3 << 3, 2223 BAQ_ONT = 4 << 3, 2224 BAQ_GENAPSYS = 5 << 3 2225 } 2226 2227 /// Calculate BAQ scores 2228 /** @param b BAM record 2229 @param ref Reference sequence 2230 @param ref_len Reference sequence length 2231 @param flag Flags, see description 2232 @return 0 on success \n 2233 -1 if the read was unmapped, zero length, had no quality values, did not have at least one M, X or = CIGAR operator, or included a reference skip. \n 2234 -3 if BAQ alignment has already been done and does not need to be applied, or has already been applied. \n 2235 -4 if alignment failed (most likely due to running out of memory) 2236 2237 This function calculates base alignment quality (BAQ) values using the method 2238 described in "Improving SNP discovery by base alignment quality", Heng Li, 2239 Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076). 2240 2241 The @param flag value can be generated using the htsRealnFlags enum, but for 2242 backwards compatibilty reasons is retained as an "int". An example usage 2243 of the enum could be this, equivalent to flag 19: 2244 2245 sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS); 2246 2247 The following @param flag bits can be used: 2248 2249 Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values 2250 2251 If set, the data in the BQ:Z tag is used to adjust the quality values, and 2252 the BQ:Z tag is renamed to ZQ:Z. 2253 2254 If clear, and a ZQ:Z tag is present, the quality values are reverted using 2255 the data in the tag, and the tag is renamed to BQ:Z. 2256 2257 Bit 1 (BAQ_EXTEND): Use "extended" BAQ. 2258 2259 Changes the BAQ calculation to increase sensitivity at the expense of 2260 reduced specificity. 2261 2262 Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present. 2263 2264 Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable 2265 recalculation. 2266 2267 Bits 3-10: Choose parameters tailored to a specific instrument type. 2268 2269 One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and 2270 BAQ_GENAPSYS. The BAQ parameter tuning are still a work in progress and 2271 at the time of writing mainly consist of Illumina vs long-read technology 2272 adjustments. 2273 2274 @bug 2275 If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed. 2276 Depending on what previous processing happened, this may or may not be the 2277 correct thing to do. It would be wise to avoid this situation if possible. 2278 */ 2279 int sam_prob_realn(bam1_t* b, const(char)* ref_, hts_pos_t ref_len, int flag); 2280 2281 // --------------------------- 2282 // Base modification retrieval 2283 2284 /**! @typedef 2285 @abstract Holds a single base modification. 2286 @field modified_base The short base code (m, h, etc) or -ChEBI (negative) 2287 @field canonical_base The canonical base referred to in the MM tag. 2288 One of A, C, G, T or N. Note this may not be the 2289 explicit base recorded in the SEQ column (esp. if N). 2290 @field stran 0 or 1, indicating + or - strand from MM tag. 2291 @field qual Quality code (256*probability), or -1 if unknown 2292 2293 @discussion 2294 Note this doesn't hold any location data or information on which other 2295 modifications may be possible at this site. 2296 */ 2297 struct hts_base_mod 2298 { 2299 int modified_base; 2300 int canonical_base; 2301 int strand; 2302 int qual; 2303 } 2304 2305 /// Allocates an hts_base_mode_state. 2306 /** 2307 * @return An hts_base_mode_state pointer on success, 2308 * NULL on failure. 2309 * 2310 * This just allocates the memory. The initialisation of the contents is 2311 * done using bam_parse_basemod. Successive calls may be made to that 2312 * without the need to free and allocate a new state. 2313 * 2314 * The state be destroyed using the hts_base_mode_state_free function. 2315 */ 2316 hts_base_mod_state* hts_base_mod_state_alloc(); 2317 2318 /// Destroys an hts_base_mode_state. 2319 /** 2320 * @param state The base modification state pointer. 2321 * 2322 * The should have previously been created by hts_base_mode_state_alloc. 2323 */ 2324 void hts_base_mod_state_free(hts_base_mod_state* state); 2325 2326 /// Parses the Mm and Ml tags out of a bam record. 2327 /** 2328 * @param b BAM alignment record 2329 * @param state The base modification state pointer. 2330 * @return 0 on success, 2331 * -1 on failure. 2332 * 2333 * This fills out the contents of the modification state, resetting the 2334 * iterator location to the first sequence base. 2335 */ 2336 int bam_parse_basemod(const(bam1_t)* b, hts_base_mod_state* state); 2337 2338 /// Returns modification status for the next base position in the query seq. 2339 /** 2340 * @param b BAM alignment record 2341 * @param state The base modification state pointer. 2342 * @param mods A supplied array for returning base modifications 2343 * @param n_mods The size of the mods array 2344 * @return The number of modifications found on success, 2345 * -1 on failure. 2346 * 2347 * This is intended to be used as an iterator, with one call per location 2348 * along the query sequence. 2349 * 2350 * If no modifications are found, the returned value is zero. 2351 * If more than n_mods modifications are found, the total found is returned. 2352 * Note this means the caller needs to check whether this is higher than 2353 * n_mods. 2354 */ 2355 int bam_mods_at_next_pos( 2356 const(bam1_t)* b, 2357 hts_base_mod_state* state, 2358 hts_base_mod* mods, 2359 int n_mods); 2360 2361 /// Finds the next location containing base modifications and returns them 2362 /** 2363 * @param b BAM alignment record 2364 * @param state The base modification state pointer. 2365 * @param mods A supplied array for returning base modifications 2366 * @param n_mods The size of the mods array 2367 * @return The number of modifications found on success, 2368 * 0 if no more modifications are present, 2369 * -1 on failure. 2370 * 2371 * Unlike bam_mods_at_next_pos this skips ahead to the next site 2372 * with modifications. 2373 * 2374 * If more than n_mods modifications are found, the total found is returned. 2375 * Note this means the caller needs to check whether this is higher than 2376 * n_mods. 2377 */ 2378 int bam_next_basemod( 2379 const(bam1_t)* b, 2380 hts_base_mod_state* state, 2381 hts_base_mod* mods, 2382 int n_mods, 2383 int* pos); 2384 2385 /// Returns modification status for a specific query position. 2386 /** 2387 * @param b BAM alignment record 2388 * @param state The base modification state pointer. 2389 * @param mods A supplied array for returning base modifications 2390 * @param n_mods The size of the mods array 2391 * @return The number of modifications found on success, 2392 * -1 on failure. 2393 * 2394 * Note if called multipled times, qpos must be higher than the previous call. 2395 * Hence this is suitable for use from a pileup iterator. If more random 2396 * access is required, bam_parse_basemod must be called each time to reset 2397 * the state although this has an efficiency cost. 2398 * 2399 * If no modifications are found, the returned value is zero. 2400 * If more than n_mods modifications are found, the total found is returned. 2401 * Note this means the caller needs to check whether this is higher than 2402 * n_mods. 2403 */ 2404 int bam_mods_at_qpos( 2405 const(bam1_t)* b, 2406 int qpos, 2407 hts_base_mod_state* state, 2408 hts_base_mod* mods, 2409 int n_mods); 2410 2411 2412 /* 2413 * @param b BAM alignment record 2414 * @param state The base modification state pointer. 2415 * @param code Modification code. If positive this is a character code, 2416 * if negative it is a -ChEBI code. 2417 * 2418 * @param strand Boolean for top (0) or bottom (1) strand 2419 * @param implicit Boolean for whether unlisted positions should be 2420 * implicitly assumed to be unmodified, or require an 2421 * explicit score and should be considered as unknown. 2422 * Returned. 2423 * @param canonical Canonical base type associated with this modification 2424 * Returned. 2425 * 2426 * @return 0 on success or -1 if not found. The strand, implicit and canonical 2427 * fields are filled out if passed in as non-NULL pointers. 2428 */ 2429 int bam_mods_query_type( 2430 hts_base_mod_state* state, 2431 int code, 2432 int* strand, 2433 int* implicit, 2434 char* canonical); 2435 2436 2437 2438 2439 /* 2440 * @param b BAM alignment record 2441 * @param state The base modification state pointer. 2442 * @param ntype Filled out with the number of array elements returned 2443 * 2444 * @return the type array, with *ntype filled out with the size. 2445 * The array returned should not be freed. 2446 * It is a valid pointer until the state is freed using 2447 * hts_base_mod_free(). 2448 */ 2449 int* bam_mods_recorded(hts_base_mod_state* state, int* ntype);