1 /// @file htslib/sam.h 2 /// High-level SAM/BAM/CRAM sequence file operations. 3 /* 4 Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd. 5 Copyright (C) 2010, 2012, 2013 Broad Institute. 6 7 Author: Heng Li <lh3@sanger.ac.uk> 8 9 Permission is hereby granted, free of charge, to any person obtaining a copy 10 of this software and associated documentation files (the "Software"), to deal 11 in the Software without restriction, including without limitation the rights 12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 copies of the Software, and to permit persons to whom the Software is 14 furnished to do so, subject to the following conditions: 15 16 The above copyright notice and this permission notice shall be included in 17 all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 DEALINGS IN THE SOFTWARE. */ 26 27 module htslib.sam; 28 29 import core.stdc.stddef; 30 import core.stdc.stdlib; 31 import std.format: format; 32 33 import htslib.hts; 34 import htslib.hts_log; 35 import htslib.bgzf : BGZF; 36 import htslib.kstring : kstring_t, ssize_t; 37 38 @system: 39 extern (C): 40 @nogc nothrow { 41 42 /// Highest SAM format version supported by this library 43 enum SAM_FORMAT_VERSION = "1.6"; 44 45 /*************************** 46 *** SAM/BAM/CRAM header *** 47 ***************************/ 48 49 /*! @typedef 50 * @abstract Header extension structure, grouping a collection 51 * of hash tables that contain the parsed header data. 52 */ 53 54 struct sam_hrecs_t; 55 56 /*! @typedef 57 @abstract Structure for the alignment header. 58 @field n_targets number of reference sequences 59 @field l_text length of the plain text in the header (may be zero if 60 the header has been edited) 61 @field target_len lengths of the reference sequences 62 @field target_name names of the reference sequences 63 @field text plain text (may be NULL if the header has been edited) 64 @field sdict header dictionary 65 @field hrecs pointer to the extended header struct (internal use only) 66 @field ref_count reference count 67 68 @note The text and l_text fields are included for backwards compatibility. 69 These fields may be set to NULL and zero respectively as a side-effect 70 of calling some header API functions. New code that needs to access the 71 header text should use the sam_hdr_str() and sam_hdr_length() functions 72 instead of these fields. 73 */ 74 75 struct sam_hdr_t 76 { 77 int n_targets; 78 int ignore_sam_err; 79 size_t l_text; 80 uint* target_len; 81 const(byte)* cigar_tab; 82 char** target_name; 83 char* text; 84 void* sdict; 85 sam_hrecs_t* hrecs; 86 uint ref_count; 87 } 88 89 /*! @typedef 90 * @abstract Old name for compatibility with existing code. 91 */ 92 alias bam_hdr_t = sam_hdr_t; 93 94 /**************************** 95 *** CIGAR related macros *** 96 ****************************/ 97 98 enum BAM_CMATCH = 0; 99 enum BAM_CINS = 1; 100 enum BAM_CDEL = 2; 101 enum BAM_CREF_SKIP = 3; 102 enum BAM_CSOFT_CLIP = 4; 103 enum BAM_CHARD_CLIP = 5; 104 enum BAM_CPAD = 6; 105 enum BAM_CEQUAL = 7; 106 enum BAM_CDIFF = 8; 107 enum BAM_CBACK = 9; 108 109 enum BAM_CIGAR_STR = "MIDNSHP=XB"; 110 enum BAM_CIGAR_SHIFT = 4; 111 enum BAM_CIGAR_MASK = 0xf; 112 enum BAM_CIGAR_TYPE = 0x3C1A7; 113 114 /*! @abstract Table for converting a CIGAR operator character to BAM_CMATCH etc. 115 Result is operator code or -1. Be sure to cast the index if it is a plain char: 116 int op = bam_cigar_table[(unsigned char) ch]; 117 */ 118 extern __gshared const(byte)[256] bam_cigar_table; 119 120 extern (D) auto bam_cigar_op(T)(auto ref T c) 121 { 122 return c & BAM_CIGAR_MASK; 123 } 124 125 extern (D) auto bam_cigar_oplen(T)(auto ref T c) 126 { 127 return c >> BAM_CIGAR_SHIFT; 128 } 129 130 // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that 131 // the array look-up will not fall off the end. '?' is chosen as the 132 // padding character so it's easy to spot if one is emitted, and will 133 // result in a parsing failure (in sam_parse1(), at least) if read. 134 extern (D) auto bam_cigar_gen(T0, T1)(auto ref T0 l, auto ref T1 o) 135 { 136 return l << BAM_CIGAR_SHIFT | o; 137 } 138 139 /* bam_cigar_type returns a bit flag with: 140 * bit 1 set if the cigar operation consumes the query 141 * bit 2 set if the cigar operation consumes the reference 142 * 143 * For reference, the unobfuscated truth table for this function is: 144 * BAM_CIGAR_TYPE QUERY REFERENCE 145 * -------------------------------- 146 * BAM_CMATCH 1 1 147 * BAM_CINS 1 0 148 * BAM_CDEL 0 1 149 * BAM_CREF_SKIP 0 1 150 * BAM_CSOFT_CLIP 1 0 151 * BAM_CHARD_CLIP 0 0 152 * BAM_CPAD 0 0 153 * BAM_CEQUAL 1 1 154 * BAM_CDIFF 1 1 155 * BAM_CBACK 0 0 156 * -------------------------------- 157 */ 158 extern (D) auto bam_cigar_type(T)(auto ref T o) 159 { 160 return BAM_CIGAR_TYPE >> (o << 1) & 3; 161 } // bit 1: consume query; bit 2: consume reference 162 163 /*! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ 164 enum BAM_FPAIRED = 1; 165 /*! @abstract the read is mapped in a proper pair */ 166 enum BAM_FPROPER_PAIR = 2; 167 /*! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ 168 enum BAM_FUNMAP = 4; 169 /*! @abstract the mate is unmapped */ 170 enum BAM_FMUNMAP = 8; 171 /*! @abstract the read is mapped to the reverse strand */ 172 enum BAM_FREVERSE = 16; 173 /*! @abstract the mate is mapped to the reverse strand */ 174 enum BAM_FMREVERSE = 32; 175 /*! @abstract this is read1 */ 176 enum BAM_FREAD1 = 64; 177 /*! @abstract this is read2 */ 178 enum BAM_FREAD2 = 128; 179 /*! @abstract not primary alignment */ 180 enum BAM_FSECONDARY = 256; 181 /*! @abstract QC failure */ 182 enum BAM_FQCFAIL = 512; 183 /*! @abstract optical or PCR duplicate */ 184 enum BAM_FDUP = 1024; 185 /*! @abstract supplementary alignment */ 186 enum BAM_FSUPPLEMENTARY = 2048; 187 188 /************************* 189 *** Alignment records *** 190 *************************/ 191 192 /* 193 * Assumptions made here. While pos can be 64-bit, no sequence 194 * itself is that long, but due to ref skip CIGAR fields it 195 * may span more than that. (CIGAR itself is 28-bit len + 4 bit 196 * type, but in theory we can combine multiples together.) 197 * 198 * Mate position and insert size also need to be 64-bit, but 199 * we won't accept more than 32-bit for tid. 200 * 201 * The bam_core_t structure is the *in memory* layout and not 202 * the same as the on-disk format. 64-bit changes here permit 203 * SAM to work with very long chromosomes and permit BAM and CRAM 204 * to seamlessly update in the future without further API/ABI 205 * revisions. 206 */ 207 208 /*! @typedef 209 @abstract Structure for core alignment information. 210 @field pos 0-based leftmost coordinate 211 @field tid chromosome ID, defined by sam_hdr_t 212 @field bin bin calculated by bam_reg2bin() 213 @field qual mapping quality 214 @field l_extranul length of extra NULs between qname & cigar (for alignment) 215 @field flag bitwise flag 216 @field l_qname length of the query name 217 @field n_cigar number of CIGAR operations 218 @field l_qseq length of the query sequence (read) 219 @field mtid chromosome ID of next read in template, defined by sam_hdr_t 220 @field mpos 0-based leftmost coordinate of next read in template 221 @field isize observed template length ("insert size") 222 */ 223 struct bam1_core_t 224 { 225 hts_pos_t pos; 226 int tid; 227 ushort bin; // NB: invalid on 64-bit pos 228 ubyte qual; 229 ubyte l_extranul; 230 ushort flag; 231 ushort l_qname; 232 uint n_cigar; 233 int l_qseq; 234 int mtid; 235 hts_pos_t mpos; 236 hts_pos_t isize; 237 } 238 239 /*! @typedef 240 @abstract Structure for one alignment. 241 @field core core information about the alignment 242 @field id 243 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux 244 @field l_data current length of bam1_t::data 245 @field m_data maximum length of bam1_t::data 246 @field mempolicy memory handling policy, see bam_set_mempolicy() 247 248 @discussion Notes: 249 250 1. The data blob should be accessed using bam_get_qname, bam_get_cigar, 251 bam_get_seq, bam_get_qual and bam_get_aux macros. These returns pointers 252 to the start of each type of data. 253 2. qname is terminated by one to four NULs, so that the following 254 cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, 255 while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). 256 3. Cigar data is encoded 4 bytes per CIGAR operation. 257 See the bam_cigar_* macros for manipulation. 258 4. seq is nibble-encoded according to bam_nt16_table. 259 See the bam_seqi macro for retrieving individual bases. 260 5. Per base qualities are stored in the Phred scale with no +33 offset. 261 Ie as per the BAM specification and not the SAM ASCII printable method. 262 */ 263 struct bam1_t 264 { 265 import std.bitmanip : bitfields; 266 267 bam1_core_t core; 268 ulong id; 269 ubyte* data; 270 int l_data; 271 uint m_data; 272 273 mixin(bitfields!( 274 uint, "mempolicy", 2, 275 uint, "", 30)); 276 277 /* Reserved */ 278 } 279 280 /*! @function 281 @abstract Get whether the query is on the reverse strand 282 @param b pointer to an alignment 283 @return boolean true if query is on the reverse strand 284 */ 285 extern (D) auto bam_is_rev(T)(auto ref T b) 286 { 287 return (b.core.flag & BAM_FREVERSE) != 0; 288 } 289 290 /*! @function 291 @abstract Get whether the query's mate is on the reverse strand 292 @param b pointer to an alignment 293 @return boolean true if query's mate on the reverse strand 294 */ 295 extern (D) auto bam_is_mrev(T)(auto ref T b) 296 { 297 return (b.core.flag & BAM_FMREVERSE) != 0; 298 } 299 300 /*! @function 301 @abstract Get the name of the query 302 @param b pointer to an alignment 303 @return pointer to the name string, null terminated 304 */ 305 extern (D) auto bam_get_qname(T)(auto ref T b) 306 { 307 return cast(char*) b.data; 308 } 309 310 /*! @function 311 @abstract Get the CIGAR array 312 @param b pointer to an alignment 313 @return pointer to the CIGAR array 314 315 @discussion In the CIGAR array, each element is a 32-bit integer. The 316 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the 317 length of a CIGAR. 318 */ 319 extern (D) auto bam_get_cigar(bam1_t * b) 320 { 321 return cast(uint*) ((*b).data + (*b).core.l_qname); 322 } 323 324 /*! @function 325 @abstract Get query sequence 326 @param b pointer to an alignment 327 @return pointer to sequence 328 329 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 330 8 for T and 15 for N. Two bases are packed in one byte with the base 331 at the higher 4 bits having smaller coordinate on the read. It is 332 recommended to use bam_seqi() macro to get the base. 333 */ 334 extern (D) auto bam_get_seq(T)(auto ref T b) 335 { 336 return b.data + (b.core.n_cigar << 2) + b.core.l_qname; 337 } 338 339 /*! @function 340 @abstract Get query quality 341 @param b pointer to an alignment 342 @return pointer to quality string 343 */ 344 extern (D) auto bam_get_qual(T)(auto ref T b) 345 { 346 return b.data + (b.core.n_cigar << 2) + b.core.l_qname + ((b.core.l_qseq + 1) >> 1); 347 } 348 349 /*! @function 350 @abstract Get auxiliary data 351 @param b pointer to an alignment 352 @return pointer to the concatenated auxiliary data 353 */ 354 extern (D) auto bam_get_aux(T)(auto ref T b) 355 { 356 return b.data + (b.core.n_cigar << 2) + b.core.l_qname + ((b.core.l_qseq + 1) >> 1) + b.core.l_qseq; 357 } 358 359 /*! @function 360 @abstract Get length of auxiliary data 361 @param b pointer to an alignment 362 @return length of the concatenated auxiliary data 363 */ 364 extern (D) auto bam_get_l_aux(T)(auto ref T b) 365 { 366 return b.l_data - (b.core.n_cigar << 2) - b.core.l_qname - b.core.l_qseq - ((b.core.l_qseq + 1) >> 1); 367 } 368 369 /*! @function 370 @abstract Get a base on read 371 @param s Query sequence returned by bam_get_seq() 372 @param i The i-th position, 0-based 373 @return 4-bit integer representing the base. 374 */ 375 extern (D) auto bam_seqi(T0, T1)(auto ref T0 s, auto ref T1 i) 376 { 377 return s[i >> 1] >> ((~i & 1) << 2) & 0xf; 378 } 379 380 /*! 381 @abstract Modifies a single base in the bam structure. 382 @param s Query sequence returned by bam_get_seq() 383 @param i The i-th position, 0-based 384 @param b Base in nt16 nomenclature (see seq_nt16_table) 385 */ 386 387 extern (D) void bam_set_seqi(T0, T1, T3)(auto ref T0 s, auto ref T1 i, auto ref T3 b) 388 { 389 s[i >> 1] = (s[i >> 1] & (0xf0 >> ((~i & 1) << 2))) | cast(ubyte)(b << ((~i & 1) << 2)); 390 } 391 392 /************************** 393 *** Exported functions *** 394 **************************/ 395 396 /*************** 397 *** BAM I/O *** 398 ***************/ 399 400 /* Header */ 401 402 /// Generates a new unpopulated header structure. 403 /*! 404 * 405 * @return A valid pointer to new header on success, NULL on failure 406 * 407 * The sam_hdr_t struct returned by a successful call should be freed 408 * via sam_hdr_destroy() when it is no longer needed. 409 */ 410 sam_hdr_t* sam_hdr_init(); 411 412 /// Read the header from a BAM compressed file. 413 /*! 414 * @param fp File pointer 415 * @return A valid pointer to new header on success, NULL on failure 416 * 417 * This function only works with BAM files. It is usually better to use 418 * sam_hdr_read(), which works on SAM, BAM and CRAM files. 419 * 420 * The sam_hdr_t struct returned by a successful call should be freed 421 * via sam_hdr_destroy() when it is no longer needed. 422 */ 423 sam_hdr_t* bam_hdr_read(BGZF* fp); 424 425 /// Writes the header to a BAM file. 426 /*! 427 * @param fp File pointer 428 * @param h Header pointer 429 * @return 0 on success, -1 on failure 430 * 431 * This function only works with BAM files. Use sam_hdr_write() to 432 * write in any of the SAM, BAM or CRAM formats. 433 */ 434 int bam_hdr_write(BGZF* fp, const(sam_hdr_t)* h); 435 436 /*! 437 * Frees the resources associated with a header. 438 */ 439 void sam_hdr_destroy(sam_hdr_t* h); 440 441 /// Duplicate a header structure. 442 /*! 443 * @return A valid pointer to new header on success, NULL on failure 444 * 445 * The sam_hdr_t struct returned by a successful call should be freed 446 * via sam_hdr_destroy() when it is no longer needed. 447 */ 448 sam_hdr_t* sam_hdr_dup(const(sam_hdr_t)* h0); 449 450 /*! 451 * @abstract Old names for compatibility with existing code. 452 */ 453 pragma(inline,true) 454 sam_hdr_t* bam_hdr_init() { return sam_hdr_init(); } 455 456 pragma(inline,true) 457 void bam_hdr_destroy(sam_hdr_t* h) { sam_hdr_destroy(h); } 458 459 pragma(inline,true) 460 sam_hdr_t* bam_hdr_dup(const(sam_hdr_t)* h0) { return sam_hdr_dup(h0); } 461 462 alias samFile = htsFile; 463 464 /// Create a header from existing text. 465 /*! 466 * @param l_text Length of text 467 * @param text Header text 468 * @return A populated sam_hdr_t structure on success; NULL on failure. 469 * @note The text field of the returned header will be NULL, and the l_text 470 * field will be zero. 471 * 472 * The sam_hdr_t struct returned by a successful call should be freed 473 * via sam_hdr_destroy() when it is no longer needed. 474 */ 475 sam_hdr_t* sam_hdr_parse(size_t l_text, const(char)* text); 476 477 /// Read a header from a SAM, BAM or CRAM file. 478 /*! 479 * @param fp Pointer to a SAM, BAM or CRAM file handle 480 * @return A populated sam_hdr_t struct on success; NULL on failure. 481 * 482 * The sam_hdr_t struct returned by a successful call should be freed 483 * via sam_hdr_destroy() when it is no longer needed. 484 */ 485 sam_hdr_t* sam_hdr_read(samFile* fp); 486 487 /// Write a header to a SAM, BAM or CRAM file. 488 /*! 489 * @param fp SAM, BAM or CRAM file header 490 * @param h Header structure to write 491 * @return 0 on success; -1 on failure 492 */ 493 int sam_hdr_write(samFile* fp, const(sam_hdr_t)* h); 494 495 /// Returns the current length of the header text. 496 /*! 497 * @return >= 0 on success, SIZE_MAX on failure 498 */ 499 size_t sam_hdr_length(sam_hdr_t* h); 500 501 /// Returns the text representation of the header. 502 /*! 503 * @return valid char pointer on success, NULL on failure 504 * 505 * The returned string is part of the header structure. It will remain 506 * valid until a call to a header API function causes the string to be 507 * invalidated, or the header is destroyed. 508 * 509 * The caller should not attempt to free or realloc this pointer. 510 */ 511 const(char)* sam_hdr_str(sam_hdr_t* h); 512 513 /// Returns the number of references in the header. 514 /*! 515 * @return >= 0 on success, -1 on failure 516 */ 517 int sam_hdr_nref(const(sam_hdr_t)* h); 518 519 /* ==== Line level methods ==== */ 520 521 /// Add formatted lines to an existing header. 522 /*! 523 * @param lines Full SAM header record, eg "@SQ\tSN:foo\tLN:100", with 524 * optional new-line. If it contains more than 1 line then 525 * multiple lines will be added in order 526 * @param len The maximum length of lines (if an early NUL is not 527 * encountered). len may be 0 if unknown, in which case 528 * lines must be NUL-terminated 529 * @return 0 on success, -1 on failure 530 * 531 * The lines will be appended to the end of the existing header 532 * (apart from HD, which always comes first). 533 */ 534 int sam_hdr_add_lines(sam_hdr_t* h, const(char)* lines, size_t len); 535 536 /// Adds a single line to an existing header. 537 /*! 538 * Specify type and one or more key,value pairs, ending with the NULL key. 539 * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). 540 * 541 * @param type Type of the added line. Eg. "SQ" 542 * @return 0 on success, -1 on failure 543 * 544 * The new line will be added immediately after any others of the same 545 * type, or at the end of the existing header if no lines of the 546 * given type currently exist. The exception is HD lines, which always 547 * come first. If an HD line already exists, it will be replaced. 548 */ 549 int sam_hdr_add_line(sam_hdr_t* h, const(char)* type, ...); 550 551 /// Returns a complete line of formatted text for a given type and ID. 552 /*! 553 * @param type Type of the searched line. Eg. "SQ" 554 * @param ID_key Tag key defining the line. Eg. "SN" 555 * @param ID_value Tag value associated with the key above. Eg. "ref1" 556 * @param ks kstring to hold the result 557 * @return 0 on success; 558 * -1 if no matching line is found 559 * -2 on other failures 560 * 561 * Puts a complete line of formatted text for a specific header type/ID 562 * combination into @p ks. If ID_key is NULL then it returns the first line of 563 * the specified type. 564 * 565 * Any existing content in @p ks will be overwritten. 566 */ 567 int sam_hdr_find_line_id( 568 sam_hdr_t* h, 569 const(char)* type, 570 const(char)* ID_key, 571 const(char)* ID_val, 572 kstring_t* ks); 573 574 /// Returns a complete line of formatted text for a given type and index. 575 /*! 576 * @param type Type of the searched line. Eg. "SQ" 577 * @param position Index in lines of this type (zero-based) 578 * @param ks kstring to hold the result 579 * @return 0 on success; 580 * -1 if no matching line is found 581 * -2 on other failures 582 * 583 * Puts a complete line of formatted text for a specific line into @p ks. 584 * The header line is selected using the @p type and @p position parameters. 585 * 586 * Any existing content in @p ks will be overwritten. 587 */ 588 int sam_hdr_find_line_pos( 589 sam_hdr_t* h, 590 const(char)* type, 591 int pos, 592 kstring_t* ks); 593 594 /// Remove a line with given type / id from a header 595 /*! 596 * @param type Type of the searched line. Eg. "SQ" 597 * @param ID_key Tag key defining the line. Eg. "SN" 598 * @param ID_value Tag value associated with the key above. Eg. "ref1" 599 * @return 0 on success, -1 on error 600 * 601 * Remove a line from the header by specifying a tag:value that uniquely 602 * identifies the line, i.e. the @SQ line containing "SN:ref1". 603 * 604 * \@SQ line is uniquely identified by the SN tag. 605 * \@RG line is uniquely identified by the ID tag. 606 * \@PG line is uniquely identified by the ID tag. 607 * Eg. sam_hdr_remove_line_id(h, "SQ", "SN", "ref1") 608 * 609 * If no key:value pair is specified, the type MUST be followed by a NULL argument and 610 * the first line of the type will be removed, if any. 611 * Eg. sam_hdr_remove_line_id(h, "SQ", NULL, NULL) 612 * 613 * @note Removing \@PG lines is currently unsupported. 614 */ 615 int sam_hdr_remove_line_id( 616 sam_hdr_t* h, 617 const(char)* type, 618 const(char)* ID_key, 619 const(char)* ID_value); 620 621 /// Remove nth line of a given type from a header 622 /*! 623 * @param type Type of the searched line. Eg. "SQ" 624 * @param position Index in lines of this type (zero-based). E.g. 3 625 * @return 0 on success, -1 on error 626 * 627 * Remove a line from the header by specifying the position in the type 628 * group, i.e. 3rd @SQ line. 629 */ 630 int sam_hdr_remove_line_pos(sam_hdr_t* h, const(char)* type, int position); 631 632 /// Add or update tag key,value pairs in a header line. 633 /*! 634 * @param type Type of the searched line. Eg. "SQ" 635 * @param ID_key Tag key defining the line. Eg. "SN" 636 * @param ID_value Tag value associated with the key above. Eg. "ref1" 637 * @return 0 on success, -1 on error 638 * 639 * Adds or updates tag key,value pairs in a header line. 640 * Eg. for adding M5 tags to @SQ lines or updating sort order for the 641 * @HD line. 642 * 643 * Specify multiple key,value pairs ending in NULL. Eg. 644 * sam_hdr_update_line(h, "RG", "ID", "rg1", "DS", "description", "PG", "samtools", NULL) 645 * 646 * Attempting to update the record name (i.e. @SQ SN or @RG ID) will 647 * work as long as the new name is not already in use, however doing this 648 * on a file opened for reading may produce unexpected results. 649 * 650 * Renaming an @RG record in this way will only change the header. Alignment 651 * records written later will not be updated automatically even if they 652 * reference the old read group name. 653 * 654 * Attempting to change an @PG ID tag is not permitted. 655 */ 656 int sam_hdr_update_line( 657 sam_hdr_t* h, 658 const(char)* type, 659 const(char)* ID_key, 660 const(char)* ID_value, 661 ...); 662 663 /// Remove all lines of a given type from a header, except the one matching an ID 664 /*! 665 * @param type Type of the searched line. Eg. "SQ" 666 * @param ID_key Tag key defining the line. Eg. "SN" 667 * @param ID_value Tag value associated with the key above. Eg. "ref1" 668 * @return 0 on success, -1 on failure 669 * 670 * Remove all lines of type <type> from the header, except the one 671 * specified by tag:value, i.e. the @SQ line containing "SN:ref1". 672 * 673 * If no line matches the key:value ID, all lines of the given type are removed. 674 * To remove all lines of a given type, use NULL for both ID_key and ID_value. 675 */ 676 int sam_hdr_remove_except( 677 sam_hdr_t* h, 678 const(char)* type, 679 const(char)* ID_key, 680 const(char)* ID_value); 681 682 /// Remove header lines of a given type, except those in a given ID set 683 /*! 684 * @param type Type of the searched line. Eg. "RG" 685 * @param id Tag key defining the line. Eg. "ID" 686 * @param rh Hash set initialised by the caller with the values to be kept. 687 * See description for how to create this. If @p rh is NULL, all 688 * lines of this type will be removed. 689 * @return 0 on success, -1 on failure 690 * 691 * Remove all lines of type @p type from the header, except the ones 692 * specified in the hash set @p rh. If @p rh is NULL, all lines of 693 * this type will be removed. 694 * Declaration of @p rh is done using KHASH_SET_INIT_STR macro. Eg. 695 * @code{.c} 696 * #include "htslib/khash.h" 697 * KHASH_SET_INIT_STR(keep) 698 * typedef khash_t(keep) *keephash_t; 699 * 700 * void your_method() { 701 * samFile *sf = sam_open("alignment.bam", "r"); 702 * sam_hdr_t *h = sam_hdr_read(sf); 703 * keephash_t rh = kh_init(keep); 704 * int ret = 0; 705 * kh_put(keep, rh, strdup("chr2"), &ret); 706 * kh_put(keep, rh, strdup("chr3"), &ret); 707 * if (sam_hdr_remove_lines(h, "SQ", "SN", rh) == -1) 708 * fprintf(stderr, "Error removing lines\n"); 709 * khint_t k; 710 * for (k = 0; k < kh_end(rh); ++k) 711 * if (kh_exist(rh, k)) free((char*)kh_key(rh, k)); 712 * kh_destroy(keep, rh); 713 * sam_hdr_destroy(h); 714 * sam_close(sf); 715 * } 716 * @endcode 717 * 718 */ 719 int sam_hdr_remove_lines( 720 sam_hdr_t* h, 721 const(char)* type, 722 const(char)* id, 723 void* rh); 724 725 /// Count the number of lines for a given header type 726 /*! 727 * @param h BAM header 728 * @param type Header type to count. Eg. "RG" 729 * @return Number of lines of this type on success; -1 on failure 730 */ 731 int sam_hdr_count_lines(sam_hdr_t* h, const(char)* type); 732 733 /// Index of the line for the types that have dedicated look-up tables (SQ, RG, PG) 734 /*! 735 * @param h BAM header 736 * @param type Type of the searched line. Eg. "RG" 737 * @param key The value of the identifying key. Eg. "rg1" 738 * @return 0-based index on success; -1 if line does not exist; -2 on failure 739 */ 740 int sam_hdr_line_index(sam_hdr_t* bh, const(char)* type, const(char)* key); 741 742 /// Id key of the line for the types that have dedicated look-up tables (SQ, RG, PG) 743 /*! 744 * @param h BAM header 745 * @param type Type of the searched line. Eg. "RG" 746 * @param pos Zero-based index inside the type group. Eg. 2 (for the third RG line) 747 * @return Valid key string on success; NULL on failure 748 */ 749 const(char)* sam_hdr_line_name(sam_hdr_t* bh, const(char)* type, int pos); 750 751 /* ==== Key:val level methods ==== */ 752 753 /// Return the value associated with a key for a header line identified by ID_key:ID_val 754 /*! 755 * @param type Type of the line to which the tag belongs. Eg. "SQ" 756 * @param ID_key Tag key defining the line. Eg. "SN". Can be NULL, if looking for the first line. 757 * @param ID_value Tag value associated with the key above. Eg. "ref1". Can be NULL, if ID_key is NULL. 758 * @param key Key of the searched tag. Eg. "LN" 759 * @param ks kstring where the value will be written 760 * @return 0 on success 761 * -1 if the requested tag does not exist 762 * -2 on other errors 763 * 764 * Looks for a specific key in a single SAM header line and writes the 765 * associated value into @p ks. The header line is selected using the ID_key 766 * and ID_value parameters. Any pre-existing content in @p ks will be 767 * overwritten. 768 */ 769 int sam_hdr_find_tag_id( 770 sam_hdr_t* h, 771 const(char)* type, 772 const(char)* ID_key, 773 const(char)* ID_value, 774 const(char)* key, 775 kstring_t* ks); 776 777 /// Return the value associated with a key for a header line identified by position 778 /*! 779 * @param type Type of the line to which the tag belongs. Eg. "SQ" 780 * @param position Index in lines of this type (zero-based). E.g. 3 781 * @param key Key of the searched tag. Eg. "LN" 782 * @param ks kstring where the value will be written 783 * @return 0 on success 784 * -1 if the requested tag does not exist 785 * -2 on other errors 786 * 787 * Looks for a specific key in a single SAM header line and writes the 788 * associated value into @p ks. The header line is selected using the @p type 789 * and @p position parameters. Any pre-existing content in @p ks will be 790 * overwritten. 791 */ 792 int sam_hdr_find_tag_pos( 793 sam_hdr_t* h, 794 const(char)* type, 795 int pos, 796 const(char)* key, 797 kstring_t* ks); 798 799 /// Remove the key from the line identified by type, ID_key and ID_value. 800 /*! 801 * @param type Type of the line to which the tag belongs. Eg. "SQ" 802 * @param ID_key Tag key defining the line. Eg. "SN" 803 * @param ID_value Tag value associated with the key above. Eg. "ref1" 804 * @param key Key of the targeted tag. Eg. "M5" 805 * @return 1 if the key was removed; 0 if it was not present; -1 on error 806 */ 807 int sam_hdr_remove_tag_id( 808 sam_hdr_t* h, 809 const(char)* type, 810 const(char)* ID_key, 811 const(char)* ID_value, 812 const(char)* key); 813 814 /// Get the target id for a given reference sequence name 815 /*! 816 * @param ref Reference name 817 * @return Positive value on success, 818 * -1 if unknown reference, 819 * -2 if the header could not be parsed 820 * 821 * Looks up a reference sequence by name in the reference hash table 822 * and returns the numerical target id. 823 */ 824 int sam_hdr_name2tid(sam_hdr_t* h, const(char)* ref_); 825 826 /// Get the reference sequence name from a target index 827 /*! 828 * @param tid Target index 829 * @return Valid reference name on success, NULL on failure 830 * 831 * Fetch the reference sequence name from the target name array, 832 * using the numerical target id. 833 */ 834 const(char)* sam_hdr_tid2name(const(sam_hdr_t)* h, int tid); 835 836 /// Get the reference sequence length from a target index 837 /*! 838 * @param tid Target index 839 * @return Strictly positive value on success, 0 on failure 840 * 841 * Fetch the reference sequence length from the target length array, 842 * using the numerical target id. 843 */ 844 hts_pos_t sam_hdr_tid2len(const(sam_hdr_t)* h, int tid); 845 846 /// Alias of sam_hdr_name2tid(), for backwards compatibility. 847 /*! 848 * @param ref Reference name 849 * @return Positive value on success, 850 * -1 if unknown reference, 851 * -2 if the header could not be parsed 852 */ 853 pragma(inline,true) 854 int bam_name2id(sam_hdr_t* h, const(char)* ref_) { return sam_hdr_name2tid(h, ref_); } 855 856 /// Generate a unique \@PG ID: value 857 /*! 858 * @param name Name of the program. Eg. samtools 859 * @return Valid ID on success, NULL on failure 860 * 861 * Returns a unique ID from a base name. The string returned will remain 862 * valid until the next call to this function, or the header is destroyed. 863 * The caller should not attempt to free() or realloc() it. 864 */ 865 const(char)* sam_hdr_pg_id(sam_hdr_t* h, const(char)* name); 866 867 /// Add an \@PG line. 868 /*! 869 * @param name Name of the program. Eg. samtools 870 * @return 0 on success, -1 on failure 871 * 872 * If we wish complete control over this use sam_hdr_add_line() directly. This 873 * function uses that, but attempts to do a lot of tedious house work for 874 * you too. 875 * 876 * - It will generate a suitable ID if the supplied one clashes. 877 * - It will generate multiple \@PG records if we have multiple PG chains. 878 * 879 * Call it as per sam_hdr_add_line() with a series of key,value pairs ending 880 * in NULL. 881 */ 882 int sam_hdr_add_pg(sam_hdr_t* h, const(char)* name, ...); 883 884 /*! 885 * A function to help with construction of CL tags in @PG records. 886 * Takes an argc, argv pair and returns a single space-separated string. 887 * This string should be deallocated by the calling function. 888 * 889 * @return 890 * Returns malloced char * on success; 891 * NULL on failure 892 */ 893 char* stringify_argv(int argc, char** argv); 894 895 /// Increments the reference count on a header 896 /*! 897 * This permits multiple files to share the same header, all calling 898 * sam_hdr_destroy when done, without causing errors for other open files. 899 */ 900 void sam_hdr_incr_ref(sam_hdr_t* h); 901 902 /* 903 * Macros for changing the \@HD line. They eliminate the need to use NULL method arguments. 904 */ 905 906 /// Returns the SAM formatted text of the \@HD header line 907 extern (D) auto sam_hdr_find_hd(T0, T1)(auto ref T0 h, auto ref T1 ks) 908 { 909 return sam_hdr_find_line_id(h, "HD", NULL, NULL, ks); 910 } 911 912 /// Returns the value associated with a given \@HD line tag 913 extern (D) auto sam_hdr_find_tag_hd(T0, T1, T2)(auto ref T0 h, auto ref T1 key, auto ref T2 ks) 914 { 915 return sam_hdr_find_tag_id(h, "HD", NULL, NULL, key, ks); 916 } 917 918 /// Adds or updates tags on the header \@HD line 919 extern (D) auto sam_hdr_update_hd(T, A...)(auto ref T h, auto ref A a) 920 { 921 // NOTE: This macro was dropped by dstep due to variadic args 922 static assert (a.length %2 == 0); // K-V pairs => even number of variadic args 923 return sam_hdr_update_line(h, "HD", null, null, a, null); 924 } 925 926 /// Removes the \@HD line tag with the given key 927 extern (D) auto sam_hdr_remove_tag_hd(T0, T1)(auto ref T0 h, auto ref T1 key) 928 { 929 return sam_hdr_remove_tag_id(h, "HD", NULL, NULL, key); 930 } 931 932 /* Alignment */ 933 934 /// Create a new bam1_t alignment structure 935 /** 936 @return An empty bam1_t structure on success, NULL on failure 937 938 The bam1_t struct returned by a successful call should be freed 939 via bam_destroy1() when it is no longer needed. 940 */ 941 bam1_t* bam_init1(); 942 943 /// Destroy a bam1_t structure 944 /** 945 @param b structure to destroy 946 947 Does nothing if @p b is NULL. If not, all memory associated with @p b 948 will be freed, along with the structure itself. @p b should not be 949 accessed after calling this function. 950 */ 951 void bam_destroy1(bam1_t* b); 952 953 enum BAM_USER_OWNS_STRUCT = 1; 954 enum BAM_USER_OWNS_DATA = 2; 955 956 /// Set alignment record memory policy 957 /** 958 @param b Alignment record 959 @param policy Desired policy 960 961 Allows the way HTSlib reallocates and frees bam1_t data to be 962 changed. @policy can be set to the bitwise-or of the following 963 values: 964 965 \li \c BAM_USER_OWNS_STRUCT 966 If this is set then bam_destroy1() will not try to free the bam1_t struct. 967 968 \li \c BAM_USER_OWNS_DATA 969 If this is set, bam_destroy1() will not free the bam1_t::data pointer. 970 Also, functions which need to expand bam1_t::data memory will change 971 behaviour. Instead of calling realloc() on the pointer, they will 972 allocate a new data buffer and copy any existing content in to it. 973 The existing memory will \b not be freed. bam1_t::data will be 974 set to point to the new memory and the BAM_USER_OWNS_DATA flag will be 975 cleared. 976 977 BAM_USER_OWNS_STRUCT allows bam_destroy1() to be called on bam1_t 978 structures that are members of an array. 979 980 BAM_USER_OWNS_DATA can be used by applications that want more control 981 over where the variable-length parts of the bam record will be stored. 982 By preventing calls to free() and realloc(), it allows bam1_t::data 983 to hold pointers to memory that cannot be passed to those functions. 984 985 Example: Read a block of alignment records, storing the variable-length 986 data in a single buffer and the records in an array. Stop when either 987 the array or the buffer is full. 988 989 \code{.c} 990 #define MAX_RECS 1000 991 #define REC_LENGTH 400 // Average length estimate, to get buffer size 992 size_t bufsz = MAX_RECS * REC_LENGTH, nrecs, buff_used = 0; 993 bam1_t *recs = calloc(MAX_RECS, sizeof(bam1_t)); 994 uint8_t *buffer = malloc(bufsz); 995 int res = 0, result = EXIT_FAILURE; 996 uint32_t new_m_data; 997 998 if (!recs || !buffer) goto cleanup; 999 for (nrecs = 0; nrecs < MAX_RECS; nrecs++) { 1000 bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); 1001 1002 // Set data pointer to unused part of buffer 1003 recs[nrecs].data = &buffer[buff_used]; 1004 1005 // Set m_data to size of unused part of buffer. On 64-bit platforms it 1006 // will be necessary to limit this to UINT32_MAX due to the size of 1007 // bam1_t::m_data (not done here as our buffer is only 400K). 1008 recs[nrecs].m_data = bufsz - buff_used; 1009 1010 // Read the record 1011 res = sam_read1(file_handle, header, &recs[nrecs]); 1012 if (res <= 0) break; // EOF or error 1013 1014 // Check if the record data didn't fit - if not, stop reading 1015 if ((bam_get_mempolicy(&recs[nrecs]) & BAM_USER_OWNS_DATA) == 0) { 1016 nrecs++; // Include last record in count 1017 break; 1018 } 1019 1020 // Adjust m_data to the space actually used. If space is available, 1021 // round up to eight bytes so the next record aligns nicely. 1022 new_m_data = ((uint32_t) recs[nrecs].l_data + 7) & (~7U); 1023 if (new_m_data < recs[nrecs].m_data) recs[nrecs].m_data = new_m_data; 1024 1025 buff_used += recs[nrecs].m_data; 1026 } 1027 if (res < 0) goto cleanup; 1028 result = EXIT_SUCCESS; 1029 1030 // ... use data ... 1031 1032 cleanup: 1033 for (size_t i = 0; i < nrecs; i++) 1034 bam_destroy1(i); 1035 free(buffer); 1036 free(recs); 1037 1038 \endcode 1039 */ 1040 pragma(inline,true) 1041 void bam_set_mempolicy(bam1_t* b, uint policy) { 1042 b.mempolicy = policy; 1043 } 1044 1045 /// Get alignment record memory policy 1046 /** @param b Alignment record 1047 1048 See bam_set_mempolicy() 1049 */ 1050 pragma(inline,true) 1051 uint bam_get_mempolicy(bam1_t* b) { 1052 return b.mempolicy; 1053 } 1054 1055 /// Read a BAM format alignment record 1056 /** 1057 @param fp BGZF file being read 1058 @param b Destination for the alignment data 1059 @return number of bytes read on success 1060 -1 at end of file 1061 < -1 on failure 1062 1063 This function can only read BAM format files. Most code should use 1064 sam_read1() instead, which can be used with BAM, SAM and CRAM formats. 1065 */ 1066 int bam_read1(BGZF* fp, bam1_t* b); 1067 1068 /// Write a BAM format alignment record 1069 /** 1070 @param fp BGZF file being written 1071 @param b Alignment record to write 1072 @return number of bytes written on success 1073 -1 on error 1074 1075 This function can only write BAM format files. Most code should use 1076 sam_write1() instead, which can be used with BAM, SAM and CRAM formats. 1077 */ 1078 int bam_write1(BGZF* fp, const(bam1_t)* b); 1079 1080 /// Copy alignment record data 1081 /** 1082 @param bdst Destination alignment record 1083 @param bsrc Source alignment record 1084 @return bdst on success; NULL on failure 1085 */ 1086 bam1_t* bam_copy1(bam1_t* bdst, const(bam1_t)* bsrc); 1087 1088 /// Create a duplicate alignment record 1089 /** 1090 @param bsrc Source alignment record 1091 @return Pointer to a new alignment record on success; NULL on failure 1092 1093 The bam1_t struct returned by a successful call should be freed 1094 via bam_destroy1() when it is no longer needed. 1095 */ 1096 bam1_t* bam_dup1(const(bam1_t)* bsrc); 1097 1098 /// Sets all components of an alignment structure 1099 /** 1100 @param bam Target alignment structure. Must be initialized by a call to bam_init1(). 1101 The data field will be reallocated automatically as needed. 1102 @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used. 1103 @param qname Query name, may be NULL if l_qname = 0 1104 @param flag Bitwise flag, a combination of the BAM_F* constants. 1105 @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME). 1106 @param pos 0-based leftmost coordinate. 1107 @param mapq Mapping quality. 1108 @param n_cigar Number of CIGAR operations. 1109 @param cigar CIGAR data, may be NULL if n_cigar = 0. 1110 @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT). 1111 @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT). 1112 @param isize Observed template length ("insert size") (a.k.a. TLEN). 1113 @param l_seq Length of the query sequence (read) and sequence quality string. 1114 @param seq Sequence, may be NULL if l_seq = 0. 1115 @param qual Sequence quality, may be NULL. 1116 @param l_aux Length to be reserved for auxiliary field data, may be 0. 1117 1118 @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. 1119 */ 1120 int bam_set1( 1121 bam1_t* bam, 1122 size_t l_qname, 1123 const(char)* qname, 1124 ushort flag, 1125 int tid, 1126 hts_pos_t pos, 1127 ubyte mapq, 1128 size_t n_cigar, 1129 const(uint)* cigar, 1130 int mtid, 1131 hts_pos_t mpos, 1132 hts_pos_t isize, 1133 size_t l_seq, 1134 const(char)* seq, 1135 const(char)* qual, 1136 size_t l_aux); 1137 1138 /// Calculate query length from CIGAR data 1139 /** 1140 @param n_cigar Number of items in @p cigar 1141 @param cigar CIGAR data 1142 @return Query length 1143 1144 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op 1145 where op_len is the length in bases and op is a value between 0 and 8 1146 representing one of the operations "MIDNSHP=X" (M = 0; X = 8) 1147 1148 This function returns the sum of the lengths of the M, I, S, = and X 1149 operations in @p cigar (these are the operations that "consume" query 1150 bases). All other operations (including invalid ones) are ignored. 1151 1152 @note This return type of this function is hts_pos_t so that it can 1153 correctly return the length of CIGAR sequences including many long 1154 operations without overflow. However, other restrictions (notably the sizes 1155 of bam1_core_t::l_qseq and bam1_t::data) limit the maximum query sequence 1156 length supported by HTSlib to fewer than INT_MAX bases. 1157 */ 1158 hts_pos_t bam_cigar2qlen(int n_cigar, const(uint)* cigar); 1159 1160 /// Calculate reference length from CIGAR data 1161 /** 1162 @param n_cigar Number of items in @p cigar 1163 @param cigar CIGAR data 1164 @return Reference length 1165 1166 CIGAR data is stored as in the BAM format, i.e. (op_len << 4) | op 1167 where op_len is the length in bases and op is a value between 0 and 8 1168 representing one of the operations "MIDNSHP=X" (M = 0; X = 8) 1169 1170 This function returns the sum of the lengths of the M, D, N, = and X 1171 operations in @p cigar (these are the operations that "consume" reference 1172 bases). All other operations (including invalid ones) are ignored. 1173 */ 1174 hts_pos_t bam_cigar2rlen(int n_cigar, const(uint)* cigar); 1175 1176 /*! 1177 @abstract Calculate the rightmost base position of an alignment on the 1178 reference genome. 1179 1180 @param b pointer to an alignment 1181 @return the coordinate of the first base after the alignment, 0-based 1182 1183 @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. 1184 For an unmapped read (either according to its flags or if it has no cigar 1185 string) or a read whose cigar string consumes no reference bases at all, 1186 we return b->core.pos + 1 by convention. 1187 */ 1188 hts_pos_t bam_endpos(const(bam1_t)* b); 1189 1190 int bam_str2flag(const(char)* str); /** returns negative value on error */ 1191 1192 char* bam_flag2str(int flag); /** The string must be freed by the user */ 1193 1194 /*! @function 1195 @abstract Set the name of the query 1196 @param b pointer to an alignment 1197 @return 0 on success, -1 on failure 1198 */ 1199 int bam_set_qname(bam1_t* b, const(char)* qname); 1200 1201 /*! @function 1202 @abstract Parse a CIGAR string into a uint32_t array 1203 @param in [in] pointer to the source string 1204 @param end [out] address of the pointer to the new end of the input string 1205 can be NULL 1206 @param a_cigar [in/out] address of the destination uint32_t buffer 1207 @param a_mem [in/out] address of the allocated number of buffer elements 1208 @return number of processed CIGAR operators; -1 on error 1209 */ 1210 ssize_t sam_parse_cigar( 1211 const(char)* in_, 1212 char** end, 1213 uint** a_cigar, 1214 size_t* a_mem); 1215 1216 /*! @function 1217 @abstract Parse a CIGAR string into a bam1_t struct 1218 @param in [in] pointer to the source string 1219 @param end [out] address of the pointer to the new end of the input string 1220 can be NULL 1221 @param b [in/out] address of the destination bam1_t struct 1222 @return number of processed CIGAR operators; -1 on error 1223 */ 1224 ssize_t bam_parse_cigar(const(char)* in_, char** end, bam1_t* b); 1225 1226 /************************* 1227 *** BAM/CRAM indexing *** 1228 *************************/ 1229 1230 // These BAM iterator functions work only on BAM files. To work with either 1231 // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. 1232 alias bam_itr_destroy = hts_itr_destroy; 1233 alias bam_itr_queryi = sam_itr_queryi; 1234 alias bam_itr_querys = sam_itr_querys; 1235 1236 pragma(inline, true) 1237 extern (D) auto bam_itr_next(T0, T1, T2)(auto ref T0 htsfp, auto ref T1 itr, auto ref T2 r) 1238 { 1239 return hts_itr_next(htsfp.fp.bgzf, itr, r, 0); 1240 } 1241 1242 // Load/build .csi or .bai BAM index file. Does not work with CRAM. 1243 // It is recommended to use the sam_index_* functions below instead. 1244 extern (D) auto bam_index_load(T)(auto ref T fn) 1245 { 1246 return hts_idx_load(fn, HTS_FMT_BAI); 1247 } 1248 1249 extern (D) auto bam_index_build(T0, T1)(auto ref T0 fn, auto ref T1 min_shift) 1250 { 1251 return sam_index_build(fn, min_shift); 1252 } 1253 1254 /// Initialise fp->idx for the current format type for SAM, BAM and CRAM types . 1255 /** @param fp File handle for the data file being written. 1256 @param h Bam header structured (needed for BAI and CSI). 1257 @param min_shift 0 for BAI, or larger for CSI (CSI defaults to 14). 1258 @param fnidx Filename to write index to. This pointer must remain valid 1259 until after sam_idx_save is called. 1260 @return 0 on success, <0 on failure. 1261 1262 @note This must be called after the header has been written, but before 1263 any other data. 1264 */ 1265 int sam_idx_init(htsFile* fp, sam_hdr_t* h, int min_shift, const(char)* fnidx); 1266 1267 /// Writes the index initialised with sam_idx_init to disk. 1268 /** @param fp File handle for the data file being written. 1269 @return 0 on success, <0 on failure. 1270 */ 1271 int sam_idx_save(htsFile* fp); 1272 1273 /// Load a BAM (.csi or .bai) or CRAM (.crai) index file 1274 /** @param fp File handle of the data file whose index is being opened 1275 @param fn BAM/CRAM/etc filename to search alongside for the index file 1276 @return The index, or NULL if an error occurred. 1277 1278 Equivalent to sam_index_load3(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); 1279 */ 1280 hts_idx_t* sam_index_load(htsFile* fp, const(char)* fn); 1281 1282 /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file 1283 /** @param fp File handle of the data file whose index is being opened 1284 @param fn BAM/CRAM/etc data file filename 1285 @param fnidx Index filename, or NULL to search alongside @a fn 1286 @return The index, or NULL if an error occurred. 1287 1288 Equivalent to sam_index_load3(fp, fn, fnidx, HTS_IDX_SAVE_REMOTE); 1289 */ 1290 hts_idx_t* sam_index_load2(htsFile* fp, const(char)* fn, const(char)* fnidx); 1291 1292 /// Load or stream a BAM (.csi or .bai) or CRAM (.crai) index file 1293 /** @param fp File handle of the data file whose index is being opened 1294 @param fn BAM/CRAM/etc data file filename 1295 @param fnidx Index filename, or NULL to search alongside @a fn 1296 @param flags Flags to alter behaviour (see description) 1297 @return The index, or NULL if an error occurred. 1298 1299 The @p flags parameter can be set to a combination of the following values: 1300 1301 HTS_IDX_SAVE_REMOTE Save a local copy of any remote indexes 1302 HTS_IDX_SILENT_FAIL Fail silently if the index is not present 1303 1304 Note that HTS_IDX_SAVE_REMOTE has no effect for remote CRAM indexes. They 1305 are always downloaded and never cached locally. 1306 1307 The index struct returned by a successful call should be freed 1308 via hts_idx_destroy() when it is no longer needed. 1309 */ 1310 hts_idx_t* sam_index_load3( 1311 htsFile* fp, 1312 const(char)* fn, 1313 const(char)* fnidx, 1314 int flags); 1315 1316 /// Generate and save an index file 1317 /** @param fn Input BAM/etc filename, to which .csi/etc will be added 1318 @param min_shift Positive to generate CSI, or 0 to generate BAI 1319 @return 0 if successful, or negative if an error occurred (usually -1; or 1320 -2: opening fn failed; -3: format not indexable; -4: 1321 failed to create and/or save the index) 1322 */ 1323 int sam_index_build(const(char)* fn, int min_shift); 1324 1325 /// Generate and save an index to a specific file 1326 /** @param fn Input BAM/CRAM/etc filename 1327 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1328 @param min_shift Positive to generate CSI, or 0 to generate BAI 1329 @return 0 if successful, or negative if an error occurred (see 1330 sam_index_build for error codes) 1331 */ 1332 int sam_index_build2(const(char)* fn, const(char)* fnidx, int min_shift); 1333 1334 /// Generate and save an index to a specific file 1335 /** @param fn Input BAM/CRAM/etc filename 1336 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 1337 @param min_shift Positive to generate CSI, or 0 to generate BAI 1338 @param nthreads Number of threads to use when building the index 1339 @return 0 if successful, or negative if an error occurred (see 1340 sam_index_build for error codes) 1341 */ 1342 int sam_index_build3( 1343 const(char)* fn, 1344 const(char)* fnidx, 1345 int min_shift, 1346 int nthreads); 1347 1348 /// Free a SAM iterator 1349 /// @param iter Iterator to free 1350 alias sam_itr_destroy = hts_itr_destroy; 1351 1352 /// Create a BAM/CRAM iterator 1353 /** @param idx Index 1354 @param tid Target id 1355 @param beg Start position in target 1356 @param end End position in target 1357 @return An iterator on success; NULL on failure 1358 1359 The following special values (defined in htslib/hts.h)can be used for @p tid. 1360 When using one of these values, @p beg and @p end are ignored. 1361 1362 HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file 1363 HTS_IDX_START iterates over the entire file 1364 HTS_IDX_REST iterates from the current position to the end of the file 1365 HTS_IDX_NONE always returns "no more alignment records" 1366 1367 When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. 1368 */ 1369 hts_itr_t* sam_itr_queryi( 1370 const(hts_idx_t)* idx, 1371 int tid, 1372 hts_pos_t beg, 1373 hts_pos_t end); 1374 1375 /// Create a SAM/BAM/CRAM iterator 1376 /** @param idx Index 1377 @param hdr Header 1378 @param region Region specification 1379 @return An iterator on success; NULL on failure 1380 1381 Regions are parsed by hts_parse_reg(), and take one of the following forms: 1382 1383 region | Outputs 1384 --------------- | ------------- 1385 REF | All reads with RNAME REF 1386 REF: | All reads with RNAME REF 1387 REF:START | Reads with RNAME REF overlapping START to end of REF 1388 REF:-END | Reads with RNAME REF overlapping start of REF to END 1389 REF:START-END | Reads with RNAME REF overlapping START to END 1390 . | All reads from the start of the file 1391 * | Unmapped reads at the end of the file (RNAME '*' in SAM) 1392 1393 The form `REF:` should be used when the reference name itself contains a colon. 1394 1395 Note that SAM files must be bgzf-compressed for iterators to work. 1396 */ 1397 hts_itr_t* sam_itr_querys( 1398 const(hts_idx_t)* idx, 1399 sam_hdr_t* hdr, 1400 const(char)* region); 1401 1402 /// Create a multi-region iterator 1403 /** @param idx Index 1404 @param hdr Header 1405 @param reglist Array of regions to iterate over 1406 @param regcount Number of items in reglist 1407 1408 Each @p reglist entry should have the reference name in the `reg` field, an 1409 array of regions for that reference in `intervals` and the number of items 1410 in `intervals` should be stored in `count`. No other fields need to be filled 1411 in. 1412 1413 The iterator will return all reads overlapping the given regions. If a read 1414 overlaps more than one region, it will only be returned once. 1415 */ 1416 hts_itr_t* sam_itr_regions( 1417 const(hts_idx_t)* idx, 1418 sam_hdr_t* hdr, 1419 hts_reglist_t* reglist, 1420 uint regcount); 1421 1422 /// Create a multi-region iterator 1423 /** @param idx Index 1424 @param hdr Header 1425 @param regarray Array of ref:interval region specifiers 1426 @param regcount Number of items in regarray 1427 1428 Each @p regarray entry is parsed by hts_parse_reg(), and takes one of the 1429 following forms: 1430 1431 region | Outputs 1432 --------------- | ------------- 1433 REF | All reads with RNAME REF 1434 REF: | All reads with RNAME REF 1435 REF:START | Reads with RNAME REF overlapping START to end of REF 1436 REF:-END | Reads with RNAME REF overlapping start of REF to END 1437 REF:START-END | Reads with RNAME REF overlapping START to END 1438 . | All reads from the start of the file 1439 * | Unmapped reads at the end of the file (RNAME '*' in SAM) 1440 1441 The form `REF:` should be used when the reference name itself contains a colon. 1442 1443 The iterator will return all reads overlapping the given regions. If a read 1444 overlaps more than one region, it will only be returned once. 1445 */ 1446 hts_itr_t* sam_itr_regarray( 1447 const(hts_idx_t)* idx, 1448 sam_hdr_t* hdr, 1449 char** regarray, 1450 uint regcount); 1451 1452 } /// closing @nogc 1453 1454 /// Get the next read from a SAM/BAM/CRAM iterator 1455 /** @param htsfp Htsfile pointer for the input file 1456 @param itr Iterator 1457 @param r Pointer to a bam1_t struct 1458 @return >= 0 on success; -1 when there is no more data; < -1 on error 1459 */ 1460 pragma(inline,true) 1461 int sam_itr_next(htsFile* htsfp, hts_itr_t* itr, bam1_t* r) { 1462 if (!htsfp.is_bgzf && !htsfp.is_cram) { 1463 hts_log_error(__FUNCTION__, format("%s not BGZF compressed", htsfp.fn ? htsfp.fn : "File")); 1464 return -2; 1465 } 1466 if (!itr) { 1467 hts_log_error(__FUNCTION__,"Null iterator"); 1468 return -2; 1469 } 1470 1471 if (itr.multi) 1472 return hts_itr_multi_next(htsfp, itr, r); 1473 else 1474 return hts_itr_next(htsfp.is_bgzf ? htsfp.fp.bgzf : null, itr, r, htsfp); 1475 } 1476 1477 @nogc nothrow: 1478 1479 /// Get the next read from a BAM/CRAM multi-iterator 1480 /** @param htsfp Htsfile pointer for the input file 1481 @param itr Iterator 1482 @param r Pointer to a bam1_t struct 1483 @return >= 0 on success; -1 when there is no more data; < -1 on error 1484 */ 1485 alias sam_itr_multi_next = sam_itr_next; 1486 1487 const(char)* sam_parse_region( 1488 sam_hdr_t* h, 1489 const(char)* s, 1490 int* tid, 1491 hts_pos_t* beg, 1492 hts_pos_t* end, 1493 int flags); 1494 1495 /*************** 1496 *** SAM I/O *** 1497 ***************/ 1498 1499 alias sam_open = hts_open; 1500 alias sam_open_format = hts_open_format; 1501 1502 alias sam_close = hts_close; 1503 1504 int sam_open_mode(char* mode, const(char)* fn, const(char)* format); 1505 1506 // A version of sam_open_mode that can handle ,key=value options. 1507 // The format string is allocated and returned, to be freed by the caller. 1508 // Prefix should be "r" or "w", 1509 char* sam_open_mode_opts( 1510 const(char)* fn, 1511 const(char)* mode, 1512 const(char)* format); 1513 1514 int sam_hdr_change_HD(sam_hdr_t* h, const(char)* key, const(char)* val); 1515 1516 int sam_parse1(kstring_t* s, sam_hdr_t* h, bam1_t* b); 1517 int sam_format1(const(sam_hdr_t)* h, const(bam1_t)* b, kstring_t* str); 1518 1519 /// sam_read1 - Read a record from a file 1520 /** @param fp Pointer to the source file 1521 * @param h Pointer to the header previously read (fully or partially) 1522 * @param b Pointer to the record placeholder 1523 * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error 1524 */ 1525 int sam_read1(samFile* fp, sam_hdr_t* h, bam1_t* b); 1526 /// sam_write1 - Write a record to a file 1527 /** @param fp Pointer to the destination file 1528 * @param h Pointer to the header structure previously read 1529 * @param b Pointer to the record to be written 1530 * @return >= 0 on successfully writing the record, -1 on error 1531 */ 1532 int sam_write1(samFile* fp, const(sam_hdr_t)* h, const(bam1_t)* b); 1533 1534 // Forward declaration, see hts_expr.h for full. 1535 struct hts_filter_t; 1536 1537 /// sam_passes_filter - Checks whether a record passes an hts_filter. 1538 /** @param h Pointer to the header structure previously read 1539 * @param b Pointer to the BAM record to be checked 1540 * @param filt Pointer to the filter, created from hts_filter_init. 1541 * @return 1 if passes, 0 if not, and <0 on error. 1542 */ 1543 int sam_passes_filter( 1544 const(sam_hdr_t)* h, 1545 const(bam1_t)* b, 1546 hts_filter_t* filt); 1547 1548 /************************************* 1549 *** Manipulating auxiliary fields *** 1550 *************************************/ 1551 1552 /// Converts a BAM aux tag to SAM format 1553 /* 1554 * @param b Pointer to the bam record 1555 * @param key Two letter tag key 1556 * @param type Single letter type code: ACcSsIifHZB. 1557 * @param tag Tag data pointer, in BAM format 1558 * @param end Pointer to end of bam record (largest extent of tag) 1559 * @param ks kstring to write the formatted tag to 1560 * 1561 * @return pointer to end of tag on success, 1562 * NULL on failure. 1563 * 1564 * @discussion The three separate parameters key, type, tag may be 1565 * derived from a s=bam_aux_get() query as s-2, *s and s+1. However 1566 * it is recommended to use bam_aux_get_str in this situation. 1567 * The desire to split these parameters up is for potential processing 1568 * of non-BAM formats that encode using a BAM type mechanism 1569 * (such as the internal CRAM representation). 1570 */ 1571 1572 // brevity and consistency with other code. 1573 1574 // NB: "d" is not an official type in the SAM spec. 1575 // However for unknown reasons samtools has always supported this. 1576 // We believe, HOPE, it is not in general usage and we do not 1577 // encourage it. 1578 1579 // or externalise sam.c's aux_type2size function? 1580 1581 // now points to the start of the array 1582 1583 // write the type 1584 1585 // Unknown type 1586 const(ubyte)* sam_format_aux1( 1587 const(ubyte)* key, 1588 const ubyte type, 1589 const(ubyte)* tag, 1590 const(ubyte)* end, 1591 kstring_t* ks); 1592 1593 /// Return a pointer to an aux record 1594 /** @param b Pointer to the bam record 1595 @param tag Desired aux tag 1596 @return Pointer to the tag data, or NULL if tag is not present or on error 1597 If the tag is not present, this function returns NULL and sets errno to 1598 ENOENT. If the bam record's aux data is corrupt (either a tag has an 1599 invalid type, or the last record is incomplete) then errno is set to 1600 EINVAL and NULL is returned. 1601 */ 1602 ubyte* bam_aux_get(const(bam1_t)* b, ref const(char)[2] tag); 1603 1604 /// Return a SAM formatting string containing a BAM tag 1605 /** @param b Pointer to the bam record 1606 @param tag Desired aux tag 1607 @param s The kstring to write to. 1608 1609 @return 1 on success, 1610 0 on no tag found with errno = ENOENT, 1611 -1 on error (errno will be either EINVAL or ENOMEM). 1612 */ 1613 int bam_aux_get_str(const(bam1_t)* b, ref const(char)[2] tag, kstring_t* s); 1614 1615 /// Get an integer aux value 1616 /** @param s Pointer to the tag data, as returned by bam_aux_get() 1617 @return The value, or 0 if the tag was not an integer type 1618 If the tag is not an integer type, errno is set to EINVAL. This function 1619 will not return the value of floating-point tags. 1620 */ 1621 long bam_aux2i(const(ubyte)* s); 1622 1623 /// Get an integer aux value 1624 /** @param s Pointer to the tag data, as returned by bam_aux_get() 1625 @return The value, or 0 if the tag was not an integer type 1626 If the tag is not an numeric type, errno is set to EINVAL. The value of 1627 integer flags will be returned cast to a double. 1628 */ 1629 double bam_aux2f(const(ubyte)* s); 1630 1631 /// Get a character aux value 1632 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1633 @return The value, or 0 if the tag was not a character ('A') type 1634 If the tag is not a character type, errno is set to EINVAL. 1635 */ 1636 char bam_aux2A(const(ubyte)* s); 1637 1638 /// Get a string aux value 1639 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1640 @return Pointer to the string, or NULL if the tag was not a string type 1641 If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL. 1642 */ 1643 char* bam_aux2Z(const(ubyte)* s); 1644 1645 /// Get the length of an array-type ('B') tag 1646 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1647 @return The length of the array, or 0 if the tag is not an array type. 1648 If the tag is not an array type, errno is set to EINVAL. 1649 */ 1650 uint bam_auxB_len(const(ubyte)* s); 1651 1652 /// Get an integer value from an array-type tag 1653 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1654 @param idx 0-based Index into the array 1655 @return The idx'th value, or 0 on error. 1656 If the array is not an integer type, errno is set to EINVAL. If idx 1657 is greater than or equal to the value returned by bam_auxB_len(s), 1658 errno is set to ERANGE. In both cases, 0 will be returned. 1659 */ 1660 long bam_auxB2i(const(ubyte)* s, uint idx); 1661 1662 /// Get a floating-point value from an array-type tag 1663 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 1664 @param idx 0-based Index into the array 1665 @return The idx'th value, or 0.0 on error. 1666 If the array is not a numeric type, errno is set to EINVAL. This can 1667 only actually happen if the input record has an invalid type field. If 1668 idx is greater than or equal to the value returned by bam_auxB_len(s), 1669 errno is set to ERANGE. In both cases, 0.0 will be returned. 1670 */ 1671 double bam_auxB2f(const(ubyte)* s, uint idx); 1672 1673 /// Append tag data to a bam record 1674 /* @param b The bam record to append to. 1675 @param tag Tag identifier 1676 @param type Tag data type 1677 @param len Length of the data in bytes 1678 @param data The data to append 1679 @return 0 on success; -1 on failure. 1680 If there is not enough space to store the additional tag, errno is set to 1681 ENOMEM. If the type is invalid, errno may be set to EINVAL. errno is 1682 also set to EINVAL if the bam record's aux data is corrupt. 1683 */ 1684 int bam_aux_append( 1685 bam1_t* b, 1686 ref const(char)[2] tag, 1687 char type, 1688 int len, 1689 const(ubyte)* data); 1690 1691 /// Delete tag data from a bam record 1692 /* @param b The bam record to update 1693 @param s Pointer to the tag to delete, as returned by bam_aux_get(). 1694 @return 0 on success; -1 on failure 1695 If the bam record's aux data is corrupt, errno is set to EINVAL and this 1696 function returns -1; 1697 */ 1698 int bam_aux_del(bam1_t* b, ubyte* s); 1699 1700 /// Update or add a string-type tag 1701 /* @param b The bam record to update 1702 @param tag Tag identifier 1703 @param len The length of the new string 1704 @param data The new string 1705 @return 0 on success, -1 on failure 1706 This function will not change the ordering of tags in the bam record. 1707 New tags will be appended to any existing aux records. 1708 1709 If @p len is less than zero, the length of the input string will be 1710 calculated using strlen(). Otherwise exactly @p len bytes will be 1711 copied from @p data to make the new tag. If these bytes do not 1712 include a terminating NUL character, one will be added. (Note that 1713 versions of HTSlib up to 1.10.2 had different behaviour here and 1714 simply copied @p len bytes from data. To generate a valid tag it 1715 was necessary to ensure the last character was a NUL, and include 1716 it in @p len.) 1717 1718 On failure, errno may be set to one of the following values: 1719 1720 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1721 given ID is not of type 'Z'. 1722 1723 ENOMEM: The bam data needs to be expanded and either the attempt to 1724 reallocate the data buffer failed or the resulting buffer would be 1725 longer than the maximum size allowed in a bam record (2Gbytes). 1726 */ 1727 int bam_aux_update_str( 1728 bam1_t* b, 1729 ref const(char)[2] tag, 1730 int len, 1731 const(char)* data); 1732 1733 /// Update or add an integer tag 1734 /* @param b The bam record to update 1735 @param tag Tag identifier 1736 @param val The new value 1737 @return 0 on success, -1 on failure 1738 This function will not change the ordering of tags in the bam record. 1739 New tags will be appended to any existing aux records. 1740 1741 On failure, errno may be set to one of the following values: 1742 1743 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1744 given ID is not of an integer type (c, C, s, S, i or I). 1745 1746 EOVERFLOW (or ERANGE on systems that do not have EOVERFLOW): val is 1747 outside the range that can be stored in an integer bam tag (-2147483647 1748 to 4294967295). 1749 1750 ENOMEM: The bam data needs to be expanded and either the attempt to 1751 reallocate the data buffer failed or the resulting buffer would be 1752 longer than the maximum size allowed in a bam record (2Gbytes). 1753 */ 1754 int bam_aux_update_int(bam1_t* b, ref const(char)[2] tag, long val); 1755 1756 /// Update or add a floating-point tag 1757 /* @param b The bam record to update 1758 @param tag Tag identifier 1759 @param val The new value 1760 @return 0 on success, -1 on failure 1761 This function will not change the ordering of tags in the bam record. 1762 New tags will be appended to any existing aux records. 1763 1764 On failure, errno may be set to one of the following values: 1765 1766 EINVAL: The bam record's aux data is corrupt or an existing tag with the 1767 given ID is not of a float type. 1768 1769 ENOMEM: The bam data needs to be expanded and either the attempt to 1770 reallocate the data buffer failed or the resulting buffer would be 1771 longer than the maximum size allowed in a bam record (2Gbytes). 1772 */ 1773 int bam_aux_update_float(bam1_t* b, ref const(char)[2] tag, float val); 1774 1775 /// Update or add an array tag 1776 /* @param b The bam record to update 1777 @param tag Tag identifier 1778 @param type Data type (one of c, C, s, S, i, I or f) 1779 @param items Number of items 1780 @param data Pointer to data 1781 @return 0 on success, -1 on failure 1782 The type parameter indicates the how the data is interpreted: 1783 1784 Letter code | Data type | Item Size (bytes) 1785 ----------- | --------- | ----------------- 1786 c | int8_t | 1 1787 C | uint8_t | 1 1788 s | int16_t | 2 1789 S | uint16_t | 2 1790 i | int32_t | 4 1791 I | uint32_t | 4 1792 f | float | 4 1793 1794 This function will not change the ordering of tags in the bam record. 1795 New tags will be appended to any existing aux records. The bam record 1796 will grow or shrink in order to accommodate the new data. 1797 1798 The data parameter must not point to any data in the bam record itself or 1799 undefined behaviour may result. 1800 1801 On failure, errno may be set to one of the following values: 1802 1803 EINVAL: The bam record's aux data is corrupt, an existing tag with the 1804 given ID is not of an array type or the type parameter is not one of 1805 the values listed above. 1806 1807 ENOMEM: The bam data needs to be expanded and either the attempt to 1808 reallocate the data buffer failed or the resulting buffer would be 1809 longer than the maximum size allowed in a bam record (2Gbytes). 1810 */ 1811 int bam_aux_update_array( 1812 bam1_t* b, 1813 ref const(char)[2] tag, 1814 ubyte type, 1815 uint items, 1816 void* data); 1817 1818 /************************** 1819 *** Pileup and Mpileup *** 1820 **************************/ 1821 1822 /*! @typedef 1823 @abstract Generic pileup 'client data'. 1824 1825 @discussion The pileup iterator allows setting a constructor and 1826 destructor function, which will be called every time a sequence is 1827 fetched and discarded. This permits caching of per-sequence data in 1828 a tidy manner during the pileup process. This union is the cached 1829 data to be manipulated by the "client" (the caller of pileup). 1830 */ 1831 union bam_pileup_cd 1832 { 1833 void* p; 1834 long i; 1835 double f; 1836 } 1837 1838 /*! @typedef 1839 @abstract Structure for one alignment covering the pileup position. 1840 @field b pointer to the alignment 1841 @field qpos position of the read base at the pileup site, 0-based 1842 @field indel indel length; 0 for no indel, positive for ins and negative for del 1843 @field level the level of the read in the "viewer" mode 1844 @field is_del 1 iff the base on the padded read is a deletion 1845 @field is_head 1 iff this is the first base in the query sequence 1846 @field is_tail 1 iff this is the last base in the query sequence 1847 @field is_refskip 1 iff the base on the padded read is part of CIGAR N op 1848 @field aux (used by bcf_call_gap_prep()) 1849 @field cigar_ind index of the CIGAR operator that has just been processed 1850 1851 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The 1852 difference between the two functions is that the former does not 1853 set bam_pileup1_t::level, while the later does. Level helps the 1854 implementation of alignment viewers, but calculating this has some 1855 overhead. 1856 */ 1857 struct bam_pileup1_t 1858 { 1859 import std.bitmanip : bitfields; 1860 1861 bam1_t* b; 1862 int qpos; 1863 int indel; 1864 int level; 1865 1866 mixin(bitfields!( 1867 uint, "is_del", 1, 1868 uint, "is_head", 1, 1869 uint, "is_tail", 1, 1870 uint, "is_refskip", 1, 1871 uint, "", 1, 1872 uint, "aux", 27)); 1873 1874 /* reserved */ 1875 bam_pileup_cd cd; // generic per-struct data, owned by caller. 1876 int cigar_ind; 1877 } 1878 1879 alias bam_plp_auto_f = int function(void* data, bam1_t* b); 1880 1881 struct bam_plp_s; 1882 alias bam_plp_t = bam_plp_s*; 1883 1884 struct bam_mplp_s; 1885 alias bam_mplp_t = bam_mplp_s*; 1886 1887 /** 1888 * bam_plp_init() - sets an iterator over multiple 1889 * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return 1890 * status: 0 on success, -1 on end, < -1 on non-recoverable errors 1891 * @data: user data to pass to @func 1892 * 1893 * The struct returned by a successful call should be freed 1894 * via bam_plp_destroy() when it is no longer needed. 1895 */ 1896 bam_plp_t bam_plp_init(bam_plp_auto_f func, void* data); 1897 1898 void bam_plp_destroy(bam_plp_t iter); 1899 1900 int bam_plp_push(bam_plp_t iter, const(bam1_t)* b); 1901 1902 const(bam_pileup1_t)* bam_plp_next( 1903 bam_plp_t iter, 1904 int* _tid, 1905 int* _pos, 1906 int* _n_plp); 1907 1908 const(bam_pileup1_t)* bam_plp_auto( 1909 bam_plp_t iter, 1910 int* _tid, 1911 int* _pos, 1912 int* _n_plp); 1913 1914 const(bam_pileup1_t)* bam_plp64_next( 1915 bam_plp_t iter, 1916 int* _tid, 1917 hts_pos_t* _pos, 1918 int* _n_plp); 1919 1920 const(bam_pileup1_t)* bam_plp64_auto( 1921 bam_plp_t iter, 1922 int* _tid, 1923 hts_pos_t* _pos, 1924 int* _n_plp); 1925 1926 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); 1927 1928 void bam_plp_reset(bam_plp_t iter); 1929 1930 /** 1931 * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields. 1932 * @plp: The bam_plp_t initialised using bam_plp_init. 1933 * @func: The callback function itself. When called, it is given the 1934 * data argument (specified in bam_plp_init), the bam structure and 1935 * a pointer to a locally allocated bam_pileup_cd union. This union 1936 * will also be present in each bam_pileup1_t created. 1937 */ 1938 void bam_plp_constructor( 1939 bam_plp_t plp, 1940 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 1941 void bam_plp_destructor( 1942 bam_plp_t plp, 1943 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 1944 1945 /// Get pileup padded insertion sequence 1946 /** 1947 * @param p pileup data 1948 * @param ins the kstring where the insertion sequence will be written 1949 * @param del_len location for deletion length 1950 * @return the length of insertion string on success; -1 on failure. 1951 * 1952 * Fills out the kstring with the padded insertion sequence for the current 1953 * location in 'p'. If this is not an insertion site, the string is blank. 1954 * 1955 * If del_len is not NULL, the location pointed to is set to the length of 1956 * any deletion immediately following the insertion, or zero if none. 1957 */ 1958 int bam_plp_insertion(const(bam_pileup1_t)* p, kstring_t* ins, int* del_len); 1959 1960 /// Create a new bam_mplp_t structure 1961 /** The struct returned by a successful call should be freed 1962 * via bam_mplp_destroy() when it is no longer needed. 1963 */ 1964 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void** data); 1965 1966 /// Set up mpileup overlap detection 1967 /** 1968 * @param iter mpileup iterator 1969 * @return 0 on success; a negative value on error 1970 * 1971 * If called, mpileup will detect overlapping 1972 * read pairs and for each base pair set the base quality of the 1973 * lower-quality base to zero, thus effectively discarding it from 1974 * calling. If the two bases are identical, the quality of the other base 1975 * is increased to the sum of their qualities (capped at 200), otherwise 1976 * it is multiplied by 0.8. 1977 */ 1978 int bam_mplp_init_overlaps(bam_mplp_t iter); 1979 1980 void bam_mplp_destroy(bam_mplp_t iter); 1981 1982 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); 1983 1984 int bam_mplp_auto( 1985 bam_mplp_t iter, 1986 int* _tid, 1987 int* _pos, 1988 int* n_plp, 1989 const(bam_pileup1_t*)* plp); 1990 1991 int bam_mplp64_auto( 1992 bam_mplp_t iter, 1993 int* _tid, 1994 hts_pos_t* _pos, 1995 int* n_plp, 1996 const(bam_pileup1_t*)* plp); 1997 1998 void bam_mplp_reset(bam_mplp_t iter); 1999 2000 void bam_mplp_constructor( 2001 bam_mplp_t iter, 2002 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2003 2004 void bam_mplp_destructor( 2005 bam_mplp_t iter, 2006 int function(void* data, const(bam1_t)* b, bam_pileup_cd* cd) func); 2007 2008 // ~!defined(BAM_NO_PILEUP) 2009 2010 /*********************************** 2011 * BAQ calculation and realignment * 2012 ***********************************/ 2013 2014 int sam_cap_mapq(bam1_t* b, const(char)* ref_, hts_pos_t ref_len, int thres); 2015 2016 /// Used as flag parameter in sam_prob_realn. 2017 enum htsRealnFlags { 2018 BAQ_APPLY = 1, 2019 BAQ_EXTEND = 2, 2020 BAQ_REDO = 4, 2021 2022 // Platform subfield, in bit position 3 onwards 2023 BAQ_AUTO = 0<<3, 2024 BAQ_ILLUMINA = 1<<3, 2025 BAQ_PACBIOCCS = 2<<3, 2026 BAQ_PACBIO = 3<<3, 2027 BAQ_ONT = 4<<3, 2028 BAQ_GENAPSYS = 5<<3 2029 } 2030 2031 /// Calculate BAQ scores 2032 /** @param b BAM record 2033 @param ref Reference sequence 2034 @param ref_len Reference sequence length 2035 @param flag Flags, see description 2036 @return 0 on success \n 2037 -1 if the read was unmapped, zero length, had no quality values, did not have at least one M, X or = CIGAR operator, or included a reference skip. \n 2038 -3 if BAQ alignment has already been done and does not need to be applied, or has already been applied. \n 2039 -4 if alignment failed (most likely due to running out of memory) 2040 2041 This function calculates base alignment quality (BAQ) values using the method 2042 described in "Improving SNP discovery by base alignment quality", Heng Li, 2043 Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076). 2044 2045 The @param flag value can be generated using the htsRealnFlags enum, but for 2046 backwards compatibilty reasons is retained as an "int". An example usage 2047 of the enum could be this, equivalent to flag 19: 2048 2049 sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS); 2050 2051 2052 The following @param flag bits can be used: 2053 2054 Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values 2055 2056 If set, the data in the BQ:Z tag is used to adjust the quality values, and 2057 the BQ:Z tag is renamed to ZQ:Z. 2058 2059 If clear, and a ZQ:Z tag is present, the quality values are reverted using 2060 the data in the tag, and the tag is renamed to BQ:Z. 2061 2062 Bit 1 (BAQ_EXTEND): Use "extended" BAQ. 2063 2064 Changes the BAQ calculation to increase sensitivity at the expense of 2065 reduced specificity. 2066 2067 Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present. 2068 2069 Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable 2070 recalculation. 2071 2072 Bits 3-10: Choose parameters tailored to a specific instrument type. 2073 2074 One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and 2075 BAQ_GENAPSYS. The BAQ parameter tuning are still a work in progress and 2076 at the time of writing mainly consist of Illumina vs long-read technology 2077 adjustments. 2078 2079 2080 @bug 2081 If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed. 2082 Depending on what previous processing happened, this may or may not be the 2083 correct thing to do. It would be wise to avoid this situation if possible. 2084 */ 2085 2086 int sam_prob_realn(bam1_t* b, const(char)* ref_, hts_pos_t ref_len, int flag); 2087