1 // htslib-1.9 sam.h as D module 2 // Changes include: 3 // Removed if(n)defs 4 // including BAM_NO_ID which is probably archaic 5 // Changed #defines to const/immutable 6 // Removed all HTS_RESULT_USED (__attribute__ ((__warn_unused_result__))) 7 // Do not #include "hts_defs.h" 8 // Change numeric #defines to enum int 9 // Changed string #define to 8-bit (char) string literal "xxx"c 10 // typedef struct to alias 11 // removed typedefs 12 // modified bitfields in struct and aligned(1) 13 // removed redundant struct declarations when declaring struct pointers 14 // ref is a reserved keyword in D; changed 'ref' to '_ref' 15 // Function prototypes taking fixed size array (e.g. ..., const char tag[2], ) should include ref in the D prototype 16 // const char *p -> const(char) *p 17 // Replace localal definition with import kstring 18 /* 19 Aliased function pointer typedefs: 20 21 typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); 22 alias bam_plp_auto_f = int *function(void *data, bam1_t *b); 23 24 Updated parameter function pointers: 25 int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); 26 int function(void *data, const bam1_t *b, bam_pileup_cd *cd) func); 27 28 Functions returning const must be rewritten as const(type)*func_name 29 */ 30 module dhtslib.htslib.sam; 31 32 import std.bitmanip; 33 34 extern (C): 35 /// @file htslib/sam.h 36 /// High-level SAM/BAM/CRAM sequence file operations. 37 /* 38 Copyright (C) 2008, 2009, 2013-2017 Genome Research Ltd. 39 Copyright (C) 2010, 2012, 2013 Broad Institute. 40 41 Author: Heng Li <lh3@sanger.ac.uk> 42 43 Permission is hereby granted, free of charge, to any person obtaining a copy 44 of this software and associated documentation files (the "Software"), to deal 45 in the Software without restriction, including without limitation the rights 46 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 47 copies of the Software, and to permit persons to whom the Software is 48 furnished to do so, subject to the following conditions: 49 50 The above copyright notice and this permission notice shall be included in 51 all copies or substantial portions of the Software. 52 53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 54 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 55 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 56 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 57 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 58 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 59 DEALINGS IN THE SOFTWARE. */ 60 61 import core.stdc.stdint; 62 import dhtslib.htslib.hts; 63 import dhtslib.htslib.bgzf:BGZF; 64 import dhtslib.htslib.kstring: __kstring_t, kstring_t; 65 66 /// Highest SAM format version supported by this library 67 auto SAM_FORMAT_VERSION = "1.6"c; 68 69 /********************** 70 *** SAM/BAM header *** 71 **********************/ 72 73 /**! @typedef 74 @abstract Structure for the alignment header. 75 @field n_targets number of reference sequences 76 @field l_text length of the plain text in the header 77 @field target_len lengths of the reference sequences 78 @field target_name names of the reference sequences 79 @field text plain text 80 @field sdict header dictionary 81 */ 82 83 struct bam_hdr_t { // @suppress(dscanner.style.phobos_naming_convention) 84 int n_targets; /// number of targets (contigs) 85 int ignore_sam_err; /// ? Ignore SAM format errors? 86 uint l_text; /// ? text data length 87 uint *target_len; /// ? number of targets (contigs) 88 byte *cigar_tab; /// ? CIGAR table 89 char **target_name; /// ? C-style array of target (contig) names 90 char *text; /// ? 91 void *sdict; /// ? 92 } 93 94 /**************************** 95 *** CIGAR related macros *** 96 ****************************/ 97 98 enum { 99 BAM_CMATCH = 0, /// CIGAR M, match 100 BAM_CINS = 1, /// CIGAR I, insertion 101 BAM_CDEL = 2, /// CIGAR D, deletion 102 BAM_CREF_SKIP = 3, /// CIGAR N, refskip 103 BAM_CSOFT_CLIP = 4, /// CIGAR S, soft clip 104 BAM_CHARD_CLIP = 5, /// CIGAR H, hard clip 105 BAM_CPAD = 6, /// CIGAR P, padding 106 BAM_CEQUAL = 7, /// CIGAR =, equal 107 BAM_CDIFF = 8, /// CIGAR X, differs 108 BAM_CBACK = 9 /// CIGAR B, back(?) 109 } 110 111 auto BAM_CIGAR_STR = "MIDNSHP=XB"c; /// internal 112 enum int BAM_CIGAR_SHIFT = 4; /// internal 113 enum int BAM_CIGAR_MASK = 0xf; /// internal 114 enum int BAM_CIGAR_TYPE = 0x3C1A7; /// internal magic const for bam_cigar_type() 115 116 pragma(inline, true) 117 { 118 /// CIGAR opcode for CIGAR uint (4 LSB) 119 auto bam_cigar_op(uint c) { return ( c & BAM_CIGAR_MASK); } 120 /// CIGAR operation length for CIGAR uint (28 MSB >> 4) 121 auto bam_cigar_oplen(uint c) { return ( c >> BAM_CIGAR_SHIFT ); } 122 // Note that BAM_CIGAR_STR is padded to length 16 bytes below so that 123 // the array look-up will not fall off the end. '?' is chosen as the 124 // padding character so it's easy to spot if one is emitted, and will 125 // result in a parsing failure (in sam_parse1(), at least) if read. 126 /// CIGAR opcode character for CIGAR uint 127 auto bam_cigar_opchr(uint c) { return (BAM_CIGAR_STR ~ "??????"[bam_cigar_op(c)]); } 128 /// Generate CIGAR uint from length and operator 129 auto bam_cigar_gen(uint l, uint o) { return (l << BAM_CIGAR_SHIFT | o); } 130 131 /** bam_cigar_type returns a bit flag with: 132 * bit 1 set if the cigar operation consumes the query 133 * bit 2 set if the cigar operation consumes the reference 134 * 135 * For reference, the unobfuscated truth table for this function is: 136 * BAM_CIGAR_TYPE QUERY REFERENCE 137 * -------------------------------- 138 * BAM_CMATCH 1 1 139 * BAM_CINS 1 0 140 * BAM_CDEL 0 1 141 * BAM_CREF_SKIP 0 1 142 * BAM_CSOFT_CLIP 1 0 143 * BAM_CHARD_CLIP 0 0 144 * BAM_CPAD 0 0 145 * BAM_CEQUAL 1 1 146 * BAM_CDIFF 1 1 147 * BAM_CBACK 0 0 148 * -------------------------------- 149 */ 150 auto bam_cigar_type(uint o) { return (BAM_CIGAR_TYPE >> (o << 1) & 3); } // bit 1: consume query; bit 2: consume reference 151 } // end pragma(inline) 152 153 /**! @abstract the read is paired in sequencing, no matter whether it is mapped in a pair */ 154 enum int BAM_FPAIRED = 1; 155 /**! @abstract the read is mapped in a proper pair */ 156 enum int BAM_FPROPER_PAIR = 2; 157 /**! @abstract the read itself is unmapped; conflictive with BAM_FPROPER_PAIR */ 158 enum int BAM_FUNMAP = 4; 159 /**! @abstract the mate is unmapped */ 160 enum int BAM_FMUNMAP = 8; 161 /**! @abstract the read is mapped to the reverse strand */ 162 enum int BAM_FREVERSE =16; 163 /**! @abstract the mate is mapped to the reverse strand */ 164 enum int BAM_FMREVERSE =32; 165 /**! @abstract this is read1 */ 166 enum int BAM_FREAD1 =64; 167 /**! @abstract this is read2 */ 168 enum int BAM_FREAD2 =128; 169 /**! @abstract not primary alignment */ 170 enum int BAM_FSECONDARY =256; 171 /**! @abstract QC failure */ 172 enum int BAM_FQCFAIL =512; 173 /**! @abstract optical or PCR duplicate */ 174 enum int BAM_FDUP =1024; 175 /**! @abstract supplementary alignment */ 176 enum int BAM_FSUPPLEMENTARY=2048; 177 178 /************************* 179 *** Alignment records *** 180 *************************/ 181 182 /**! @typedef 183 @abstract Structure for core alignment information. 184 @field tid chromosome ID, defined by bam_hdr_t 185 @field pos 0-based leftmost coordinate 186 @field bin bin calculated by bam_reg2bin() 187 @field qual mapping quality 188 @field l_qname length of the query name 189 @field flag bitwise flag 190 @field l_extranul length of extra NULs between qname & cigar (for alignment) 191 @field n_cigar number of CIGAR operations 192 @field l_qseq length of the query sequence (read) 193 @field mtid chromosome ID of next read in template, defined by bam_hdr_t 194 @field mpos 0-based leftmost coordinate of next read in template 195 */ 196 struct bam1_core_t { // @suppress(dscanner.style.phobos_naming_convention) 197 int32_t tid; /// chromosome ID, defined by bam_hdr_t 198 int32_t pos; /// 0-based leftmost coordinate 199 uint16_t bin; /// bin calculated by bam_reg2bin() 200 uint8_t qual; /// mapping quality 201 uint8_t l_qname; /// length of the query name 202 uint16_t flag; /// bitwise flag 203 uint8_t unused1; /// padding 204 uint8_t l_extranul;/// length of extra NULs between qname & cigar (for alignment) 205 uint32_t n_cigar; /// number of CIGAR operations 206 int32_t l_qseq; /// length of the query sequence (read) 207 int32_t mtid; /// chromosome ID of next read in template, defined by bam_hdr_t 208 int32_t mpos; /// 0-based leftmost coordinate of next read in template (mate) 209 int32_t isize; /// ? template length 210 } 211 212 /**! @typedef 213 @abstract Structure for one alignment. 214 @field core core information about the alignment 215 @field l_data current length of bam1_t::data 216 @field m_data maximum length of bam1_t::data 217 @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux 218 219 @discussion Notes: 220 221 1. qname is terminated by one to four NULs, so that the following 222 cigar data is 32-bit aligned; core.l_qname includes these trailing NULs, 223 while core.l_extranul counts the excess NULs (so 0 <= l_extranul <= 3). 224 2. l_qseq is calculated from the total length of an alignment block 225 on reading or from CIGAR. 226 3. cigar data is encoded 4 bytes per CIGAR operation. 227 4. seq is nybble-encoded according to bam_nt16_table. 228 */ 229 struct bam1_t { // @suppress(dscanner.style.phobos_naming_convention) 230 bam1_core_t core; /// core information about the alignment 231 int l_data; /// current length of bam1_t::data 232 uint32_t m_data; /// maximum length of bam1_t::data 233 uint8_t *data; /// all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux 234 uint64_t id; /// ??? 235 } 236 237 pragma(inline, true) { 238 /**! @function 239 @abstract Get whether the query is on the reverse strand 240 @param b pointer to an alignment 241 @return boolean true if query is on the reverse strand 242 */ 243 bool bam_is_rev(bam1_t *b) { return ( ((*b).core.flag & BAM_FREVERSE) != 0 ); } 244 /**! @function 245 @abstract Get whether the query's mate is on the reverse strand 246 @param b pointer to an alignment 247 @return boolean true if query's mate on the reverse strand 248 */ 249 bool bam_is_mrev(bam1_t *b) { return( ((*b).core.flag & BAM_FMREVERSE) != 0); } 250 /**! @function 251 @abstract Get the name of the query 252 @param b pointer to an alignment 253 @return pointer to the name string, null terminated 254 */ 255 auto bam_get_qname(bam1_t *b) { return (cast(char*)(*b).data); } 256 /**! @function 257 @abstract Get the CIGAR array 258 @param b pointer to an alignment 259 @return pointer to the CIGAR array 260 261 @discussion In the CIGAR array, each element is a 32-bit integer. The 262 lower 4 bits gives a CIGAR operation and the higher 28 bits keep the 263 length of a CIGAR. 264 */ 265 auto bam_get_cigar(bam1_t *b) { return cast(uint *)((*b).data + (*b).core.l_qname); } 266 /**! @function 267 @abstract Get query sequence 268 @param b pointer to an alignment 269 @return pointer to sequence 270 271 @discussion Each base is encoded in 4 bits: 1 for A, 2 for C, 4 for G, 272 8 for T and 15 for N. Two bases are packed in one byte with the base 273 at the higher 4 bits having smaller coordinate on the read. It is 274 recommended to use bam_seqi() macro to get the base. 275 */ 276 auto bam_get_seq(bam1_t *b) { return ((*b).data + ((*b).core.n_cigar<<2) + (*b).core.l_qname); } 277 /**! @function 278 @abstract Get query quality 279 @param b pointer to an alignment 280 @return pointer to quality string 281 */ 282 ////#define bam_get_qual(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) 283 auto bam_get_qual(bam1_t *b) { return (*b).data + ((*b).core.n_cigar<<2) + 284 (*b).core.l_qname + 285 (((*b).core.l_qseq + 1)>>1); } 286 287 /**! @function 288 @abstract Get auxiliary data 289 @param b pointer to an alignment 290 @return pointer to the concatenated auxiliary data 291 */ 292 ////#define bam_get_aux(b) ((b)->data + ((b)->core.n_cigar<<2) + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1) + (b)->core.l_qseq) 293 auto bam_get_aux(bam1_t *b) { return ((*b).data + ((*b).core.n_cigar<<2) + 294 (*b).core.l_qname + 295 (((*b).core.l_qseq + 1)>>1) + 296 (*b).core.l_qseq); } 297 /**! @function 298 @abstract Get length of auxiliary data 299 @param b pointer to an alignment 300 @return length of the concatenated auxiliary data 301 */ 302 ////#define bam_get_l_aux(b) ((b)->l_data - ((b)->core.n_cigar<<2) - (b)->core.l_qname - (b)->core.l_qseq - (((b)->core.l_qseq + 1)>>1)) 303 auto bam_get_l_aux(bam1_t *b) { return ((*b).l_data - 304 ((*b).core.n_cigar<<2) - 305 (*b).core.l_qname - 306 (*b).core.l_qseq - 307 (((*b).core.l_qseq + 1)>>1)); } 308 /**! @function 309 @abstract Get a base on read 310 @param s Query sequence returned by bam_get_seq() 311 @param i The i-th position, 0-based 312 @return 4-bit integer representing the base. 313 */ 314 ////#define bam_seqi(s, i) ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf) 315 auto bam_seqi(ubyte *s, uint i) { return ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf); } 316 //auto bam_seqi(char *s, uint i) { return ((s)[(i)>>1] >> ((~(i)&1)<<2) & 0xf); } 317 } // end pragma(inline) 318 319 /************************** 320 *** Exported functions *** 321 **************************/ 322 323 /*************** 324 *** BAM I/O *** 325 ***************/ 326 327 /// init a BAM header 328 bam_hdr_t *bam_hdr_init(); 329 /// read BAM header from fp 330 bam_hdr_t *bam_hdr_read(BGZF *fp); 331 /// write BAM header to fp 332 int bam_hdr_write(BGZF *fp, const(bam_hdr_t) *h); 333 /// destroy a BAM header 334 void bam_hdr_destroy(bam_hdr_t *h); 335 /// Get contig tid 336 int bam_name2id(bam_hdr_t *h, const(char) *_ref); 337 /// duplicate a BAM header 338 bam_hdr_t* bam_hdr_dup(const(bam_hdr_t) *h0); 339 340 /// init a BAM record 341 bam1_t *bam_init1(); 342 /// destroy a BAM record 343 void bam_destroy1(bam1_t *b); 344 /// read BAM record from fp 345 int bam_read1(BGZF *fp, bam1_t *b); 346 /// write BAM record to fp 347 int bam_write1(BGZF *fp, const(bam1_t) *b); 348 /// copy a BAM record from dst to src 349 bam1_t *bam_copy1(bam1_t *bdst, const(bam1_t) *bsrc); 350 /// duplicate a BAM record 351 bam1_t *bam_dup1(const(bam1_t) *bsrc); 352 353 /// How much query is consumed by CIGAR op(s)? 354 int bam_cigar2qlen(int n_cigar, const(uint32_t) *cigar); 355 /// How much reference is consumed by CIGAR op(s)? 356 int bam_cigar2rlen(int n_cigar, const(uint32_t) *cigar); 357 358 /**! 359 @abstract Calculate the rightmost base position of an alignment on the 360 reference genome. 361 362 @param b pointer to an alignment 363 @return the coordinate of the first base after the alignment, 0-based 364 365 @discussion For a mapped read, this is just b->core.pos + bam_cigar2rlen. 366 For an unmapped read (either according to its flags or if it has no cigar 367 string), we return b->core.pos + 1 by convention. 368 */ 369 int32_t bam_endpos(const(bam1_t) *b); 370 371 /// returns negative value on error 372 int bam_str2flag(const(char) *str); /** returns negative value on error */ 373 /// The string must be freed by the user 374 char *bam_flag2str(int flag); /** The string must be freed by the user */ 375 376 /************************* 377 *** BAM/CRAM indexing *** 378 *************************/ 379 380 // These BAM iterator functions work only on BAM files. To work with either 381 // BAM or CRAM files use the sam_index_load() & sam_itr_*() functions. 382 deprecated("These BAM iterator functions work only on BAM files. " 383 ~ "To work with either BAM or CRAM files use the sam_index_load() & sam_itr_*() functions.") 384 { 385 ////#define bam_itr_destroy(iter) hts_itr_destroy(iter) 386 alias bam_itr_destroy = hts_itr_destroy; 387 ////#define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) 388 alias bam_itr_queryi = sam_itr_queryi; 389 ////#define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) 390 alias bam_itr_querys = sam_itr_querys; 391 ////#define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) 392 pragma(inline, true) 393 auto bam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) { return hts_itr_next(htsfp.fp.bgzf, itr, r, null); } 394 } 395 396 // Load/build .csi or .bai BAM index file. Does not work with CRAM. 397 // It is recommended to use the sam_index_* functions below instead. 398 deprecated("Load/build .csi or .bai BAM index file. Does not work with CRAM. " 399 ~ "It is recommended to use the sam_index_* functions below instead.") 400 { 401 ////#define bam_index_load(fn) hts_idx_load((fn), HTS_FMT_BAI) 402 pragma(inline, true) auto bam_index_load(const(char) *fn) { return hts_idx_load(fn, HTS_FMT_BAI); } 403 ////#define bam_index_build(fn, min_shift) (sam_index_build((fn), (min_shift))) 404 alias bam_index_build = sam_index_build; 405 } 406 407 /// Load a BAM (.csi or .bai) or CRAM (.crai) index file 408 /** @param fp File handle of the data file whose index is being opened 409 @param fn BAM/CRAM/etc filename to search alongside for the index file 410 @return The index, or NULL if an error occurred. 411 */ 412 hts_idx_t *sam_index_load(htsFile *fp, const(char) *fn); 413 414 /// Load a specific BAM (.csi or .bai) or CRAM (.crai) index file 415 /** @param fp File handle of the data file whose index is being opened 416 @param fn BAM/CRAM/etc data file filename 417 @param fnidx Index filename, or NULL to search alongside @a fn 418 @return The index, or NULL if an error occurred. 419 */ 420 hts_idx_t *sam_index_load2(htsFile *fp, const(char) *fn, const(char) *fnidx); 421 422 /// Generate and save an index file 423 /** @param fn Input BAM/etc filename, to which .csi/etc will be added 424 @param min_shift Positive to generate CSI, or 0 to generate BAI 425 @return 0 if successful, or negative if an error occurred (usually -1; or 426 -2: opening fn failed; -3: format not indexable; -4: 427 failed to create and/or save the index) 428 */ 429 int sam_index_build(const(char) *fn, int min_shift); 430 431 /// Generate and save an index to a specific file 432 /** @param fn Input BAM/CRAM/etc filename 433 @param fnidx Output filename, or NULL to add .bai/.csi/etc to @a fn 434 @param min_shift Positive to generate CSI, or 0 to generate BAI 435 @return 0 if successful, or negative if an error occurred (see 436 sam_index_build for error codes) 437 */ 438 int sam_index_build2(const(char) *fn, const(char) *fnidx, int min_shift); 439 /// ditto 440 int sam_index_build3(const(char) *fn, const(char) *fnidx, int min_shift, int nthreads); 441 442 ////#define sam_itr_destroy(iter) hts_itr_destroy(iter) 443 alias sam_itr_destroy = hts_itr_destroy; 444 /// SAM/BAM/CRAM iterator query by integer tid/start/end 445 hts_itr_t *sam_itr_queryi(const(hts_idx_t) *idx, int tid, int beg, int end); 446 /// SAM/BAM/CRAM iterator query by string ("chr:start-end") 447 hts_itr_t *sam_itr_querys(const(hts_idx_t) *idx, bam_hdr_t *hdr, const(char) *region); 448 /// SAM/BAM/CRAM iterator query by region list 449 hts_itr_multi_t *sam_itr_regions(const(hts_idx_t) *idx, bam_hdr_t *hdr, hts_reglist_t *reglist, uint regcount); 450 451 ////#define sam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), (htsfp)) 452 pragma(inline, true) 453 auto sam_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) { return hts_itr_next(htsfp.fp.bgzf, itr, r, htsfp); } 454 ////#define sam_itr_multi_next(htsfp, itr, r) hts_itr_multi_next((htsfp), (itr), (r)) 455 //auto sam_itr_multi_next(htsFile *htsfp, hts_itr_t *itr, void *r) { return hts_itr_multi_next(htsfp, itr, r); } 456 alias sam_itr_multi_next = hts_itr_multi_next; 457 458 /*************** 459 *** SAM I/O *** 460 ***************/ 461 462 //#define sam_open(fn, mode) (hts_open((fn), (mode))) 463 alias sam_open = hts_open; 464 //#define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt))) 465 alias sam_open_format = hts_open_format; 466 //#define sam_close(fp) hts_close(fp) 467 alias sam_close = hts_close; 468 469 /// open SAM/BAM/CRAM 470 int sam_open_mode(char *mode, const(char) *fn, const(char) *format); 471 472 /// A version of sam_open_mode that can handle ,key=value options. 473 /// The format string is allocated and returned, to be freed by the caller. 474 /// Prefix should be "r" or "w", 475 char *sam_open_mode_opts(const(char) *fn, 476 const(char) *mode, 477 const(char) *format); 478 479 alias samFile = htsFile; 480 /// parse SAM/BAM/CRAM header from text 481 bam_hdr_t *sam_hdr_parse(int l_text, const(char) *text); 482 /// read SAM/BAM/CRAM header frpm fp 483 bam_hdr_t *sam_hdr_read(samFile *fp); 484 /// write SAM/BAM/CRAM header to fp 485 int sam_hdr_write(samFile *fp, const(bam_hdr_t) *h); 486 /// edit SAM/BAM/CRAM header @HD line by key/value (see SAM specs section 1.3) 487 int sam_hdr_change_HD(bam_hdr_t *h, const(char) *key, const(char) *val); 488 489 /// parse text SAM line 490 int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b); 491 /// emit text SAM line 492 int sam_format1(const(bam_hdr_t) *h, const(bam1_t) *b, kstring_t *str); 493 494 /**! 495 * @return >= 0 on successfully reading a new record, -1 on end of stream, < -1 on error 496 **/ 497 int sam_read1(samFile *fp, bam_hdr_t *h, bam1_t *b); 498 /// Return >= 0 on successfully writing a new record; <= -1 on error 499 int sam_write1(samFile *fp, const(bam_hdr_t) *h, const(bam1_t) *b); 500 501 /************************************* 502 *** Manipulating auxiliary fields *** 503 *************************************/ 504 505 /// Return a pointer to an aux record 506 /** @param b Pointer to the bam record 507 @param tag Desired aux tag 508 @return Pointer to the tag data, or NULL if tag is not present or on error 509 If the tag is not present, this function returns NULL and sets errno to 510 ENOENT. If the bam record's aux data is corrupt (either a tag has an 511 invalid type, or the last record is incomplete) then errno is set to 512 EINVAL and NULL is returned. 513 */ 514 uint8_t *bam_aux_get(const(bam1_t) *b, const ref char[2] tag); 515 516 /// Get an integer aux value 517 /** @param s Pointer to the tag data, as returned by bam_aux_get() 518 @return The value, or 0 if the tag was not an integer type 519 If the tag is not an integer type, errno is set to EINVAL. This function 520 will not return the value of floating-point tags. 521 */ 522 int64_t bam_aux2i(const(uint8_t) *s); 523 524 /// Get an integer aux value 525 /** @param s Pointer to the tag data, as returned by bam_aux_get() 526 @return The value, or 0 if the tag was not an integer type 527 If the tag is not an numeric type, errno is set to EINVAL. The value of 528 integer flags will be returned cast to a double. 529 */ 530 double bam_aux2f(const(uint8_t) *s); 531 532 /// Get a character aux value 533 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 534 @return The value, or 0 if the tag was not a character ('A') type 535 If the tag is not a character type, errno is set to EINVAL. 536 */ 537 char bam_aux2A(const(uint8_t) *s); 538 539 /// Get a string aux value 540 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 541 @return Pointer to the string, or NULL if the tag was not a string type 542 If the tag is not a string type ('Z' or 'H'), errno is set to EINVAL. 543 */ 544 char *bam_aux2Z(const(uint8_t) *s); 545 546 /// Get the length of an array-type ('B') tag 547 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 548 @return The length of the array, or 0 if the tag is not an array type. 549 If the tag is not an array type, errno is set to EINVAL. 550 */ 551 uint32_t bam_auxB_len(const(uint8_t) *s); 552 553 /// Get an integer value from an array-type tag 554 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 555 @param idx 0-based Index into the array 556 @return The idx'th value, or 0 on error. 557 If the array is not an integer type, errno is set to EINVAL. If idx 558 is greater than or equal to the value returned by bam_auxB_len(s), 559 errno is set to ERANGE. In both cases, 0 will be returned. 560 */ 561 int64_t bam_auxB2i(const(uint8_t) *s, uint32_t idx); 562 563 /// Get a floating-point value from an array-type tag 564 /** @param s Pointer to the tag data, as returned by bam_aux_get(). 565 @param idx 0-based Index into the array 566 @return The idx'th value, or 0.0 on error. 567 If the array is not a numeric type, errno is set to EINVAL. This can 568 only actually happen if the input record has an invalid type field. If 569 idx is greater than or equal to the value returned by bam_auxB_len(s), 570 errno is set to ERANGE. In both cases, 0.0 will be returned. 571 */ 572 double bam_auxB2f(const(uint8_t) *s, uint32_t idx); 573 574 /// Append tag data to a bam record 575 /* @param b The bam record to append to. 576 @param tag Tag identifier 577 @param type Tag data type 578 @param len Length of the data in bytes 579 @param data The data to append 580 @return 0 on success; -1 on failure. 581 If there is not enough space to store the additional tag, errno is set to 582 ENOMEM. If the type is invalid, errno may be set to EINVAL. errno is 583 also set to EINVAL if the bam record's aux data is corrupt. 584 */ 585 int bam_aux_append(bam1_t *b, const ref char[2] tag, char type, int len, const(uint8_t) *data); 586 587 /// Delete tag data from a bam record 588 /* @param b The bam record to update 589 @param s Pointer to the tag to delete, as returned by bam_aux_get(). 590 @return 0 on success; -1 on failure 591 If the bam record's aux data is corrupt, errno is set to EINVAL and this 592 function returns -1; 593 */ 594 int bam_aux_del(bam1_t *b, uint8_t *s); 595 596 /// Update or add a string-type tag 597 /* @param b The bam record to update 598 @param tag Tag identifier 599 @param len The length of the new string 600 @param data The new string 601 @return 0 on success, -1 on failure 602 This function will not change the ordering of tags in the bam record. 603 New tags will be appended to any existing aux records. 604 605 On failure, errno may be set to one of the following values: 606 607 EINVAL: The bam record's aux data is corrupt or an existing tag with the 608 given ID is not of type 'Z'. 609 610 ENOMEM: The bam data needs to be expanded and either the attempt to 611 reallocate the data buffer failed or the resulting buffer would be 612 longer than the maximum size allowed in a bam record (2Gbytes). 613 */ 614 int bam_aux_update_str(bam1_t *b, const ref char[2] tag, int len, const(char) *data); 615 616 /// Update or add an integer tag 617 /* @param b The bam record to update 618 @param tag Tag identifier 619 @param val The new value 620 @return 0 on success, -1 on failure 621 This function will not change the ordering of tags in the bam record. 622 New tags will be appended to any existing aux records. 623 624 On failure, errno may be set to one of the following values: 625 626 EINVAL: The bam record's aux data is corrupt or an existing tag with the 627 given ID is not of an integer type (c, C, s, S, i or I). 628 629 EOVERFLOW (or ERANGE on systems that do not have EOVERFLOW): val is 630 outside the range that can be stored in an integer bam tag (-2147483647 631 to 4294967295). 632 633 ENOMEM: The bam data needs to be expanded and either the attempt to 634 reallocate the data buffer failed or the resulting buffer would be 635 longer than the maximum size allowed in a bam record (2Gbytes). 636 */ 637 int bam_aux_update_int(bam1_t *b, const ref char[2] tag, int64_t val); 638 639 /// Update or add a floating-point tag 640 /* @param b The bam record to update 641 @param tag Tag identifier 642 @param val The new value 643 @return 0 on success, -1 on failure 644 This function will not change the ordering of tags in the bam record. 645 New tags will be appended to any existing aux records. 646 647 On failure, errno may be set to one of the following values: 648 649 EINVAL: The bam record's aux data is corrupt or an existing tag with the 650 given ID is not of a float type. 651 652 ENOMEM: The bam data needs to be expanded and either the attempt to 653 reallocate the data buffer failed or the resulting buffer would be 654 longer than the maximum size allowed in a bam record (2Gbytes). 655 */ 656 int bam_aux_update_float(bam1_t *b, const ref char[2] tag, float val); 657 658 /// Update or add an array tag 659 /* @param b The bam record to update 660 @param tag Tag identifier 661 @param type Data type (one of c, C, s, S, i, I or f) 662 @param items Number of items 663 @param data Pointer to data 664 @return 0 on success, -1 on failure 665 The type parameter indicates the how the data is interpreted: 666 667 Letter code | Data type | Item Size (bytes) 668 ----------- | --------- | ----------------- 669 c | int8_t | 1 670 C | uint8_t | 1 671 s | int16_t | 2 672 S | uint16_t | 2 673 i | int32_t | 4 674 I | uint32_t | 4 675 f | float | 4 676 677 This function will not change the ordering of tags in the bam record. 678 New tags will be appended to any existing aux records. The bam record 679 will grow or shrink in order to accomodate the new data. 680 681 The data parameter must not point to any data in the bam record itself or 682 undefined behaviour may result. 683 684 On failure, errno may be set to one of the following values: 685 686 EINVAL: The bam record's aux data is corrupt, an existing tag with the 687 given ID is not of an array type or the type parameter is not one of 688 the values listed above. 689 690 ENOMEM: The bam data needs to be expanded and either the attempt to 691 reallocate the data buffer failed or the resulting buffer would be 692 longer than the maximum size allowed in a bam record (2Gbytes). 693 */ 694 int bam_aux_update_array(bam1_t *b, const ref char[2] tag, 695 uint8_t type, uint32_t items, void *data); 696 697 /************************** 698 *** Pileup and Mpileup *** 699 **************************/ 700 701 /*! @typedef 702 @abstract Generic pileup 'client data'. 703 704 @discussion The pileup iterator allows setting a constructor and 705 destructor function, which will be called every time a sequence is 706 fetched and discarded. This permits caching of per-sequence data in 707 a tidy manner during the pileup process. This union is the cached 708 data to be manipulated by the "client" (the caller of pileup). 709 */ 710 union bam_pileup_cd { 711 void *p; /// data ptr 712 int64_t i; /// ? 713 double f; /// ? 714 } 715 716 /**! @typedef 717 @abstract Structure for one alignment covering the pileup position. 718 @field b pointer to the alignment 719 @field qpos position of the read base at the pileup site, 0-based 720 @field indel indel length; 0 for no indel, positive for ins and negative for del 721 @field level the level of the read in the "viewer" mode 722 @field is_del 1 iff the base on the padded read is a deletion 723 @field is_head ??? 724 @field is_tail ??? 725 @field is_refskip ??? 726 @field aux ??? 727 728 @discussion See also bam_plbuf_push() and bam_lplbuf_push(). The 729 difference between the two functions is that the former does not 730 set bam_pileup1_t::level, while the later does. Level helps the 731 implementation of alignment viewers, but calculating this has some 732 overhead. 733 */ 734 struct bam_pileup1_t { 735 bam1_t *b; /// bam record 736 int32_t qpos; /// query position 737 /// ??? 738 int indel, level; 739 pragma(msg, "bam_pileup1_t: bitfield order assumed starting with LSB"); 740 //uint32_t is_del:1, is_head:1, is_tail:1, is_refskip:1, aux:28; 741 mixin(bitfields!( 742 bool, "is_del", 1, 743 bool, "is_head", 1, 744 bool, "is_tail", 1, 745 bool, "is_refskip", 1, 746 uint, "aux", 28 )); 747 bam_pileup_cd cd; /// generic per-struct data, owned by caller. 748 } 749 750 ///typedef int (*bam_plp_auto_f)(void *data, bam1_t *b); 751 alias bam_plp_auto_f = int *function(void *data, bam1_t *b); 752 753 /// opaque pileup data defined in sam.c 754 struct __bam_plp_t; 755 ///typedef struct __bam_plp_t *bam_plp_t; 756 alias bam_plp_t = __bam_plp_t*; 757 758 /// opaque mpileup data defined in sam.c 759 struct __bam_mplp_t; 760 ///typedef struct __bam_mplp_t *bam_mplp_t; 761 alias bam_mplp_t = __bam_mplp_t*; 762 763 /** 764 * bam_plp_init() - sets an iterator over multiple 765 * @func: see mplp_func in bam_plcmd.c in samtools for an example. Expected return 766 * status: 0 on success, -1 on end, < -1 on non-recoverable errors 767 * @data: user data to pass to @func 768 */ 769 /// NB: maxcnt records default is 8000 770 bam_plp_t bam_plp_init(bam_plp_auto_f func, void *data); 771 /// destroy pileup iterator 772 void bam_plp_destroy(bam_plp_t iter); 773 /// add bam record to pileup iterator 774 int bam_plp_push(bam_plp_t iter, const(bam1_t) *b); 775 /// Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns 776 /// pointer to the piled records if next position is ready or NULL if there is not enough records in the 777 /// buffer yet (the current position is still the maximum position across all buffered reads). 778 const(bam_pileup1_t)*bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); 779 /// ??? 780 const(bam_pileup1_t)*bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); 781 /// set maximum pileup records returned (init default is 8000) 782 void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); 783 /// reset pileup 784 void bam_plp_reset(bam_plp_t iter); 785 786 /** 787 * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields. 788 * @plp: The bam_plp_t initialised using bam_plp_init. 789 * @func: The callback function itself. When called, it is given the 790 * data argument (specified in bam_plp_init), the bam structure and 791 * a pointer to a locally allocated bam_pileup_cd union. This union 792 * will also be present in each bam_pileup1_t created. 793 */ 794 void bam_plp_constructor(bam_plp_t plp, 795 int function(void *data, const(bam1_t) *b, bam_pileup_cd *cd) func); 796 /// per-pileup1_t field destructor (may be needed if used bam_plp_constructor()) 797 void bam_plp_destructor(bam_plp_t plp, 798 int function(void *data, const(bam1_t) *b, bam_pileup_cd *cd) func); 799 800 /// initialize new mpileup iterator 801 bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data); 802 /** 803 * bam_mplp_init_overlaps() - if called, mpileup will detect overlapping 804 * read pairs and for each base pair set the base quality of the 805 * lower-quality base to zero, thus effectively discarding it from 806 * calling. If the two bases are identical, the quality of the other base 807 * is increased to the sum of their qualities (capped at 200), otherwise 808 * it is multiplied by 0.8. 809 */ 810 void bam_mplp_init_overlaps(bam_mplp_t iter); 811 /// destroy mpileup iterator 812 void bam_mplp_destroy(bam_mplp_t iter); 813 /// set maximum mpileup records returned per subpileup (init default of each pileup in the mpileup is 8000) 814 void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); 815 /// ??? 816 int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); 817 /// reset mpileup 818 void bam_mplp_reset(bam_mplp_t iter); 819 /// see bam_plp_constructor 820 void bam_mplp_constructor(bam_mplp_t iter, 821 int function(void *data, const(bam1_t) *b, bam_pileup_cd *cd) func); 822 /// see bam_plp_destructor 823 void bam_mplp_destructor(bam_mplp_t iter, 824 int function(void *data, const(bam1_t) *b, bam_pileup_cd *cd) func); 825 826 827 /*********************************** 828 * BAQ calculation and realignment * 829 ***********************************/ 830 831 int sam_cap_mapq(bam1_t *b, const(char) *_ref, int ref_len, int thres); 832 833 /// Calculate BAQ scores 834 /** @param b BAM record 835 @param ref Reference sequence 836 @param ref_len Reference sequence length 837 @param flag Flags, see description 838 @return 0 on success \n 839 -1 if the read was unmapped, zero length, had no quality values, did not have at least one M, X or = CIGAR operator, or included a reference skip. \n 840 -3 if BAQ alignment has already been done and does not need to be applied, or has already been applied. \n 841 -4 if alignment failed (most likely due to running out of memory) 842 843 This function calculates base alignment quality (BAQ) values using the method 844 described in "Improving SNP discovery by base alignment quality", Heng Li, 845 Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076). 846 847 The following @param flag bits can be used: 848 849 Bit 0: Adjust the quality values using the BAQ values 850 851 If set, the data in the BQ:Z tag is used to adjust the quality values, and 852 the BQ:Z tag is renamed to ZQ:Z. 853 854 If clear, and a ZQ:Z tag is present, the quality values are reverted using 855 the data in the tag, and the tag is renamed to BQ:Z. 856 857 Bit 1: Use "extended" BAQ. 858 859 Changes the BAQ calculation to increase sensitivity at the expense of 860 reduced specificity. 861 862 Bit 2: Recalculate BAQ, even if a BQ tag is present. 863 864 Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable 865 recalculation. 866 867 @bug 868 If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed. 869 Depending on what previous processing happened, this may or may not be the 870 correct thing to do. It would be wise to avoid this situation if possible. 871 */ 872 873 int sam_prob_realn(bam1_t *b, const(char) *_ref, int ref_len, int flag);