1 /** 2 3 VCF version 4.2 (including BCF) reader and writer, including 4 a model with convenience functions for the header (metadata) 5 and individual VCF/BCF records (rows). 6 7 Specifications: https://samtools.github.io/hts-specs/VCFv4.2.pdf 8 9 */ 10 module dhtslib.vcf; 11 12 import core.stdc..string; 13 import core.vararg; 14 15 import std.conv: to, ConvException; 16 import std.datetime; 17 import std.format: format; 18 import std.range: ElementType; 19 import std..string: fromStringz, toStringz; 20 import std.traits: isArray, isDynamicArray, isBoolean, isIntegral, isFloatingPoint, isNumeric, isSomeString; 21 import std.traits: Unqual; 22 23 import htslib.hts_log; 24 import htslib.kstring; 25 import htslib.vcf; 26 27 alias BCFRecord = VCFRecord; 28 alias BCFWriter = VCFWriter; 29 30 /** Wrapper around bcf_hdr_t 31 32 Q: Why do we have VCFHeader, but not SAMHeader? 33 A: Most (all)? of the SAM (bam1_t) manipulation functions do not require a ptr to the header, 34 whereas almost all of the VCF (bcf1_t) manipulation functions do. Therefore, we track bcf_hdr_t* 35 inside each VCFRecord; this is wrapped by VCFHeader for future convenience (for example, 36 now we have @property nsamples; may move the tag reader and writer functions here?) 37 38 In order to avoid double free()'ing an instance bcf_hdr_t, 39 this wrapper will be the authoritative holder of of bcf_hdr_t ptrs, 40 it shall be passed around by reference, and copies are disabled. 41 */ 42 struct VCFHeader 43 { 44 /// Pointer to htslib BCF/VCF header struct; will be freed from VCFHeader dtor 45 bcf_hdr_t *hdr; 46 47 // Copies have to be disabled to avoid double free() 48 @disable this(this); 49 50 ~this() 51 { 52 // Deallocate header 53 if (this.hdr != null) bcf_hdr_destroy(this.hdr); 54 } 55 56 invariant 57 { 58 assert(this.hdr != null); 59 } 60 61 /// List of contigs in the header 62 @property string[] sequences() 63 { 64 import core.stdc.stdlib : free; 65 int nseqs; 66 67 /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ 68 //const(char) **bcf_hdr_seqnames(const(bcf_hdr_t) *h, int *nseqs); 69 const(char) **ary = bcf_hdr_seqnames(this.hdr, &nseqs); 70 if (!nseqs) return []; 71 72 string[] ret; 73 ret.reserve(nseqs); 74 75 for(int i; i < nseqs; i++) { 76 ret ~= fromStringz(ary[i]).idup; 77 } 78 79 free(ary); 80 return ret; 81 } 82 83 /// Number of samples in the header 84 pragma(inline, true) 85 @property int nsamples() { return bcf_hdr_nsamples(this.hdr); } 86 87 } 88 89 /** Wrapper around bcf1_t 90 91 Because it uses bcf1_t internally, it must conform to the BCF2 part 92 of the VCFv4.2 specs, rather than the loosey-goosey VCF specs. i.e., 93 INFO, CONTIG, FILTER records must exist in the header. 94 95 TODO: Does this need to be kept in a consistent state? 96 Ideally, VCFWriter would reject invalid ones, but we are informed 97 that it is invalid (e.g. if contig not found) while building this 98 struct; bcf_write1 will actually segfault, unfortunately. I'd like 99 to avoid expensive validate() calls for every record before writing 100 if possible, which means keeping this consistent. However, not 101 sure what to do if error occurs when using the setters herein? 102 103 2019-01-23 struct->class to mirror SAMRecord -- faster if reference type? 104 105 2019-01-23 WIP: getters for chrom, pos, id, ref, alt are complete (untested) 106 107 After parsing a BCF or VCF line, bcf1_t must be unpacked. (not applicable when building bcf1_t from scratch) 108 Depending on information needed, this can be done to various levels with performance tradeoff. 109 Unpacking symbols: 110 BCF_UN_ALL: all 111 BCF_UN_SHR: all shared information (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO) 112 113 BCF_UN_STR 114 / BCF_UN_FLT 115 | / BCF_UN_INFO 116 | | / ____________________________ BCF_UN_FMT 117 V V V / | | | 118 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 ... 119 120 */ 121 class VCFRecord 122 { 123 bcf1_t* line; /// htslib structured record TODO: change to 'b' for better internal consistency? (vcf.h/c actually use line quite a bit in fn params) 124 125 VCFHeader *vcfheader; /// corresponding header (required); 126 /// is ptr to avoid copying struct containing ptr to bcf_hdr_t (leads to double free()) 127 128 /** VCFRecord 129 130 Construct a bcf/vcf record, backed by bcf1_t, from: an existing bcf1_t, parameters, or a VCF line. 131 132 Internal backing by bcf1_t means it must conform to the BCF2 rules -- i.e., header must contain 133 appropriate INFO, CONTIG, and FILTER lines. 134 135 Protip: specifying alternate MAX_UNPACK can speed it tremendously 136 as it will not unpack all fields, only up to those requested (see htslib.vcf) 137 For example, BCF_UN_STR is up to ALT inclusive, and BCF_UN_STR is up to FILTER 138 */ 139 this(T)(T *h, bcf1_t *b, int MAX_UNPACK = BCF_UN_ALL) 140 if(is(T == VCFHeader) || is(T == bcf_hdr_t)) 141 { 142 static if (is(T == VCFHeader)) this.vcfheader = h; 143 //else static if (is(T == bcf_hdr_t)) this.vcfheader = new VCFHeader(h); // double free() bug if we don't own bcf_hdr_t h 144 else static if (is(T == bcf_hdr_t)) assert(0); // ferret out situations that will lead to free() crashes 145 else assert(0); 146 147 this.line = b; 148 149 // Now it must be unpacked 150 // Protip: specifying alternate MAX_UNPACK can speed it tremendously 151 immutable int ret = bcf_unpack(this.line, MAX_UNPACK); // unsure what to do c̄ return value // @suppress(dscanner.suspicious.unused_variable) 152 } 153 /// ditto 154 this(SS)(VCFHeader *vcfhdr, string chrom, int pos, string id, string _ref, string alt, float qual, SS filter, ) 155 if (isSomeString!SS || is(SS == string[])) 156 { 157 this.line = bcf_init1(); 158 this.vcfheader = vcfhdr; 159 160 this.chrom = chrom; 161 this.pos = pos; 162 this.id = id; 163 164 this.setAlleles(_ref, alt); 165 166 this.qual = qual; 167 this.filter = filter; 168 } 169 /// ditto 170 this(VCFHeader *vcfhdr, string line, int MAX_UNPACK = BCF_UN_ALL) 171 { 172 this.vcfheader = vcfhdr; 173 174 kstring_t kline; 175 auto dupline = line.dup; // slower, but safer than a cast // @suppress(dscanner.suspicious.unmodified) 176 177 kline.l = dupline.length; 178 kline.m = dupline.length; 179 kline.s = dupline.ptr; 180 181 this.line = bcf_init1(); 182 this.line.max_unpack = MAX_UNPACK; 183 184 auto ret = vcf_parse(&kline, this.vcfheader.hdr, this.line); 185 if (ret < 0) { 186 hts_log_error(__FUNCTION__, "vcf_parse returned < 0 -- code error or malformed VCF line"); 187 } else { 188 ret = bcf_unpack(this.line, MAX_UNPACK); // unsure what to do c̄ return value 189 } 190 } 191 192 /// disable copying to prevent double-free (which should not come up except when writeln'ing) 193 //@disable this(this); // commented out as class has no postblit ctor (relevant when VCFRecord was struct) 194 /// dtor 195 ~this() 196 { 197 if (this.line) bcf_destroy1(this.line); 198 } 199 200 //----- FIXED FIELDS -----// 201 202 /* CHROM */ 203 /// Get chromosome (CHROM) 204 @property 205 string chrom() 206 { 207 if (!this.line.unpacked) 208 bcf_unpack(this.line, BCF_UN_STR); 209 210 return fromStringz(bcf_hdr_id2name(this.vcfheader.hdr, this.line.rid)).idup; 211 } 212 /// Set chromosome (CHROM) 213 @property 214 void chrom(const(char)[] c) 215 { 216 auto rid = bcf_hdr_name2id(this.vcfheader.hdr, toStringz(c)); 217 if (rid == -1) { 218 hts_log_error(__FUNCTION__, format("contig not found: %s", c)); 219 throw new Exception("contig not found"); 220 } 221 else line.rid = rid; 222 } 223 224 225 /* POS */ 226 /** Get position (POS, column 2) 227 * 228 * NB: internally BCF is uzing 0 based coordinates; we only show +1 when printing a VCF line with toString (which calls vcf_format) 229 */ 230 @property 231 long pos() 232 out(coord) { assert(coord >= 0); } 233 do 234 { 235 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 236 return this.line.pos; 237 } 238 /// Set position (POS, column 2) 239 @property 240 void pos(long p) 241 in { assert(p >= 0); } 242 do 243 { 244 // TODO: should we check for pos >= 0 && pos < contig length? Could really hamper performance. 245 // TODO: if writing out the file with invalid POS values crashes htslib, will consider it 246 this.line.pos = p; 247 } 248 249 250 /* ID */ 251 /// Get ID string 252 @property string id() 253 { 254 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 255 return fromStringz(this.line.d.id).idup; 256 } 257 /// Sets new ID string; comma-separated list allowed but no dup checking performed 258 @property int id(const(char)[] id) 259 { 260 // bcf_update_id expects null pointer instead of empty string to mean "no value" 261 if (id == "") return bcf_update_id(this.vcfheader.hdr, this.line, null); 262 else return bcf_update_id(this.vcfheader.hdr, this.line, toStringz(id)); 263 } 264 /// Append an ID (column 3) to the record. 265 /// 266 /// NOTE: htslib performs duplicate checking 267 int addID(const(char)[] id) 268 { 269 if(id == "") return 0; 270 return bcf_add_id(this.vcfheader.hdr, this.line, toStringz(id)); 271 } 272 273 274 /* Alleles (REF, ALT) 275 276 Internally, alleles are stored as a \0 separated array: 277 [C \0 T \0 T T \0.. \0] 278 ^ ^ ^ ^ 279 | | L L ALT_1 = "TT" 280 | L ALT_0 281 L REF 282 283 TODO: Getters and setters poorly or inconsistently (?) named at this time or there are too many overloads? 284 TODO: need non-overwriting setter for ref and alt alleles 285 TODO: some of these may be inefficent; since they may be used in hot inner loops, pls optimize 286 */ 287 /// REF allele length 288 @property long refLen() 289 { 290 version(DigitalMars) pragma(inline); 291 version(LDC) pragma(inline, true); 292 version(GNU) pragma(inline, true); 293 return this.line.rlen; 294 } 295 /// All alleles getter (array) 296 @property string[] allelesAsArray() 297 { 298 string[] ret; 299 ret.length = this.line.n_allele; // n=0, no reference; n=1, ref but no alt 300 foreach(int i; 0 .. this.line.n_allele) // ref allele is index 0 301 { 302 ret[i] = fromStringz(this.line.d.allele[i]).idup; 303 } 304 return ret; 305 } 306 /// Reference allele getter 307 @property string refAllele() 308 { 309 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 310 if (this.line.n_allele < 1) return ""; // a valid record could have no ref (or alt) alleles 311 else return fromStringz(this.line.d.als).idup; 312 } 313 // NB WIP: there could be zero, or multiple alt alleles 314 /+@property string altAlleles() 315 { 316 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 317 return fromStringz(this.line.d.als) 318 }+/ 319 /// Alternate alleles getter version 1: ["A", "ACTG", ...] 320 @property string[] altAllelesAsArray() 321 { 322 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 323 324 string[] ret; 325 if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt 326 return this.allelesAsArray[1 .. $]; // trim off REF 327 } 328 /// Alternate alleles getter version 2: "A,ACTG,..." 329 @property string altAllelesAsString() 330 { 331 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 332 333 string ret; 334 if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt 335 336 char[] tmp; 337 tmp.length = this.line.d.m_allele; // max allocated size in htslib 338 339 char *a = this.line.d.allele[1]; // ref allele is index 0 340 const char *last = this.line.d.allele[this.line.n_allele - 1]; // pointer to start of last allele 341 size_t i = 0; 342 343 assert( this.line.d.allele[1] == (this.line.d.als + this.line.rlen + 1) ); // ensue data structure is as we think (+1: past ref's term \0) 344 345 while (i < this.line.d.m_allele) // safety stop at max allocated size 346 { 347 if (*a) tmp[i] = *a; 348 else { // '\0' 349 if (a < last) tmp[i] = ','; 350 else break; 351 } 352 i++; 353 a++; 354 } 355 tmp.length = i; 356 357 return tmp.idup; 358 } 359 /// Set alleles; comma-separated list 360 @property void alleles(string a) 361 { 362 if (a == "") { 363 this.line.n_allele = 0; 364 if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF 365 } 366 else 367 bcf_update_alleles_str(this.vcfheader.hdr, this.line, toStringz(a)); 368 } 369 /// Set alleles; array 370 @property void alleles(string[] a) 371 { 372 if (a.length == 0) { 373 this.line.n_allele = 0; 374 if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF 375 } 376 else { 377 const(char)*[64] allelesp; // will fail if a locus has more than 63 ALT alleles :-O 378 foreach(i; 0 .. a.length ) { 379 // In order to zero out all alleles, pass a zero-length allele to the string version, 380 // or a zero-length array to this version. Zero length string member of nonempty allele array is an error. 381 if (a[i].length == 0) { 382 hts_log_error(__FUNCTION__, "Zero-length allele in nonempty allele array. Setting to '.'"); 383 allelesp[i] = ".".ptr; 384 } 385 else allelesp[i] = toStringz(a[i]); 386 } 387 bcf_update_alleles(this.vcfheader.hdr, this.line, &allelesp[0], cast(int)a.length); 388 } 389 } 390 /// Set REF allele only 391 /// param r is \0-term Cstring 392 /// TODO: UNTESTED 393 void setRefAllele(const(char)* r) 394 { 395 // first, get REF 396 if (!this.line.unpacked) bcf_unpack(this.line, BCF_UN_STR); 397 // a valid record could have no ref (or alt) alleles 398 if (this.line.n_allele < 2) // if none or 1 (=REF only), just add the REF we receieved 399 bcf_update_alleles(this.vcfheader.hdr, this.line, &r, 1); 400 else { 401 // length of REF allele is allele[1] - allele[0], (minus one more for \0) 402 // TODO ** TODO ** : there is a property line.refLen already 403 const auto reflen = (this.line.d.allele[1] - this.line.d.allele[0]) - 1; 404 if (strlen(r) <= reflen) { 405 memcpy(this.line.d.allele[0], r, reflen + 1); // +1 -> copy a trailing \0 in case original REF was longer than new 406 // TODO: do we need to call _sync ? 407 } else { 408 // slower way: replace allele[0] with r, but keep rest of pointers poitning at existing allele block, 409 // then call bcf_update_alleles; this will make complete new copy of this.line.d.allele, so forgive the casts 410 this.line.d.allele[0] = cast(char*) r; 411 bcf_update_alleles(this.vcfheader.hdr, this.line, 412 cast( const(char)** ) this.line.d.allele, this.line.n_allele); 413 } 414 } 415 } 416 /// Set alleles; alt can be comma separated 417 void setAlleles(string _ref, string alt) 418 { 419 immutable string alleles = _ref ~ "," ~ alt ~ "\0"; 420 bcf_update_alleles_str(this.vcfheader.hdr, this.line, alleles.ptr); 421 } 422 /// Set alleles; min. 2 alleles (ref, alt1); unlimited alts may be specified 423 void setAlleles(string _ref, string alt, ...) 424 { 425 string alleles = _ref ~ "," ~ alt; 426 427 for (int i = 0; i < _arguments.length; i++) 428 { 429 alleles ~= "," ~ va_arg!string(_argptr); 430 } 431 432 alleles ~= "\0"; 433 434 bcf_update_alleles_str(this.vcfheader.hdr, this.line, alleles.ptr); 435 } 436 437 438 /* Quality (QUAL) */ 439 /// Get variant quality (QUAL, column 6) 440 @property float qual() 441 out(result) { assert(result >= 0); } 442 do 443 { 444 if (this.line.max_unpack < BCF_UN_FLT) bcf_unpack(this.line, BCF_UN_FLT); 445 return this.line.qual; 446 } 447 /// Set variant quality (QUAL, column 6) 448 @property void qual(float q) 449 in { assert(q >= 0); } 450 do { this.line.qual = q; } 451 452 453 /* FILTER */ 454 /// Get FILTER column (nothing in htslib sadly) 455 @property string filter() 456 { 457 const(char)[] ret; 458 459 if (this.line.max_unpack < BCF_UN_FLT) bcf_unpack(this.line, BCF_UN_FLT); 460 461 if (this.line.d.n_flt) { 462 for(int i; i< this.line.d.n_flt; i++) { 463 if (i) ret ~= ";"; 464 ret ~= fromStringz(this.vcfheader.hdr.id[BCF_DT_ID][ this.line.d.flt[0] ].key); 465 } 466 } else { 467 ret = "."; 468 } 469 470 return ret.idup; 471 } 472 /// Remove all entries in FILTER 473 void removeAllFilters() 474 { 475 // always returns zero 476 bcf_update_filter(this.vcfheader.hdr, this.line, null, 0); 477 } 478 /// Set the FILTER column to f 479 @property void filter(string f) 480 { 481 this.filter([f]); 482 } 483 /// Set the FILTER column to f0,f1,f2... 484 /// TODO: determine definitiely whether "." is replaced with "PASS" 485 @property void filter(string[] fs) 486 { 487 int[] fids; 488 foreach(f; fs) { 489 if(f == "") continue; 490 const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); 491 if (fid == -1) hts_log_warning(__FUNCTION__, format("filter not found in header (ignoring): %s", f) ); 492 else fids ~= fid; 493 } 494 if (fids.length > 0) 495 bcf_update_filter(this.vcfheader.hdr, this.line, fids.ptr, cast(int)fids.length); 496 else 497 hts_log_warning(__FUNCTION__, "No FILTER update was performed due to empty list"); 498 } 499 /// Add a filter; from htslib: 500 /// "If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed." 501 int addFilter(string f) 502 { 503 return bcf_add_filter(this.vcfheader.hdr, this.line, 504 bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f))); 505 } 506 /// Remove a filter by name 507 int removeFilter(string f) 508 { 509 const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); 510 return removeFilter(fid); 511 } 512 /// Remove a filter by numeric id 513 int removeFilter(int fid) 514 { 515 return bcf_remove_filter(this.vcfheader.hdr, this.line, fid, 0); 516 } 517 /// Determine whether FILTER is present. log warning if filter does not exist. "PASS" and "." can be used interchangeably. 518 bool hasFilter(string filter) 519 { 520 char[] f = filter.dup ~ '\0'; 521 //const auto id = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, f.ptr); 522 const auto ret = bcf_has_filter(this.vcfheader.hdr, this.line, f.ptr); 523 524 if (ret > 0) return true; 525 else if (ret == 0) return false; 526 else { 527 hts_log_warning(__FUNCTION__, format("FILTER %s does not exist in the header", filter)); 528 return false; 529 } 530 } 531 532 533 /** Update INFO (pan-sample info; column 8) 534 * 535 * Add a tag:value to the INFO column 536 * NOTE: tag must already exist in the header 537 * 538 * Templated on data type, calls one of bcf_update_info_{int32,float,string,flag} 539 * Both singletons and arrays are supported. 540 */ 541 void addInfo(T)(string tag, T data) 542 { 543 int ret = -1; 544 545 static if(isIntegral!T) { 546 auto integer = cast(int) data; 547 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, 1); 548 } 549 550 else static if(isFloatingPoint!T) { 551 auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double 552 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, 1); 553 } 554 555 else static if(isSomeString!T) 556 ret = bcf_update_info_string(this.vcfheader.hdr, this.line, toStringz(tag), toStringz(data)); 557 558 else static if(isBoolean!T) { 559 immutable int set = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 560 ret = bcf_update_info_flag(this.vcfheader.hdr, this.line, toStringz(tag), null, set); 561 } 562 563 if (ret == -1) 564 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 565 } 566 /// ditto 567 void addInfo(T)(string tag, T[] data) 568 if(!is(T==immutable(char))) // otherwise string::immutable(char)[] will match the other template 569 { 570 int ret = -1; 571 572 static if(isIntegral!T) { 573 auto integer = cast(int) data; 574 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, data.length); 575 } 576 577 else static if(isFloatingPoint!T) { 578 auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double 579 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, data.length); 580 } 581 582 if (ret == -1) 583 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 584 } 585 586 /** Update FORMAT (sample info; column 9+) 587 * 588 * Templated on data type, calls one of bc_update_format_{int32,float,string,flag} 589 */ 590 void addFormat(T)(string tag, T data) 591 if(!isArray!T) 592 { 593 int ret = -1; 594 595 static if(isIntegral!T) { 596 auto integer = cast(int) data; 597 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, 1); 598 } 599 600 else static if(isFloatingPoint!T) { 601 auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double 602 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, 1); 603 } 604 605 else static if(isSomeString!T) 606 ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), toStringz(data)); 607 608 else static if(isBoolean!T) { 609 immutable int set = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 610 ret = bcf_update_format_flag(this.vcfheader.hdr, this.line, toStringz(tag), null, set); 611 } 612 613 if (ret == -1) hts_log_warning(__FUNCTION__, format("Couldn't add format (ignoring): %s", data)); 614 } 615 /// ditto 616 void addFormat(T)(string tag, T[] data) 617 { 618 int ret = -1; 619 620 static if(isIntegral!T) { 621 auto integer = cast(int[]) data; 622 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), 623 integer.ptr, cast(int)data.length); 624 } 625 626 else static if(isFloatingPoint!T) { 627 auto flt = cast(float[]) data; // simply passing "2.0" (or whatever) => data is a double 628 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), 629 flt.ptr, cast(int)data.length); 630 } 631 632 if (ret == -1) hts_log_warning(__FUNCTION__, format("Couldn't add format (ignoring): %s", data)); 633 634 } 635 636 /// add INFO or FORMAT key:value pairs to a record 637 /// add a single datapoint OR vector of values, OR, values to each sample (if tagType == FORMAT) 638 void add(string tagType, T)(const(char)[] tag, T data) 639 if((tagType == "INFO" || tagType == "FORMAT") && 640 (isIntegral!T || isIntegral!(ElementType!T) || 641 isFloatingPoint!T || isFloatingPoint!(ElementType!T) || 642 isSomeString!T || isSomeString!(ElementType!T) || 643 isBoolean!T || isBoolean!(ElementType!T))) 644 { 645 int ret = -1; 646 int len; 647 648 static if (!isDynamicArray!T) { 649 len = 1; 650 static if (isIntegral!T) auto d = data.to!int; 651 else static if (isFloatingPoint!T) auto d = data.to!float; 652 else static if (isSomeString!T) auto d = toStringz(data); 653 else static if (isBoolean!T) immutable int d = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 654 else static assert(0); 655 656 const auto ptr = &d; 657 } 658 else static if (isDynamicArray!T) { 659 assert(data.length < int.max); 660 len = cast(int) data.length; 661 static if (isIntegral!(ElementType!T)) auto d = data.to!(int[]); 662 else static if (isFloatingPoint!(ElementType!T)) auto d = data.to!(float[]); 663 else static if (isSomeString!(ElementType!T)) { 664 char[] d; 665 foreach(s; data) { 666 d ~= s ~ "\0"; 667 } 668 if(d.length == 0) d ~= "\0"; // TODO replace with asserts on data length earlier 669 } 670 else static if (isBoolean!(ElementType!T)) { 671 int[] d; 672 foreach(b; data) { 673 d ~= (b ? 1 : 0); 674 } 675 } 676 677 const auto ptr = d.ptr; 678 } 679 else static assert(0); 680 681 static if (tagType == "INFO") { 682 static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) 683 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 684 else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) 685 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 686 else static if (isSomeString!T || isSomeString!(ElementType!T)) 687 ret = bcf_update_info_string(this.vcfheader.hdr, this.line, toStringz(tag), ptr); 688 else static if (is(T == bool) || is(ElementType!T == bool)) 689 ret = bcf_update_info_flag(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 690 else static assert(0, "Type not recognized for INFO tag"); 691 } 692 else static if (tagType == "FORMAT") { 693 static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) 694 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 695 else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) 696 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 697 else static if (isSomeString!T || isSomeString!(ElementType!T)) 698 ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 699 else static assert(0, "Type not recognized for FORMAT tag"); 700 } 701 else static assert(0); 702 703 if (ret == -1) 704 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 705 } 706 707 /// Return a string representation of the VCFRecord (i.e. as would appear in .vcf) 708 /// 709 /// As a bonus, there is a kstring_t memory leak 710 override string toString() const 711 { 712 kstring_t s; 713 714 const int ret = vcf_format(this.vcfheader.hdr, this.line, &s); 715 if (ret) 716 { 717 hts_log_error(__FUNCTION__, 718 format("vcf_format returned nonzero (%d) (likely EINVAL, invalid bcf1_t struct?)", ret)); 719 return "[VCFRecord vcf_format parse_error]"; 720 } 721 722 return cast(string) s.s[0 .. s.l]; 723 } 724 } 725 726 /** Basic support for writing VCF, BCF files */ 727 struct VCFWriter 728 { 729 // htslib data structures 730 vcfFile *fp; /// htsFile 731 //bcf_hdr_t *hdr; /// header 732 VCFHeader *vcfhdr; /// header wrapper -- no copies 733 bcf1_t*[] rows; /// individual records 734 735 @disable this(); 736 /// open file or network resources for writing 737 /// setup incl header allocation 738 /// if mode==w: 739 /// bcf_hdr_init automatically sets version string (##fileformat=VCFv4.2) 740 /// bcf_hdr_init automatically adds the PASS filter 741 this(string fn) 742 { 743 if (fn == "") throw new Exception("Empty filename passed to VCFWriter constructor"); 744 this.fp = vcf_open(toStringz(fn), toStringz("w"c)); 745 if (!this.fp) throw new Exception("Could not hts_open file"); 746 747 this.vcfhdr = new VCFHeader( bcf_hdr_init("w\0"c.ptr)); 748 addFiledate(); 749 bcf_hdr_sync(this.vcfhdr.hdr); 750 751 /+ 752 hts_log_trace(__FUNCTION__, "Displaying header at construction"); 753 // int bcf_hdr_format(const(bcf_hdr_t) *hdr, int is_bcf, kstring_t *str); 754 kstring_t ks; 755 bcf_hdr_format(this.vcfhdr.hdr, 0, &ks); 756 char[] hdr; 757 hdr.length = ks.l; 758 import core.stdc.string: memcpy; 759 memcpy(hdr.ptr, ks.s, ks.l); 760 import std.stdio: writeln; 761 writeln(hdr); 762 +/ 763 } 764 /// setup and copy a header from another BCF/VCF as template 765 this(T)(string fn, T other) 766 if(is(T == VCFHeader*) || is(T == bcf_hdr_t*)) 767 { 768 if (fn == "") throw new Exception("Empty filename passed to VCFWriter constructor"); 769 this.fp = vcf_open(toStringz(fn), toStringz("w"c)); 770 if (!this.fp) throw new Exception("Could not hts_open file"); 771 772 static if(is(T == VCFHeader*)) { this.vcfhdr = new VCFHeader( bcf_hdr_dup(other.hdr) ); } 773 else static if(is(T == bcf_hdr_t*)) { this.vcfhdr = new VCFHeader( bcf_hdr_dup(other) ); } 774 } 775 /// dtor 776 ~this() 777 { 778 const ret = vcf_close(this.fp); 779 if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after writing"); 780 781 // Deallocate header 782 //bcf_hdr_destroy(this.hdr); 783 // 2018-09-15: Do not deallocate header; will be free'd by VCFHeader dtor 784 } 785 invariant 786 { 787 assert(this.vcfhdr != null); 788 } 789 790 VCFHeader* getHeader() 791 { 792 return this.vcfhdr; 793 } 794 795 // TODO 796 /// copy header lines from a template without overwiting existing lines 797 void copyHeaderLines(bcf_hdr_t *other) 798 { 799 assert(this.vcfhdr != null); 800 assert(0); 801 // bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const(bcf_hdr_t) *src); 802 } 803 804 /// Add sample to this VCF 805 /// * int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample); 806 int addSample(string name) 807 in { assert(name != ""); } 808 do 809 { 810 assert(this.vcfhdr != null); 811 812 bcf_hdr_add_sample(this.vcfhdr.hdr, toStringz(name)); 813 814 // AARRRRGGGHHHH 815 // https://github.com/samtools/htslib/issues/767 816 bcf_hdr_sync(this.vcfhdr.hdr); 817 818 return 0; 819 } 820 821 /// Add a new header line 822 int addHeaderLineKV(string key, string value) 823 { 824 // TODO check that key is not INFO, FILTER, FORMAT (or contig?) 825 string line = format("##%s=%s", key, value); 826 827 return bcf_hdr_append(this.vcfhdr.hdr, toStringz(line)); 828 } 829 /// Add a new header line -- must be formatted ##key=value 830 int addHeaderLineRaw(string line) 831 { 832 assert(this.vcfhdr != null); 833 // int bcf_hdr_append(bcf_hdr_t *h, const(char) *line); 834 const auto ret = bcf_hdr_append(this.vcfhdr.hdr, toStringz(line)); 835 bcf_hdr_sync(this.vcfhdr.hdr); 836 return ret; 837 } 838 /// Add a filedate= headerline, which is not called out specifically in the spec, 839 /// but appears in the spec's example files. We could consider allowing a param here. 840 int addFiledate() 841 { 842 return addHeaderLineKV("filedate", (cast(Date) Clock.currTime()).toISOString ); 843 } 844 845 /** Add INFO (§1.2.2) or FORMAT (§1.2.4) tag 846 847 The INFO tag describes row-specific keys used in the INFO column; 848 The FORMAT tag describes sample-specific keys used in the last, and optional, genotype column. 849 850 Template parameter: string; must be INFO or FORMAT 851 852 The first four parameters are required; NUMBER and TYPE have specific allowable values. 853 source and version are optional, but recommended (for INFO only). 854 855 * id: ID tag 856 * number: NUMBER tag; here a string because it can also take special values {A,R,G,.} (see §1.2.2) 857 * type: Integer, Float, Flag, Character, and String 858 * description: Text description; will be double quoted 859 * source: Annotation source (eg dbSNP) 860 * version: Annotation version (eg 142) 861 */ 862 void addTag(string tagType, T)( string id, 863 T number, 864 string type, 865 string description, 866 string source="", 867 string _version="") 868 if((tagType == "INFO" || tagType == "FORMAT") && (isIntegral!T || isSomeString!T)) 869 { 870 string line; // we'll suffix with \0, don't worry 871 872 // check ID 873 if (id == "") { 874 hts_log_error(__FUNCTION__, "no ID"); 875 return; 876 } 877 878 // check Number is either a special code {A,R,G,.} or an integer 879 static if(isSomeString!T) { 880 if (number != "A" && 881 number != "R" && 882 number != "G" && 883 number != ".") { 884 // not a special ; check if integer 885 try { 886 number.to!int; // don't need to store result, will use format/%s 887 } 888 catch (ConvException e) { 889 hts_log_error(__FUNCTION__, "Number not A/R/G/. nor an integer"); 890 return; 891 } 892 } 893 } 894 895 // check Type 896 if (type != "Integer" && 897 type != "Float" && 898 type != "Flag" && 899 type != "Character" && 900 type != "String") { 901 hts_log_error(__FUNCTION__, "unrecognized type"); 902 return; 903 } 904 905 // check Description 906 if (description == "") hts_log_error(__FUNCTION__, "no description"); 907 908 // check Source and Version 909 if (source == "" && _version != "") hts_log_error(__FUNCTION__, "version wo source"); 910 911 // Format params 912 if (source != "" && _version != "") 913 line = format("##%s=<ID=%s,Number=%s,Type=%s,Description=\"%s\",Source=\"%s\",Version=\"%s\">\0", 914 tagType, id, number, type, description, source, _version); 915 else if (source != "" && _version == "") 916 line = format("##%s=<ID=%s,Number=%s,Type=%s,Description=\"%s\",Source=\"%s\">\0", 917 tagType, id, number, type, description, source); 918 else 919 line = format("##%s=<ID=%s,Number=%s,Type=%s,Description=\"%s\">\0", 920 tagType, id, number, type, description); 921 922 bcf_hdr_append(this.vcfhdr.hdr, line.ptr); 923 } 924 925 /** Add FILTER tag (§1.2.3) */ 926 void addTag(string tagType)(string id, string description) 927 if(tagType == "FILTER") 928 { 929 // check ID 930 if (id == "") { 931 hts_log_error(__FUNCTION__, "no ID"); 932 return; 933 } 934 // check Description 935 if (description == "") hts_log_error(__FUNCTION__, "no description"); 936 937 string line = format("##FILTER=<ID=%s,Description=\"%s\">\0", id, description); 938 bcf_hdr_append(this.vcfhdr.hdr, line.ptr); 939 } 940 941 /** Add FILTER tag (§1.2.3) */ 942 deprecated void addFilterTag(string id, string description) 943 { 944 string filter = format("##FILTER=<ID=%s,Description=\"%s\">\0", id, description); 945 bcf_hdr_append(this.vcfhdr.hdr, filter.ptr); 946 } 947 948 /** Add contig definition (§1.2.7) to header meta-info 949 950 other: "url=...,md5=...,etc." 951 */ 952 auto addTag(string tagType)(const(char)[] id, const int length = 0, string other = "") 953 if(tagType == "contig" || tagType == "CONTIG") 954 { 955 const(char)[] contig = "##contig=<ID=" ~ id ~ 956 (length > 0 ? ",length=" ~ length.to!string : "") ~ 957 (other != "" ? "," ~ other : "") ~ 958 ">\0"; 959 960 return bcf_hdr_append(this.vcfhdr.hdr, contig.ptr); 961 } 962 963 /** 964 Add a record 965 966 alleles: comma-separated string, including ref allele 967 qual: a float, but accepts any numeric 968 */ 969 int addRecord(S, N)(S contig, int pos, S id, S alleles, N qual, S[] filters) 970 if ( (isSomeString!S || is(S == char*) ) && isNumeric!N) 971 { 972 bcf1_t *line = new bcf1_t; 973 974 line.rid = bcf_hdr_name2id(this.vcfhdr.hdr, toStringz(contig)); 975 if (line.rid == -1) hts_log_error(__FUNCTION__, "contig not found"); 976 977 line.pos = pos; 978 979 bcf_update_id(this.vcfhdr.hdr, line, (id == "" ? null : toStringz(id)) ); // TODO: could support >1 id with array as with the filters 980 981 bcf_update_alleles_str(this.vcfhdr.hdr, line, toStringz(alleles)); 982 983 line.qual = qual; 984 985 // Update filter(s); if/else blocks for speed 986 if(filters.length == 0) 987 { 988 int pass = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz("PASS"c)); 989 bcf_update_filter(this.vcfhdr.hdr, line, &pass, 1); 990 } 991 else if(filters.length == 1) 992 { 993 int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(filters[0])); 994 if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", filters[0]) ); 995 bcf_update_filter(this.vcfhdr.hdr, line, &fid, 1); 996 } 997 else // TODO: factor out the check for -1 into a safe_update_filter or something 998 { 999 int[] filter_ids; 1000 foreach(f; filters) { 1001 const int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(f)); 1002 if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", f) ); 1003 else filter_ids ~= fid; 1004 } 1005 bcf_update_filter(this.vcfhdr.hdr, line, filter_ids.ptr, cast(int)filter_ids.length ); 1006 } 1007 1008 // Add a record 1009 /+ 1010 // .. FORMAT 1011 bcf_hdr_append(hdr, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">"); 1012 float[4] test; 1013 bcf_float_set_missing(test[0]); 1014 test[1] = 47.11f; 1015 bcf_float_set_vector_end(test[2]); 1016 writeln("pre update format float"); 1017 bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("TF"), &test[0], 4); 1018 +/ 1019 int tmpi = 1; 1020 bcf_update_info_int32(this.vcfhdr.hdr, line, toStringz("NS"c), &tmpi, 1); 1021 1022 // Add the actual sample 1023 int[4] dp = [ 9000, 1, 2, 3]; 1024 bcf_update_format(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1, BCF_HT_INT); 1025 //int dp = 9000; 1026 //bcf_update_format_int32(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1); 1027 //auto f = new float; 1028 //*f = 1.0; 1029 //bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("XF"c), f, 1); 1030 1031 this.rows ~= line; 1032 1033 return 0; 1034 } 1035 1036 /// as expected 1037 int writeHeader() 1038 { 1039 return bcf_hdr_write(this.fp, this.vcfhdr.hdr); 1040 } 1041 /// as expected 1042 int writeRecord(ref VCFRecord r) 1043 { 1044 const ret = bcf_write(this.fp, this.vcfhdr.hdr, r.line); 1045 if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error"); 1046 return ret; 1047 } 1048 /// as expected 1049 int writeRecord(bcf_hdr_t *hdr, bcf1_t *rec) 1050 { 1051 hts_log_warning(__FUNCTION__, "pre call"); 1052 const ret = bcf_write(this.fp, hdr, rec); 1053 hts_log_warning(__FUNCTION__, "post call"); 1054 if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error"); 1055 return ret; 1056 } 1057 } 1058 1059 /** Basic support for reading VCF, BCF files */ 1060 struct VCFReader 1061 { 1062 // htslib data structures 1063 vcfFile *fp; /// htsFile 1064 //bcf_hdr_t *hdr; /// header 1065 VCFHeader *vcfhdr; /// header wrapper -- no copies 1066 bcf1_t* b; /// record for use in iterator, will be recycled 1067 1068 int MAX_UNPACK; /// see htslib.vcf 1069 1070 private static int refct; 1071 1072 this(this) 1073 { 1074 this.refct++; 1075 } 1076 @disable this(); 1077 /// read existing VCF file 1078 /// MAX_UNPACK: setting alternate value could speed reading 1079 this(string fn, int MAX_UNPACK = BCF_UN_ALL) 1080 { 1081 import htslib.hts : hts_set_threads; 1082 1083 if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor"); 1084 this.fp = vcf_open(toStringz(fn), "r"c.ptr); 1085 if (!this.fp) throw new Exception("Could not hts_open file"); 1086 1087 hts_set_threads(this.fp, 1); // extra decoding thread 1088 1089 this.vcfhdr = new VCFHeader( bcf_hdr_read(this.fp)); 1090 1091 this.b = bcf_init1(); 1092 this.b.max_unpack = MAX_UNPACK; 1093 this.MAX_UNPACK = MAX_UNPACK; 1094 } 1095 /// dtor 1096 ~this() 1097 { 1098 this.refct--; 1099 1100 // block file close and bcf1_t free() with reference counting 1101 // to allow VCFReader to implement Range interface 1102 if(!this.refct) { 1103 const ret = vcf_close(this.fp); 1104 if (ret != 0) hts_log_error(__FUNCTION__, "couldn't close VCF after reading"); 1105 1106 // Deallocate header 1107 //bcf_hdr_destroy(this.hdr); 1108 // 2018-09-15: Do not deallocate header; will be free'd by VCFHeader dtor 1109 1110 bcf_destroy1(this.b); 1111 } 1112 } 1113 invariant 1114 { 1115 assert(this.vcfhdr != null); 1116 } 1117 1118 VCFHeader* getHeader() 1119 { 1120 return this.vcfhdr; 1121 } 1122 1123 /** VCF version, e.g. VCFv4.2 */ 1124 @property string vcfVersion() { return cast(string) fromStringz( bcf_hdr_get_version(this.vcfhdr.hdr) ).idup; } 1125 1126 /++ 1127 bcf_hrec_t *bcf_hdr_get_hrec(const(bcf_hdr_t) *hdr, int type, const(char) *key, const(char) *value, 1128 const(char) *str_class); 1129 +/ 1130 // TODO: check key for "", fail if empty 1131 // TODO: check value, str_class for "" and replace with NULL 1132 // TODO: Memory leak. We are never freeing the bcf_hrec_t, but, it escapes as pointer inside key/value string 1133 // TODO: handle structured and general lines 1134 string[string] getTagByKV(string tagType, T)(string key, string value, string str_class) 1135 if((tagType == "FILTER" || tagType == "INFO" || tagType == "FORMAT" || tagType == "contig") && 1136 (isIntegral!T || isSomeString!T)) 1137 { 1138 // hlt : header line type 1139 static if (tagType == "FILTER") const int hlt = BCF_HL_FLT; 1140 else static if (tagType == "INFO") const int hlt = BCF_HL_INFO; // @suppress(dscanner.suspicious.label_var_same_name) 1141 else static if (tagType == "FORMAT") const int hlt= BCF_HL_FMT; // @suppress(dscanner.suspicious.label_var_same_name) 1142 else static if (tagType == "contig") const int hlt= BCF_HL_CTG; // @suppress(dscanner.suspicious.label_var_same_name) 1143 else assert(0); 1144 1145 bcf_hrec_t *hrec = bcf_hdr_get_hrec(this.vcfhdr.hdr, hlt, toStringz(key), 1146 toStringz(value), 1147 toStringz(str_class)); 1148 1149 const int nkeys = hrec.nkeys; 1150 string[string] kv; 1151 1152 foreach(int i; 0 .. nkeys) { 1153 string k = cast(string) fromStringz(hrec.keys[i]); 1154 string v = cast(string) fromStringz(hrec.vals[i]); 1155 kv[k] = v; 1156 } 1157 1158 return kv; 1159 } 1160 1161 /// InputRange interface: iterate over all records 1162 @property bool empty() 1163 { 1164 // int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); 1165 // documentation claims returns -1 on critical errors, 0 otherwise 1166 // however it looks like -1 is EOF and -2 is critical errors? 1167 immutable success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); 1168 if (success >= 0) return false; 1169 else if (success == -1) { 1170 // EOF 1171 // see htslib my comments https://github.com/samtools/htslib/issues/246 1172 // and commit 9845bc9a947350d0f34e6ce69e79ab81b6339bd2 1173 return true; 1174 } 1175 else { 1176 hts_log_error(__FUNCTION__, "*** CRITICAL ERROR bcf_read < -1"); 1177 // TODO: check b->errcode 1178 return true; 1179 } 1180 } 1181 /// ditto 1182 void popFront() 1183 { 1184 // noop? 1185 // free this.b ? 1186 //bam_destroy1(this.b); 1187 1188 // TODO: clear (not destroy) the bcf1_t for reuse? 1189 } 1190 /// ditto 1191 VCFRecord front() 1192 { 1193 // note that VCFRecord's constructor will be responsible for 1194 // * unpacking and 1195 // * destroying 1196 // its copy 1197 return new VCFRecord(this.vcfhdr, bcf_dup(this.b), this.MAX_UNPACK); 1198 } 1199 } 1200 1201 /// 1202 debug(dhtslib_unittest) 1203 unittest 1204 { 1205 import std.exception: assertThrown; 1206 import std.stdio: writeln, writefln; 1207 1208 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 1209 1210 1211 auto vw = VCFWriter("/dev/null"); 1212 1213 vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); 1214 vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"); 1215 // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> 1216 vw.addTag!"INFO"("AF", "A", "Integer", "Number of Samples With Data"); 1217 vw.addHeaderLineRaw("##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); // @suppress(dscanner.style.long_line) 1218 vw.addHeaderLineRaw("##FILTER=<ID=q10,Description=\"Quality below 10\">"); 1219 1220 // Exercise header 1221 assert(vw.vcfhdr.nsamples == 0); 1222 vw.addSample("NA12878"); 1223 assert(vw.vcfhdr.nsamples == 1); 1224 1225 auto r = new VCFRecord(vw.vcfhdr, bcf_init1()); 1226 1227 r.chrom = "20"; 1228 assert(r.chrom == "20"); 1229 assertThrown(r.chrom = "chr20"); // headerline chromosome is "20" not "chr20" 1230 1231 r.pos = 999_999; 1232 assert(r.pos == 999_999); 1233 r.pos = 62_435_964 + 1; // Exceeds contig length 1234 1235 // Test ID field 1236 // note that in an empty freshly initialized bcf1_t, the ID field is "" rather than "." 1237 // Will consider adding initialization code 1238 //writefln("ID: %s", r.id); 1239 r.id = ""; 1240 assert(r.id == "."); 1241 r.id = "."; 1242 assert(r.id == "."); 1243 r.id = "rs001"; 1244 assert(r.id == "rs001"); 1245 r.addID("rs999"); 1246 assert(r.id == "rs001;rs999"); 1247 r.addID("rs001"); // test duplicate checking 1248 assert(r.id == "rs001;rs999"); 1249 1250 // Test REF/ALT allele setters and getters 1251 // many overloads to test for good coverage 1252 // Test matrix: Set {zero, one, two, three} alleles * {set by string, set by array} = 8 test cases 1253 // * also, there is setAlleles(ref, alt) and setAlleles(ref, alt1, ...) 1254 // * TODO: will also need to retreive alt alleles as array and string 1255 1256 string[] alleles_array; 1257 1258 // Zero alleles 1259 r.alleles(""); 1260 const auto zs = r.allelesAsArray; 1261 const auto zsr = r.refAllele; 1262 assert(zs.length == 0); 1263 assert(zsr == ""); 1264 1265 r.alleles(alleles_array); 1266 const auto za = r.allelesAsArray; 1267 const auto zar = r.refAllele; 1268 assert(za.length == 0); 1269 assert(zar == ""); 1270 1271 // One allele 1272 r.alleles("C"); 1273 const auto os = r.allelesAsArray; 1274 const auto osr = r.refAllele; 1275 assert(os.length == 1 && os[0] == "C"); 1276 assert(osr == "C"); 1277 1278 r.alleles(["C"]); 1279 const auto oa = r.allelesAsArray; 1280 const auto oar = r.refAllele; 1281 assert(oa.length == 1 && oa[0] == "C"); 1282 assert(oar == "C"); 1283 1284 // Two alleles 1285 r.alleles("C,T"); 1286 const auto ts = r.allelesAsArray; 1287 const auto tsr= r.refAllele; 1288 assert(ts == ["C", "T"]); 1289 assert(tsr== "C"); 1290 1291 r.alleles(["C", "T"]); 1292 const auto ta = r.allelesAsArray; 1293 const auto tar= r.refAllele; 1294 assert(ta == ["C", "T"]); 1295 assert(tar== "C"); 1296 1297 const taaa = r.altAllelesAsArray; 1298 const taas = r.altAllelesAsString; 1299 assert(taaa == ["T"]); 1300 assert(taas == "T"); 1301 1302 // alternate setter for >= 2 alleles 1303 r.setAlleles("A", "G"); 1304 assert(r.allelesAsArray == ["A", "G"]); 1305 1306 // Three alleles 1307 r.alleles("A,C,T"); 1308 assert(r.allelesAsArray == ["A", "C", "T"]); 1309 assert(r.refAllele == "A"); 1310 assert(r.altAllelesAsString == "C,T"); 1311 1312 // alternate the alleles for testing purposes 1313 r.alleles(["G", "A", "C"]); 1314 assert(r.allelesAsArray == ["G", "A", "C"]); 1315 assert(r.refAllele == "G"); 1316 assert(r.altAllelesAsString == "A,C"); 1317 1318 r.setAlleles("A", "C", "T"); 1319 assert(r.allelesAsArray == ["A", "C", "T"]); 1320 assert(r.refAllele == "A"); 1321 assert(r.altAllelesAsString == "C,T"); 1322 1323 1324 // Test QUAL 1325 r.qual = 1.0; 1326 assert(r.qual == 1.0); 1327 // now test setting qual without unpacking 1328 // TODO: see https://forum.dlang.org/post/hebouvswxlslqhovzaia@forum.dlang.org, once resolved (or once factory function written), 1329 // add template value param BCF_UN_STR 1330 auto rr = new VCFRecord(vw.vcfhdr, "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0.017\n"); // @suppress(dscanner.style.long_line) 1331 rr.qual = 3.0; 1332 assert(rr.qual == 3.0); 1333 1334 1335 // Test FILTER 1336 assert(r.hasFilter("PASS")); 1337 assert(r.hasFilter(".")); 1338 // add q10 filter (is in header) 1339 r.filter = "q10"; 1340 assert(r.filter == "q10"); 1341 assert(!r.hasFilter("PASS")); // if has another filter, no longer PASS (unless added explicitly) 1342 assert(!r.hasFilter(".")); // if has another filter, no longer has . (which is PASS, or no filter [spec conflict?]) 1343 1344 // add q30 filteR (not in header) 1345 r.filter = "q30"; 1346 assert(r.filter == "q10"); // i.e., unchanged 1347 1348 assert(r.hasFilter("q10")); 1349 assert(!r.hasFilter("q30")); 1350 1351 r.removeAllFilters(); 1352 assert(r.hasFilter(".")); 1353 r.addFilter("q10"); 1354 assert(r.hasFilter("q10")); 1355 r.removeFilter("q10"); 1356 assert(r.hasFilter("PASS")); 1357 1358 1359 // Finally, print the records: 1360 writefln("VCF records via toString:\n%s%s", r, rr); 1361 1362 writeln("\ndhtslib.vcf: all tests passed\n"); 1363 }