1 /** Module provides VCF Reader/Writer 2 3 VCF version 4.2 (including BCF) reader and writer, including 4 a model with convenience functions for the header (metadata) 5 and individual VCF/BCF records (rows). 6 7 Specifications: https://samtools.github.io/hts-specs/VCFv4.2.pdf 8 9 */ 10 module dhtslib.vcf.record; 11 12 import core.stdc.string; 13 import core.vararg; 14 15 import std.conv: to, ConvException; 16 import std.format: format; 17 import std.range; 18 import std.string: fromStringz, toStringz; 19 import std.utf : toUTFz; 20 import std.algorithm : map, filter, count; 21 import std.array : array; 22 import std.traits; 23 import std.meta : staticIndexOf; 24 25 import dhtslib.vcf; 26 import dhtslib.coordinates; 27 import dhtslib.memory; 28 import htslib.hts_log; 29 import htslib.kstring; 30 import htslib.vcf; 31 32 alias BCFRecord = VCFRecord; 33 34 /// Struct to aid in conversion of VCF info data 35 /// into D types 36 struct InfoField 37 { 38 /// String identifier of info field 39 string key; 40 /// BCF TYPE indicator 41 BcfRecordType type; 42 /// number of data elements 43 int len; 44 /// reference of info field data 45 ubyte[] data; 46 47 /// VCFRecord refct 48 Bcf1 line; 49 50 /// ctor from VCFRecord 51 this(string key, bcf_info_t * info, Bcf1 line) 52 { 53 this.line = line; 54 this(key, info); 55 } 56 57 /// Explicit postblit to avoid 58 /// https://github.com/blachlylab/dhtslib/issues/122 59 this(this) 60 { 61 this.line = line; 62 this.key = key; 63 this.type = type; 64 this.data = data; 65 } 66 67 /// string, info field ctor 68 this(string key, bcf_info_t * info) 69 { 70 /// get type 71 this.type = cast(BcfRecordType)(info.type & 0b1111); 72 /// get len 73 this.len = info.len; 74 /// copy data 75 this.data = info.vptr[0.. info.vptr_len]; 76 /// double check our lengths 77 debug{ 78 final switch(this.type){ 79 case BcfRecordType.Int8: 80 assert(data.length == this.len * byte.sizeof); 81 break; 82 case BcfRecordType.Int16: 83 assert(data.length == this.len * short.sizeof); 84 break; 85 case BcfRecordType.Int32: 86 assert(data.length == this.len * int.sizeof); 87 break; 88 case BcfRecordType.Int64: 89 assert(data.length == this.len * long.sizeof); 90 break; 91 case BcfRecordType.Float: 92 assert(data.length == this.len * float.sizeof); 93 break; 94 case BcfRecordType.Char: 95 assert(data.length == this.len * char.sizeof); 96 break; 97 case BcfRecordType.Null: 98 assert(data.length == 0); 99 } 100 } 101 } 102 103 /// convert to a D type array if int, float, or bool type array 104 T[] to(T: T[])() 105 if(isIntegral!T || is(T == float) || isBoolean!T) 106 { 107 static if(isIntegral!T){ 108 /// if we select the correct type just slice and return 109 if(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType)) 110 return (cast(T *)this.data.ptr)[0 .. this.len]; 111 /// if we select type that is too small log error and return 112 if(RecordTypeSizes[this.type] > T.sizeof) 113 { 114 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 115 return []; 116 } 117 T[] ret; 118 /// if we select type that is bigger slice, convert and return 119 switch(this.type){ 120 case BcfRecordType.Int8: 121 ret = ((cast(byte *)this.data.ptr)[0 .. this.len]).to!(T[]); 122 break; 123 case BcfRecordType.Int16: 124 ret = ((cast(short *)this.data.ptr)[0 .. this.len]).to!(T[]); 125 break; 126 case BcfRecordType.Int32: 127 ret = ((cast(int *)this.data.ptr)[0 .. this.len]).to!(T[]); 128 break; 129 case BcfRecordType.Int64: 130 ret = ((cast(long *)this.data.ptr)[0 .. this.len]).to!(T[]); 131 break; 132 default: 133 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(this.type, T.stringof)); 134 return []; 135 } 136 return ret; 137 }else{ 138 /// if we select type the wrong type error and return 139 if(!(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType))) 140 { 141 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 142 return []; 143 } 144 return (cast(T *)this.data.ptr)[0 .. this.len]; 145 } 146 } 147 148 /// convert to a D type if int, float, or bool type 149 T to(T)() 150 if(isIntegral!T || is(T == float) || isBoolean!T) 151 { 152 /// if bool return true 153 static if(isBoolean!T) 154 return true; 155 else{ 156 /// if info field has > 1 element error and return 157 if(this.len != 1){ 158 hts_log_error(__FUNCTION__, "This info field has a length of %d not 1".format(this.len)); 159 return T.init; 160 } 161 static if(isIntegral!T){ 162 /// if we select type that is too small log error and return 163 if(RecordTypeSizes[this.type] > T.sizeof) 164 { 165 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 166 return T.init; 167 } 168 /// just gonna use the array impl and grab index 0 169 return this.to!(T[])[0]; 170 }else{ 171 /// if we select type the wrong type error and return 172 if(!(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType))) 173 { 174 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 175 return T.init; 176 } 177 /// just gonna use the array impl and grab index 0 178 return this.to!(T[])[0]; 179 } 180 } 181 } 182 183 /// convert to a string 184 T to(T)() 185 if(isSomeString!T) 186 { 187 /// if we select type the wrong type error and return 188 if(!(this.type == BcfRecordType.Char)) 189 { 190 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 191 return T.init; 192 } 193 /// otherwise slice and dup 194 return cast(T)(cast(char *)this.data.ptr)[0 .. this.len].dup; 195 } 196 } 197 198 /// Struct to aid in conversion of VCF format data 199 /// into D types 200 struct FormatField 201 { 202 /// String identifier of info field 203 string key; 204 /// BCF TYPE indicator 205 BcfRecordType type; 206 /// number of samples 207 int nSamples; 208 /// number of data elements per sample 209 int n; 210 /// number of bytes per sample 211 int size; 212 /// reference of info field data 213 ubyte[] data; 214 215 /// VCFRecord refct 216 Bcf1 line; 217 218 /// ctor from VCFRecord 219 this(string key, bcf_fmt_t * fmt, Bcf1 line) 220 { 221 this.line = line; 222 this(key, fmt); 223 } 224 225 /// Explicit postblit to avoid 226 /// https://github.com/blachlylab/dhtslib/issues/122 227 this(this) 228 { 229 this.line = line; 230 this.key = key; 231 this.type = type; 232 this.nSamples = nSamples; 233 this.n = n; 234 this.size = size; 235 this.data = data; 236 } 237 238 /// string and format ctor 239 this(string key, bcf_fmt_t * fmt) 240 { 241 /// set all our data 242 this.key = key; 243 this.type = cast(BcfRecordType)(fmt.type & 0b1111); 244 this.n = fmt.n; 245 this.nSamples = fmt.p_len / fmt.size; 246 this.data = fmt.p[0 .. fmt.p_len].dup; 247 this.size = fmt.size; 248 /// double check our work 249 debug{ 250 final switch(this.type){ 251 case BcfRecordType.Int8: 252 assert(data.length == this.nSamples * this.n * byte.sizeof); 253 break; 254 case BcfRecordType.Int16: 255 assert(data.length == this.nSamples * this.n * short.sizeof); 256 break; 257 case BcfRecordType.Int32: 258 assert(data.length == this.nSamples * this.n * int.sizeof); 259 break; 260 case BcfRecordType.Int64: 261 assert(data.length == this.nSamples * this.n * long.sizeof); 262 break; 263 case BcfRecordType.Float: 264 assert(data.length == this.nSamples * this.n * float.sizeof); 265 break; 266 case BcfRecordType.Char: 267 assert(data.length == this.nSamples * this.n * char.sizeof); 268 break; 269 case BcfRecordType.Null: 270 assert(data.length == 0); 271 } 272 } 273 } 274 275 /// convert to a D type array if int, float, bool, or string type array 276 /// very similar to InfoField.to 277 /// This returns chunks as we separated FORMAT values by sample 278 auto to(T)() 279 if(isIntegral!T || is(T == float) || isBoolean!T || isSomeString!T) 280 { 281 T[] ret; 282 static if(isIntegral!T){ 283 if(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType)){ 284 ret = (cast(T *)this.data.ptr)[0 .. this.n * this.nSamples * T.sizeof]; 285 return ret.chunks(this.n); 286 } 287 if(RecordTypeSizes[this.type] > T.sizeof) 288 { 289 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 290 return ret.chunks(this.n); 291 } 292 switch(this.type){ 293 case BcfRecordType.Int8: 294 ret = ((cast(byte *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); 295 break; 296 case BcfRecordType.Int16: 297 ret = ((cast(short *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); 298 break; 299 case BcfRecordType.Int32: 300 ret = ((cast(int *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); 301 break; 302 case BcfRecordType.Int64: 303 ret = ((cast(long *)this.data.ptr)[0 .. this.n * this.nSamples]).to!(T[]); 304 break; 305 default: 306 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(this.type, T.stringof)); 307 return ret.chunks(this.n); 308 } 309 }else static if(isSomeString!T){ 310 if(!(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType))) 311 { 312 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 313 return ret.chunks(this.n); 314 } 315 auto ptr = this.data.ptr; 316 for(auto i=0; i < nSamples; i++) 317 { 318 auto chrArr = (cast(char *)ptr)[i*size .. (i+1) * size]; 319 if(chrArr[$-1]) 320 ret ~= chrArr.idup; 321 else 322 ret ~= fromStringz(chrArr.ptr).idup; 323 } 324 }else{ 325 if(!(this.type == cast(BcfRecordType) staticIndexOf!(T, RecordTypeToDType))) 326 { 327 hts_log_error(__FUNCTION__, "Cannot convert %s to %s".format(type, T.stringof)); 328 return ret.chunks(this.n); 329 } 330 ret = (cast(T *)this.data.ptr)[0 .. this.n * this.nSamples]; 331 } 332 static if(isSomeString!T) 333 return ret.chunks(1); 334 else 335 return ret.chunks(this.n); 336 } 337 } 338 339 /// Represents individual GT values as encoded in BCF. 340 /// This is described in the VCF 4.2 spec section 6.3.3. 341 /// In summary it can be an int, short, or byte and 342 /// encodes an allele value and a phased flag 343 /// e.g (allele + 1) << 1 | phased 344 union GT(T) 345 { 346 import std.bitmanip : bitfields; 347 T raw; 348 mixin(bitfields!( 349 bool, "phased", 1, 350 T, "allele", (T.sizeof * 8) - 1, 351 )); 352 353 /// get allele value 354 auto getAllele() const { 355 return this.allele - 1; 356 } 357 358 /// set allele value 359 void setAllele(int val) { 360 this.allele = cast(T)(val + 1); 361 } 362 363 /// check if value is missing 364 bool isMissing() const { 365 return this.allele == 0; 366 } 367 368 /// set value as missing 369 void setMissing() { 370 this.allele = T(0); 371 } 372 373 /// check if value is padding 374 bool isPadding() { 375 static if (is(T == byte)) { 376 return cast(bool)((this.raw & 0xFE) == 0x80); 377 } else static if (is(T == short)) { 378 return cast(bool)((this.raw & 0xFFFE) == 0x8000); 379 } else static if (is(T == int)) { 380 return cast(bool)((this.raw & 0xFFFFFFFE) == 0x80000000); 381 } 382 } 383 384 } 385 386 /// Struct to represent VCF/BCF genotype data 387 struct Genotype 388 { 389 /// Type of underlying format data 390 BcfRecordType type; 391 /// Array of encoded genotypes 392 void[] data; 393 394 /// VCFRecord refct 395 Bcf1 line; 396 397 /// ctor from FormatField 398 this(FormatField gt, ulong sampleIdx) 399 { 400 this.line = gt.line; 401 this.type = gt.type; 402 final switch(this.type){ 403 case BcfRecordType.Int8: 404 this.data = gt.to!byte()[sampleIdx]; 405 break; 406 case BcfRecordType.Int16: 407 this.data = gt.to!short()[sampleIdx]; 408 break; 409 case BcfRecordType.Int32: 410 this.data = gt.to!int()[sampleIdx]; 411 break; 412 case BcfRecordType.Int64: 413 case BcfRecordType.Float: 414 case BcfRecordType.Char: 415 case BcfRecordType.Null: 416 assert(0); 417 } 418 } 419 420 /// Explicit postblit to avoid 421 /// https://github.com/blachlylab/dhtslib/issues/122 422 this(this) 423 { 424 this.type = type; 425 this.data = data; 426 this.line = line; 427 } 428 429 bool isNull() 430 { 431 final switch(this.type){ 432 case BcfRecordType.Int8: 433 return (cast(GT!byte[])(this.data)).filter!(x=>!x.isMissing).empty; 434 case BcfRecordType.Int16: 435 return (cast(GT!short[])(this.data)).filter!(x=>!x.isMissing).empty; 436 case BcfRecordType.Int32: 437 return (cast(GT!int[])(this.data)).filter!(x=>!x.isMissing).empty; 438 case BcfRecordType.Int64: 439 case BcfRecordType.Float: 440 case BcfRecordType.Char: 441 case BcfRecordType.Null: 442 assert(0); 443 } 444 } 445 446 auto alleles() 447 { 448 final switch(this.type){ 449 case BcfRecordType.Int8: 450 return (cast(GT!byte[])(this.data)).filter!(x=>!x.isPadding).map!(x=>x.getAllele).array; 451 case BcfRecordType.Int16: 452 return (cast(GT!short[])(this.data)).filter!(x=>!x.isPadding).map!(x=>x.getAllele).array; 453 case BcfRecordType.Int32: 454 return (cast(GT!int[])(this.data)).filter!(x=>!x.isPadding).map!(x=>x.getAllele).array; 455 case BcfRecordType.Int64: 456 case BcfRecordType.Float: 457 case BcfRecordType.Char: 458 case BcfRecordType.Null: 459 assert(0); 460 } 461 } 462 463 /// get genotype ploidy 464 auto getPloidy() { 465 final switch(this.type){ 466 case BcfRecordType.Int8: 467 return (cast(GT!byte[])(this.data)).filter!(x=>!x.isPadding).count; 468 case BcfRecordType.Int16: 469 return (cast(GT!short[])(this.data)).filter!(x=>!x.isPadding).count; 470 case BcfRecordType.Int32: 471 return (cast(GT!int[])(this.data)).filter!(x=>!x.isPadding).count; 472 case BcfRecordType.Int64: 473 case BcfRecordType.Float: 474 case BcfRecordType.Char: 475 case BcfRecordType.Null: 476 assert(0); 477 } 478 } 479 /// mixin for genotype printing 480 auto toString(T)() const { 481 string ret; 482 foreach(i, gt; cast(GT!T[]) this.data) { 483 if(i) 484 ret ~= ['/','|'][gt.phased]; 485 if(gt.isPadding) { 486 ret = ret[0..$-1]; 487 break; 488 } 489 if (gt.getAllele() == -1) { 490 ret ~= '.'; 491 } else { 492 ret ~= gt.getAllele().to!string; 493 } 494 } 495 return ret; 496 } 497 /// get string representation 498 auto toString() const { 499 final switch(this.type){ 500 case BcfRecordType.Int8: 501 return this.toString!byte; 502 case BcfRecordType.Int16: 503 return this.toString!short; 504 case BcfRecordType.Int32: 505 return this.toString!int; 506 case BcfRecordType.Int64: 507 case BcfRecordType.Float: 508 case BcfRecordType.Char: 509 case BcfRecordType.Null: 510 assert(0); 511 } 512 } 513 } 514 515 516 517 /** Wrapper around bcf1_t 518 519 Because it uses bcf1_t internally, it must conform to the BCF2 part 520 of the VCFv4.2 specs, rather than the loosey-goosey VCF specs. i.e., 521 INFO, CONTIG, FILTER records must exist in the header. 522 523 TODO: Does this need to be kept in a consistent state? 524 Ideally, VCFWriter would reject invalid ones, but we are informed 525 that it is invalid (e.g. if contig not found) while building this 526 struct; bcf_write1 will actually segfault, unfortunately. I'd like 527 to avoid expensive validate() calls for every record before writing 528 if possible, which means keeping this consistent. However, not 529 sure what to do if error occurs when using the setters herein? 530 531 2019-01-23 struct->class to mirror SAMRecord -- faster if reference type? 532 533 2019-01-23 WIP: getters for chrom, pos, id, ref, alt are complete (untested) 534 535 After parsing a BCF or VCF line, bcf1_t must be UnpackLeveled. (not applicable when building bcf1_t from scratch) 536 Depending on information needed, this can be done to various levels with performance tradeoff. 537 Unpacking symbols: 538 UNPACK.ALL: all 539 UNPACK.SHR: all shared information (UNPACK.ALT|UNPACK.FILT|UNPACK.INFO) 540 541 UnpackLevel.ALT 542 / UnpackLevel.FILT 543 | / UnpackLevel.INFO 544 | | / ____________________________ UnpackLevel.FMT 545 V V V / | | | 546 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 ... 547 548 */ 549 struct VCFRecord 550 { 551 Bcf1 line; /// htslib structured record TODO: change to 'b' for better internal consistency? (vcf.h/c actually use line quite a bit in fn params) 552 553 VCFHeader vcfheader; /// corresponding header (required); 554 555 /** VCFRecord 556 557 Construct a bcf/vcf record, backed by bcf1_t, from: an existing bcf1_t, parameters, or a VCF line. 558 559 Internal backing by bcf1_t means it must conform to the BCF2 rules -- i.e., header must contain 560 appropriate INFO, CONTIG, and FILTER lines. 561 562 Protip: specifying alternate MAX_UNPACK can speed it tremendously 563 as it will not UnpackLevel all fields, only up to those requested (see htslib.vcf) 564 For example, UnpackLevel.ALT is up to ALT inclusive, and UnpackLevel.ALT is up to FILTER 565 */ 566 this(T)(T h, bcf1_t *b, UnpackLevel MAX_UNPACK = UnpackLevel.None) 567 if(is(T == VCFHeader) || is(T == bcf_hdr_t *)) 568 { 569 static if (is(T == VCFHeader)) this.vcfheader = h; 570 else static if (is(T == bcf_hdr_t *)) this.vcfheader = VCFHeader(Bcf_hdr_t(h)); // double free() bug if we don't own bcf_hdr_t h 571 else static if (is(T == bcf_hdr_t)) assert(0); // ferret out situations that will lead to free() crashes 572 else assert(0); 573 574 this.line = Bcf1(b); 575 576 // Now it must be UNPACKed 577 // Protip: specifying alternate MAX_UNPACK can speed it tremendously 578 immutable int ret = bcf_unpack(this.line, MAX_UNPACK); // unsure what to do c̄ return value // @suppress(dscanner.suspicious.unused_variable) 579 } 580 /// ditto 581 this(SS)(VCFHeader vcfhdr, string chrom, int pos, string id, string _ref, string alt, float qual, SS filter, ) 582 if (isSomeString!SS || is(SS == string[])) 583 { 584 this.line = Bcf1(bcf_init1()); 585 this.vcfheader = vcfhdr; 586 587 this.chrom = chrom; 588 this.pos = pos; 589 this.id = id; 590 591 this.setAlleles(_ref, alt); 592 593 this.qual = qual; 594 this.filter = filter; 595 } 596 /// ditto 597 this(VCFHeader vcfhdr, string line, UnpackLevel MAX_UNPACK = UnpackLevel.None) 598 { 599 this.vcfheader = vcfhdr; 600 601 kstring_t kline; 602 auto dupline = line.dup; // slower, but safer than a cast // @suppress(dscanner.suspicious.unmodified) 603 604 kline.l = dupline.length; 605 kline.m = dupline.length; 606 kline.s = dupline.ptr; 607 608 this.line = Bcf1(bcf_init1()); 609 this.line.max_unpack = MAX_UNPACK; 610 611 auto ret = vcf_parse(&kline, this.vcfheader.hdr, this.line); 612 if (ret < 0) { 613 hts_log_error(__FUNCTION__, "vcf_parse returned < 0 -- code error or malformed VCF line"); 614 } else { 615 ret = bcf_unpack(this.line, MAX_UNPACK); // unsure what to do c̄ return value 616 } 617 } 618 619 /// Explicit postblit to avoid 620 /// https://github.com/blachlylab/dhtslib/issues/122 621 this(this) 622 { 623 this.line = line; 624 this.vcfheader = vcfheader; 625 } 626 627 /// ensure that vcf variable length data is unpacked to at least desired level 628 pragma(inline, true) 629 void unpack(UnpackLevel unpackLevel) 630 { 631 if (this.line.max_unpack < unpackLevel) bcf_unpack(this.line, unpackLevel); 632 } 633 634 //----- FIXED FIELDS -----// 635 636 /* CHROM */ 637 /// Get chromosome (CHROM) 638 @property 639 string chrom() 640 { 641 return fromStringz(bcf_hdr_id2name(this.vcfheader.hdr, this.line.rid)).idup; 642 } 643 /// Set chromosome (CHROM) 644 @property 645 void chrom(const(char)[] c) 646 { 647 auto rid = bcf_hdr_name2id(this.vcfheader.hdr, toStringz(c)); 648 if (rid == -1) { 649 hts_log_error(__FUNCTION__, format("contig not found: %s", c)); 650 throw new Exception("contig not found"); 651 } 652 else line.rid = rid; 653 } 654 655 656 /* POS */ 657 /** Get position (POS, column 2) 658 * 659 * NB: internally BCF is uzing 0 based coordinates; we only show +1 when printing a VCF line with toString (which calls vcf_format) 660 */ 661 @property 662 ZB pos() 663 out(coord) { assert(coord >= 0); } 664 do 665 { 666 return ZB(this.line.pos); 667 } 668 /// Set position (POS, column 2) 669 @property 670 void pos(ZB p) 671 in { assert(p >= 0); } 672 do 673 { 674 // TODO: should we check for pos >= 0 && pos < contig length? Could really hamper performance. 675 // TODO: if writing out the file with invalid POS values crashes htslib, will consider it 676 this.line.pos = p.pos; 677 } 678 679 680 /* ID */ 681 /// Get ID string 682 @property string id() 683 { 684 this.unpack(UnpackLevel.AltAllele); 685 return fromStringz(this.line.d.id).idup; 686 } 687 /// Sets new ID string; comma-separated list allowed but no dup checking performed 688 @property int id(const(char)[] id) 689 { 690 // bcf_update_id expects null pointer instead of empty string to mean "no value" 691 if (id == "") return bcf_update_id(this.vcfheader.hdr, this.line, null); 692 else return bcf_update_id(this.vcfheader.hdr, this.line, toStringz(id)); 693 } 694 /// Append an ID (column 3) to the record. 695 /// 696 /// NOTE: htslib performs duplicate checking 697 int addID(const(char)[] id) 698 { 699 if(id == "") return 0; 700 return bcf_add_id(this.vcfheader.hdr, this.line, toStringz(id)); 701 } 702 703 704 /* Alleles (REF, ALT) 705 706 Internally, alleles are stored as a \0 separated array: 707 [C \0 T \0 T T \0.. \0] 708 ^ ^ ^ ^ 709 | | L L ALT_1 = "TT" 710 | L ALT_0 711 L REF 712 713 TODO: Getters and setters poorly or inconsistently (?) named at this time or there are too many overloads? 714 TODO: need non-overwriting setter for ref and alt alleles 715 TODO: some of these may be inefficent; since they may be used in hot inner loops, pls optimize 716 */ 717 /// REF allele length 718 @property long refLen() 719 { 720 version(DigitalMars) pragma(inline); 721 version(LDC) pragma(inline, true); 722 version(GNU) pragma(inline, true); 723 return this.line.rlen; 724 } 725 726 /// Coordinate range of the reference allele 727 @property ZBHO coordinates() 728 { 729 return ZBHO(this.pos, this.pos + this.refLen); 730 } 731 732 /// All alleles getter (array) 733 @property string[] allelesAsArray() 734 { 735 this.unpack(UnpackLevel.AltAllele); 736 string[] ret; 737 ret.length = this.line.n_allele; // n=0, no reference; n=1, ref but no alt 738 foreach(int i; 0 .. this.line.n_allele) // ref allele is index 0 739 { 740 ret[i] = fromStringz(this.line.d.allele[i]).idup; 741 } 742 return ret; 743 } 744 /// Reference allele getter 745 @property string refAllele() 746 { 747 this.unpack(UnpackLevel.AltAllele); 748 if (this.line.n_allele < 1) return ""; // a valid record could have no ref (or alt) alleles 749 else return fromStringz(this.line.d.als).idup; 750 } 751 // NB WIP: there could be zero, or multiple alt alleles 752 /+@property string altAlleles() 753 { 754 if (!this.line.unpacked) bcf_unpack(this.line, UnpackLevel.ALT); 755 return fromStringz(this.line.d.als) 756 }+/ 757 /// Alternate alleles getter version 1: ["A", "ACTG", ...] 758 @property string[] altAllelesAsArray() 759 { 760 this.unpack(UnpackLevel.AltAllele); 761 762 string[] ret; 763 if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt 764 return this.allelesAsArray[1 .. $]; // trim off REF 765 } 766 /// Alternate alleles getter version 2: "A,ACTG,..." 767 @property string altAllelesAsString() 768 { 769 this.unpack(UnpackLevel.AltAllele); 770 771 string ret; 772 if (this.line.n_allele < 2) return ret; // n=0, no reference; n=1, ref but no alt 773 774 char[] tmp; 775 tmp.length = this.line.d.m_allele; // max allocated size in htslib 776 777 char *a = this.line.d.allele[1]; // ref allele is index 0 778 const char *last = this.line.d.allele[this.line.n_allele - 1]; // pointer to start of last allele 779 size_t i = 0; 780 781 assert( this.line.d.allele[1] == (this.line.d.als + this.line.rlen + 1) ); // ensue data structure is as we think (+1: past ref's term \0) 782 783 while (i < this.line.d.m_allele) // safety stop at max allocated size 784 { 785 if (*a) tmp[i] = *a; 786 else { // '\0' 787 if (a < last) tmp[i] = ','; 788 else break; 789 } 790 i++; 791 a++; 792 } 793 tmp.length = i; 794 795 return tmp.idup; 796 } 797 /// Set alleles; comma-separated list 798 @property void alleles(string a) 799 { 800 this.unpack(UnpackLevel.AltAllele); 801 if (a == "") { 802 this.line.n_allele = 0; 803 if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF 804 } 805 else 806 bcf_update_alleles_str(this.vcfheader.hdr, this.line, toStringz(a)); 807 } 808 /// Set alleles; array 809 @property void alleles(string[] a) 810 { 811 this.unpack(UnpackLevel.AltAllele); 812 if (a.length == 0) { 813 this.line.n_allele = 0; 814 if (this.line.d.m_allele) this.line.d.als[0] = '\0'; // if storage allocated, zero out the REF 815 } 816 else { 817 const(char)*[64] allelesp; // will fail if a locus has more than 63 ALT alleles :-O 818 foreach(i; 0 .. a.length ) { 819 // In order to zero out all alleles, pass a zero-length allele to the string version, 820 // or a zero-length array to this version. Zero length string member of nonempty allele array is an error. 821 if (a[i].length == 0) { 822 hts_log_error(__FUNCTION__, "Zero-length allele in nonempty allele array. Setting to '.'"); 823 allelesp[i] = ".".ptr; 824 } 825 else allelesp[i] = toStringz(a[i]); 826 } 827 bcf_update_alleles(this.vcfheader.hdr, this.line, &allelesp[0], cast(int)a.length); 828 } 829 } 830 831 /// Set REF allele only 832 void setRefAllele(string r) 833 { 834 // first, get REF 835 this.unpack(UnpackLevel.AltAllele); 836 // a valid record could have no ref (or alt) alleles 837 auto alleles = [toUTFz!(char *)(r)]; 838 if (this.line.n_allele < 2) // if none or 1 (=REF only), just add the REF we receieved 839 bcf_update_alleles(this.vcfheader.hdr, this.line, alleles.ptr, 1); 840 else { 841 // length of REF allele is allele[1] - allele[0], (minus one more for \0) 842 // TODO ** TODO ** : there is a property line.refLen already 843 const auto reflen = (this.line.d.allele[1] - this.line.d.allele[0]) - 1; 844 alleles ~= this.altAllelesAsArray.map!(toUTFz!(char *)).array; 845 bcf_update_alleles(this.vcfheader.hdr, this.line, alleles.ptr, cast(int)alleles.length); 846 } 847 } 848 /// Set alleles; alt can be comma separated 849 void setAlleles(string _ref, string alt) 850 { 851 immutable string alleles = _ref ~ "," ~ alt ~ "\0"; 852 bcf_update_alleles_str(this.vcfheader.hdr, this.line, alleles.ptr); 853 } 854 /// Set alleles; min. 2 alleles (ref, alt1); unlimited alts may be specified 855 void setAlleles(string _ref, string alt, ...) 856 { 857 string alleles = _ref ~ "," ~ alt; 858 859 for (int i = 0; i < _arguments.length; i++) 860 { 861 alleles ~= "," ~ va_arg!string(_argptr); 862 } 863 864 alleles ~= "\0"; 865 866 bcf_update_alleles_str(this.vcfheader.hdr, this.line, alleles.ptr); 867 } 868 869 870 /* Quality (QUAL) */ 871 /// Get variant quality (QUAL, column 6) 872 @property float qual() 873 out(result) { assert(result >= 0); } 874 do 875 { 876 return this.line.qual; 877 } 878 /// Set variant quality (QUAL, column 6) 879 @property void qual(float q) 880 in { assert(q >= 0); } 881 do { this.line.qual = q; } 882 883 884 /* FILTER */ 885 /// Get FILTER column (nothing in htslib sadly) 886 @property string filter() 887 { 888 const(char)[] ret; 889 890 this.unpack(UnpackLevel.Filter); 891 892 if (this.line.d.n_flt) { 893 for(int i; i< this.line.d.n_flt; i++) { 894 if (i) ret ~= ";"; 895 ret ~= fromStringz(this.vcfheader.hdr.id[BCF_DT_ID][ this.line.d.flt[0] ].key); 896 } 897 } else { 898 ret = "."; 899 } 900 901 return ret.idup; 902 } 903 /// Remove all entries in FILTER 904 void removeAllFilters() 905 { 906 // always returns zero 907 bcf_update_filter(this.vcfheader.hdr, this.line, null, 0); 908 } 909 /// Set the FILTER column to f 910 @property void filter(string f) 911 { 912 this.filter([f]); 913 } 914 /// Set the FILTER column to f0,f1,f2... 915 /// TODO: determine definitiely whether "." is replaced with "PASS" 916 @property void filter(string[] fs) 917 { 918 int[] fids; 919 foreach(f; fs) { 920 const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); 921 if (fid == -1){ 922 hts_log_warning(__FUNCTION__, format("ignoring filter not found in header: %s", f.empty() ? "(empty)" : f)); 923 continue; 924 } 925 else fids ~= fid; 926 } 927 if (fids.length > 0) 928 bcf_update_filter(this.vcfheader.hdr, this.line, fids.ptr, cast(int)fids.length); 929 else 930 hts_log_warning(__FUNCTION__, "No FILTER update was performed due to empty list"); 931 } 932 /// Add a filter; from htslib: 933 /// "If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed." 934 int addFilter(string f) 935 { 936 937 if(f == "") hts_log_warning(__FUNCTION__, "filter was empty"); 938 const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); 939 if (fid == -1){ 940 hts_log_warning(__FUNCTION__, format("ignoring filter not found in header: %s", f.empty() ? "(empty)" : f)); 941 return -1; 942 } 943 944 return bcf_add_filter(this.vcfheader.hdr, this.line, fid); 945 } 946 /// Remove a filter by name 947 int removeFilter(string f) 948 { 949 const int fid = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, toStringz(f)); 950 if (fid == -1){ 951 hts_log_warning(__FUNCTION__, format("filter not found in header (ignoring): %s", f) ); 952 return -1; 953 } 954 return removeFilter(fid); 955 } 956 /// Remove a filter by numeric id 957 int removeFilter(int fid) 958 { 959 return bcf_remove_filter(this.vcfheader.hdr, this.line, fid, 0); 960 } 961 /// Determine whether FILTER is present. log warning if filter does not exist. "PASS" and "." can be used interchangeably. 962 bool hasFilter(string filter) 963 { 964 char[] f = filter.dup ~ '\0'; 965 //const auto id = bcf_hdr_id2int(this.vcfheader.hdr, BCF_DT_ID, f.ptr); 966 const auto ret = bcf_has_filter(this.vcfheader.hdr, this.line, f.ptr); 967 968 if (ret > 0) return true; 969 else if (ret == 0) return false; 970 else { 971 hts_log_warning(__FUNCTION__, format("FILTER %s does not exist in the header", filter)); 972 return false; 973 } 974 } 975 976 977 /** Update INFO (pan-sample info; column 8) 978 * 979 * Add a tag:value to the INFO column 980 * NOTE: tag must already exist in the header 981 * 982 * Templated on data type, calls one of bcf_update_info_{int32,float,string,flag} 983 * Both singletons and arrays are supported. 984 */ 985 void addInfo(T)(string tag, T data) 986 if(isSomeString!T || ((isIntegral!T || isFloatingPoint!T || isBoolean!T) && !isArray!T)) 987 { 988 int ret = -1; 989 990 static if(isIntegral!T) { 991 auto integer = cast(int) data; 992 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, 1); 993 } 994 995 else static if(isFloatingPoint!T) { 996 auto flt = cast(float) data; // simply passing "2.0" (or whatever) => data is a double 997 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, 1); 998 } 999 1000 else static if(isSomeString!T) 1001 ret = bcf_update_info_string(this.vcfheader.hdr, this.line, toStringz(tag), toStringz(data)); 1002 1003 else static if(isBoolean!T) { 1004 immutable int set = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 1005 ret = bcf_update_info_flag(this.vcfheader.hdr, this.line, toStringz(tag), null, set); 1006 } 1007 1008 if (ret == -1) 1009 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 1010 } 1011 /// ditto 1012 void addInfo(T)(string tag, T data) 1013 if(isIntegral!(ElementType!T) || isFloatingPoint!(ElementType!T)) // otherwise string::immutable(char)[] will match the other template 1014 { 1015 int ret = -1; 1016 1017 static if(isIntegral!(ElementType!T)) { 1018 auto integer = data.to!(int[]); 1019 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), integer.ptr, cast(int)data.length); 1020 } 1021 1022 else static if(isFloatingPoint!(ElementType!T)) { 1023 auto flt = data.to!(float[]); // simply passing "2.0" (or whatever) => data is a double 1024 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), flt.ptr, cast(int)data.length); 1025 } 1026 1027 if (ret == -1) 1028 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 1029 } 1030 1031 /** Update FORMAT (sample info; column 9+) 1032 * 1033 * Templated on data type, calls one of bc_update_format_{int32,float,string,flag} 1034 */ 1035 void addFormat(T)(string tag, T data) 1036 if(!isArray!T || isSomeString!T) 1037 { 1038 int ret = -1; 1039 1040 static if(isIntegral!T) { 1041 auto integer = data.to!int; 1042 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), &integer, 1); 1043 } 1044 1045 else static if(isFloatingPoint!T) { 1046 auto flt = data.to!float; // simply passing "2.0" (or whatever) => data is a double 1047 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), &flt, 1); 1048 } 1049 1050 else static if(isSomeString!T){ 1051 auto strs = [data].map!(toUTFz!(char*)).array; 1052 ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), strs.ptr, 1); 1053 } 1054 1055 else static if(isBoolean!T) { 1056 immutable int set = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 1057 ret = bcf_update_format_flag(this.vcfheader.hdr, this.line, toStringz(tag), null, set); 1058 } 1059 1060 if (ret == -1) hts_log_warning(__FUNCTION__, format("Couldn't add format (ignoring): %s", data)); 1061 } 1062 /// ditto 1063 void addFormat(T)(string tag, T[] data) 1064 if((!isArray!T || isSomeString!T) && !is(T == immutable(char))) 1065 { 1066 int ret = -1; 1067 1068 static if(isIntegral!T) { 1069 auto integer = data.to!(int[]); 1070 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), 1071 integer.ptr, cast(int)data.length); 1072 } 1073 1074 else static if(isFloatingPoint!T) { 1075 auto flt = data.to!(float[]); // simply passing "2.0" (or whatever) => data is a double 1076 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), 1077 flt.ptr, cast(int)data.length); 1078 } 1079 1080 else static if(isSomeString!T){ 1081 auto strs = data.map!(toUTFz!(char*)).array; 1082 ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), strs.ptr, cast(int)strs.length); 1083 } 1084 1085 if (ret == -1) hts_log_warning(__FUNCTION__, format("Couldn't add format (ignoring): %s", data)); 1086 1087 } 1088 1089 /// remove an info section 1090 void removeInfo(string tag) 1091 { 1092 bcf_update_info(this.vcfheader.hdr,this.line, toStringz(tag),null,0,HeaderTypes.None); 1093 } 1094 1095 /// remove a format section 1096 void removeFormat(string tag) 1097 { 1098 bcf_update_format(this.vcfheader.hdr,this.line, toStringz(tag),null,0,HeaderTypes.None); 1099 } 1100 1101 1102 /// add INFO or FORMAT key:value pairs to a record 1103 /// add a single datapoint OR vector of values, OR, values to each sample (if lineType == FORMAT) 1104 void add(HeaderRecordType lineType, T)(const(char)[] tag, T data) 1105 if((lineType == HeaderRecordType.Info || lineType == HeaderRecordType.Format) && 1106 (isIntegral!T || isIntegral!(ElementType!T) || 1107 isFloatingPoint!T || isFloatingPoint!(ElementType!T) || 1108 isSomeString!T || isSomeString!(ElementType!T) || 1109 isBoolean!T || isBoolean!(ElementType!T))) 1110 { 1111 int ret = -1; 1112 int len; 1113 1114 static if (!isDynamicArray!T) { 1115 len = 1; 1116 static if (isIntegral!T) auto d = data.to!int; 1117 else static if (isFloatingPoint!T) auto d = data.to!float; 1118 else static if (isSomeString!T) auto d = toStringz(data); 1119 else static if (isBoolean!T) immutable int d = data ? 1 : 0; // if data == true, pass 1 to bcf_update_info_flag(n=); n=0 => clear flag 1120 else static assert(0); 1121 1122 const auto ptr = &d; 1123 } 1124 else static if (isDynamicArray!T) { 1125 assert(data.length < int.max); 1126 len = cast(int) data.length; 1127 static if (isIntegral!(ElementType!T)) auto d = data.to!(int[]); 1128 else static if (isFloatingPoint!(ElementType!T)) auto d = data.to!(float[]); 1129 else static if (isSomeString!(ElementType!T)) { 1130 char[] d; 1131 foreach(s; data) { 1132 d ~= s ~ "\0"; 1133 } 1134 if(d.length == 0) d ~= "\0"; // TODO replace with asserts on data length earlier 1135 } 1136 else static if (isBoolean!(ElementType!T)) { 1137 int[] d; 1138 foreach(b; data) { 1139 d ~= (b ? 1 : 0); 1140 } 1141 } 1142 1143 const auto ptr = d.ptr; 1144 } 1145 else static assert(0); 1146 1147 static if (lineType == HeaderRecordType.Info) { 1148 static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) 1149 ret = bcf_update_info_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1150 else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) 1151 ret = bcf_update_info_float(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1152 else static if (isSomeString!T || isSomeString!(ElementType!T)) 1153 ret = bcf_update_info_string(this.vcfheader.hdr, this.line, toStringz(tag), ptr); 1154 else static if (is(T == bool) || is(ElementType!T == bool)) 1155 ret = bcf_update_info_flag(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1156 else static assert(0, "Type not recognized for INFO tag"); 1157 } 1158 else static if (lineType == HeaderRecordType.Format) { 1159 static if (is(Unqual!T == int) || is(Unqual!(ElementType!T) == int)) 1160 ret = bcf_update_format_int32(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1161 else static if (is(Unqual!T == float) || is(Unqual!(ElementType!T) == float)) 1162 ret = bcf_update_format_float(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1163 else static if (isSomeString!T || isSomeString!(ElementType!T)) 1164 ret = bcf_update_format_string(this.vcfheader.hdr, this.line, toStringz(tag), ptr, len); 1165 else static assert(0, "Type not recognized for FORMAT tag"); 1166 } 1167 else static assert(0); 1168 1169 if (ret == -1) 1170 hts_log_warning(__FUNCTION__, format("Couldn't add tag (ignoring): %s with value %s", tag, data)); 1171 } 1172 1173 /// returns a hashmap of info data by field 1174 InfoField[string] getInfos() 1175 { 1176 /// UnpackLevel 1177 this.unpack(UnpackLevel.Info); 1178 1179 /// copy some data 1180 InfoField[string] infoMap; 1181 bcf_info_t[] infos = this.line.d.info[0..this.line.n_info].dup; 1182 1183 foreach (bcf_info_t info; infos) 1184 { 1185 /// if variable data is null then value is set for deletion 1186 /// skip 1187 if(!info.vptr) continue; 1188 auto key = fromStringz(this.vcfheader.hdr.id[HeaderDictTypes.Id][info.key].key).idup; 1189 infoMap[key] = InfoField(key, &info, this.line); 1190 } 1191 return infoMap; 1192 } 1193 1194 /// returns a hashmap of format data by field 1195 FormatField[string] getFormats() 1196 { 1197 this.unpack(UnpackLevel.Format); 1198 FormatField[string] fmtMap; 1199 bcf_fmt_t[] fmts = this.line.d.fmt[0..this.line.n_fmt].dup; 1200 foreach (bcf_fmt_t fmt; fmts) 1201 { 1202 /// if variable data is null then value is set for deletion 1203 /// skip 1204 if(!fmt.p) continue; 1205 auto key = fromStringz(this.vcfheader.hdr.id[HeaderDictTypes.Id][fmt.id].key).idup; 1206 fmtMap[key] = FormatField(key, &fmt, this.line); 1207 } 1208 return fmtMap; 1209 } 1210 1211 Genotype getGenotype(int sampleIdx) { 1212 Genotype ret; 1213 if("GT" in this.getFormats() && sampleIdx < this.vcfheader.nsamples) { 1214 ret = Genotype(this.getFormats["GT"], sampleIdx); 1215 }else if (sampleIdx >= this.vcfheader.nsamples) { 1216 hts_log_error(__FUNCTION__, "sample idx is larger than nsamples"); 1217 }else{ 1218 hts_log_error(__FUNCTION__, "GT doesn't exist in FORMAT fields"); 1219 } 1220 return ret; 1221 } 1222 1223 Genotype[] getGenotypes() { 1224 Genotype[] ret; 1225 FormatField gt; 1226 if("GT" in this.getFormats()) { 1227 gt = this.getFormats["GT"]; 1228 }else{ 1229 hts_log_error(__FUNCTION__, "GT doesn't exist in FORMAT fields"); 1230 } 1231 for(auto i=0; i < this.vcfheader.nsamples; i++) 1232 { 1233 ret ~= Genotype(gt, i); 1234 } 1235 return ret; 1236 } 1237 1238 /// Return a string representation of the VCFRecord (i.e. as would appear in .vcf) 1239 /// 1240 /// As a bonus, there is a kstring_t memory leak 1241 string toString() const 1242 { 1243 kstring_t s; 1244 1245 const int ret = vcf_format(this.vcfheader.hdr, this.line, &s); 1246 if (ret) 1247 { 1248 hts_log_error(__FUNCTION__, 1249 format("vcf_format returned nonzero (%d) (likely EINVAL, invalid bcf1_t struct?)", ret)); 1250 return "[VCFRecord vcf_format parse_error]"; 1251 } 1252 1253 return cast(string) s.s[0 .. s.l]; 1254 } 1255 } 1256 1257 1258 1259 1260 1261 /// 1262 debug(dhtslib_unittest) 1263 unittest 1264 { 1265 import std.exception: assertThrown; 1266 import std.stdio: writeln, writefln; 1267 import dhtslib.vcf.writer; 1268 1269 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 1270 1271 1272 auto vw = VCFWriter("/dev/null", VCFWriterTypes.VCF); 1273 1274 vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); 1275 vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"); 1276 vw.addHeaderLineKV("INFO", "<ID=DP2,Number=2,Type=Float,Description=\"Total Depth\">"); 1277 // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> 1278 vw.vcfhdr.addHeaderLine!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data"); 1279 vw.addHeaderLineRaw("##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); // @suppress(dscanner.style.long_line) 1280 vw.addHeaderLineRaw("##FILTER=<ID=q10,Description=\"Quality below 10\">"); 1281 HeaderRecord hrec; 1282 hrec.setHeaderRecordType(HeaderRecordType.Format); 1283 hrec.setID("AF"); 1284 hrec.setLength(HeaderLengths.OnePerAltAllele); 1285 hrec.setValueType(HeaderTypes.Float); 1286 hrec.setDescription("Allele Freq"); 1287 vw.vcfhdr.addHeaderRecord(hrec); 1288 1289 assert(vw.vcfhdr.getHeaderRecord(HeaderRecordType.Format, "ID","AF")["ID"] == "AF"); 1290 vw.addHeaderLineRaw("##FORMAT=<ID=CH,Number=3,Type=String,Description=\"test\">"); 1291 1292 // Exercise header 1293 assert(vw.vcfhdr.nsamples == 0); 1294 vw.addSample("NA12878"); 1295 assert(vw.vcfhdr.nsamples == 1); 1296 1297 vw.writeHeader(); 1298 1299 auto r = new VCFRecord(vw.vcfhdr, bcf_init1()); 1300 1301 r.chrom = "20"; 1302 assert(r.chrom == "20"); 1303 assertThrown(r.chrom = "chr20"); // headerline chromosome is "20" not "chr20" 1304 1305 r.pos = Coordinate!(Basis.zero)(999_999); 1306 assert(r.pos == Coordinate!(Basis.zero)(999_999)); 1307 r.pos = Coordinate!(Basis.zero)(62_435_964 + 1); // Exceeds contig length 1308 1309 // Test ID field 1310 // note that in an empty freshly initialized bcf1_t, the ID field is "" rather than "." 1311 // Will consider adding initialization code 1312 //writefln("ID: %s", r.id); 1313 r.id = ""; 1314 assert(r.id == "."); 1315 r.id = "."; 1316 assert(r.id == "."); 1317 r.id = "rs001"; 1318 assert(r.id == "rs001"); 1319 r.addID("rs999"); 1320 assert(r.id == "rs001;rs999"); 1321 r.addID("rs001"); // test duplicate checking 1322 assert(r.id == "rs001;rs999"); 1323 1324 // Test REF/ALT allele setters and getters 1325 // many overloads to test for good coverage 1326 // Test matrix: Set {zero, one, two, three} alleles * {set by string, set by array} = 8 test cases 1327 // * also, there is setAlleles(ref, alt) and setAlleles(ref, alt1, ...) 1328 // * TODO: will also need to retreive alt alleles as array and string 1329 1330 string[] alleles_array; 1331 1332 // Zero alleles 1333 r.alleles(""); 1334 const auto zs = r.allelesAsArray; 1335 const auto zsr = r.refAllele; 1336 assert(zs.length == 0); 1337 assert(zsr == ""); 1338 1339 r.alleles(alleles_array); 1340 const auto za = r.allelesAsArray; 1341 const auto zar = r.refAllele; 1342 assert(za.length == 0); 1343 assert(zar == ""); 1344 1345 // One allele 1346 r.alleles("C"); 1347 const auto os = r.allelesAsArray; 1348 const auto osr = r.refAllele; 1349 assert(os.length == 1 && os[0] == "C"); 1350 assert(osr == "C"); 1351 1352 r.alleles(["C"]); 1353 const auto oa = r.allelesAsArray; 1354 const auto oar = r.refAllele; 1355 assert(oa.length == 1 && oa[0] == "C"); 1356 assert(oar == "C"); 1357 1358 // Two alleles 1359 r.alleles("C,T"); 1360 const auto ts = r.allelesAsArray; 1361 const auto tsr= r.refAllele; 1362 assert(ts == ["C", "T"]); 1363 assert(tsr== "C"); 1364 1365 r.alleles(["C", "T"]); 1366 const auto ta = r.allelesAsArray; 1367 const auto tar= r.refAllele; 1368 assert(ta == ["C", "T"]); 1369 assert(tar== "C"); 1370 1371 const taaa = r.altAllelesAsArray; 1372 const taas = r.altAllelesAsString; 1373 assert(taaa == ["T"]); 1374 assert(taas == "T"); 1375 1376 // alternate setter for >= 2 alleles 1377 r.setAlleles("A", "G"); 1378 assert(r.allelesAsArray == ["A", "G"]); 1379 1380 // Three alleles 1381 r.alleles("A,C,T"); 1382 assert(r.allelesAsArray == ["A", "C", "T"]); 1383 assert(r.refAllele == "A"); 1384 assert(r.altAllelesAsString == "C,T"); 1385 1386 // alternate the alleles for testing purposes 1387 r.alleles(["G", "A", "C"]); 1388 assert(r.allelesAsArray == ["G", "A", "C"]); 1389 assert(r.refAllele == "G"); 1390 assert(r.altAllelesAsString == "A,C"); 1391 1392 r.setAlleles("A", "C", "T"); 1393 assert(r.allelesAsArray == ["A", "C", "T"]); 1394 assert(r.refAllele == "A"); 1395 assert(r.altAllelesAsString == "C,T"); 1396 1397 r.setRefAllele("G"); 1398 assert(r.allelesAsArray == ["G", "C", "T"]); 1399 assert(r.refAllele == "G"); 1400 assert(r.altAllelesAsString == "C,T"); 1401 1402 r.setRefAllele("A"); 1403 assert(r.allelesAsArray == ["A", "C", "T"]); 1404 assert(r.refAllele == "A"); 1405 assert(r.altAllelesAsString == "C,T"); 1406 1407 1408 1409 // Test QUAL 1410 r.qual = 1.0; 1411 assert(r.qual == 1.0); 1412 // now test setting qual without UnpackLeveling 1413 // TODO: see https://forum.dlang.org/post/hebouvswxlslqhovzaia@forum.dlang.org, once resolved (or once factory function written), 1414 // add template value param UnpackLevel.ALT 1415 auto rr = new VCFRecord(vw.vcfhdr, "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0.017\n"); // @suppress(dscanner.style.long_line) 1416 rr.qual = 3.0; 1417 assert(rr.qual == 3.0); 1418 1419 1420 // Test FILTER 1421 assert(r.hasFilter("PASS")); 1422 assert(r.hasFilter(".")); 1423 // add q10 filter (is in header) 1424 r.filter = "q10"; 1425 assert(r.filter == "q10"); 1426 assert(!r.hasFilter("PASS")); // if has another filter, no longer PASS (unless added explicitly) 1427 assert(!r.hasFilter(".")); // if has another filter, no longer has . (which is PASS, or no filter [spec conflict?]) 1428 1429 // add q30 filteR (not in header) 1430 r.filter = "q30"; 1431 assert(r.filter == "q10"); // i.e., unchanged 1432 1433 assert(r.hasFilter("q10")); 1434 assert(!r.hasFilter("q30")); 1435 1436 r.removeAllFilters(); 1437 assert(r.hasFilter(".")); 1438 r.addFilter("q10"); 1439 assert(r.hasFilter("q10")); 1440 r.removeFilter("q10"); 1441 assert(r.hasFilter("PASS")); 1442 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\n"); 1443 1444 r.addFormat("AF",[0.1, 0.5]); 1445 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\tAF\t0.1,0.5\n"); 1446 1447 rr.addFormat("AF",[0.1]); 1448 writeln(rr.toString); 1449 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0\tAF\t0.1\n"); 1450 1451 1452 assert(rr.getFormats["AF"].to!float[0][0] == 0.1f); 1453 1454 rr.addInfo("AF",0.5); 1455 assert(rr.getInfos["AF"].to!float == 0.5f); 1456 1457 rr.addInfo("DP2",[0.5f, 0.4f]); 1458 assert(rr.getInfos["DP2"].to!(float[]) == [0.5f, 0.4f]); 1459 1460 rr.removeInfo("AF"); 1461 1462 rr.removeFormat("AF"); 1463 1464 rr.addFormat("CH", "test"); 1465 1466 1467 writeln(rr.toString); 1468 writeln(rr.getFormats["CH"].to!string); 1469 assert(rr.getFormats["CH"].to!string[0][0] == "test"); 1470 1471 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;DP2=0.5,0.4\tCH\ttest\n"); 1472 1473 assert(rr.coordinates == ZBHO(17329,17330)); 1474 vw.writeRecord(*r); 1475 vw.writeRecord(vw.vcfhdr.hdr, r.line); 1476 1477 1478 // Finally, print the records: 1479 writefln("VCF records via toString:\n%s%s", (*r), (*rr)); 1480 1481 writeln("\ndhtslib.vcf: all tests passed\n"); 1482 } 1483 1484 debug(dhtslib_unittest) unittest 1485 { 1486 import dhtslib.util; 1487 import std.stdio; 1488 import htslib.hts_log; 1489 import std.algorithm : map, count; 1490 import std.array : array; 1491 import std.path : buildPath, dirName; 1492 import std.math : approxEqual, isNaN; 1493 import dhtslib.tabix; 1494 import dhtslib.vcf; 1495 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 1496 hts_log_info(__FUNCTION__, "Testing VCFReader"); 1497 hts_log_info(__FUNCTION__, "Loading test file"); 1498 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); 1499 auto tbx = TabixIndexedFile(fn); 1500 auto reg = getIntervalFromString("1:3000151-3062916"); 1501 auto vcf = VCFReader(tbx, reg.contig, reg.interval); 1502 assert(!vcf.empty); 1503 1504 VCFRecord rec = vcf.front; 1505 assert(rec.getInfos["AN"].to!int == 4); 1506 rec.addInfo("AN", rec.getInfos["AN"].to!int + 1); 1507 assert(rec.getInfos["AN"].to!int == 5); 1508 assert(rec.getInfos["AN"].to!byte == 5); 1509 assert(rec.getInfos["AN"].to!short == 5); 1510 assert(rec.getInfos["AN"].to!long == 5); 1511 1512 assert(rec.getInfos["AN"].to!uint == 5); 1513 assert(rec.getInfos["AN"].to!ubyte == 5); 1514 assert(rec.getInfos["AN"].to!ushort == 5); 1515 assert(rec.getInfos["AN"].to!ulong == 5); 1516 1517 assert(rec.getInfos["AN"].to!float.isNaN); 1518 1519 rec.addInfo("AN", 128); 1520 assert(rec.getInfos["AN"].to!int == 128); 1521 assert(rec.getInfos["AN"].to!byte == 0); 1522 assert(rec.getInfos["AN"].to!short == 128); 1523 assert(rec.getInfos["AN"].to!long == 128); 1524 1525 rec.addInfo("AN", 32768); 1526 assert(rec.getInfos["AN"].to!int == 32768); 1527 assert(rec.getInfos["AN"].to!byte == 0); 1528 assert(rec.getInfos["AN"].to!short == 0); 1529 assert(rec.getInfos["AN"].to!long == 32768); 1530 1531 assert(rec.getInfos["AN"].to!long == 32768); 1532 1533 // rec.addInfo("AN", 2147483648); 1534 // assert(rec.getInfos["AN"].to!int == 0); 1535 // assert(rec.getInfos["AN"].to!byte == 0); 1536 // assert(rec.getInfos["AN"].to!short == 0); 1537 // assert(rec.getInfos["AN"].to!long == 2147483648); 1538 1539 vcf.popFront; 1540 rec = vcf.front; 1541 assert(rec.refAllele == "GTTT"); 1542 assert(rec.getInfos["INDEL"].to!bool == true); 1543 1544 assert(rec.getInfos["DP4"].to!(int[]) == [1, 2, 3, 4]); 1545 assert(rec.getInfos["DP4"].to!(byte[]) == [1, 2, 3, 4]); 1546 assert(rec.getInfos["DP4"].to!(short[]) == [1, 2, 3, 4]); 1547 assert(rec.getInfos["DP4"].to!(long[]) == [1, 2, 3, 4]); 1548 1549 assert(rec.getInfos["DP4"].to!(uint[]) == [1, 2, 3, 4]); 1550 assert(rec.getInfos["DP4"].to!(ubyte[]) == [1, 2, 3, 4]); 1551 assert(rec.getInfos["DP4"].to!(ushort[]) == [1, 2, 3, 4]); 1552 assert(rec.getInfos["DP4"].to!(ulong[]) == [1, 2, 3, 4]); 1553 1554 assert(rec.getInfos["DP4"].to!(float[]) == []); 1555 rec.addInfo("DP4", [2, 2, 3, 4]); 1556 assert(rec.getInfos["DP4"].to!(int[]) == [2, 2, 3, 4]); 1557 1558 rec.addInfo("DP4", [128, 128, 128, 128]); 1559 assert(rec.getInfos["DP4"].to!(int[]) == [128, 128, 128, 128]); 1560 assert(rec.getInfos["DP4"].to!(byte[]) == []); 1561 assert(rec.getInfos["DP4"].to!(short[]) == [128, 128, 128, 128]); 1562 assert(rec.getInfos["DP4"].to!(long[]) == [128, 128, 128, 128]); 1563 1564 rec.addInfo("DP4", [32768, 32768, 32768, 32768]); 1565 assert(rec.getInfos["DP4"].to!(int[]) == [32768, 32768, 32768, 32768]); 1566 assert(rec.getInfos["DP4"].to!(byte[]) == []); 1567 assert(rec.getInfos["DP4"].to!(short[]) == []); 1568 assert(rec.getInfos["DP4"].to!(long[]) == [32768, 32768, 32768, 32768]); 1569 1570 1571 assert(rec.getFormats["GQ"].to!int.array == [[409], [409]]); 1572 1573 rec.addFormat("GQ", [32768,32768]); 1574 1575 assert(rec.getFormats["GQ"].to!byte.array == []); 1576 assert(rec.getFormats["GQ"].to!short.array == []); 1577 assert(rec.getFormats["GQ"].to!int[0][0] == 32768); 1578 1579 assert(rec.getFormats["GL"].to!float[1][1] == -5.0); 1580 1581 1582 assert(rec.getInfos["STR"].to!string == "test"); 1583 1584 assert(rec.getInfos["DP4"].to!string == ""); 1585 1586 assert(rec.getFormats["GT"].to!int.array == [[2, 4], [2, 4]]); 1587 assert(rec.getGenotypes.map!(x => x.toString).array == ["0/1", "0/1"]); 1588 1589 1590 vcf.popFront; 1591 rec = vcf.front; 1592 1593 auto fmts = rec.getFormats; 1594 auto sam = vcf.vcfhdr.getSampleId("A"); 1595 assert(fmts["GT"].to!int[sam] == [2, 4]); 1596 assert(rec.getGenotype(sam).toString == "0/1"); 1597 assert(rec.getGenotype(sam).getPloidy == 2); 1598 sam = vcf.vcfhdr.getSampleId("B"); 1599 assert(fmts["GT"].to!int[sam] == [6, -127]); 1600 assert(rec.getGenotype(sam).toString == "2"); 1601 assert(rec.getGenotype(sam).getPloidy == 1); 1602 }