1 module dhtslib.vcf.writer; 2 3 import std.datetime; 4 import std.string: fromStringz, toStringz; 5 import std.traits: isArray, isDynamicArray, isBoolean, isIntegral, isFloatingPoint, isNumeric, isSomeString; 6 import std.conv: to, ConvException; 7 import std.format: format; 8 import std.parallelism : totalCPUs; 9 10 import dhtslib.memory; 11 import dhtslib.vcf; 12 import htslib.vcf; 13 import htslib.hts_log; 14 import htslib.hfile; 15 import htslib.hts; 16 17 alias BCFWriter = VCFWriter; 18 19 /// VCF/BCF/Compressed VCF/ Uncompressed BCF on-disk format. 20 /// `DEDUCE` will attempt to auto-detect from filename or other means 21 enum VCFWriterTypes 22 { 23 VCF, /// Regular VCF 24 BCF, /// Compressed BCF 25 UBCF, /// Uncompressed BCF 26 CVCF, /// Compressed VCF (vcf.gz) 27 DEDUCE /// Determine based on file extension 28 } 29 30 /** Basic support for writing VCF, BCF files */ 31 struct VCFWriter 32 { 33 /// filename; as usable from D 34 string filename; 35 36 /// filename \0-terminated C string; reference needed to avoid GC reaping result of toStringz when ctor goes out of scope 37 private const(char)* fn; 38 39 // htslib data structures 40 VcfFile fp; /// rc htsFile wrapper 41 VCFHeader vcfhdr; /// header wrapper 42 Bcf1[] rows; /// individual records 43 44 /// hFILE if required 45 private hFILE* f; 46 47 @disable this(); 48 /// open file or network resources for writing 49 /// setup incl header allocation 50 /// if mode==w: 51 /// bcf_hdr_init automatically sets version string (##fileformat=VCFv4.2) 52 /// bcf_hdr_init automatically adds the PASS filter 53 this(T)(T f, VCFWriterTypes t=VCFWriterTypes.DEDUCE) 54 if (is(T == string) || is(T == File)) 55 { 56 auto hdr = VCFHeader( bcf_hdr_init("w\0"c.ptr)); 57 hdr.addFiledate(); 58 bcf_hdr_sync(hdr.hdr); 59 this(f, hdr, t); 60 } 61 /// setup and copy a header from another BCF/VCF as template 62 this(T, H)(T f, H header, VCFWriterTypes t=VCFWriterTypes.DEDUCE, int extra_threads = -1) 63 if((is(H == VCFHeader) || is(H == bcf_hdr_t*)) && (is(T == string) || is(T == File))) 64 { 65 66 char[] mode; 67 if(t == VCFWriterTypes.BCF) mode=['w','b','\0']; 68 else if(t == VCFWriterTypes.UBCF) mode=['w','b','0','\0']; 69 else if(t == VCFWriterTypes.VCF) mode=['w','\0']; 70 else if(t == VCFWriterTypes.CVCF) mode=['w','z','\0']; 71 // open file 72 static if (is(T == string)) 73 { 74 if(t == VCFWriterTypes.DEDUCE){ 75 import std.path:extension, stripExtension; 76 auto ext=extension(f); 77 if(ext==".bcf") mode=['w','b','\0']; 78 else if(ext==".vcf") mode=['w','\0']; 79 else if(ext==".cram") mode=['w','c','\0']; 80 else if(ext==".gz") { 81 auto extRemoved = stripExtension(f); 82 if (extension(extRemoved) == ".vcf") mode=['w','z','\0']; 83 else { 84 hts_log_error(__FUNCTION__,"extension "~extension(extRemoved)~ext~" not valid"); 85 throw new Exception("DEDUCE VCFWriterType used with non-valid extension"); 86 } 87 } 88 else { 89 hts_log_error(__FUNCTION__,"extension "~ext~" not valid"); 90 throw new Exception("DEDUCE VCFWriterType used with non-valid extension"); 91 } 92 } 93 this.filename = f; 94 this.fn = toStringz(f); 95 96 if (f == "") throw new Exception("Empty filename passed to VCFWriter constructor"); 97 this.fp = vcf_open(toStringz(f), mode.ptr); 98 if (!this.fp) throw new Exception("Could not hts_open file"); 99 } 100 else static if (is(T == File)) 101 { 102 assert(t!=VCFWriterTypes.DEDUCE); 103 this.filename = f.name(); 104 this.fn = toStringz(f.name); 105 this.f = hdopen(f.fileno, cast(immutable(char)*) "w"); 106 this.fp = hts_hopen(this.f, this.fn, mode.ptr); 107 } 108 else assert(0); 109 110 if (extra_threads == -1) 111 { 112 if ( totalCPUs > 1) 113 { 114 hts_log_info(__FUNCTION__, 115 format("%d CPU cores detected; enabling multithreading", totalCPUs)); 116 // hts_set_threads adds N _EXTRA_ threads, so totalCPUs - 1 seemed reasonable, 117 // but overcomitting by 1 thread (i.e., passing totalCPUs) buys an extra 3% on my 2-core 2013 Mac 118 hts_set_threads(this.fp, totalCPUs); 119 } 120 } else if (extra_threads > 0) 121 { 122 if ((extra_threads + 1) > totalCPUs) 123 hts_log_warning(__FUNCTION__, "More threads requested than CPU cores detected"); 124 hts_set_threads(this.fp, extra_threads); 125 } 126 else if (extra_threads == 0) 127 { 128 hts_log_debug(__FUNCTION__, "Zero extra threads requested"); 129 } 130 131 static if(is(H == VCFHeader*)) { this.vcfhdr = VCFHeader( bcf_hdr_dup(header.hdr) ); } 132 else static if(is(H == VCFHeader)) { this.vcfhdr = VCFHeader( bcf_hdr_dup(header.hdr)); } 133 else static if(is(H == bcf_hdr_t*)) { this.vcfhdr = VCFHeader( bcf_hdr_dup(header) ); } 134 else assert(0); 135 } 136 137 /// Explicit postblit to avoid 138 /// https://github.com/blachlylab/dhtslib/issues/122 139 this(this) 140 { 141 this.fp = fp; 142 this.vcfhdr = vcfhdr; 143 this.rows = rows; 144 } 145 146 VCFHeader getHeader() 147 { 148 return this.vcfhdr; 149 } 150 151 /// Add sample to this VCF 152 /// * int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample); 153 deprecated("use VCFHeader methods instead") 154 int addSample(string name) 155 in { assert(name != ""); } 156 do 157 { 158 return this.vcfhdr.addSample(name); 159 } 160 161 deprecated("use VCFHeader methods instead") 162 /// Add a new header line 163 int addHeaderLineKV(string key, string value) 164 { 165 return this.vcfhdr.addHeaderLineKV(key, value); 166 } 167 168 deprecated("use VCFHeader methods instead") 169 /// Add a new header line -- must be formatted ##key=value 170 int addHeaderLineRaw(string line) 171 { 172 return this.vcfhdr.addHeaderLineRaw(line); 173 } 174 175 deprecated("use VCFHeader methods instead") 176 /// Add a filedate= headerline, which is not called out specifically in the spec, 177 /// but appears in the spec's example files. We could consider allowing a param here. 178 int addFiledate() 179 { 180 return this.vcfhdr.addFiledate; 181 } 182 183 /** Add INFO (§1.2.2) or FORMAT (§1.2.4) tag 184 185 The INFO tag describes row-specific keys used in the INFO column; 186 The FORMAT tag describes sample-specific keys used in the last, and optional, genotype column. 187 188 Template parameter: string; must be INFO or FORMAT 189 190 The first four parameters are required; NUMBER and TYPE have specific allowable values. 191 source and version are optional, but recommended (for INFO only). 192 193 * id: ID tag 194 * number: NUMBER tag; here a string because it can also take special values {A,R,G,.} (see §1.2.2) 195 * type: Integer, Float, Flag, Character, and String 196 * description: Text description; will be double quoted 197 * source: Annotation source (eg dbSNP) 198 * version: Annotation version (eg 142) 199 */ 200 deprecated("use VCFHeader methods instead") 201 void addTag(HeaderRecordType tagType)( string id, 202 HeaderLengths number, 203 HeaderTypes type, 204 string description, 205 string source="", 206 string _version="") 207 if(tagType == HeaderRecordType.Info || tagType == HeaderRecordType.Format) 208 { 209 this.vcfhdr.addHeaderLine!(tagType)(id, number, type, description, source, _version); 210 } 211 212 /** Add FILTER tag (§1.2.3) */ 213 deprecated("use VCFHeader methods instead") 214 void addTag(HeaderRecordType tagType)(string id, string description) 215 if(tagType == HeaderRecordType.Filter) 216 { 217 this.vcfhdr.addHeaderLine!(tagType)(id, description); 218 } 219 220 /** Add FILTER tag (§1.2.3) */ 221 deprecated("use VCFHeader methods instead") 222 void addFilterTag(string id, string description) 223 { 224 this.vcfhdr.addFilter(id, description); 225 } 226 227 /** Add contig definition (§1.2.7) to header meta-info 228 229 other: "url=...,md5=...,etc." 230 */ 231 deprecated("use VCFHeader methods instead") 232 auto addTag(HeaderRecordType tagType)(const(char)[] id, const int length = 0, string other = "") 233 if(tagType == HeaderRecordType.Contig) 234 { 235 return this.vcfhdr.addTag!(tagType)(id, length, other); 236 } 237 238 /** 239 Add a record 240 241 alleles: comma-separated string, including ref allele 242 qual: a float, but accepts any numeric 243 */ 244 int addRecord(S, N)(S contig, int pos, S id, S alleles, N qual, S[] filters) 245 if ( (isSomeString!S || is(S == char*) ) && isNumeric!N) 246 { 247 Bcf1 line = Bcf1(bcf_init1()); 248 249 line.rid = bcf_hdr_name2id(this.vcfhdr.hdr, toStringz(contig)); 250 if (line.rid == -1) hts_log_error(__FUNCTION__, "contig not found"); 251 252 line.pos = pos; 253 254 bcf_update_id(this.vcfhdr.hdr, line, (id == "" ? null : toStringz(id)) ); // TODO: could support >1 id with array as with the filters 255 256 bcf_update_alleles_str(this.vcfhdr.hdr, line, toStringz(alleles)); 257 258 line.qual = qual; 259 260 // Update filter(s); if/else blocks for speed 261 if(filters.length == 0) 262 { 263 int pass = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz("PASS"c)); 264 bcf_update_filter(this.vcfhdr.hdr, line, &pass, 1); 265 } 266 else if(filters.length == 1) 267 { 268 int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(filters[0])); 269 if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", filters[0]) ); 270 bcf_update_filter(this.vcfhdr.hdr, line, &fid, 1); 271 } 272 else // TODO: factor out the check for -1 into a safe_update_filter or something 273 { 274 int[] filter_ids; 275 foreach(f; filters) { 276 const int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(f)); 277 if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", f) ); 278 else filter_ids ~= fid; 279 } 280 bcf_update_filter(this.vcfhdr.hdr, line, filter_ids.ptr, cast(int)filter_ids.length ); 281 } 282 283 // Add a record 284 /+ 285 // .. FORMAT 286 bcf_hdr_append(hdr, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">"); 287 float[4] test; 288 bcf_float_set_missing(test[0]); 289 test[1] = 47.11f; 290 bcf_float_set_vector_end(test[2]); 291 writeln("pre update format float"); 292 bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("TF"), &test[0], 4); 293 +/ 294 int tmpi = 1; 295 bcf_update_info_int32(this.vcfhdr.hdr, line, toStringz("NS"c), &tmpi, 1); 296 297 // Add the actual sample 298 int[4] dp = [ 9000, 1, 2, 3]; 299 bcf_update_format(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1, BCF_HT_INT); 300 //int dp = 9000; 301 //bcf_update_format_int32(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1); 302 //auto f = new float; 303 //*f = 1.0; 304 //bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("XF"c), f, 1); 305 306 this.rows ~= line; 307 308 return 0; 309 } 310 311 /// as expected 312 int writeHeader() 313 { 314 return bcf_hdr_write(this.fp, this.vcfhdr.hdr); 315 } 316 /// as expected 317 int writeRecord(ref VCFRecord r) 318 { 319 const ret = bcf_write(this.fp, this.vcfhdr.hdr, r.line); 320 if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error"); 321 return ret; 322 } 323 /// as expected 324 int writeRecord(bcf_hdr_t *hdr, bcf1_t *rec) 325 { 326 hts_log_warning(__FUNCTION__, "pre call"); 327 const ret = bcf_write(this.fp, hdr, rec); 328 hts_log_warning(__FUNCTION__, "post call"); 329 if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error"); 330 return ret; 331 } 332 } 333 334 /// 335 debug(dhtslib_unittest) 336 unittest 337 { 338 import std.exception: assertThrown; 339 import std.stdio: writeln, writefln; 340 import dhtslib.vcf.writer; 341 342 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 343 344 345 auto vw = VCFWriter("/dev/null", VCFWriterTypes.VCF); 346 347 vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); 348 vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">"); 349 // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> 350 vw.addTag!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data"); 351 vw.addTag!(HeaderRecordType.Filter)("filt","test"); 352 vw.addFilterTag("filt2","test2"); 353 354 assert(vw.getHeader.getHeaderRecord(HeaderRecordType.Filter, "filt2").getDescription == "\"test2\""); 355 vw.writeHeader(); 356 } 357 358 debug(dhtslib_unittest) unittest 359 { 360 import std.stdio; 361 import htslib.hts_log; 362 import std.algorithm : map, count; 363 import std.array : array; 364 import std.path : buildPath, dirName; 365 import std.math : approxEqual; 366 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 367 hts_log_info(__FUNCTION__, "Testing VCFWriterTypes DEDUCE"); 368 hts_log_info(__FUNCTION__, "Loading test file"); 369 { 370 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 371 auto vcfw = VCFWriter("/tmp/test_vcf.vcf", vcf.vcfhdr); 372 373 vcfw.writeHeader; 374 foreach(rec;vcf) { 375 vcfw.writeRecord(rec); 376 } 377 destroy(vcfw); 378 } 379 { 380 auto vcf = VCFReader("/tmp/test_vcf.vcf"); 381 assert(vcf.count == 14); 382 vcf = VCFReader("/tmp/test_vcf.vcf"); 383 384 VCFRecord rec = vcf.front; 385 assert(rec.chrom == "1"); 386 assert(rec.pos == 3000149); 387 assert(rec.refAllele == "C"); 388 assert(rec.altAllelesAsArray == ["T"]); 389 assert(rec.allelesAsArray == ["C","T"]); 390 assert(approxEqual(rec.qual,59.2)); 391 assert(rec.filter == "PASS"); 392 } 393 { 394 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 395 auto vcfw = VCFWriter("/tmp/test_vcf.bcf", vcf.vcfhdr); 396 397 vcfw.writeHeader; 398 foreach(rec;vcf) { 399 vcfw.writeRecord(rec); 400 } 401 destroy(vcfw); 402 } 403 { 404 auto vcf = VCFReader("/tmp/test_vcf.bcf"); 405 assert(vcf.count == 14); 406 vcf = VCFReader("/tmp/test_vcf.bcf"); 407 408 VCFRecord rec = vcf.front; 409 assert(rec.chrom == "1"); 410 assert(rec.pos == 3000149); 411 assert(rec.refAllele == "C"); 412 assert(rec.altAllelesAsArray == ["T"]); 413 assert(rec.allelesAsArray == ["C","T"]); 414 assert(approxEqual(rec.qual,59.2)); 415 assert(rec.filter == "PASS"); 416 } 417 { 418 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 419 auto vcfw = VCFWriter("/tmp/test_vcf.vcf.gz", vcf.vcfhdr); 420 421 vcfw.writeHeader; 422 foreach(rec;vcf) { 423 vcfw.writeRecord(rec); 424 } 425 destroy(vcfw); 426 } 427 { 428 auto vcf = VCFReader("/tmp/test_vcf.vcf.gz"); 429 assert(vcf.count == 14); 430 vcf = VCFReader("/tmp/test_vcf.vcf.gz"); 431 432 VCFRecord rec = vcf.front; 433 assert(rec.chrom == "1"); 434 assert(rec.pos == 3000149); 435 assert(rec.refAllele == "C"); 436 assert(rec.altAllelesAsArray == ["T"]); 437 assert(rec.allelesAsArray == ["C","T"]); 438 assert(approxEqual(rec.qual,59.2)); 439 assert(rec.filter == "PASS"); 440 } 441 } 442 443 debug(dhtslib_unittest) unittest 444 { 445 import std.stdio; 446 import htslib.hts_log; 447 import std.algorithm : map, count; 448 import std.array : array; 449 import std.path : buildPath, dirName; 450 import std.math : approxEqual; 451 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 452 hts_log_info(__FUNCTION__, "Testing VCFWriterTypes"); 453 hts_log_info(__FUNCTION__, "Loading test file"); 454 { 455 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 456 auto vcfw = VCFWriter("/tmp/test_vcf.cvcf", vcf.vcfhdr, VCFWriterTypes.CVCF); 457 458 vcfw.writeHeader; 459 foreach(rec;vcf) { 460 vcfw.writeRecord(rec); 461 } 462 destroy(vcfw); 463 } 464 { 465 auto vcf = VCFReader("/tmp/test_vcf.cvcf"); 466 assert(vcf.count == 14); 467 vcf = VCFReader("/tmp/test_vcf.cvcf"); 468 469 VCFRecord rec = vcf.front; 470 assert(rec.chrom == "1"); 471 assert(rec.pos == 3000149); 472 assert(rec.refAllele == "C"); 473 assert(rec.altAllelesAsArray == ["T"]); 474 assert(rec.allelesAsArray == ["C","T"]); 475 assert(approxEqual(rec.qual,59.2)); 476 assert(rec.filter == "PASS"); 477 } 478 { 479 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 480 auto vcfw = VCFWriter("/tmp/test_vcf.ubcf", vcf.vcfhdr, VCFWriterTypes.UBCF); 481 482 vcfw.writeHeader; 483 foreach(rec;vcf) { 484 vcfw.writeRecord(rec); 485 } 486 destroy(vcfw); 487 } 488 { 489 auto vcf = VCFReader("/tmp/test_vcf.ubcf"); 490 assert(vcf.count == 14); 491 vcf = VCFReader("/tmp/test_vcf.ubcf"); 492 493 VCFRecord rec = vcf.front; 494 assert(rec.chrom == "1"); 495 assert(rec.pos == 3000149); 496 assert(rec.refAllele == "C"); 497 assert(rec.altAllelesAsArray == ["T"]); 498 assert(rec.allelesAsArray == ["C","T"]); 499 assert(approxEqual(rec.qual,59.2)); 500 assert(rec.filter == "PASS"); 501 } 502 { 503 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 504 auto vcfw = VCFWriter("/tmp/test_vcf.txt", vcf.vcfhdr, VCFWriterTypes.VCF); 505 506 vcfw.writeHeader; 507 foreach(rec;vcf) { 508 vcfw.writeRecord(rec); 509 } 510 destroy(vcfw); 511 } 512 { 513 auto vcf = VCFReader("/tmp/test_vcf.txt"); 514 assert(vcf.count == 14); 515 vcf = VCFReader("/tmp/test_vcf.txt"); 516 517 VCFRecord rec = vcf.front; 518 assert(rec.chrom == "1"); 519 assert(rec.pos == 3000149); 520 assert(rec.refAllele == "C"); 521 assert(rec.altAllelesAsArray == ["T"]); 522 assert(rec.allelesAsArray == ["C","T"]); 523 assert(approxEqual(rec.qual,59.2)); 524 assert(rec.filter == "PASS"); 525 } 526 }