1 module dhtslib.vcf.writer;
2 
3 import std.datetime;
4 import std.string: fromStringz, toStringz;
5 import std.traits: isArray, isDynamicArray, isBoolean, isIntegral, isFloatingPoint, isNumeric, isSomeString;
6 import std.conv: to, ConvException;
7 import std.format: format;
8 import std.parallelism : totalCPUs;
9 
10 import dhtslib.memory;
11 import dhtslib.vcf;
12 import htslib.vcf;
13 import htslib.hts_log;
14 import htslib.hfile;
15 import htslib.hts;
16 
17 alias BCFWriter = VCFWriter;
18 
19 /// VCF/BCF/Compressed VCF/ Uncompressed BCF on-disk format.
20 /// `DEDUCE` will attempt to auto-detect from filename or other means
21 enum VCFWriterTypes
22 {
23     VCF, /// Regular VCF
24     BCF, /// Compressed BCF
25     UBCF, /// Uncompressed BCF
26     CVCF, /// Compressed VCF (vcf.gz)
27     DEDUCE /// Determine based on file extension
28 }
29 
30 /** Basic support for writing VCF, BCF files */
31 struct VCFWriter
32 {
33     /// filename; as usable from D
34     string filename;
35 
36     /// filename \0-terminated C string; reference needed to avoid GC reaping result of toStringz when ctor goes out of scope
37     private const(char)* fn;
38 
39     // htslib data structures
40     VcfFile     fp;    /// rc htsFile wrapper
41     VCFHeader   vcfhdr;    /// header wrapper
42     Bcf1[]    rows;   /// individual records
43 
44     /// hFILE if required
45     private hFILE* f;
46 
47     @disable this();
48     /// open file or network resources for writing
49     /// setup incl header allocation
50     /// if mode==w:
51     ///     bcf_hdr_init automatically sets version string (##fileformat=VCFv4.2)
52     ///     bcf_hdr_init automatically adds the PASS filter
53     this(T)(T f, VCFWriterTypes t=VCFWriterTypes.DEDUCE)
54     if (is(T == string) || is(T == File))
55     {
56         auto hdr = VCFHeader( bcf_hdr_init("w\0"c.ptr));
57         hdr.addFiledate();
58         bcf_hdr_sync(hdr.hdr);
59         this(f, hdr, t);
60     }
61     /// setup and copy a header from another BCF/VCF as template
62     this(T, H)(T f, H header, VCFWriterTypes t=VCFWriterTypes.DEDUCE, int extra_threads = -1)
63     if((is(H == VCFHeader) || is(H == bcf_hdr_t*)) && (is(T == string) || is(T == File)))
64     {
65 
66         char[] mode;
67         if(t == VCFWriterTypes.BCF) mode=['w','b','\0'];
68         else if(t == VCFWriterTypes.UBCF) mode=['w','b','0','\0'];
69         else if(t == VCFWriterTypes.VCF) mode=['w','\0'];
70         else if(t == VCFWriterTypes.CVCF) mode=['w','z','\0'];
71         // open file
72         static if (is(T == string))
73         {
74             if(t == VCFWriterTypes.DEDUCE){
75                 import std.path:extension, stripExtension;
76                 auto ext=extension(f);
77                 if(ext==".bcf") mode=['w','b','\0'];
78                 else if(ext==".vcf") mode=['w','\0'];
79                 else if(ext==".cram") mode=['w','c','\0'];
80                 else if(ext==".gz") {
81                     auto extRemoved = stripExtension(f);
82                     if (extension(extRemoved) == ".vcf") mode=['w','z','\0'];
83                     else {
84                         hts_log_error(__FUNCTION__,"extension "~extension(extRemoved)~ext~" not valid");
85                         throw new Exception("DEDUCE VCFWriterType used with non-valid extension");
86                     }
87                 }
88                 else {
89                     hts_log_error(__FUNCTION__,"extension "~ext~" not valid");
90                     throw new Exception("DEDUCE VCFWriterType used with non-valid extension");
91                 }
92             }
93             this.filename = f;
94             this.fn = toStringz(f);
95 
96             if (f == "") throw new Exception("Empty filename passed to VCFWriter constructor");
97             this.fp = vcf_open(toStringz(f), mode.ptr);
98             if (!this.fp) throw new Exception("Could not hts_open file");
99         }
100         else static if (is(T == File))
101         {
102             assert(t!=VCFWriterTypes.DEDUCE);
103             this.filename = f.name();
104             this.fn = toStringz(f.name);
105             this.f = hdopen(f.fileno, cast(immutable(char)*) "w");
106             this.fp = hts_hopen(this.f, this.fn, mode.ptr);
107         }
108         else assert(0);
109 
110         if (extra_threads == -1)
111         {
112             if ( totalCPUs > 1)
113             {
114                 hts_log_info(__FUNCTION__,
115                         format("%d CPU cores detected; enabling multithreading", totalCPUs));
116                 // hts_set_threads adds N _EXTRA_ threads, so totalCPUs - 1 seemed reasonable,
117                 // but overcomitting by 1 thread (i.e., passing totalCPUs) buys an extra 3% on my 2-core 2013 Mac
118                 hts_set_threads(this.fp, totalCPUs);
119             }
120         } else if (extra_threads > 0)
121         {
122             if ((extra_threads + 1) > totalCPUs)
123                 hts_log_warning(__FUNCTION__, "More threads requested than CPU cores detected");
124             hts_set_threads(this.fp, extra_threads);
125         }
126         else if (extra_threads == 0)
127         {
128             hts_log_debug(__FUNCTION__, "Zero extra threads requested");
129         }
130 
131         static if(is(H == VCFHeader*)) { this.vcfhdr      = VCFHeader( bcf_hdr_dup(header.hdr) ); }
132         else static if(is(H == VCFHeader)) { this.vcfhdr      = VCFHeader( bcf_hdr_dup(header.hdr)); }
133         else static if(is(H == bcf_hdr_t*)) { this.vcfhdr = VCFHeader( bcf_hdr_dup(header) ); }
134         else assert(0);
135     }
136 
137     /// Explicit postblit to avoid 
138     /// https://github.com/blachlylab/dhtslib/issues/122
139     this(this)
140     {
141         this.fp = fp;
142         this.vcfhdr = vcfhdr;
143         this.rows = rows;
144     }
145 
146     VCFHeader getHeader()
147     {
148         return this.vcfhdr;
149     }
150 
151     /// Add sample to this VCF
152     /// * int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample);
153     deprecated("use VCFHeader methods instead")
154     int addSample(string name)
155     in { assert(name != ""); }
156     do
157     {
158         return this.vcfhdr.addSample(name);
159     }
160 
161     deprecated("use VCFHeader methods instead")
162     /// Add a new header line
163     int addHeaderLineKV(string key, string value)
164     {
165         return this.vcfhdr.addHeaderLineKV(key, value);
166     }
167 
168     deprecated("use VCFHeader methods instead")
169     /// Add a new header line -- must be formatted ##key=value
170     int addHeaderLineRaw(string line)
171     {
172         return this.vcfhdr.addHeaderLineRaw(line);
173     }
174 
175     deprecated("use VCFHeader methods instead")
176     /// Add a filedate= headerline, which is not called out specifically in  the spec,
177     /// but appears in the spec's example files. We could consider allowing a param here.
178     int addFiledate()
179     {
180         return this.vcfhdr.addFiledate;
181     }
182     
183     /** Add INFO (§1.2.2) or FORMAT (§1.2.4) tag
184 
185     The INFO tag describes row-specific keys used in the INFO column;
186     The FORMAT tag describes sample-specific keys used in the last, and optional, genotype column.
187 
188     Template parameter: string; must be INFO or FORMAT
189 
190     The first four parameters are required; NUMBER and TYPE have specific allowable values.
191     source and version are optional, but recommended (for INFO only).
192 
193     *   id:     ID tag
194     *   number: NUMBER tag; here a string because it can also take special values {A,R,G,.} (see §1.2.2)
195     *   type:   Integer, Float, Flag, Character, and String
196     *   description: Text description; will be double quoted
197     *   source:      Annotation source  (eg dbSNP)
198     *   version:     Annotation version (eg 142)
199     */
200     deprecated("use VCFHeader methods instead")
201     void addTag(HeaderRecordType tagType)( string id,
202                                     HeaderLengths number,
203                                     HeaderTypes type,
204                                     string description,
205                                     string source="",
206                                     string _version="")
207     if(tagType == HeaderRecordType.Info || tagType == HeaderRecordType.Format)
208     {
209         this.vcfhdr.addHeaderLine!(tagType)(id, number, type, description, source, _version);
210     }
211 
212     /** Add FILTER tag (§1.2.3) */
213     deprecated("use VCFHeader methods instead")
214     void addTag(HeaderRecordType tagType)(string id, string description)
215     if(tagType == HeaderRecordType.Filter)
216     {
217         this.vcfhdr.addHeaderLine!(tagType)(id, description);
218     }
219 
220     /** Add FILTER tag (§1.2.3) */
221     deprecated("use VCFHeader methods instead")
222     void addFilterTag(string id, string description)
223     {
224         this.vcfhdr.addFilter(id, description);
225     }
226 
227     /** Add contig definition (§1.2.7) to header meta-info 
228     
229         other: "url=...,md5=...,etc."
230     */
231     deprecated("use VCFHeader methods instead")
232     auto addTag(HeaderRecordType tagType)(const(char)[] id, const int length = 0, string other = "")
233     if(tagType == HeaderRecordType.Contig)
234     {
235         return this.vcfhdr.addTag!(tagType)(id, length, other);
236     }
237     
238     /**
239         Add a record
240 
241         alleles:    comma-separated string, including ref allele
242         qual:       a float, but accepts any numeric
243     */
244     int addRecord(S, N)(S contig, int pos, S id, S alleles, N qual, S[] filters)
245     if ( (isSomeString!S || is(S == char*) ) && isNumeric!N)
246     {        
247         Bcf1 line = Bcf1(bcf_init1());
248 
249         line.rid = bcf_hdr_name2id(this.vcfhdr.hdr, toStringz(contig));
250         if (line.rid == -1) hts_log_error(__FUNCTION__, "contig not found");
251 
252         line.pos = pos;
253 
254         bcf_update_id(this.vcfhdr.hdr, line, (id == "" ? null : toStringz(id)) );  // TODO: could support >1 id with array as with the filters
255 
256         bcf_update_alleles_str(this.vcfhdr.hdr, line, toStringz(alleles));
257 
258         line.qual = qual;
259 
260         // Update filter(s); if/else blocks for speed
261         if(filters.length == 0)
262         {
263             int pass = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz("PASS"c));
264             bcf_update_filter(this.vcfhdr.hdr, line, &pass, 1);
265         }
266         else if(filters.length == 1)
267         {
268             int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(filters[0]));
269             if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", filters[0]) );
270             bcf_update_filter(this.vcfhdr.hdr, line, &fid, 1);
271         }
272         else    // TODO: factor out the check for -1 into a safe_update_filter or something
273         {
274             int[] filter_ids;
275             foreach(f; filters) {
276                 const int fid = bcf_hdr_id2int(this.vcfhdr.hdr, BCF_DT_ID, toStringz(f));
277                 if(fid == -1) hts_log_error(__FUNCTION__, format("filter not found (ignoring): ", f) );
278                 else filter_ids ~= fid;
279             }
280             bcf_update_filter(this.vcfhdr.hdr, line, filter_ids.ptr, cast(int)filter_ids.length );
281         }
282 
283         // Add a record
284         /+
285         // .. FORMAT
286         bcf_hdr_append(hdr, "##FORMAT=<ID=TF,Number=1,Type=Float,Description=\"Test Float\">");
287         float[4] test;
288         bcf_float_set_missing(test[0]);
289         test[1] = 47.11f;
290         bcf_float_set_vector_end(test[2]);
291         writeln("pre update format float");
292         bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("TF"), &test[0], 4);
293         +/
294         int tmpi = 1;
295         bcf_update_info_int32(this.vcfhdr.hdr, line, toStringz("NS"c), &tmpi, 1);
296 
297         // Add the actual sample
298         int[4] dp = [ 9000, 1, 2, 3];
299         bcf_update_format(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1, BCF_HT_INT);
300         //int dp = 9000;
301         //bcf_update_format_int32(this.vcfhdr.hdr, line, toStringz("DP"c), &dp, 1);
302         //auto f = new float;
303         //*f = 1.0;
304         //bcf_update_format_float(this.vcfhdr.hdr, line, toStringz("XF"c), f, 1);
305 
306         this.rows ~= line;
307 
308         return 0;
309     }
310 
311     /// as expected
312     int writeHeader()
313     {
314         return bcf_hdr_write(this.fp, this.vcfhdr.hdr);
315     }
316     /// as expected
317     int writeRecord(ref VCFRecord r)
318     {
319         const ret = bcf_write(this.fp, this.vcfhdr.hdr, r.line);
320         if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error");
321         return ret;
322     }
323     /// as expected
324     int writeRecord(bcf_hdr_t *hdr, bcf1_t *rec)
325     {
326         hts_log_warning(__FUNCTION__, "pre call");
327         const ret = bcf_write(this.fp, hdr, rec);
328         hts_log_warning(__FUNCTION__, "post call");
329         if (ret != 0) hts_log_error(__FUNCTION__, "bcf_write error");
330         return ret;
331     }
332 }
333 
334 ///
335 debug(dhtslib_unittest)
336 unittest
337 {
338     import std.exception: assertThrown;
339     import std.stdio: writeln, writefln;
340     import dhtslib.vcf.writer;
341 
342     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
343 
344 
345     auto vw = VCFWriter("/dev/null", VCFWriterTypes.VCF);
346 
347     vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
348     vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
349     // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
350     vw.addTag!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data");
351     vw.addTag!(HeaderRecordType.Filter)("filt","test");
352     vw.addFilterTag("filt2","test2");
353 
354     assert(vw.getHeader.getHeaderRecord(HeaderRecordType.Filter, "filt2").getDescription == "\"test2\"");
355     vw.writeHeader();
356 }
357 
358 debug(dhtslib_unittest) unittest
359 {
360     import std.stdio;
361     import htslib.hts_log;
362     import std.algorithm : map, count;
363     import std.array : array;
364     import std.path : buildPath, dirName;
365     import std.math : approxEqual;
366     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
367     hts_log_info(__FUNCTION__, "Testing VCFWriterTypes DEDUCE");
368     hts_log_info(__FUNCTION__, "Loading test file");
369     {
370         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
371         auto vcfw = VCFWriter("/tmp/test_vcf.vcf", vcf.vcfhdr);
372         
373         vcfw.writeHeader;
374         foreach(rec;vcf) {
375             vcfw.writeRecord(rec);
376         }
377         destroy(vcfw);
378     }
379     {
380         auto vcf = VCFReader("/tmp/test_vcf.vcf");
381         assert(vcf.count == 14);
382         vcf = VCFReader("/tmp/test_vcf.vcf");
383 
384         VCFRecord rec = vcf.front;
385         assert(rec.chrom == "1");
386         assert(rec.pos == 3000149);
387         assert(rec.refAllele == "C");
388         assert(rec.altAllelesAsArray == ["T"]);
389         assert(rec.allelesAsArray == ["C","T"]);
390         assert(approxEqual(rec.qual,59.2));
391         assert(rec.filter == "PASS");
392     }
393     {
394         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
395         auto vcfw = VCFWriter("/tmp/test_vcf.bcf", vcf.vcfhdr);
396         
397         vcfw.writeHeader;
398         foreach(rec;vcf) {
399             vcfw.writeRecord(rec);
400         }
401         destroy(vcfw);
402     }
403     {
404         auto vcf = VCFReader("/tmp/test_vcf.bcf");
405         assert(vcf.count == 14);
406         vcf = VCFReader("/tmp/test_vcf.bcf");
407         
408         VCFRecord rec = vcf.front;
409         assert(rec.chrom == "1");
410         assert(rec.pos == 3000149);
411         assert(rec.refAllele == "C");
412         assert(rec.altAllelesAsArray == ["T"]);
413         assert(rec.allelesAsArray == ["C","T"]);
414         assert(approxEqual(rec.qual,59.2));
415         assert(rec.filter == "PASS");
416     }
417     {
418         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
419         auto vcfw = VCFWriter("/tmp/test_vcf.vcf.gz", vcf.vcfhdr);
420         
421         vcfw.writeHeader;
422         foreach(rec;vcf) {
423             vcfw.writeRecord(rec);
424         }
425         destroy(vcfw);
426     }
427     {
428         auto vcf = VCFReader("/tmp/test_vcf.vcf.gz");
429         assert(vcf.count == 14);
430         vcf = VCFReader("/tmp/test_vcf.vcf.gz");
431         
432         VCFRecord rec = vcf.front;
433         assert(rec.chrom == "1");
434         assert(rec.pos == 3000149);
435         assert(rec.refAllele == "C");
436         assert(rec.altAllelesAsArray == ["T"]);
437         assert(rec.allelesAsArray == ["C","T"]);
438         assert(approxEqual(rec.qual,59.2));
439         assert(rec.filter == "PASS");
440     }
441 }
442 
443 debug(dhtslib_unittest) unittest
444 {
445     import std.stdio;
446     import htslib.hts_log;
447     import std.algorithm : map, count;
448     import std.array : array;
449     import std.path : buildPath, dirName;
450     import std.math : approxEqual;
451     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
452     hts_log_info(__FUNCTION__, "Testing VCFWriterTypes");
453     hts_log_info(__FUNCTION__, "Loading test file");
454     {
455         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
456         auto vcfw = VCFWriter("/tmp/test_vcf.cvcf", vcf.vcfhdr, VCFWriterTypes.CVCF);
457         
458         vcfw.writeHeader;
459         foreach(rec;vcf) {
460             vcfw.writeRecord(rec);
461         }
462         destroy(vcfw);
463     }
464     {
465         auto vcf = VCFReader("/tmp/test_vcf.cvcf");
466         assert(vcf.count == 14);
467         vcf = VCFReader("/tmp/test_vcf.cvcf");
468 
469         VCFRecord rec = vcf.front;
470         assert(rec.chrom == "1");
471         assert(rec.pos == 3000149);
472         assert(rec.refAllele == "C");
473         assert(rec.altAllelesAsArray == ["T"]);
474         assert(rec.allelesAsArray == ["C","T"]);
475         assert(approxEqual(rec.qual,59.2));
476         assert(rec.filter == "PASS");
477     }
478     {
479         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
480         auto vcfw = VCFWriter("/tmp/test_vcf.ubcf", vcf.vcfhdr, VCFWriterTypes.UBCF);
481         
482         vcfw.writeHeader;
483         foreach(rec;vcf) {
484             vcfw.writeRecord(rec);
485         }
486         destroy(vcfw);
487     }
488     {
489         auto vcf = VCFReader("/tmp/test_vcf.ubcf");
490         assert(vcf.count == 14);
491         vcf = VCFReader("/tmp/test_vcf.ubcf");
492         
493         VCFRecord rec = vcf.front;
494         assert(rec.chrom == "1");
495         assert(rec.pos == 3000149);
496         assert(rec.refAllele == "C");
497         assert(rec.altAllelesAsArray == ["T"]);
498         assert(rec.allelesAsArray == ["C","T"]);
499         assert(approxEqual(rec.qual,59.2));
500         assert(rec.filter == "PASS");
501     }
502     {
503         auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
504         auto vcfw = VCFWriter("/tmp/test_vcf.txt", vcf.vcfhdr, VCFWriterTypes.VCF);
505         
506         vcfw.writeHeader;
507         foreach(rec;vcf) {
508             vcfw.writeRecord(rec);
509         }
510         destroy(vcfw);
511     }
512     {
513         auto vcf = VCFReader("/tmp/test_vcf.txt");
514         assert(vcf.count == 14);
515         vcf = VCFReader("/tmp/test_vcf.txt");
516         
517         VCFRecord rec = vcf.front;
518         assert(rec.chrom == "1");
519         assert(rec.pos == 3000149);
520         assert(rec.refAllele == "C");
521         assert(rec.altAllelesAsArray == ["T"]);
522         assert(rec.allelesAsArray == ["C","T"]);
523         assert(approxEqual(rec.qual,59.2));
524         assert(rec.filter == "PASS");
525     }
526 }