1 module dhtslib.vcf.reader;
2 
3 import std.string: fromStringz, toStringz;
4 import std.utf: toUTFz;
5 import std.traits : ReturnType;
6 
7 import dhtslib.vcf;
8 import dhtslib.tabix;
9 import dhtslib.coordinates;
10 import dhtslib.memory;
11 import dhtslib.util;
12 import htslib.vcf;
13 import htslib.hts_log;
14 import htslib.kstring;
15 
16 alias BCFReader = VCFReader;
17 
18 auto VCFReader(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None)
19 {
20     return VCFReaderImpl!(CoordSystem.zbc, false)(fn, MAX_UNPACK);
21 }
22 
23 auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None)
24 {
25     return VCFReaderImpl!(cs,true)(tbxFile, chrom, coords, MAX_UNPACK);
26 }
27 
28 /** Basic support for reading VCF, BCF files */
29 struct VCFReaderImpl(CoordSystem cs, bool isTabixFile)
30 {
31     // htslib data structures
32     VcfFile     fp;    /// rc htsFile wrapper
33     VCFHeader   vcfhdr;    /// rc header wrapper
34     Bcf1 b;          /// rc bcf1_t wrapper, record for use in iterator, will be recycled
35     UnpackLevel MAX_UNPACK;     /// see htslib.vcf
36 
37 
38     private bool initialized;
39     private int success;
40 
41     @disable this();
42     static if(isTabixFile){
43 
44         TabixIndexedFile tbx; /// For tabix use
45         ReturnType!(this.initializeTabixRange) tbxRange; /// For tabix use
46 
47         /// TabixIndexedFile and coordinates ctor
48         this(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None)
49         {
50             this.tbx = tbxFile;
51             this.tbxRange = initializeTabixRange(chrom, coords);
52             this.tbxRange.empty;
53             /// read the header from TabixIndexedFile
54             bcf_hdr_t * hdrPtr = bcf_hdr_init(toUTFz!(char *)("r"));
55             auto err = bcf_hdr_parse(hdrPtr, toUTFz!(char *)(this.tbx.header));
56             this.vcfhdr = VCFHeader(hdrPtr);
57             
58             this.b = Bcf1(bcf_init1());
59             this.b.max_unpack = MAX_UNPACK;
60             this.MAX_UNPACK = MAX_UNPACK;
61             this.empty;
62         }
63 
64         /// copy the TabixIndexedFile.region range
65         auto initializeTabixRange(string chrom, Interval!cs coords)
66         {
67             return this.tbx.region(chrom, coords);
68         }
69     }else{
70         /// read existing VCF file
71         /// MAX_UNPACK: setting alternate value could speed reading
72         this(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None)
73         {
74             import htslib.hts : hts_set_threads;
75 
76             assert(!isTabixFile);
77             if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor");
78             this.fp = VcfFile(vcf_open(toStringz(fn), "r"c.ptr));
79             if (!this.fp) throw new Exception("Could not hts_open file");
80             
81             hts_set_threads(this.fp, 1);    // extra decoding thread
82 
83             this.vcfhdr = VCFHeader(bcf_hdr_read(this.fp));
84 
85             this.b = Bcf1(bcf_init1());
86             this.b.max_unpack = MAX_UNPACK;
87             this.MAX_UNPACK = MAX_UNPACK;
88             this.empty;
89         }
90     }
91 
92     invariant
93     {
94         assert(this.vcfhdr.hdr != null);
95     }
96 
97     VCFHeader getHeader()
98     {
99         return this.vcfhdr;
100     }
101 
102     /// InputRange interface: iterate over all records
103     @property bool empty()
104     {
105         static if(isTabixFile){
106             if(this.tbxRange.empty) return true;
107         }
108         if (!this.initialized) {
109             this.popFront();
110             this.initialized = true;
111         }
112         if (success >= 0)
113             return false;
114         else if (success == -1)
115             return true;
116         else
117         {
118             static if(isTabixFile)
119                 hts_log_error(__FUNCTION__, "*** ERROR vcf_parse < -1");
120             else
121                 hts_log_error(__FUNCTION__, "*** ERROR bcf_read < -1");
122             return true;
123         }
124         
125     }
126     /// ditto
127     void popFront()
128     {
129         //     int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v);
130         // documentation claims returns -1 on critical errors, 0 otherwise
131         // however it looks like -1 is EOF and -2 is critical errors?
132         static if(isTabixFile){
133             if (this.initialized) this.tbxRange.popFront;
134             auto str = this.tbxRange.front;
135             kstring_t kstr;
136             kputs(toUTFz!(char *)(str), &kstr);
137             this.success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b);
138             
139         }else
140             this.success = bcf_read(this.fp, this.vcfhdr.hdr, this.b);
141 
142     }
143     /// ditto
144     VCFRecord front()
145     {
146         // note that VCFRecord's constructor will be responsible for
147         // * UnpackLeveling and
148         // * destroying
149         // its copy
150         auto copiedBcf_1t = bcf_dup(this.b);
151         return VCFRecord(this.vcfhdr, copiedBcf_1t, this.MAX_UNPACK);
152     }
153 
154     typeof(this) save()
155     {
156         typeof(this) newRange = this;
157         static if(isTabixFile){
158             newRange.tbx = tbx; /// For tabix use
159             newRange.tbxRange = tbxRange.save; /// For tabix use
160         }else{
161             newRange.fp = HtsFile(copyHtsFile(this.fp));
162         }
163         newRange.vcfhdr = vcfhdr;
164         newRange.b = Bcf1(bcf_dup(this.b));
165         newRange.initialized = initialized;
166         newRange.success = success;
167         return newRange;
168     }
169 }
170 
171 debug(dhtslib_unittest) unittest
172 {
173     import std.stdio;
174     import htslib.hts_log;
175     import std.algorithm : map, count;
176     import std.array : array;
177     import std.path : buildPath, dirName;
178     import std.math : approxEqual;
179     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
180     hts_log_info(__FUNCTION__, "Testing VCFReader (Tabix)");
181     hts_log_info(__FUNCTION__, "Loading test file");
182 
183     auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
184     assert(vcf.count == 14);
185     vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
186     VCFRecord rec = vcf.front;
187     assert(rec.chrom == "1");
188     assert(rec.pos == 3000149);
189     assert(rec.refAllele == "C");
190     assert(rec.altAllelesAsArray == ["T"]);
191     assert(rec.allelesAsArray == ["C","T"]);
192     assert(approxEqual(rec.qual,59.2));
193     assert(rec.filter == "PASS");
194     
195     vcf.popFront;
196     rec = vcf.front;
197 
198     assert(rec.chrom == "1");
199     assert(rec.pos == 3000150);
200     assert(rec.refAllele == "C");
201     assert(rec.altAllelesAsArray == ["T"]);
202     assert(rec.allelesAsArray == ["C","T"]);
203     assert(approxEqual(rec.qual,59.2));
204     assert(rec.filter == "PASS");
205     
206     vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"));
207     auto range1 = vcf.save;
208     vcf.popFront;
209     auto range2 = vcf.save;
210     vcf.popFront;
211     vcf.popFront;
212     auto range3 = vcf.save;
213     vcf.popFront;
214     vcf.popFront;
215     vcf.popFront;
216     auto range4 = vcf.save;
217     assert(range1.array.length == 14);
218     assert(range2.array.length == 13);
219     assert(range3.array.length == 11);
220     assert(range4.array.length == 8);
221 }
222 
223 debug(dhtslib_unittest) unittest
224 {
225     import dhtslib.util;
226     import std.stdio;
227     import htslib.hts_log;
228     import std.algorithm : map, count;
229     import std.array : array;
230     import std.path : buildPath, dirName;
231     import std.math : approxEqual;
232     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
233     hts_log_info(__FUNCTION__, "Testing VCFReader");
234     hts_log_info(__FUNCTION__, "Loading test file");
235     auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz");
236     auto tbx = TabixIndexedFile(fn);
237     auto reg = getIntervalFromString("1:3000151-3062916");
238     auto vcf = VCFReader(tbx, reg.contig, reg.interval);
239     assert(vcf.count == 3);
240     vcf = VCFReader(tbx, reg.contig, reg.interval);
241     VCFRecord rec = vcf.front;
242     assert(rec.chrom == "1");
243     assert(rec.pos == 3000150);
244     assert(rec.refAllele == "C");
245     assert(rec.altAllelesAsArray == ["T"]);
246     assert(rec.allelesAsArray == ["C","T"]);
247     assert(approxEqual(rec.qual, 59.2));
248     assert(rec.filter == "PASS");
249 
250     vcf.popFront;
251     rec = vcf.front;
252 
253     assert(rec.chrom == "1");
254     assert(rec.pos == 3062914);
255     assert(rec.refAllele == "GTTT");
256     assert(rec.altAllelesAsArray == ["G"]);
257     assert(rec.allelesAsArray == ["GTTT","G"]);
258     assert(approxEqual(rec.qual,12.9));
259     assert(rec.filter == "q10");
260     // assert(rec. == ["C","T"]);
261 
262     vcf = VCFReader(tbx, reg.contig, reg.interval);
263     auto range1 = vcf.save;
264     vcf.popFront;
265     auto range2 = vcf.save;
266     vcf.popFront;
267     auto range3 = vcf.save;
268     vcf.popFront;
269     auto range4 = vcf.save;
270     assert(range1.array.length == 3);
271     assert(range2.array.length == 2);
272     assert(range3.array.length == 1);
273     assert(range4.array.length == 0);
274 }
275 
276 debug(dhtslib_unittest) unittest
277 {
278     import dhtslib.util;
279     import std.stdio;
280     import htslib.hts_log;
281     import std.algorithm : map, count;
282     import std.array : array;
283     import std.path : buildPath, dirName;
284     import std.math : approxEqual;
285     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
286     hts_log_info(__FUNCTION__, "Testing bcf1_t Unpacking");
287     hts_log_info(__FUNCTION__, "Loading test file");
288     auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz");
289     auto tbx = TabixIndexedFile(fn);
290     auto reg = getIntervalFromString("1:3000151-3062916");
291 
292     auto vcf = VCFReader(tbx, reg.contig, reg.interval);
293     assert(vcf.count == 3);
294     vcf = VCFReader(tbx, reg.contig, reg.interval, UnpackLevel.None);
295     VCFRecord rec = vcf.front;
296 
297     assert(rec.chrom == "1");
298     assert(rec.pos == 3000150);
299 
300     assert(approxEqual(rec.qual, 59.2));
301     assert(rec.line.unpacked == UnpackLevel.None);
302 
303     assert(rec.id == ".");
304     assert(rec.line.unpacked == UnpackLevel.AltAllele);
305 
306     assert(rec.refAllele == "C");
307     assert(rec.line.unpacked == UnpackLevel.AltAllele);
308 
309     assert(rec.altAllelesAsArray == ["T"]);
310     assert(rec.line.unpacked == UnpackLevel.AltAllele);
311 
312     assert(rec.filter == "PASS");
313     assert(rec.line.unpacked == (UnpackLevel.Filter | UnpackLevel.AltAllele));
314 
315     assert(rec.getInfos["AN"].to!int == 4);
316     assert(rec.line.unpacked == UnpackLevel.SharedFields);
317 
318     assert(rec.getFormats["DP"].to!int.array == [[32], [32]]);
319     assert(rec.line.unpacked == UnpackLevel.All);
320 
321     /// only one extra test for pulling only format fields
322     ///
323     /// bcf_unpack automatically promotes UnpackLevel.Filter to
324     /// (UnpackLevel.Filter | UnpackLevel.AltAllele)
325     ///
326     /// bcf_unpack automatically promotes UnpackLevel.Info to
327     /// (UnpackLevel.Info | UnpackLevel.SharedFields)
328 
329     /// since front calls bcf_dup, we get a fresh unpacked record
330     rec = vcf.front;
331     assert(rec.getFormats["DP"].to!int.array == [[32], [32]]);
332     assert(rec.line.unpacked == UnpackLevel.Format);
333     
334 }