1 module dhtslib.file.iterator;
2 
3 import core.stdc.stdlib;
4 import core.stdc.string;
5 
6 import dhtslib.memory;
7 import dhtslib.file.file;
8 import dhtslib.util;
9 import htslib;
10 
11 /** 
12  * HtslibIterator is an abstraction for htslib's hts_itr_t using dhtslib.memory for reference counting.
13  * HtslibIterator can be used to iterate VCF/BCF/SAM/BAM/Text files using a BAI/CSI/TBX index
14  * or by simply iterating the file.
15  * 
16  * This struct adapts htslib's iterators into ForwardRange. It is paired with 
17  * and relies on HtslibFile. 
18  */
19 struct HtslibIterator(T)
20 if(is(T == Bam1) || is(T == Bcf1) || is(T == Kstring))
21 {
22     HtslibFile f;               /// HtslibFile
23     HtsItr itr;                 /// refcounted hts_itr_t
24     T rec;                      /// refcounted bam1_t, bcf1_t, or kstring_t
25     private Kstring line;
26     private bool is_itr;        /// Using an Itr or just calling *_read functions
27     private bool initialized;   /// Is the range initialized
28     /// htslib read function return value
29     /// -1 on eof, < -1 on err
30     /// must be pointer to keep range updated when blitted
31     private int * r; 
32 
33     /// ctor using only HtslibFile
34     /// without an iterator
35     this(HtslibFile f)
36     {
37         this.r = new int(0);
38         this.f = f;
39         static if(is(T == Bam1)){
40             rec = Bam1(bam_init1);
41         }else static if(is(T == Bcf1)){
42             rec = Bcf1(bcf_init);
43         }else static if(is(T == Kstring)){
44             rec = Kstring(initKstring);
45             ks_initialize(rec);
46         }
47         this.line = Kstring(initKstring);
48         ks_initialize(this.line);
49         this.empty;
50     }
51 
52     /// ctor using an HtslibFile and an iterator
53     this(HtslibFile f, HtsItr itr)
54     {
55         this.is_itr = true;
56         this.itr = itr;
57         this(f);
58     }
59 
60     /// Duplicate the iterator
61     /// must fully duplicate hts_itr_t
62     HtslibIterator dup()
63     {
64 
65         HtslibIterator newItr;
66         // dup file
67         newItr.f = this.f.dup;
68 
69         // set private values
70         newItr.r = new int(*this.r);
71         newItr.initialized = this.initialized;
72         newItr.is_itr = this.is_itr;
73 
74         // duplicate current record
75         static if(is(T == Bam1)){
76             newItr.rec = Bam1(bam_dup1(this.rec));
77         }else static if(is(T == Bcf1)){
78             newItr.rec = Bcf1(bcf_dup(this.rec));
79         }else static if(is(T == Kstring)){
80             auto ks = Kstring(initKstring);
81             ks_initialize(ks);
82             kputs(this.rec.s, ks);
83             newItr.rec = ks;
84         }
85         auto ks2 = Kstring(initKstring);
86         ks_initialize(ks2);
87         kputs(ks_c_str(this.line), ks2);
88         newItr.line = ks2;
89 
90         // if itr we need to deep copy it
91         if(this.is_itr){
92             // init
93             auto newHtsItr = cast(hts_itr_t *) malloc(hts_itr_t.sizeof);
94 
95             // bulk copy data
96             *newHtsItr = *itr;
97 
98             // initialize and copy region list
99             // if it is not null
100             if(newHtsItr.reg_list != null){
101 
102                 // initialize the region lists
103                 newHtsItr.reg_list = cast(hts_reglist_t *) malloc(itr.n_reg * hts_reglist_t.sizeof);
104                 newHtsItr.reg_list[0 .. newHtsItr.n_reg] = itr.reg_list[0 .. itr.n_reg];
105 
106                 // for each list
107                 for(auto i=0; i < newHtsItr.n_reg; i++)
108                 {
109                     // copy all intervals
110                     newHtsItr.reg_list[i].intervals = cast(hts_pair_pos_t *) malloc(itr.reg_list[i].count * hts_pair_pos_t.sizeof);
111                     newHtsItr.reg_list[i].intervals[0 .. newHtsItr.reg_list[i].count] = itr.reg_list[i].intervals[0 .. itr.reg_list[i].count];
112 
113                     // copy region c string
114                     // if not null
115                     if(itr.reg_list[i].reg){
116                         newHtsItr.reg_list[i].reg = cast(char *) malloc(strlen(itr.reg_list[i].reg) + 1);
117                         memcpy(cast(void*)(newHtsItr.reg_list[i].reg),itr.reg_list[i].reg, strlen(itr.reg_list[i].reg) + 1);
118                     }
119                 }
120             }
121 
122             // initialize and copy bins list
123             newHtsItr.bins.a = cast(int *) malloc(itr.bins.m * int.sizeof);
124             assert(newHtsItr.bins.m >= newHtsItr.bins.n);
125             newHtsItr.bins.a[0 .. newHtsItr.bins.m] = itr.bins.a[0 .. itr.bins.m];
126 
127             // initialize and copy off list
128             newHtsItr.off = cast(hts_pair64_max_t *) malloc(itr.n_off * hts_pair64_max_t.sizeof);
129             newHtsItr.off[0 .. newHtsItr.n_off] = itr.off[0 .. itr.n_off];
130             
131             // set new itr
132             newItr.itr = HtsItr(newHtsItr);
133         }else{
134             newItr.itr = this.itr;
135         }
136         
137         return newItr;
138     }
139 
140     /// Get front of the iterator, returns  Bam1, Bcf1, or Kstring
141     /// Backing bam1_t, bcf1_t, kstring_t is re-used 
142     /// If you keep the result around it should be duplicated
143     T front()
144     {
145         return rec;
146     }
147 
148     /// popFront to move range forward
149     /// is destructive
150     void popFront()
151     {
152         if(!is_itr){
153             static if(is(T == Bam1)){
154                 assert(this.f.bamHdr.initialized && this.f.bamHdr.getRef);
155                 *this.r = sam_read1(this.f.fp, this.f.bamHdr, rec);
156             }
157             else static if(is(T == Bcf1)){
158                 assert(this.f.bcfHdr.initialized && this.f.bcfHdr.getRef);
159                 *this.r = bcf_read(this.f.fp, this.f.bcfHdr, rec);
160             }else static if(is(T == Kstring)){
161                 *this.r = hts_getline(this.f.fp, cast(int)'\n', rec);
162             }
163         }else{
164             if (itr.multi)
165                 *this.r = hts_itr_multi_next(f.fp, itr, rec.getRef);
166             else{
167                 if(f.idx != null)
168                     *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, rec.getRef, f.fp);
169                 else if(f.tbx != null){
170                     static if(is(T == Kstring))
171                         *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, rec.getRef, f.tbx);
172                     else {
173                         *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, this.line.getRef, f.tbx);
174                         static if(is(T == Bam1))
175                             *this.r = sam_parse1(this.line, this.f.bamHdr, this.rec);
176                         else static if(is(T == Bcf1))
177                             *this.r = vcf_parse(this.line, this.f.bcfHdr, this.rec);
178                         ks_clear(this.line);
179                     }
180                     
181                 }
182                 else{
183                     *r = -2;
184                     hts_log_error(__FUNCTION__, "Neither tabix nor bai/csi index are loaded");
185                 }
186                     
187             }
188         }
189     }
190 
191     /// InputRange interface
192     @property bool empty()
193     {
194         // TODO, itr.finished shouldn't be used
195         if (!this.initialized) {
196             this.popFront();
197             this.initialized = true;
198         }
199         if(!is_itr){
200             return *r < 0 ? true : false;
201         }else{
202             assert(this.itr.initialized && this.itr.getRef);
203             return (*r < 0 || itr.finished) ? true : false;
204         }
205     }
206     
207     /// Needed to be ForwardRange
208     typeof(this) save()
209     {
210         return this.dup;
211     }
212 }
213 debug(dhtslib_unittest) unittest
214 {
215     import std.path:buildPath,dirName;
216     import std.algorithm: count;
217     import std.string: fromStringz;
218     hts_log_info(__FUNCTION__, "Testing HtslibIterator SAM/BAM reading");
219     
220     auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","range.bam");
221     auto f = HtslibFile(fn);
222     f.loadHeader;
223     auto read = f.readRecord!Bam1();
224     assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712");
225     
226     f.seek(0);
227     f.loadHeader;
228     assert(f.byRecord!Bam1.count == 112);
229 
230     f.seek(0);
231     f.loadHeader;
232     f.loadHtsIndex;
233     assert(f.query!Bam1(0,913,914).count == 1);
234 
235     f.seek(0);
236     f.loadHeader;
237     f.loadHtsIndex;
238     assert(f.query!Bam1(0,913,934).count == 2);
239 
240     f.seek(0);
241     f.loadHeader;
242     f.loadHtsIndex;
243     assert(f.query!Bam1(0,913,934).count == 2);
244     assert(f.query!Bam1("CHROMOSOME_I:914-935").count == 2);
245 
246     f.seek(0);
247     f.loadHeader;
248     string[] regions = ["CHROMOSOME_I:900-2000","CHROMOSOME_II:900-2000"];
249     assert(f.query(regions).count == 33);
250 }
251 
252 debug(dhtslib_unittest) unittest
253 {
254     import std.algorithm: count;
255     import std.path:buildPath,dirName;
256     hts_log_info(__FUNCTION__, "Testing HtslibIterator VCF/BCF reading");
257     auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz");
258     auto f = HtslibFile(fn);
259     f.loadHeader;
260     auto read = f.readRecord!Bcf1();
261     assert(read.pos == 3000149);
262 
263     import std.stdio;
264     f.seek(0);
265     f.loadHeader;
266     assert(f.byRecord!Bcf1.count == 14);
267 
268     f.loadTabixIndex;
269     
270     f.seek(0);
271     f.loadHeader;
272     assert(f.query!Bcf1(0, 3000149, 3000151).count == 2);
273 
274     f.seek(0);
275     f.loadHeader;
276     assert(f.query!Bcf1("1:3000150-3000151").count == 2);
277 }
278 
279 debug(dhtslib_unittest) unittest
280 {
281     import std.algorithm: count;
282     import std.path:buildPath,dirName;
283     import std.string : fromStringz;
284     hts_log_info(__FUNCTION__, "Testing HtslibIterator dup");
285 
286     auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","range.bam");
287     auto f = HtslibFile(fn);
288     f.loadHeader;
289     f.loadHtsIndex;
290     auto read = f.readRecord!Bam1();
291     assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712");
292 
293     auto range1 = f.query!Bam1(0,900,2000);
294     auto range2 = range1.dup;
295     import std.stdio;
296 
297     assert(range1.count == 14);
298     assert(range1.empty == true);
299     assert(range2.count == 14);
300 
301     f.seek(0);
302     f.loadHeader;
303 
304     range1 = f.query!Bam1(0,900,2000);
305     range1.popFront;
306     range2 = range1.dup;
307     range2.popFront;
308 
309     assert(range1.count == 13);
310     assert(range1.empty == true);
311     assert(range2.count == 12);
312 
313     f.seek(0);
314     f.loadHeader;
315 
316     range1 = f.byRecord!Bam1();
317     range1.popFront;
318     range2 = range1.dup;
319     range2.popFront;
320 
321     assert(range1.count == 111);
322     assert(range1.count == 0);
323     assert(range1.empty == true);
324     assert(range2.count == 110);
325 
326     f.seek(0);
327     f.loadHeader;
328     string[] regions = ["CHROMOSOME_I:900-2000","CHROMOSOME_II:900-2000"];
329     range1 = f.query(regions);
330     range2 = range1.dup;
331     range2.popFront;
332     assert(range1.count == 33);
333     assert(range2.count == 32);
334 
335     f.seek(0);
336     f.loadHeader;
337     auto f2 =  f.dup;
338     read = f.readRecord!Bam1();
339     assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712");
340     read = f2.readRecord!Bam1();
341     assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712");
342 
343 
344     fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf");
345     f = HtslibFile(fn);
346     f.loadHeader;
347     auto vrange1 = f.byRecord!Bcf1();
348     vrange1.popFront;
349     auto vrange2 = vrange1.dup;
350     vrange2.popFront;
351 
352     assert(vrange1.count == 13);
353     assert(vrange2.count == 12);
354 
355     fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","bed_file.bed");
356     f = HtslibFile(fn);
357     f.loadHeader;
358     auto trange1 = f.byRecord!Kstring();
359     trange1.popFront;
360     auto trange2 = trange1.dup;
361     trange2.popFront;
362 
363     assert(trange1.count == 14);
364     assert(trange2.count == 13);
365 
366     fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","bed_file.bed.gz");
367     f = HtslibFile(fn);
368     f.loadHeader;
369     trange1 = f.byRecord!Kstring();
370     trange1.popFront;
371     trange2 = trange1.dup;
372     trange2.popFront;
373 
374     assert(trange1.count == 14);
375     assert(trange2.count == 13);
376 
377 }