1 /**
2 Module providing a wrapper, `TabixIndexedFile` over a line-oriented NGS flat file,
3 such as BED, GFF3, VCF that has been indexed with tabix.
4 
5 The wrapper provides a list of reference sequence names, as well as iterator over
6 all rows falling within a sequence range, e.g. "chr1:1000-2000"
7 */
8 
9 module dhtslib.tabix;
10 
11 import std.stdio;
12 import std.string;
13 import std.traits : ReturnType;
14 import std.range : inputRangeObject, InputRangeObject;
15 import core.stdc.stdlib : malloc, free;
16 
17 import htslib.hts;
18 import htslib.kstring;
19 import htslib.tbx;
20 import dhtslib.coordinates;
21 import dhtslib.memory;
22 import dhtslib.util;
23 //import htslib.regidx;
24 
25 /** Encapsulates a position-sorted record-oriented NGS flat file,
26  *  indexed with Tabix, including BED, GFF3, VCF.
27  *
28  *  region(string r) returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 
29  */
30 struct TabixIndexedFile {
31 
32     HtsFile fp;    /// rc wrapper around htsFile ptr
33     Tbx   tbx;   /// rc wrapper around tabix ptr
34 
35     string header;  /// NGS flat file's header (if any; e.g. BED may not have one)
36 
37     /// Initialize with a complete file path name to the tabix-indexed file
38     /// The tabix index (.tbi) must already exist alongside
39     this(const(char)[] fn, const(char)[] fntbi = "")
40     {
41         debug(dhtslib_debug) { writeln("TabixIndexedFile ctor"); }
42         this.fp = HtsFile(hts_open( toStringz(fn), "r"));
43         if ( !this.fp ) {
44             writefln("Could not read %s\n", fn);
45             throw new Exception("Couldn't read file");
46         }
47         //enum htsExactFormat format = hts_get_format(fp)->format;
48         if(fntbi!="") this.tbx = Tbx(tbx_index_load2( toStringz(fn), toStringz(fntbi) ));
49         else this.tbx = Tbx(tbx_index_load( toStringz(fn) ));
50         if (!this.tbx) { 
51             writefln("Could not load .tbi index of %s\n", fn );
52             throw new Exception("Couldn't load tabix index file");
53         }
54 
55         loadHeader();
56     }
57 
58     private void loadHeader()
59     {
60         kstring_t str;
61         scope(exit) { free(str.s); }
62 
63         while ( hts_getline(this.fp, '\n', &str) >= 0 )
64         {
65             if ( !str.l || str.s[0] != this.tbx.conf.meta_char ) break;
66             this.header ~= fromStringz(str.s) ~ '\n';
67         }
68 
69         debug(dhtslib_debug) { writeln("end loadHeader"); }
70     }
71 
72     /// tbx.d: const(char **) tbx_seqnames(tbx_t *tbx, int *n);  // free the array but not the values
73     @property string[] sequenceNames()
74     {
75         // TODO: Check for memory leaks (free the array after copy into sequence_names)
76         int nseqs;
77 
78         string[] sequence_names;
79 
80         const(char **) seqnames = tbx_seqnames(this.tbx, &nseqs);
81 
82         for(int i; i<nseqs; i++) {
83             sequence_names ~= cast(string) fromStringz(seqnames[i]);
84         }
85 
86         return sequence_names;
87     }
88 
89     /** region(r)
90      *  returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 
91      */
92     auto region(CoordSystem cs)(string chrom, Interval!cs coords)
93     {
94         auto newCoords = coords.to!(CoordSystem.zbho);
95         struct Region
96         {
97 
98             /** TODO: determine how thread-(un)safe this is (i.e., using a potentially shared *fp and *tbx) */
99             private HtsFile fp;
100             private Tbx tbx;
101 
102             private HtsItr itr;
103             private string next;
104 
105             // necessary because the alternative strategy of preloading the first row
106             // leads to problems when Struct inst is blitted ->
107             // re-iterating always returns first row only (since *itr is expended 
108             // but first row was preloaded in this.next)
109             private bool active;
110             private string chrom;
111 
112             this(HtsFile fp, Tbx tbx,  string chrom, Interval!(CoordSystem.zbho) coords)
113             {
114                 this.fp = fp;
115                 this.tbx = tbx;
116                 this.chrom = chrom;
117                 this.itr = HtsItr(tbx_itr_queryi(tbx, tbx_name2id(tbx, toStringz(this.chrom)), coords.start, coords.end));
118                 this.empty;
119                 debug(dhtslib_debug) { writeln("Region ctor // this.itr: ", this.itr); }
120             }
121 
122             // had to remove "const" property from empty() due to manipulation of this.active
123             @property bool empty() {
124 
125                 if (!this.active) {
126                     // this is the first call to empty() (and use of the range)
127                     // Let's make it active and attempt to load the first record, if one exists
128                     this.active = true;
129                     this.popFront();
130                 }
131 
132                 if (!this.next) return true;
133                 else return false;
134             }
135 
136             @property string front() const {
137                 return this.next;
138             }
139 
140             void popFront() {
141                 // closure over fp and tbx? (i.e. potentially unsafe?)
142 
143                 // Get next entry
144                 kstring_t kstr;
145                 immutable res = tbx_itr_next(this.fp, this.tbx, this.itr, &kstr);
146                 if (res < 0) {
147                     // we are done
148                     this.next = null;
149                 } else {
150                     // Otherwise load into next
151                     this.next = fromStringz(kstr.s).idup;
152                     free(kstr.s);
153                 }
154             }
155             Region save()
156             {
157                 Region newRange;
158                 newRange.fp = HtsFile(copyHtsFile(fp));
159                 newRange.itr = HtsItr(copyHtsItr(itr));
160                 newRange.tbx = tbx;
161                 newRange.next = next;
162                 newRange.active = active;
163                 newRange.chrom = chrom;
164                 return newRange;
165             }
166         }
167 
168         return Region(this.fp, this.tbx, chrom, newCoords);
169     }
170 
171 }
172 
173 // TODO: figure out how to make this unittest with just htslib files
174 
175 // debug(dhtslib_unittest) unittest
176 // {
177 //     import htslib.hts_log;
178 //     import dhtslib.vcf;
179 //     import std.path : buildPath, dirName;
180 
181 //     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
182 //     hts_log_info(__FUNCTION__, "Testing TabixIndexedFile");
183 //     hts_log_info(__FUNCTION__, "Loading test file");
184 //     auto vcf = VCFReader(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","index.vcf"));
185 //     auto vcfw = VCFWriter()
186 
187 // }
188 
189 /**
190     Range that allows reading a record based format via tabix.
191     Needs a record type that encompasses only one line of text
192     and a ChromInterval region to use for tabix filtering.
193     Rectype could be GFF3Record, BedRecord ...
194     This is a sister struct to dhtslib.bgzf.RecordReader.
195 */
196 struct RecordReaderRegion(RecType, CoordSystem cs)
197 {
198     /// file reader
199     TabixIndexedFile file;
200     /// file reader range
201     ReturnType!(this.initializeRange) range;
202     /// chrom of region
203     string chrom;
204     /// coordinates of region
205     Interval!cs coords;
206     /// keep the header
207     string header;
208     
209     bool emptyLine = false;
210     
211     /// string chrom and Interval ctor
212     this(string fn, string chrom, Interval!cs coords, string fnIdx = "")
213     {
214         this.file = TabixIndexedFile(fn, fnIdx);
215         this.chrom = chrom;
216         this.coords = coords;
217         this.header = this.file.header;
218         this.range = this.initializeRange;
219         this.range.empty;
220     }
221 
222     /// copy the TabixIndexedFile.region range
223     auto initializeRange()
224     {
225         return this.file.region(this.chrom, this.coords);
226     }
227 
228     /// returns RecType
229     RecType front()
230     {
231         return RecType(this.range.front);
232     }
233 
234     /// move the range
235     void popFront()
236     {
237         this.range.popFront;
238         if(this.range.front == "") this.emptyLine = true;
239     }
240 
241     /// is the range done
242     auto empty()
243     {
244         return this.emptyLine || this.range.empty;
245     }
246 
247     typeof(this) save()
248     {
249         typeof(this) newRange = this;
250         newRange.range = this.range.save;
251         return newRange;
252     }
253 }