1 /**
2 Module providing a wrapper, `TabixIndexedFile` over a line-oriented NGS flat file,
3 such as BED, GFF3, VCF that has been indexed with tabix.
4 
5 The wrapper provides a list of reference sequence names, as well as iterator over
6 all rows falling within a sequence range, e.g. "chr1:1000-2000"
7 */
8 
9 module dhtslib.tabix;
10 
11 import std.stdio;
12 import std..string;
13 import core.stdc.stdlib : malloc, free;
14 
15 import htslib.hts;
16 import htslib.kstring;
17 import htslib.tbx;
18 //import htslib.regidx;
19 
20 /** Encapsulates a position-sorted record-oriented NGS flat file,
21  *  indexed with Tabix, including BED, GFF3, VCF.
22  *
23  *  region(string r) returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 
24  */
25 struct TabixIndexedFile {
26 
27     htsFile *fp;    /// pointer to htsFile struct
28     tbx_t   *tbx;   /// pointer to tabix handle
29 
30     string header;  /// NGS flat file's header (if any; e.g. BED may not have one)
31 
32     /// Initialize with a complete file path name to the tabix-indexed file
33     /// The tabix index (.tbi) must already exist alongside
34     this(const(char)[] fn, const(char)[] fntbi = "")
35     {
36         debug(dhtslib_debug) { writeln("TabixIndexedFile ctor"); }
37         this.fp = hts_open( toStringz(fn), "r");
38         if ( !this.fp ) {
39             writefln("Could not read %s\n", fn);
40             throw new Exception("Couldn't read file");
41         }
42         //enum htsExactFormat format = hts_get_format(fp)->format;
43         if(fntbi!="") this.tbx = tbx_index_load2( toStringz(fn), toStringz(fntbi) );
44         else this.tbx = tbx_index_load( toStringz(fn) );
45         if (!this.tbx) { 
46             writefln("Could not load .tbi index of %s\n", fn );
47             throw new Exception("Couldn't load tabix index file");
48         }
49 
50         loadHeader();
51     }
52     ~this()
53     {
54         debug(dhtslib_debug) { writeln("TabixIndexedFile dtor"); }
55         tbx_destroy(this.tbx);
56 
57         if ( hts_close(this.fp) ) writefln("hts_close returned non-zero status: %s\n", fromStringz(this.fp.fn));
58     }
59 
60     private void loadHeader()
61     {
62         kstring_t str;
63         scope(exit) { free(str.s); }
64 
65         while ( hts_getline(this.fp, '\n', &str) >= 0 )
66         {
67             if ( !str.l || str.s[0] != this.tbx.conf.meta_char ) break;
68             this.header ~= fromStringz(str.s) ~ '\n';
69         }
70 
71         debug(dhtslib_debug) { writeln("end loadHeader"); }
72     }
73 
74     /// tbx.d: const(char **) tbx_seqnames(tbx_t *tbx, int *n);  // free the array but not the values
75     @property string[] sequenceNames()
76     {
77         // TODO: Check for memory leaks (free the array after copy into sequence_names)
78         int nseqs;
79 
80         string[] sequence_names;
81 
82         const(char **) seqnames = tbx_seqnames(this.tbx, &nseqs);
83 
84         for(int i; i<nseqs; i++) {
85             sequence_names ~= cast(string) fromStringz(seqnames[i]);
86         }
87 
88         return sequence_names;
89     }
90 
91     /** region(r)
92      *  returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 
93      */
94     auto region(const(char)[] r)
95     {
96         struct Region {
97 
98             /** TODO: determine how thread-(un)safe this is (i.e., using a potentially shared *fp and *tbx) */
99             private htsFile *fp;
100             private tbx_t   *tbx;
101 
102             private hts_itr_t *itr;
103             private string next;
104 
105             // necessary because the alternative strategy of preloading the first row
106             // leads to problems when Struct inst is blitted ->
107             // re-iterating always returns first row only (since *itr is expended 
108             // but first row was preloaded in this.next)
109             private bool active;
110 
111             this(htsFile *fp, tbx_t *tbx, const(char)[] r)
112             {
113                 this.fp = fp;
114                 this.tbx= tbx;
115 
116                 this.itr = tbx_itr_querys(tbx, toStringz(r) );
117                 debug(dhtslib_debug) { writeln("Region ctor // this.itr: ", this.itr); }
118                 if (this.itr) {
119                     // Load the first record
120                     //this.popFront(); // correction, do not load the first record
121                 }
122                 else {
123                     // TODO handle error
124                     throw new Exception("could not allocate this.itr");
125                 }
126             }
127             ~this()
128             {
129                 debug(dhtslib_debug) { writeln("Region dtor // this.itr: ", this.itr); }
130                 //tbx_itr_destroy(itr);
131                 //free(this.kstr.s);
132             }
133 
134             // had to remove "const" property from empty() due to manipulation of this.active
135             @property bool empty() {
136 
137                 if (!this.active) {
138                     // this is the first call to empty() (and use of the range)
139                     // Let's make it active and attempt to load the first record, if one exists
140                     this.active = true;
141                     this.popFront();
142                 }
143 
144                 if (!this.next) return true;
145                 else return false;
146             }
147 
148             @property string front() const {
149                 return this.next;
150             }
151 
152             void popFront() {
153                 // closure over fp and tbx? (i.e. potentially unsafe?)
154 
155                 // Get next entry
156                 kstring_t kstr;
157                 immutable res = tbx_itr_next(this.fp, this.tbx, this.itr, &kstr);
158                 if (res < 0) {
159                     // we are done
160                     this.next = null;
161                 } else {
162                     // Otherwise load into next
163                     this.next = fromStringz(kstr.s).idup;
164                     free(kstr.s);
165                 }
166             }
167         }
168 
169         return Region(this.fp, this.tbx, r);
170     }
171 
172 }