1 /** 2 Module providing a wrapper, `TabixIndexedFile` over a line-oriented NGS flat file, 3 such as BED, GFF3, VCF that has been indexed with tabix. 4 5 The wrapper provides a list of reference sequence names, as well as iterator over 6 all rows falling within a sequence range, e.g. "chr1:1000-2000" 7 */ 8 9 module dhtslib.tabix; 10 11 import std.stdio; 12 import std.string; 13 import std.traits : ReturnType; 14 import std.range : inputRangeObject, InputRangeObject; 15 import core.stdc.stdlib : malloc, free; 16 17 import htslib.hts; 18 import htslib.kstring; 19 import htslib.tbx; 20 import dhtslib.coordinates; 21 import dhtslib.memory; 22 import dhtslib.util; 23 //import htslib.regidx; 24 25 /** Encapsulates a position-sorted record-oriented NGS flat file, 26 * indexed with Tabix, including BED, GFF3, VCF. 27 * 28 * region(string r) returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 29 */ 30 struct TabixIndexedFile { 31 32 HtsFile fp; /// rc wrapper around htsFile ptr 33 Tbx tbx; /// rc wrapper around tabix ptr 34 35 string header; /// NGS flat file's header (if any; e.g. BED may not have one) 36 37 /// Initialize with a complete file path name to the tabix-indexed file 38 /// The tabix index (.tbi) must already exist alongside 39 this(const(char)[] fn, const(char)[] fntbi = "") 40 { 41 debug(dhtslib_debug) { writeln("TabixIndexedFile ctor"); } 42 this.fp = HtsFile(hts_open( toStringz(fn), "r")); 43 if ( !this.fp ) { 44 writefln("Could not read %s\n", fn); 45 throw new Exception("Couldn't read file"); 46 } 47 //enum htsExactFormat format = hts_get_format(fp)->format; 48 if(fntbi!="") this.tbx = Tbx(tbx_index_load2( toStringz(fn), toStringz(fntbi) )); 49 else this.tbx = Tbx(tbx_index_load( toStringz(fn) )); 50 if (!this.tbx) { 51 writefln("Could not load .tbi index of %s\n", fn ); 52 throw new Exception("Couldn't load tabix index file"); 53 } 54 55 loadHeader(); 56 } 57 58 private void loadHeader() 59 { 60 kstring_t str; 61 scope(exit) { free(str.s); } 62 63 while ( hts_getline(this.fp, '\n', &str) >= 0 ) 64 { 65 if ( !str.l || str.s[0] != this.tbx.conf.meta_char ) break; 66 this.header ~= fromStringz(str.s) ~ '\n'; 67 } 68 69 debug(dhtslib_debug) { writeln("end loadHeader"); } 70 } 71 72 /// tbx.d: const(char **) tbx_seqnames(tbx_t *tbx, int *n); // free the array but not the values 73 @property string[] sequenceNames() 74 { 75 // TODO: Check for memory leaks (free the array after copy into sequence_names) 76 int nseqs; 77 78 string[] sequence_names; 79 80 const(char **) seqnames = tbx_seqnames(this.tbx, &nseqs); 81 82 for(int i; i<nseqs; i++) { 83 sequence_names ~= cast(string) fromStringz(seqnames[i]); 84 } 85 86 return sequence_names; 87 } 88 89 /** region(r) 90 * returns an InputRange that iterates through rows of the file intersecting or contained within the requested range 91 */ 92 auto region(CoordSystem cs)(string chrom, Interval!cs coords) 93 { 94 auto newCoords = coords.to!(CoordSystem.zbho); 95 struct Region 96 { 97 98 /** TODO: determine how thread-(un)safe this is (i.e., using a potentially shared *fp and *tbx) */ 99 private HtsFile fp; 100 private Tbx tbx; 101 102 private HtsItr itr; 103 private string next; 104 105 // necessary because the alternative strategy of preloading the first row 106 // leads to problems when Struct inst is blitted -> 107 // re-iterating always returns first row only (since *itr is expended 108 // but first row was preloaded in this.next) 109 private bool active; 110 private string chrom; 111 112 this(HtsFile fp, Tbx tbx, string chrom, Interval!(CoordSystem.zbho) coords) 113 { 114 this.fp = fp; 115 this.tbx = tbx; 116 this.chrom = chrom; 117 this.itr = HtsItr(tbx_itr_queryi(tbx, tbx_name2id(tbx, toStringz(this.chrom)), coords.start, coords.end)); 118 this.empty; 119 debug(dhtslib_debug) { writeln("Region ctor // this.itr: ", this.itr); } 120 } 121 122 // had to remove "const" property from empty() due to manipulation of this.active 123 @property bool empty() { 124 125 if (!this.active) { 126 // this is the first call to empty() (and use of the range) 127 // Let's make it active and attempt to load the first record, if one exists 128 this.active = true; 129 this.popFront(); 130 } 131 132 if (!this.next) return true; 133 else return false; 134 } 135 136 @property string front() const { 137 return this.next; 138 } 139 140 void popFront() { 141 // closure over fp and tbx? (i.e. potentially unsafe?) 142 143 // Get next entry 144 kstring_t kstr; 145 immutable res = tbx_itr_next(this.fp, this.tbx, this.itr, &kstr); 146 if (res < 0) { 147 // we are done 148 this.next = null; 149 } else { 150 // Otherwise load into next 151 this.next = fromStringz(kstr.s).idup; 152 free(kstr.s); 153 } 154 } 155 Region save() 156 { 157 Region newRange; 158 newRange.fp = HtsFile(copyHtsFile(fp)); 159 newRange.itr = HtsItr(copyHtsItr(itr)); 160 newRange.tbx = tbx; 161 newRange.next = next; 162 newRange.active = active; 163 newRange.chrom = chrom; 164 return newRange; 165 } 166 } 167 168 return Region(this.fp, this.tbx, chrom, newCoords); 169 } 170 171 } 172 173 // TODO: figure out how to make this unittest with just htslib files 174 175 // debug(dhtslib_unittest) unittest 176 // { 177 // import htslib.hts_log; 178 // import dhtslib.vcf; 179 // import std.path : buildPath, dirName; 180 181 // hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 182 // hts_log_info(__FUNCTION__, "Testing TabixIndexedFile"); 183 // hts_log_info(__FUNCTION__, "Loading test file"); 184 // auto vcf = VCFReader(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","index.vcf")); 185 // auto vcfw = VCFWriter() 186 187 // } 188 189 /** 190 Range that allows reading a record based format via tabix. 191 Needs a record type that encompasses only one line of text 192 and a ChromInterval region to use for tabix filtering. 193 Rectype could be GFF3Record, BedRecord ... 194 This is a sister struct to dhtslib.bgzf.RecordReader. 195 */ 196 struct RecordReaderRegion(RecType, CoordSystem cs) 197 { 198 /// file reader 199 TabixIndexedFile file; 200 /// file reader range 201 ReturnType!(this.initializeRange) range; 202 /// chrom of region 203 string chrom; 204 /// coordinates of region 205 Interval!cs coords; 206 /// keep the header 207 string header; 208 209 bool emptyLine = false; 210 211 /// string chrom and Interval ctor 212 this(string fn, string chrom, Interval!cs coords, string fnIdx = "") 213 { 214 this.file = TabixIndexedFile(fn, fnIdx); 215 this.chrom = chrom; 216 this.coords = coords; 217 this.header = this.file.header; 218 this.range = this.initializeRange; 219 this.range.empty; 220 } 221 222 /// copy the TabixIndexedFile.region range 223 auto initializeRange() 224 { 225 return this.file.region(this.chrom, this.coords); 226 } 227 228 /// returns RecType 229 RecType front() 230 { 231 return RecType(this.range.front); 232 } 233 234 /// move the range 235 void popFront() 236 { 237 this.range.popFront; 238 if(this.range.front == "") this.emptyLine = true; 239 } 240 241 /// is the range done 242 auto empty() 243 { 244 return this.emptyLine || this.range.empty; 245 } 246 247 typeof(this) save() 248 { 249 typeof(this) newRange = this; 250 newRange.range = this.range.save; 251 return newRange; 252 } 253 }