1 module dhtslib.file.iterator; 2 3 import core.stdc.stdlib; 4 import core.stdc.string; 5 6 import dhtslib.memory; 7 import dhtslib.file.file; 8 import dhtslib.util; 9 import htslib; 10 11 /** 12 * HtslibIterator is an abstraction for htslib's hts_itr_t using dhtslib.memory for reference counting. 13 * HtslibIterator can be used to iterate VCF/BCF/SAM/BAM/Text files using a BAI/CSI/TBX index 14 * or by simply iterating the file. 15 * 16 * This struct adapts htslib's iterators into ForwardRange. It is paired with 17 * and relies on HtslibFile. 18 */ 19 struct HtslibIterator(T) 20 if(is(T == Bam1) || is(T == Bcf1) || is(T == Kstring)) 21 { 22 HtslibFile f; /// HtslibFile 23 HtsItr itr; /// refcounted hts_itr_t 24 T rec; /// refcounted bam1_t, bcf1_t, or kstring_t 25 private Kstring line; 26 private bool is_itr; /// Using an Itr or just calling *_read functions 27 private bool initialized; /// Is the range initialized 28 /// htslib read function return value 29 /// -1 on eof, < -1 on err 30 /// must be pointer to keep range updated when blitted 31 private int * r; 32 33 /// ctor using only HtslibFile 34 /// without an iterator 35 this(HtslibFile f) 36 { 37 this.r = new int(0); 38 this.f = f; 39 static if(is(T == Bam1)){ 40 rec = Bam1(bam_init1); 41 }else static if(is(T == Bcf1)){ 42 rec = Bcf1(bcf_init); 43 }else static if(is(T == Kstring)){ 44 rec = Kstring(initKstring); 45 ks_initialize(rec); 46 } 47 this.line = Kstring(initKstring); 48 ks_initialize(this.line); 49 this.empty; 50 } 51 52 /// ctor using an HtslibFile and an iterator 53 this(HtslibFile f, HtsItr itr) 54 { 55 this.is_itr = true; 56 this.itr = itr; 57 this(f); 58 } 59 60 /// Duplicate the iterator 61 /// must fully duplicate hts_itr_t 62 HtslibIterator dup() 63 { 64 65 HtslibIterator newItr; 66 // dup file 67 newItr.f = this.f.dup; 68 69 // set private values 70 newItr.r = new int(*this.r); 71 newItr.initialized = this.initialized; 72 newItr.is_itr = this.is_itr; 73 74 // duplicate current record 75 static if(is(T == Bam1)){ 76 newItr.rec = Bam1(bam_dup1(this.rec)); 77 }else static if(is(T == Bcf1)){ 78 newItr.rec = Bcf1(bcf_dup(this.rec)); 79 }else static if(is(T == Kstring)){ 80 auto ks = Kstring(initKstring); 81 ks_initialize(ks); 82 kputs(this.rec.s, ks); 83 newItr.rec = ks; 84 } 85 auto ks2 = Kstring(initKstring); 86 ks_initialize(ks2); 87 kputs(ks_c_str(this.line), ks2); 88 newItr.line = ks2; 89 90 // if itr we need to deep copy it 91 if(this.is_itr){ 92 // init 93 auto newHtsItr = cast(hts_itr_t *) malloc(hts_itr_t.sizeof); 94 95 // bulk copy data 96 *newHtsItr = *itr; 97 98 // initialize and copy region list 99 // if it is not null 100 if(newHtsItr.reg_list != null){ 101 102 // initialize the region lists 103 newHtsItr.reg_list = cast(hts_reglist_t *) malloc(itr.n_reg * hts_reglist_t.sizeof); 104 newHtsItr.reg_list[0 .. newHtsItr.n_reg] = itr.reg_list[0 .. itr.n_reg]; 105 106 // for each list 107 for(auto i=0; i < newHtsItr.n_reg; i++) 108 { 109 // copy all intervals 110 newHtsItr.reg_list[i].intervals = cast(hts_pair_pos_t *) malloc(itr.reg_list[i].count * hts_pair_pos_t.sizeof); 111 newHtsItr.reg_list[i].intervals[0 .. newHtsItr.reg_list[i].count] = itr.reg_list[i].intervals[0 .. itr.reg_list[i].count]; 112 113 // copy region c string 114 // if not null 115 if(itr.reg_list[i].reg){ 116 newHtsItr.reg_list[i].reg = cast(char *) malloc(strlen(itr.reg_list[i].reg) + 1); 117 memcpy(cast(void*)(newHtsItr.reg_list[i].reg),itr.reg_list[i].reg, strlen(itr.reg_list[i].reg) + 1); 118 } 119 } 120 } 121 122 // initialize and copy bins list 123 newHtsItr.bins.a = cast(int *) malloc(itr.bins.m * int.sizeof); 124 assert(newHtsItr.bins.m >= newHtsItr.bins.n); 125 newHtsItr.bins.a[0 .. newHtsItr.bins.m] = itr.bins.a[0 .. itr.bins.m]; 126 127 // initialize and copy off list 128 newHtsItr.off = cast(hts_pair64_max_t *) malloc(itr.n_off * hts_pair64_max_t.sizeof); 129 newHtsItr.off[0 .. newHtsItr.n_off] = itr.off[0 .. itr.n_off]; 130 131 // set new itr 132 newItr.itr = HtsItr(newHtsItr); 133 }else{ 134 newItr.itr = this.itr; 135 } 136 137 return newItr; 138 } 139 140 /// Get front of the iterator, returns Bam1, Bcf1, or Kstring 141 /// Backing bam1_t, bcf1_t, kstring_t is re-used 142 /// If you keep the result around it should be duplicated 143 T front() 144 { 145 return rec; 146 } 147 148 /// popFront to move range forward 149 /// is destructive 150 void popFront() 151 { 152 if(!is_itr){ 153 static if(is(T == Bam1)){ 154 assert(this.f.bamHdr.initialized && this.f.bamHdr.getRef); 155 *this.r = sam_read1(this.f.fp, this.f.bamHdr, rec); 156 } 157 else static if(is(T == Bcf1)){ 158 assert(this.f.bcfHdr.initialized && this.f.bcfHdr.getRef); 159 *this.r = bcf_read(this.f.fp, this.f.bcfHdr, rec); 160 }else static if(is(T == Kstring)){ 161 *this.r = hts_getline(this.f.fp, cast(int)'\n', rec); 162 } 163 }else{ 164 if (itr.multi) 165 *this.r = hts_itr_multi_next(f.fp, itr, rec.getRef); 166 else{ 167 if(f.idx != null) 168 *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, rec.getRef, f.fp); 169 else if(f.tbx != null){ 170 static if(is(T == Kstring)) 171 *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, rec.getRef, f.tbx); 172 else { 173 *this.r = hts_itr_next(f.fp.is_bgzf ? f.fp.fp.bgzf : null, itr, this.line.getRef, f.tbx); 174 static if(is(T == Bam1)) 175 *this.r = sam_parse1(this.line, this.f.bamHdr, this.rec); 176 else static if(is(T == Bcf1)) 177 *this.r = vcf_parse(this.line, this.f.bcfHdr, this.rec); 178 ks_clear(this.line); 179 } 180 181 } 182 else{ 183 *r = -2; 184 hts_log_error(__FUNCTION__, "Neither tabix nor bai/csi index are loaded"); 185 } 186 187 } 188 } 189 } 190 191 /// InputRange interface 192 @property bool empty() 193 { 194 // TODO, itr.finished shouldn't be used 195 if (!this.initialized) { 196 this.popFront(); 197 this.initialized = true; 198 } 199 if(!is_itr){ 200 return *r < 0 ? true : false; 201 }else{ 202 assert(this.itr.initialized && this.itr.getRef); 203 return (*r < 0 || itr.finished) ? true : false; 204 } 205 } 206 207 /// Needed to be ForwardRange 208 typeof(this) save() 209 { 210 return this.dup; 211 } 212 } 213 debug(dhtslib_unittest) unittest 214 { 215 import std.path:buildPath,dirName; 216 import std.algorithm: count; 217 import std.string: fromStringz; 218 hts_log_info(__FUNCTION__, "Testing HtslibIterator SAM/BAM reading"); 219 220 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","range.bam"); 221 auto f = HtslibFile(fn); 222 f.loadHeader; 223 auto read = f.readRecord!Bam1(); 224 assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712"); 225 226 f.seek(0); 227 f.loadHeader; 228 assert(f.byRecord!Bam1.count == 112); 229 230 f.seek(0); 231 f.loadHeader; 232 f.loadHtsIndex; 233 assert(f.query!Bam1(0,913,914).count == 1); 234 235 f.seek(0); 236 f.loadHeader; 237 f.loadHtsIndex; 238 assert(f.query!Bam1(0,913,934).count == 2); 239 240 f.seek(0); 241 f.loadHeader; 242 f.loadHtsIndex; 243 assert(f.query!Bam1(0,913,934).count == 2); 244 assert(f.query!Bam1("CHROMOSOME_I:914-935").count == 2); 245 246 f.seek(0); 247 f.loadHeader; 248 string[] regions = ["CHROMOSOME_I:900-2000","CHROMOSOME_II:900-2000"]; 249 assert(f.query(regions).count == 33); 250 } 251 252 debug(dhtslib_unittest) unittest 253 { 254 import std.algorithm: count; 255 import std.path:buildPath,dirName; 256 hts_log_info(__FUNCTION__, "Testing HtslibIterator VCF/BCF reading"); 257 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); 258 auto f = HtslibFile(fn); 259 f.loadHeader; 260 auto read = f.readRecord!Bcf1(); 261 assert(read.pos == 3000149); 262 263 import std.stdio; 264 f.seek(0); 265 f.loadHeader; 266 assert(f.byRecord!Bcf1.count == 14); 267 268 f.loadTabixIndex; 269 270 f.seek(0); 271 f.loadHeader; 272 assert(f.query!Bcf1(0, 3000149, 3000151).count == 2); 273 274 f.seek(0); 275 f.loadHeader; 276 assert(f.query!Bcf1("1:3000150-3000151").count == 2); 277 } 278 279 debug(dhtslib_unittest) unittest 280 { 281 import std.algorithm: count; 282 import std.path:buildPath,dirName; 283 import std.string : fromStringz; 284 hts_log_info(__FUNCTION__, "Testing HtslibIterator dup"); 285 286 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","range.bam"); 287 auto f = HtslibFile(fn); 288 f.loadHeader; 289 f.loadHtsIndex; 290 auto read = f.readRecord!Bam1(); 291 assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712"); 292 293 auto range1 = f.query!Bam1(0,900,2000); 294 auto range2 = range1.dup; 295 import std.stdio; 296 297 assert(range1.count == 14); 298 assert(range1.empty == true); 299 assert(range2.count == 14); 300 301 f.seek(0); 302 f.loadHeader; 303 304 range1 = f.query!Bam1(0,900,2000); 305 range1.popFront; 306 range2 = range1.dup; 307 range2.popFront; 308 309 assert(range1.count == 13); 310 assert(range1.empty == true); 311 assert(range2.count == 12); 312 313 f.seek(0); 314 f.loadHeader; 315 316 range1 = f.byRecord!Bam1(); 317 range1.popFront; 318 range2 = range1.dup; 319 range2.popFront; 320 321 assert(range1.count == 111); 322 assert(range1.count == 0); 323 assert(range1.empty == true); 324 assert(range2.count == 110); 325 326 f.seek(0); 327 f.loadHeader; 328 string[] regions = ["CHROMOSOME_I:900-2000","CHROMOSOME_II:900-2000"]; 329 range1 = f.query(regions); 330 range2 = range1.dup; 331 range2.popFront; 332 assert(range1.count == 33); 333 assert(range2.count == 32); 334 335 f.seek(0); 336 f.loadHeader; 337 auto f2 = f.dup; 338 read = f.readRecord!Bam1(); 339 assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712"); 340 read = f2.readRecord!Bam1(); 341 assert(fromStringz(bam_get_qname(read)) == "HS18_09653:4:1315:19857:61712"); 342 343 344 fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf"); 345 f = HtslibFile(fn); 346 f.loadHeader; 347 auto vrange1 = f.byRecord!Bcf1(); 348 vrange1.popFront; 349 auto vrange2 = vrange1.dup; 350 vrange2.popFront; 351 352 assert(vrange1.count == 13); 353 assert(vrange2.count == 12); 354 355 fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","bed_file.bed"); 356 f = HtslibFile(fn); 357 f.loadHeader; 358 auto trange1 = f.byRecord!Kstring(); 359 trange1.popFront; 360 auto trange2 = trange1.dup; 361 trange2.popFront; 362 363 assert(trange1.count == 14); 364 assert(trange2.count == 13); 365 366 fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","bed_file.bed.gz"); 367 f = HtslibFile(fn); 368 f.loadHeader; 369 trange1 = f.byRecord!Kstring(); 370 trange1.popFront; 371 trange2 = trange1.dup; 372 trange2.popFront; 373 374 assert(trange1.count == 14); 375 assert(trange2.count == 13); 376 377 }