1 module dhtslib.vcf.reader; 2 3 import std.string: fromStringz, toStringz; 4 import std.utf: toUTFz; 5 import std.traits : ReturnType; 6 7 import dhtslib.vcf; 8 import dhtslib.tabix; 9 import dhtslib.coordinates; 10 import dhtslib.memory; 11 import dhtslib.util; 12 import htslib.vcf; 13 import htslib.hts_log; 14 import htslib.kstring; 15 16 alias BCFReader = VCFReader; 17 18 auto VCFReader(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None) 19 { 20 return VCFReaderImpl!(CoordSystem.zbc, false)(fn, MAX_UNPACK); 21 } 22 23 auto VCFReader(CoordSystem cs)(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None) 24 { 25 return VCFReaderImpl!(cs,true)(tbxFile, chrom, coords, MAX_UNPACK); 26 } 27 28 /** Basic support for reading VCF, BCF files */ 29 struct VCFReaderImpl(CoordSystem cs, bool isTabixFile) 30 { 31 // htslib data structures 32 VcfFile fp; /// rc htsFile wrapper 33 VCFHeader vcfhdr; /// rc header wrapper 34 Bcf1 b; /// rc bcf1_t wrapper, record for use in iterator, will be recycled 35 UnpackLevel MAX_UNPACK; /// see htslib.vcf 36 37 38 private bool initialized; 39 private int success; 40 41 @disable this(); 42 static if(isTabixFile){ 43 44 TabixIndexedFile tbx; /// For tabix use 45 ReturnType!(this.initializeTabixRange) tbxRange; /// For tabix use 46 47 /// TabixIndexedFile and coordinates ctor 48 this(TabixIndexedFile tbxFile, string chrom, Interval!cs coords, UnpackLevel MAX_UNPACK = UnpackLevel.None) 49 { 50 this.tbx = tbxFile; 51 this.tbxRange = initializeTabixRange(chrom, coords); 52 this.tbxRange.empty; 53 /// read the header from TabixIndexedFile 54 bcf_hdr_t * hdrPtr = bcf_hdr_init(toUTFz!(char *)("r")); 55 auto err = bcf_hdr_parse(hdrPtr, toUTFz!(char *)(this.tbx.header)); 56 this.vcfhdr = VCFHeader(hdrPtr); 57 58 this.b = Bcf1(bcf_init1()); 59 this.b.max_unpack = MAX_UNPACK; 60 this.MAX_UNPACK = MAX_UNPACK; 61 this.empty; 62 } 63 64 /// copy the TabixIndexedFile.region range 65 auto initializeTabixRange(string chrom, Interval!cs coords) 66 { 67 return this.tbx.region(chrom, coords); 68 } 69 }else{ 70 /// read existing VCF file 71 /// MAX_UNPACK: setting alternate value could speed reading 72 this(string fn, UnpackLevel MAX_UNPACK = UnpackLevel.None) 73 { 74 import htslib.hts : hts_set_threads; 75 76 assert(!isTabixFile); 77 if (fn == "") throw new Exception("Empty filename passed to VCFReader constructor"); 78 this.fp = VcfFile(vcf_open(toStringz(fn), "r"c.ptr)); 79 if (!this.fp) throw new Exception("Could not hts_open file"); 80 81 hts_set_threads(this.fp, 1); // extra decoding thread 82 83 this.vcfhdr = VCFHeader(bcf_hdr_read(this.fp)); 84 85 this.b = Bcf1(bcf_init1()); 86 this.b.max_unpack = MAX_UNPACK; 87 this.MAX_UNPACK = MAX_UNPACK; 88 this.empty; 89 } 90 } 91 92 invariant 93 { 94 assert(this.vcfhdr.hdr != null); 95 } 96 97 VCFHeader getHeader() 98 { 99 return this.vcfhdr; 100 } 101 102 /// InputRange interface: iterate over all records 103 @property bool empty() 104 { 105 static if(isTabixFile){ 106 if(this.tbxRange.empty) return true; 107 } 108 if (!this.initialized) { 109 this.popFront(); 110 this.initialized = true; 111 } 112 if (success >= 0) 113 return false; 114 else if (success == -1) 115 return true; 116 else 117 { 118 static if(isTabixFile) 119 hts_log_error(__FUNCTION__, "*** ERROR vcf_parse < -1"); 120 else 121 hts_log_error(__FUNCTION__, "*** ERROR bcf_read < -1"); 122 return true; 123 } 124 125 } 126 /// ditto 127 void popFront() 128 { 129 // int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); 130 // documentation claims returns -1 on critical errors, 0 otherwise 131 // however it looks like -1 is EOF and -2 is critical errors? 132 static if(isTabixFile){ 133 if (this.initialized) this.tbxRange.popFront; 134 auto str = this.tbxRange.front; 135 kstring_t kstr; 136 kputs(toUTFz!(char *)(str), &kstr); 137 this.success = vcf_parse(&kstr, this.vcfhdr.hdr, this.b); 138 139 }else 140 this.success = bcf_read(this.fp, this.vcfhdr.hdr, this.b); 141 142 } 143 /// ditto 144 VCFRecord front() 145 { 146 // note that VCFRecord's constructor will be responsible for 147 // * UnpackLeveling and 148 // * destroying 149 // its copy 150 auto copiedBcf_1t = bcf_dup(this.b); 151 return VCFRecord(this.vcfhdr, copiedBcf_1t, this.MAX_UNPACK); 152 } 153 154 typeof(this) save() 155 { 156 typeof(this) newRange = this; 157 static if(isTabixFile){ 158 newRange.tbx = tbx; /// For tabix use 159 newRange.tbxRange = tbxRange.save; /// For tabix use 160 }else{ 161 newRange.fp = HtsFile(copyHtsFile(this.fp)); 162 } 163 newRange.vcfhdr = vcfhdr; 164 newRange.b = Bcf1(bcf_dup(this.b)); 165 newRange.initialized = initialized; 166 newRange.success = success; 167 return newRange; 168 } 169 } 170 171 debug(dhtslib_unittest) unittest 172 { 173 import std.stdio; 174 import htslib.hts_log; 175 import std.algorithm : map, count; 176 import std.array : array; 177 import std.path : buildPath, dirName; 178 import std.math : approxEqual; 179 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 180 hts_log_info(__FUNCTION__, "Testing VCFReader (Tabix)"); 181 hts_log_info(__FUNCTION__, "Loading test file"); 182 183 auto vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 184 assert(vcf.count == 14); 185 vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 186 VCFRecord rec = vcf.front; 187 assert(rec.chrom == "1"); 188 assert(rec.pos == 3000149); 189 assert(rec.refAllele == "C"); 190 assert(rec.altAllelesAsArray == ["T"]); 191 assert(rec.allelesAsArray == ["C","T"]); 192 assert(approxEqual(rec.qual,59.2)); 193 assert(rec.filter == "PASS"); 194 195 vcf.popFront; 196 rec = vcf.front; 197 198 assert(rec.chrom == "1"); 199 assert(rec.pos == 3000150); 200 assert(rec.refAllele == "C"); 201 assert(rec.altAllelesAsArray == ["T"]); 202 assert(rec.allelesAsArray == ["C","T"]); 203 assert(approxEqual(rec.qual,59.2)); 204 assert(rec.filter == "PASS"); 205 206 vcf = VCFReader(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf")); 207 auto range1 = vcf.save; 208 vcf.popFront; 209 auto range2 = vcf.save; 210 vcf.popFront; 211 vcf.popFront; 212 auto range3 = vcf.save; 213 vcf.popFront; 214 vcf.popFront; 215 vcf.popFront; 216 auto range4 = vcf.save; 217 assert(range1.array.length == 14); 218 assert(range2.array.length == 13); 219 assert(range3.array.length == 11); 220 assert(range4.array.length == 8); 221 } 222 223 debug(dhtslib_unittest) unittest 224 { 225 import dhtslib.util; 226 import std.stdio; 227 import htslib.hts_log; 228 import std.algorithm : map, count; 229 import std.array : array; 230 import std.path : buildPath, dirName; 231 import std.math : approxEqual; 232 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 233 hts_log_info(__FUNCTION__, "Testing VCFReader"); 234 hts_log_info(__FUNCTION__, "Loading test file"); 235 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); 236 auto tbx = TabixIndexedFile(fn); 237 auto reg = getIntervalFromString("1:3000151-3062916"); 238 auto vcf = VCFReader(tbx, reg.contig, reg.interval); 239 assert(vcf.count == 3); 240 vcf = VCFReader(tbx, reg.contig, reg.interval); 241 VCFRecord rec = vcf.front; 242 assert(rec.chrom == "1"); 243 assert(rec.pos == 3000150); 244 assert(rec.refAllele == "C"); 245 assert(rec.altAllelesAsArray == ["T"]); 246 assert(rec.allelesAsArray == ["C","T"]); 247 assert(approxEqual(rec.qual, 59.2)); 248 assert(rec.filter == "PASS"); 249 250 vcf.popFront; 251 rec = vcf.front; 252 253 assert(rec.chrom == "1"); 254 assert(rec.pos == 3062914); 255 assert(rec.refAllele == "GTTT"); 256 assert(rec.altAllelesAsArray == ["G"]); 257 assert(rec.allelesAsArray == ["GTTT","G"]); 258 assert(approxEqual(rec.qual,12.9)); 259 assert(rec.filter == "q10"); 260 // assert(rec. == ["C","T"]); 261 262 vcf = VCFReader(tbx, reg.contig, reg.interval); 263 auto range1 = vcf.save; 264 vcf.popFront; 265 auto range2 = vcf.save; 266 vcf.popFront; 267 auto range3 = vcf.save; 268 vcf.popFront; 269 auto range4 = vcf.save; 270 assert(range1.array.length == 3); 271 assert(range2.array.length == 2); 272 assert(range3.array.length == 1); 273 assert(range4.array.length == 0); 274 } 275 276 debug(dhtslib_unittest) unittest 277 { 278 import dhtslib.util; 279 import std.stdio; 280 import htslib.hts_log; 281 import std.algorithm : map, count; 282 import std.array : array; 283 import std.path : buildPath, dirName; 284 import std.math : approxEqual; 285 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 286 hts_log_info(__FUNCTION__, "Testing bcf1_t Unpacking"); 287 hts_log_info(__FUNCTION__, "Loading test file"); 288 auto fn = buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","tabix","vcf_file.vcf.gz"); 289 auto tbx = TabixIndexedFile(fn); 290 auto reg = getIntervalFromString("1:3000151-3062916"); 291 292 auto vcf = VCFReader(tbx, reg.contig, reg.interval); 293 assert(vcf.count == 3); 294 vcf = VCFReader(tbx, reg.contig, reg.interval, UnpackLevel.None); 295 VCFRecord rec = vcf.front; 296 297 assert(rec.chrom == "1"); 298 assert(rec.pos == 3000150); 299 300 assert(approxEqual(rec.qual, 59.2)); 301 assert(rec.line.unpacked == UnpackLevel.None); 302 303 assert(rec.id == "."); 304 assert(rec.line.unpacked == UnpackLevel.AltAllele); 305 306 assert(rec.refAllele == "C"); 307 assert(rec.line.unpacked == UnpackLevel.AltAllele); 308 309 assert(rec.altAllelesAsArray == ["T"]); 310 assert(rec.line.unpacked == UnpackLevel.AltAllele); 311 312 assert(rec.filter == "PASS"); 313 assert(rec.line.unpacked == (UnpackLevel.Filter | UnpackLevel.AltAllele)); 314 315 assert(rec.getInfos["AN"].to!int == 4); 316 assert(rec.line.unpacked == UnpackLevel.SharedFields); 317 318 assert(rec.getFormats["DP"].to!int.array == [[32], [32]]); 319 assert(rec.line.unpacked == UnpackLevel.All); 320 321 /// only one extra test for pulling only format fields 322 /// 323 /// bcf_unpack automatically promotes UnpackLevel.Filter to 324 /// (UnpackLevel.Filter | UnpackLevel.AltAllele) 325 /// 326 /// bcf_unpack automatically promotes UnpackLevel.Info to 327 /// (UnpackLevel.Info | UnpackLevel.SharedFields) 328 329 /// since front calls bcf_dup, we get a fresh unpacked record 330 rec = vcf.front; 331 assert(rec.getFormats["DP"].to!int.array == [[32], [32]]); 332 assert(rec.line.unpacked == UnpackLevel.Format); 333 334 }