1 /** 2 This module provides a wrapper, BGZFile, over an htslib BGZF compressed file/stream. 3 The wrapper acts as a linewise ForwardRange over the file or stream. 4 */ 5 6 7 module dhtslib.bgzf; 8 9 import core.stdc.stdlib : malloc, free; 10 import std.zlib; 11 import core.stdc.stdio : SEEK_SET; 12 import std.parallelism : totalCPUs; 13 import std.stdio : writeln, writefln; 14 import std.string : fromStringz, toStringz; 15 import std.range : inputRangeObject, InputRangeObject; 16 import std.traits : ReturnType; 17 18 import dhtslib.memory; 19 import dhtslib.util; 20 21 import htslib.bgzf; 22 import htslib.hfile: hseek, off_t; 23 import htslib.kstring; 24 25 /** 26 Encapsulates a bgzipped (block gzipped) file. 27 Implements InputRange interface using htslib calls to bgzf_getline(). 28 */ 29 struct BGZFile { 30 31 /// filename; reference needed to avoid GC reaping result of toStringz when ctor goes out of scope 32 private const(char)* fn; 33 34 /// htslib data structure representing the BGZF compressed file/stream fp 35 private Bgzf bgzf; 36 37 38 /// Open filename `fn` for reading 39 this(string fn) 40 { 41 debug(dhtslib_debug) { writeln("BGZFile ctor"); } 42 43 // open file 44 this.fn = toStringz(fn); 45 this.bgzf = Bgzf(bgzf_open(this.fn, "r")); 46 47 // enable multi-threading 48 // (only effective if library was compiled with -DBGZF_MT) 49 // int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); 50 // n_sub_blks : blocks per thread; 64-256 recommended 51 if(totalCPUs > 1) { 52 immutable int ret = bgzf_mt(this.bgzf, totalCPUs, 64); 53 debug(dhtslib_debug) { 54 writefln("Total CPUs: %d", totalCPUs); 55 writefln("bgzf_mt() -> %d", ret); 56 } 57 } 58 } 59 60 /// Returns a foward range of each line 61 /// the internal buffer is reused so you must copy 62 /// if you want it to stick around 63 auto byLine(){ 64 struct BGZFRange 65 { 66 private Bgzf bgzf; 67 private Kstring line; 68 private const(char)* fn; 69 this(Bgzf bgzf, const(char)* fn){ 70 this.bgzf = copyBgzf(bgzf, fn); 71 this.fn = fn; 72 line = initKstring; 73 ks_initialize(line); 74 popFront; 75 } 76 /// InputRange interface 77 bool empty=false; 78 void popFront() 79 { 80 81 ks_clear(this.line); 82 83 // int bgzf_getline(BGZF *fp, int delim, kstring_t *str); 84 immutable int res = bgzf_getline(this.bgzf, cast(int)'\n', this.line); 85 this.empty=(res < 0 ? true : false); 86 } 87 /// ditto 88 char[] front() 89 { 90 auto ret = fromStringz(this.line.s); 91 return ret; 92 } 93 BGZFRange save() 94 { 95 BGZFRange newRange; 96 newRange.fn = fn; 97 newRange.bgzf = Bgzf(copyBgzf(bgzf, fn)); 98 newRange.line = Kstring(copyKstring(this.line)); 99 return newRange; 100 } 101 } 102 hseek(bgzf.fp, cast(off_t) 0, SEEK_SET); 103 return BGZFRange(this.bgzf, this.fn); 104 } 105 106 /// Returns a foward range of each line 107 /// same as above but we copy for you 108 auto byLineCopy(){ 109 struct BGZFRange 110 { 111 private Bgzf bgzf; 112 private Kstring line; 113 private const(char)* fn; 114 this(Bgzf bgzf, const(char)* fn){ 115 this.bgzf = copyBgzf(bgzf, fn); 116 this.fn = fn; 117 line = initKstring; 118 ks_initialize(line); 119 popFront; 120 } 121 /// InputRange interface 122 bool empty=false; 123 void popFront() 124 { 125 126 ks_clear(this.line); 127 // int bgzf_getline(BGZF *fp, int delim, kstring_t *str); 128 immutable int res = bgzf_getline(this.bgzf, cast(int)'\n', this.line); 129 this.empty=(res < 0 ? true : false); 130 } 131 /// ditto 132 string front() 133 { 134 auto ret = fromStringz(this.line.s).idup; 135 return ret; 136 } 137 138 BGZFRange save() 139 { 140 BGZFRange newRange; 141 newRange.fn = fn; 142 newRange.bgzf = Bgzf(copyBgzf(bgzf, fn)); 143 newRange.line = Kstring(copyKstring(this.line)); 144 return newRange; 145 } 146 } 147 hseek(bgzf.fp, cast(off_t) 0, SEEK_SET); 148 return BGZFRange(this.bgzf, this.fn); 149 } 150 } 151 152 /// 153 debug(dhtslib_unittest) unittest 154 { 155 import std.stdio; 156 import htslib.hts_log; 157 import std.algorithm : map; 158 import std.array : array; 159 import std.path : buildPath,dirName; 160 import std.range : drop; 161 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 162 hts_log_info(__FUNCTION__, "Testing BGZFile"); 163 hts_log_info(__FUNCTION__, "Loading test file"); 164 165 auto bg = BGZFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","fastqs.fq")); 166 auto f = File(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","fastqs.fq")); 167 assert(f.byLineCopy.array.length == 500); 168 assert(bg.byLineCopy.array.length == 500); 169 assert(bg.byLineCopy.array.length == 500); 170 assert(bg.byLine.array.length == 500); 171 172 173 auto range = bg.byLineCopy; 174 auto range1 = range.save; 175 range = range.drop(200); 176 auto range2 = range.save; 177 range = range.drop(200); 178 auto range3 = range.save; 179 180 assert(range1.array.length == 500); 181 assert(range2.array.length == 300); 182 assert(range3.array.length == 100); 183 184 auto range4 = bg.byLine; 185 auto range5 = range4.save; 186 range4 = range4.drop(200); 187 auto range6 = range4.save; 188 range4 = range4.drop(200); 189 auto range7 = range4.save; 190 191 assert(range5.array.length == 500); 192 assert(range6.array.length == 300); 193 assert(range7.array.length == 100); 194 // assert(bg.array == ["122333444455555"]); 195 } 196 197 /** 198 Range that allows reading a record based format via BGZFile. 199 Needs a record type that encompasses only one line of text. 200 Rectype could be GFF3Record, BedRecord ... 201 This is a sister struct to dhtslib.tabix.RecordReaderRegion. 202 */ 203 struct RecordReader(RecType) 204 { 205 /// file reader 206 BGZFile file; 207 /// file reader range 208 ReturnType!(this.initializeRange) range; 209 /// keep the header 210 string header; 211 212 bool emptyLine = false; 213 214 /// string filename ctor 215 this(string fn) 216 { 217 this.file = BGZFile(fn); 218 this.range = this.initializeRange; 219 while(!this.range.empty && this.range.front.length > 0 && this.range.front[0] == '#') 220 { 221 header ~= this.range.front ~ "\n"; 222 this.range.popFront; 223 } 224 if(this.header.length > 0) this.header = this.header[0 .. $-1]; 225 } 226 227 /// copy the BGZFile.byLineCopy range 228 auto initializeRange() 229 { 230 return this.file.byLineCopy.inputRangeObject; 231 } 232 233 /// returns RecType 234 RecType front() 235 { 236 return RecType(this.range.front); 237 } 238 239 /// move the range 240 void popFront() 241 { 242 this.range.popFront; 243 if(this.range.front == "") this.emptyLine = true; 244 } 245 246 /// is range done 247 auto empty() 248 { 249 return this.emptyLine || this.range.empty; 250 } 251 }