1 /**
2 This module provides a wrapper, BGZFile, over an htslib BGZF compressed file/stream.
3 The wrapper acts as a linewise ForwardRange over the file or stream.
4 */
5 
6 
7 module dhtslib.bgzf;
8 
9 import core.stdc.stdlib : malloc, free;
10 import std.zlib;
11 import core.stdc.stdio : SEEK_SET;
12 import std.parallelism : totalCPUs;
13 import std.stdio : writeln, writefln;
14 import std.string : fromStringz, toStringz;
15 import std.range : inputRangeObject, InputRangeObject;
16 import std.traits : ReturnType;
17 
18 import dhtslib.memory;
19 import dhtslib.util;
20 
21 import htslib.bgzf;
22 import htslib.hfile: hseek, off_t;
23 import htslib.kstring;
24 
25 /**
26 Encapsulates a bgzipped (block gzipped) file.
27 Implements InputRange interface using htslib calls to bgzf_getline().
28 */
29 struct BGZFile {
30 
31     /// filename; reference needed to avoid GC reaping result of toStringz when ctor goes out of scope
32     private const(char)* fn;
33 
34     /// htslib data structure representing the BGZF compressed file/stream fp
35     private Bgzf bgzf;
36 
37 
38     /// Open filename `fn` for reading
39     this(string fn)
40     {
41         debug(dhtslib_debug) { writeln("BGZFile ctor"); }
42 
43         // open file
44         this.fn = toStringz(fn);
45         this.bgzf = Bgzf(bgzf_open(this.fn, "r"));
46 
47         // enable multi-threading
48         // (only effective if library was compiled with -DBGZF_MT)
49         // int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks);
50         // n_sub_blks : blocks per thread; 64-256 recommended
51         if(totalCPUs > 1) {
52             immutable int ret = bgzf_mt(this.bgzf, totalCPUs, 64);
53             debug(dhtslib_debug) {
54                 writefln("Total CPUs: %d", totalCPUs);
55                 writefln("bgzf_mt() -> %d", ret);
56             }
57         }
58     }
59 
60     /// Returns a foward range of each line
61     /// the internal buffer is reused so you must copy 
62     /// if you want it to stick around
63     auto byLine(){
64         struct BGZFRange
65         {
66             private Bgzf bgzf;
67             private Kstring line;
68             private const(char)* fn;
69             this(Bgzf bgzf, const(char)* fn){
70                 this.bgzf = copyBgzf(bgzf, fn);
71                 this.fn = fn;
72                 line = initKstring;
73                 ks_initialize(line);
74                 popFront;
75             }
76             /// InputRange interface
77             bool empty=false;
78             void popFront()
79             {
80 
81                 ks_clear(this.line);
82                 
83                 // int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
84                 immutable int res = bgzf_getline(this.bgzf, cast(int)'\n', this.line);
85                 this.empty=(res < 0 ? true : false);
86             }
87             /// ditto
88             char[] front()
89             {
90                 auto ret = fromStringz(this.line.s);
91                 return ret;
92             }
93             BGZFRange save()
94             {
95                 BGZFRange newRange;
96                 newRange.fn = fn;
97                 newRange.bgzf = Bgzf(copyBgzf(bgzf, fn));
98                 newRange.line = Kstring(copyKstring(this.line));
99                 return newRange;
100             }
101         }
102         hseek(bgzf.fp, cast(off_t) 0, SEEK_SET);
103         return BGZFRange(this.bgzf, this.fn);
104     }
105 
106     /// Returns a foward range of each line
107     /// same as above but we copy for you
108     auto byLineCopy(){
109         struct BGZFRange
110         {
111             private Bgzf bgzf;
112             private Kstring line;
113             private const(char)* fn;
114             this(Bgzf bgzf, const(char)* fn){
115                 this.bgzf = copyBgzf(bgzf, fn);
116                 this.fn = fn;
117                 line = initKstring;
118                 ks_initialize(line);
119                 popFront;
120             }
121             /// InputRange interface
122             bool empty=false;
123             void popFront()
124             {
125 
126                 ks_clear(this.line);
127                 // int bgzf_getline(BGZF *fp, int delim, kstring_t *str);
128                 immutable int res = bgzf_getline(this.bgzf, cast(int)'\n', this.line);
129                 this.empty=(res < 0 ? true : false);
130             }
131             /// ditto
132             string front()
133             {
134                 auto ret = fromStringz(this.line.s).idup;
135                 return ret;
136             }
137 
138             BGZFRange save()
139             {
140                 BGZFRange newRange;
141                 newRange.fn = fn;
142                 newRange.bgzf = Bgzf(copyBgzf(bgzf, fn));
143                 newRange.line = Kstring(copyKstring(this.line));
144                 return newRange;
145             }
146         }
147         hseek(bgzf.fp, cast(off_t) 0, SEEK_SET);
148         return BGZFRange(this.bgzf, this.fn);
149     }
150 }
151 
152 ///
153 debug(dhtslib_unittest) unittest
154 {
155     import std.stdio;
156     import htslib.hts_log;
157     import std.algorithm : map;
158     import std.array : array;
159     import std.path : buildPath,dirName;
160     import std.range : drop;
161     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
162     hts_log_info(__FUNCTION__, "Testing BGZFile");
163     hts_log_info(__FUNCTION__, "Loading test file");
164 
165     auto bg = BGZFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","fastqs.fq"));
166     auto f = File(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","fastqs.fq"));
167     assert(f.byLineCopy.array.length == 500);
168     assert(bg.byLineCopy.array.length == 500);
169     assert(bg.byLineCopy.array.length == 500);
170     assert(bg.byLine.array.length == 500);
171 
172     
173     auto range = bg.byLineCopy;
174     auto range1 = range.save;
175     range = range.drop(200);
176     auto range2 = range.save;
177     range = range.drop(200);
178     auto range3 = range.save;
179 
180     assert(range1.array.length == 500);
181     assert(range2.array.length == 300);
182     assert(range3.array.length == 100);
183 
184     auto range4 = bg.byLine;
185     auto range5 = range4.save;
186     range4 = range4.drop(200);
187     auto range6 = range4.save;
188     range4 = range4.drop(200);
189     auto range7 = range4.save;
190 
191     assert(range5.array.length == 500);
192     assert(range6.array.length == 300);
193     assert(range7.array.length == 100);
194     // assert(bg.array == ["122333444455555"]);
195 }
196 
197 /**
198     Range that allows reading a record based format via BGZFile.
199     Needs a record type that encompasses only one line of text.
200     Rectype could be GFF3Record, BedRecord ...
201     This is a sister struct to dhtslib.tabix.RecordReaderRegion.
202 */
203 struct RecordReader(RecType)
204 {
205     /// file reader
206     BGZFile file;
207     /// file reader range
208     ReturnType!(this.initializeRange) range;
209     /// keep the header
210     string header;
211 
212     bool emptyLine = false;
213 
214     /// string filename ctor
215     this(string fn)
216     {
217         this.file = BGZFile(fn);
218         this.range = this.initializeRange;
219         while(!this.range.empty && this.range.front.length > 0 && this.range.front[0] == '#')
220         {
221             header ~= this.range.front ~ "\n";
222             this.range.popFront;
223         }
224         if(this.header.length > 0) this.header = this.header[0 .. $-1];
225     }
226 
227     /// copy the BGZFile.byLineCopy range
228     auto initializeRange()
229     {
230         return this.file.byLineCopy.inputRangeObject;
231     }
232 
233     /// returns RecType
234     RecType front()
235     {
236         return RecType(this.range.front);
237     }
238 
239     /// move the range
240     void popFront()
241     {
242         this.range.popFront;
243         if(this.range.front == "") this.emptyLine = true;
244     }
245 
246     /// is range done
247     auto empty()
248     {
249         return this.emptyLine || this.range.empty;
250     }
251 }