1 /**
2 This module provides a wrapper, `IndexedFastaFile` over a FASTA.
3 If an index exists, it will be used for rapid random access.
4 If an index does not exist, one will be built.
5 
6 The wrapper provides the ability to list sequence names (i.e., chromosomes/contigs)
7 in the FASTA, efficiently retrieve sequences (by contig, start, end)
8 
9 Sequence caching and multithreaded BGZF decompression are supported.
10 */
11 
12 module dhtslib.faidx;
13 
14 import std.string;
15 import std.typecons : Tuple;
16 import core.stdc.stdlib : malloc, free;
17 
18 import dhtslib.coordinates;
19 import dhtslib.memory;
20 import dhtslib.util;
21 import htslib.faidx;
22 
23 /** Build index for a FASTA or bgzip-compressed FASTA file.
24 
25     @param  fn  FASTA file name
26     @param  fnfai Name of .fai file to build.
27     @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
28     @return     0 on success; or -1 on failure
29 
30 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
31 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
32 file will only be built if fn is bgzip-compressed.
33 */
34 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "")
35 {
36     // int fai_build3(const char *fn, const char *fnfai, const char *fngzi);
37     const int ret = fai_build3( toStringz(fn),
38                             fnfai ? toStringz(fnfai) : null,
39                             fngzi ? toStringz(fngzi) : null);
40     
41     if (ret == 0) return true;
42     if (ret == -1) return false;
43     assert(0);
44 }
45 
46 /** FASTA file with .fai or .gzi index
47 
48 Reads existing FASTA file, optionally creating FASTA index if one does not exist.
49 
50 Convenient member fns to get no. of sequences, get sequence names and lengths,
51 test for membership, and rapidly fetch sequence at offset.
52 */
53 struct IndexedFastaFile {
54 
55     private Faidx faidx;
56     
57     /// construct from filename, optionally creating index if it does not exist
58     /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true
59     this(string fn, bool create=false)
60     {
61         if (create) {
62             this.faidx = Faidx(fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE));
63             if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index.");
64         }
65         else {
66             this.faidx = Faidx(fai_load3( toStringz(fn) , null, null, 0));
67             if (this.faidx is null) throw new Exception("Unable to load the FASTA index.");
68         }
69     }
70 
71     /// Explicit postblit to avoid 
72     /// https://github.com/blachlylab/dhtslib/issues/122
73     this(this)
74     {
75         this.faidx = faidx;
76     }
77 
78     /// Enable BGZF cacheing (size: bytes)
79     void setCacheSize(int size)
80     {
81         fai_set_cache_size(this.faidx, size);
82     }
83 
84     /// Enable multithreaded decompression of this FA/FQ
85     /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting)
86     /// but we'll retain name for consistency with setCacheSize
87     /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???)
88     deprecated("disabled until faidx_t again made non-opaque")
89     void setThreads(int nthreads)
90     {
91         import htslib.bgzf : BGZF, bgzf_mt;
92         // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256
93         //bgzf_mt(this.faidx.bgzf, nthreads, 64);
94     }
95 
96     private struct OffsetType
97     {
98         ptrdiff_t offset;
99         alias offset this;
100 
101         // supports e.g. $ - x
102         OffsetType opBinary(string s, T)(T val)
103         {
104             mixin("return OffsetType(offset " ~ s ~ " val);");
105         }
106 
107         invariant
108         {
109             assert(this.offset <= 0, "Offset from end should be zero or negative");
110         }
111     }
112     /** Array-end `$` indexing hack courtesy of Steve Schveighoffer
113         https://forum.dlang.org/post/rl7a56$nad$1@digitalmars.com
114 
115         Requires in addition to opDollar returning a bespoke non-integral type
116         a series of overloads for opIndex and opSlice taking this type
117     */
118     OffsetType opDollar(size_t dim)() if(dim == 1)
119     {
120         return OffsetType.init;
121     }
122     /// opSlice as Coordinate and an offset
123     /// i.e [ZB(2) .. $]
124     auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, OffsetType off) if(dim == 1)
125     {
126         return Tuple!(Coordinate!bs, OffsetType)(start, off);
127     }
128     /// opSlice as two offset
129     /// i.e [$-2 .. $]
130     auto opSlice(size_t dim)(OffsetType start, OffsetType end) if(dim == 1)
131     {
132         return Tuple!(OffsetType, OffsetType)(start, end);
133     }
134     /// opIndex coordinate and Offset
135     /// i.e fai["chrom1", ZB(1) .. $]
136     auto opIndex(Basis bs)(string ctg, Tuple!(Coordinate!bs, OffsetType) coords)
137     {
138         auto end = this.seqLen(ctg) + coords[1];
139         auto endCoord = ZB(end);
140         auto newEndCoord = endCoord.to!bs;
141         auto c = Interval!(getCoordinateSystem!(bs,End.open))(coords[0], newEndCoord);
142         return fetchSequence(ctg, c);
143     }
144 
145     /// opIndex two Offsets
146     /// i.e fai["chrom1", $-2 .. $]
147     auto opIndex(string ctg, Tuple!(OffsetType, OffsetType) coords)
148     {
149         auto start = this.seqLen(ctg) + coords[0];
150         auto end = this.seqLen(ctg) + coords[1];
151         auto c = ZBHO(start, end);
152         return fetchSequence(ctg, c);
153     }
154     /// opIndex one offset
155     /// i.e fai["chrom1", $-1]
156     auto opIndex(string ctg, OffsetType endoff)
157     {
158         auto end = this.seqLen(ctg) + endoff.offset;
159         auto coords = Interval!(CoordSystem.zbho)(end, end + 1);
160         return fetchSequence(ctg, coords);
161     }
162 
163     /// Fetch sequence in region by assoc array-style lookup:
164     /// Uses htslib string region parsing
165     /// `string sequence = fafile["chr2:20123-30456"]`
166     auto opIndex(string region)
167     {
168         auto coords = getIntervalFromString(region);
169         return fetchSequence(coords.contig, coords.interval);
170     }
171 
172     /// Fetch sequence in region by multidimensional slicing:
173     /// `string sequence = fafile["chr2", 20123 .. 30456]`
174     ///
175     /// Sadly, $ to represent max length is not supported
176     auto opIndex(CoordSystem cs)(string contig, Interval!cs coords)
177     {
178         return fetchSequence(contig, coords);
179     }
180     
181     /// ditto
182     auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, Coordinate!bs end) if (dim == 1)
183     in { assert(start >= 0); assert(start <= end); }
184     do
185     {
186         auto coords = Interval!(getCoordinateSystem!(bs, End.open))(start, end);
187         return coords;
188     }
189 
190     /// Fetch sequencing in a region by function call with contig, start, end
191     /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)`
192     string fetchSequence(CoordSystem cs)(string contig, Interval!cs coords)
193     {
194         char *fetchedSeq;
195         long fetchedLen;
196 
197         // Convert given coordinates in system cs to zero-based, closed (faidx_fetch_seq)
198         auto newcoords = coords.to!(CoordSystem.zbc);
199         /* htslib API for my reference:
200          *
201          * char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len);
202          * @param  fai  Pointer to the faidx_t struct
203          * @param  c_name Region name
204          * @param  p_beg_i  Beginning position number (zero-based)
205          * @param  p_end_i  End position number (zero-based)
206          * @param  len  Length of the region; -2 if c_name not present, -1 general error
207          * @return      Pointer to the sequence; null on failure
208          *  The returned sequence is allocated by `malloc()` family and should be destroyed
209          *  by end users by calling `free()` on it.
210          *
211         */
212         fetchedSeq = faidx_fetch_seq64(this.faidx, toStringz(contig), newcoords.start, newcoords.end, &fetchedLen);
213         
214         if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error");
215         else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found");
216 
217         string seq = fromStringz(fetchedSeq).idup;
218         free(fetchedSeq);
219 
220         assert(seq.length == fetchedLen);
221         return seq;
222     }
223 
224     /// Test whether the FASTA file/index contains string seqname
225     bool hasSeq(const(char)[] seqname)
226     {
227         // int faidx_has_seq(const faidx_t *fai, const char *seq);
228         return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) );
229     }
230 
231     /// Return the number of sequences in the FASTA file/index
232     @property auto nSeq()
233     {
234         return faidx_nseq(this.faidx);
235     }
236 
237     /// Return the name of the i'th sequence
238     string seqName(int i)
239     {
240         // const(char) *faidx_iseq(const faidx_t *fai, int i);
241         // TODO determine if this property is zero indexed or one indexed
242         if (i > this.nSeq) throw new Exception("seqName: sequece number not present");
243 
244         return fromStringz( faidx_iseq(this.faidx, i) ).idup;
245     }
246 
247     /// Return sequence length, -1 if not present
248     /// NOTE: there is no 64 bit equivalent of this function (yet) in htslib-1.10
249     int seqLen(const(char)[] seqname)
250     {
251         // TODO should I check for -1 and throw exception or pass to caller?
252         // int faidx_seq_len(const faidx_t *fai, const char *seq);
253         const int l = faidx_seq_len(this.faidx, toStringz(seqname) );
254         if ( l == -1 ) throw new Exception("seqLen: sequence name not found");
255         return l;
256     }
257 }
258 
259 debug(dhtslib_unittest) unittest
260 {
261     import dhtslib.sam;
262     import htslib.hts_log;
263     import std.path : buildPath, dirName;
264 
265     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
266     hts_log_info(__FUNCTION__, "Testing IndexedFastaFile");
267     hts_log_info(__FUNCTION__, "Loading test file");
268     auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa"));
269 
270     fai.setCacheSize(4000000);
271 
272     assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(0, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
273     assert(fai.fetchSequence("CHROMOSOME_I",OBHO(1, 30)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
274     assert(fai.fetchSequence("CHROMOSOME_I",ZBC(0, 28)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
275     assert(fai.fetchSequence("CHROMOSOME_I",OBC(1, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
276 
277     assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
278     assert(fai.fetchSequence("CHROMOSOME_I",OBHO(2, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
279     assert(fai.fetchSequence("CHROMOSOME_I",ZBC(1, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
280     assert(fai.fetchSequence("CHROMOSOME_I",OBC(2, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
281 
282     assert(fai["CHROMOSOME_I",ZB(0) .. ZB(29)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
283     import std.stdio;
284     writeln(fai["CHROMOSOME_I",OB(1) .. OB(30)]);
285     assert(fai["CHROMOSOME_I",OB(1) .. OB(30)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
286 
287     assert(fai.seqLen("CHROMOSOME_I") == 1009800);
288     assert(fai.nSeq == 7);
289 
290     assert(fai.hasSeq("CHROMOSOME_I"));
291     assert(!fai.hasSeq("CHROMOSOME_"));
292 
293     assert(fai.seqName(0) == "CHROMOSOME_I");
294 }
295 
296 debug(dhtslib_unittest) unittest
297 {
298     import dhtslib.sam;
299     import htslib.hts_log;
300     import std.path : buildPath, dirName;
301 
302     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
303     hts_log_info(__FUNCTION__, "Testing IndexedFastaFile");
304     hts_log_info(__FUNCTION__, "Loading test file");
305     auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa"));
306 
307     fai.setCacheSize(4000000);
308 
309     assert(fai.fetchSequence("CHROMOSOME_II",ZBHO(0, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
310     assert(fai.fetchSequence("CHROMOSOME_II",OBHO(1, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
311     assert(fai.fetchSequence("CHROMOSOME_II",ZBC(0, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
312     assert(fai.fetchSequence("CHROMOSOME_II",OBC(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
313     assert(fai["CHROMOSOME_II:1-29"] == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
314     // "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"
315     import std.stdio;
316     assert(fai["CHROMOSOME_II",$-50 .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
317     
318     assert(fai["CHROMOSOME_II",OB(4951) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
319     assert(fai["CHROMOSOME_II",ZB(4950) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
320     assert(fai["CHROMOSOME_II",$-1] == "G");
321 }