1 /**
2 This module provides a wrapper, `IndexedFastaFile` over a FASTA.
3 If an index exists, it will be used for rapid random access.
4 If an index does not exist, one will be built.
5 
6 The wrapper provides the ability to list sequence names (i.e., chromosomes/contigs)
7 in the FASTA, efficiently retrieve sequences (by contig, start, end)
8 
9 Sequence caching and multithreaded BGZF decompression are supported.
10 */
11 
12 module dhtslib.faidx;
13 
14 import std.string;
15 import std.typecons : Tuple;
16 import core.stdc.stdlib : malloc, free;
17 
18 import dhtslib.coordinates;
19 import dhtslib.memory;
20 import dhtslib.util;
21 import htslib.faidx;
22 
23 /** Build index for a FASTA or bgzip-compressed FASTA file.
24 
25     @param  fn  FASTA file name
26     @param  fnfai Name of .fai file to build.
27     @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
28     @return     0 on success; or -1 on failure
29 
30 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
31 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
32 file will only be built if fn is bgzip-compressed.
33 */
34 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "")
35 {
36     // int fai_build3(const char *fn, const char *fnfai, const char *fngzi);
37     const int ret = fai_build3( toStringz(fn),
38                             fnfai ? toStringz(fnfai) : null,
39                             fngzi ? toStringz(fngzi) : null);
40     
41     if (ret == 0) return true;
42     if (ret == -1) return false;
43     assert(0);
44 }
45 
46 /** FASTA file with .fai or .gzi index
47 
48 Reads existing FASTA file, optionally creating FASTA index if one does not exist.
49 
50 Convenient member fns to get no. of sequences, get sequence names and lengths,
51 test for membership, and rapidly fetch sequence at offset.
52 */
53 struct IndexedFastaFile {
54 
55     private Faidx faidx;
56     
57     /// construct from filename, optionally creating index if it does not exist
58     /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true
59     this(string fn, bool create=false)
60     {
61         if (create) {
62             this.faidx = Faidx(fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE));
63             if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index.");
64         }
65         else {
66             this.faidx = Faidx(fai_load3( toStringz(fn) , null, null, 0));
67             if (this.faidx is null) throw new Exception("Unable to load the FASTA index.");
68         }
69     }
70 
71     /// Enable BGZF cacheing (size: bytes)
72     void setCacheSize(int size)
73     {
74         fai_set_cache_size(this.faidx, size);
75     }
76 
77     /// Enable multithreaded decompression of this FA/FQ
78     /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting)
79     /// but we'll retain name for consistency with setCacheSize
80     /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???)
81     deprecated("disabled until faidx_t again made non-opaque")
82     void setThreads(int nthreads)
83     {
84         import htslib.bgzf : BGZF, bgzf_mt;
85         // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256
86         //bgzf_mt(this.faidx.bgzf, nthreads, 64);
87     }
88 
89     private struct OffsetType
90     {
91         ptrdiff_t offset;
92         alias offset this;
93 
94         // supports e.g. $ - x
95         OffsetType opBinary(string s, T)(T val)
96         {
97             mixin("return OffsetType(offset " ~ s ~ " val);");
98         }
99 
100         invariant
101         {
102             assert(this.offset <= 0, "Offset from end should be zero or negative");
103         }
104     }
105     /** Array-end `$` indexing hack courtesy of Steve Schveighoffer
106         https://forum.dlang.org/post/rl7a56$nad$1@digitalmars.com
107 
108         Requires in addition to opDollar returning a bespoke non-integral type
109         a series of overloads for opIndex and opSlice taking this type
110     */
111     OffsetType opDollar(size_t dim)() if(dim == 1)
112     {
113         return OffsetType.init;
114     }
115     /// opSlice as Coordinate and an offset
116     /// i.e [ZB(2) .. $]
117     auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, OffsetType off) if(dim == 1)
118     {
119         return Tuple!(Coordinate!bs, OffsetType)(start, off);
120     }
121     /// opSlice as two offset
122     /// i.e [$-2 .. $]
123     auto opSlice(size_t dim)(OffsetType start, OffsetType end) if(dim == 1)
124     {
125         return Tuple!(OffsetType, OffsetType)(start, end);
126     }
127     /// opIndex coordinate and Offset
128     /// i.e fai["chrom1", ZB(1) .. $]
129     auto opIndex(Basis bs)(string ctg, Tuple!(Coordinate!bs, OffsetType) coords)
130     {
131         auto end = this.seqLen(ctg) + coords[1];
132         auto endCoord = ZB(end);
133         auto newEndCoord = endCoord.to!bs;
134         auto c = Interval!(getCoordinateSystem!(bs,End.open))(coords[0], newEndCoord);
135         return fetchSequence(ctg, c);
136     }
137 
138     /// opIndex two Offsets
139     /// i.e fai["chrom1", $-2 .. $]
140     auto opIndex(string ctg, Tuple!(OffsetType, OffsetType) coords)
141     {
142         auto start = this.seqLen(ctg) + coords[0];
143         auto end = this.seqLen(ctg) + coords[1];
144         auto c = ZBHO(start, end);
145         return fetchSequence(ctg, c);
146     }
147     /// opIndex one offset
148     /// i.e fai["chrom1", $-1]
149     auto opIndex(string ctg, OffsetType endoff)
150     {
151         auto end = this.seqLen(ctg) + endoff.offset;
152         auto coords = Interval!(CoordSystem.zbho)(end, end + 1);
153         return fetchSequence(ctg, coords);
154     }
155 
156     /// Fetch sequence in region by assoc array-style lookup:
157     /// Uses htslib string region parsing
158     /// `string sequence = fafile["chr2:20123-30456"]`
159     auto opIndex(string region)
160     {
161         auto coords = getIntervalFromString(region);
162         return fetchSequence(coords.contig, coords.interval);
163     }
164 
165     /// Fetch sequence in region by multidimensional slicing:
166     /// `string sequence = fafile["chr2", 20123 .. 30456]`
167     ///
168     /// Sadly, $ to represent max length is not supported
169     auto opIndex(CoordSystem cs)(string contig, Interval!cs coords)
170     {
171         return fetchSequence(contig, coords);
172     }
173     
174     /// ditto
175     auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, Coordinate!bs end) if (dim == 1)
176     in { assert(start >= 0); assert(start <= end); }
177     do
178     {
179         auto coords = Interval!(getCoordinateSystem!(bs, End.open))(start, end);
180         return coords;
181     }
182 
183     /// Fetch sequencing in a region by function call with contig, start, end
184     /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)`
185     string fetchSequence(CoordSystem cs)(string contig, Interval!cs coords)
186     {
187         char *fetchedSeq;
188         long fetchedLen;
189 
190         // Convert given coordinates in system cs to zero-based, closed (faidx_fetch_seq)
191         auto newcoords = coords.to!(CoordSystem.zbc);
192         /* htslib API for my reference:
193          *
194          * char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len);
195          * @param  fai  Pointer to the faidx_t struct
196          * @param  c_name Region name
197          * @param  p_beg_i  Beginning position number (zero-based)
198          * @param  p_end_i  End position number (zero-based)
199          * @param  len  Length of the region; -2 if c_name not present, -1 general error
200          * @return      Pointer to the sequence; null on failure
201          *  The returned sequence is allocated by `malloc()` family and should be destroyed
202          *  by end users by calling `free()` on it.
203          *
204         */
205         fetchedSeq = faidx_fetch_seq64(this.faidx, toStringz(contig), newcoords.start, newcoords.end, &fetchedLen);
206         
207         if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error");
208         else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found");
209 
210         string seq = fromStringz(fetchedSeq).idup;
211         free(fetchedSeq);
212 
213         assert(seq.length == fetchedLen);
214         return seq;
215     }
216 
217     /// Test whether the FASTA file/index contains string seqname
218     bool hasSeq(const(char)[] seqname)
219     {
220         // int faidx_has_seq(const faidx_t *fai, const char *seq);
221         return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) );
222     }
223 
224     /// Return the number of sequences in the FASTA file/index
225     @property auto nSeq()
226     {
227         return faidx_nseq(this.faidx);
228     }
229 
230     /// Return the name of the i'th sequence
231     string seqName(int i)
232     {
233         // const(char) *faidx_iseq(const faidx_t *fai, int i);
234         // TODO determine if this property is zero indexed or one indexed
235         if (i > this.nSeq) throw new Exception("seqName: sequece number not present");
236 
237         return fromStringz( faidx_iseq(this.faidx, i) ).idup;
238     }
239 
240     /// Return sequence length, -1 if not present
241     /// NOTE: there is no 64 bit equivalent of this function (yet) in htslib-1.10
242     int seqLen(const(char)[] seqname)
243     {
244         // TODO should I check for -1 and throw exception or pass to caller?
245         // int faidx_seq_len(const faidx_t *fai, const char *seq);
246         const int l = faidx_seq_len(this.faidx, toStringz(seqname) );
247         if ( l == -1 ) throw new Exception("seqLen: sequence name not found");
248         return l;
249     }
250 }
251 
252 debug(dhtslib_unittest) unittest
253 {
254     import dhtslib.sam;
255     import htslib.hts_log;
256     import std.path : buildPath, dirName;
257 
258     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
259     hts_log_info(__FUNCTION__, "Testing IndexedFastaFile");
260     hts_log_info(__FUNCTION__, "Loading test file");
261     auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa"));
262 
263     fai.setCacheSize(4000000);
264 
265     assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(0, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
266     assert(fai.fetchSequence("CHROMOSOME_I",OBHO(1, 30)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
267     assert(fai.fetchSequence("CHROMOSOME_I",ZBC(0, 28)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
268     assert(fai.fetchSequence("CHROMOSOME_I",OBC(1, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
269 
270     assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
271     assert(fai.fetchSequence("CHROMOSOME_I",OBHO(2, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
272     assert(fai.fetchSequence("CHROMOSOME_I",ZBC(1, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
273     assert(fai.fetchSequence("CHROMOSOME_I",OBC(2, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA");
274 
275     assert(fai["CHROMOSOME_I",ZB(0) .. ZB(29)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
276     import std.stdio;
277     writeln(fai["CHROMOSOME_I",OB(1) .. OB(30)]);
278     assert(fai["CHROMOSOME_I",OB(1) .. OB(30)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA");
279 
280     assert(fai.seqLen("CHROMOSOME_I") == 1009800);
281     assert(fai.nSeq == 7);
282 
283     assert(fai.hasSeq("CHROMOSOME_I"));
284     assert(!fai.hasSeq("CHROMOSOME_"));
285 
286     assert(fai.seqName(0) == "CHROMOSOME_I");
287 }
288 
289 debug(dhtslib_unittest) unittest
290 {
291     import dhtslib.sam;
292     import htslib.hts_log;
293     import std.path : buildPath, dirName;
294 
295     hts_set_log_level(htsLogLevel.HTS_LOG_INFO);
296     hts_log_info(__FUNCTION__, "Testing IndexedFastaFile");
297     hts_log_info(__FUNCTION__, "Loading test file");
298     auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa"));
299 
300     fai.setCacheSize(4000000);
301 
302     assert(fai.fetchSequence("CHROMOSOME_II",ZBHO(0, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
303     assert(fai.fetchSequence("CHROMOSOME_II",OBHO(1, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
304     assert(fai.fetchSequence("CHROMOSOME_II",ZBC(0, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
305     assert(fai.fetchSequence("CHROMOSOME_II",OBC(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
306     assert(fai["CHROMOSOME_II:1-29"] == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA");
307     // "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"
308     import std.stdio;
309     assert(fai["CHROMOSOME_II",$-50 .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
310     
311     assert(fai["CHROMOSOME_II",OB(4951) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
312     assert(fai["CHROMOSOME_II",ZB(4950) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG");
313     assert(fai["CHROMOSOME_II",$-1] == "G");
314 }