1 module dhtslib.faidx;
2 
3 import std.string;
4 import core.stdc.stdlib : malloc, free;
5 
6 import dhtslib.htslib.faidx;
7 
8 /** Coodinate Systems, since htslib sequence fetching methods surprisingly use zero-based, closed */
9 enum CoordSystem
10 {
11     zbho = 0,   /// zero-based, half-open
12     zbc,        /// zero-based, closed (behavior of faidx_fetch_seq)
13     obho,       /// one-based, half-open
14     obc         /// one-based, closed
15 }
16 
17 /** Build index for a FASTA or bgzip-compressed FASTA file.
18 
19     @param  fn  FASTA file name
20     @param  fnfai Name of .fai file to build.
21     @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
22     @return     0 on success; or -1 on failure
23 
24 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
25 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
26 file will only be built if fn is bgzip-compressed.
27 */
28 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "")
29 {
30     // int fai_build3(const char *fn, const char *fnfai, const char *fngzi);
31     const int ret = fai_build3( toStringz(fn),
32                             fnfai ? toStringz(fnfai) : null,
33                             fngzi ? toStringz(fngzi) : null);
34     
35     if (ret == 0) return true;
36     if (ret == -1) return false;
37     assert(0);
38 }
39 
40 /** FASTA file with .fai or .gzi index
41 
42 Reads existing FASTA file, optionally creating FASTA index if one does not exist.
43 
44 Convenient member fns to get no. of sequences, get sequence names and lengths,
45 test for membership, and rapidly fetch sequence at offset.
46 */
47 struct IndexedFastaFile {
48 
49     private faidx_t *faidx;
50     private int refct;      // Postblit refcounting in case the object is passed around
51 
52     this(this)
53     {
54         refct++;
55     }
56     /// construct from filename, optionally creating index if it does not exist
57     /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true
58     this(string fn, bool create=false)
59     {
60         if (create) {
61             this.faidx = fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE);
62             if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index.");
63         }
64         else {
65             this.faidx = fai_load3( toStringz(fn) , null, null, 0);
66             if (this.faidx is null) throw new Exception("Unable to load the FASTA index.");
67         }
68         refct = 1;
69     }
70     ~this()
71     {
72         if (--refct == 0)
73             fai_destroy(this.faidx);
74     }
75 
76     /// Enable BGZF cacheing (size: bytes)
77     void setCacheSize(int size)
78     {
79         import dhtslib.htslib.bgzf : BGZF, bgzf_set_cache_size;
80         bgzf_set_cache_size(this.faidx.bgzf, size);
81     }
82 
83     /// Enable multithreaded decompression of this FA/FQ
84     /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting)
85     /// but we'll retain name for consistency with setCacheSize
86     /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???)
87     void setThreads(int nthreads)
88     {
89         import dhtslib.htslib.bgzf : BGZF, bgzf_mt;
90         // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256
91         bgzf_mt(this.faidx.bgzf, nthreads, 64);
92     }
93 
94     /// Fetch sequence in region by assoc array-style lookup:
95     /// `string sequence = fafile["chr2:20123-30456"]`
96     auto opIndex(string region)
97     {
98         return fetchSequence(region);
99     }
100 
101     /// Fetch sequence in region by assoc array-style lookup:
102     /// `string sequence = fafile.fetchSequence("chr2:20123-30456")`
103     string fetchSequence(string region)
104     {
105         char *fetchedSeq;
106         int fetchedLen;
107         // char *fai_fetch(const faidx_t *fai, const char *reg, int *len);
108         // @param  len  Length of the region; -2 if seq not present, -1 general error
109         fetchedSeq = fai_fetch(this.faidx, toStringz(region), &fetchedLen);
110 
111         string seq = fromStringz(fetchedSeq).idup;
112         free(fetchedSeq);
113 
114         if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error");
115         else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found");
116 
117         return seq;
118     }
119 
120     /// Fetch sequence in region by multidimensional slicing:
121     /// `string sequence = fafile["chr2", 20123 .. 30456]`
122     ///
123     /// Sadly, $ to represent max length is not supported
124     auto opIndex(string contig, int[2] pos)
125     {
126         return fetchSequence(contig, pos[0], pos[1]);
127     }
128     /// ditto
129     int[2] opSlice(size_t dim)(int start, int end) if (dim == 1)
130     in { assert(start >= 0); }
131     do
132     {
133         return [start, end];
134     }
135 
136     /// Fetch sequence in region by multidimensional slicing:
137     /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)`
138     string fetchSequence(CoordSystem cs = CoordSystem.zbho)(string contig, int start, int end)
139     {
140         char *fetchedSeq;
141         int fetchedLen;
142 
143         static if (cs == CoordSystem.zbho) {
144             end--;
145         }
146         else static if (cs == CoordSystem.zbc) {
147             // ok
148         }
149         else static if (cs == CoordSystem.obho) {
150             start--;
151             end--;
152         }
153         else static if (cs == CoordSystem.obc) {
154             start--;
155         }
156         /* htslib API for my reference:
157          *
158          * char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len);
159          * @param  fai  Pointer to the faidx_t struct
160          * @param  c_name Region name
161          * @param  p_beg_i  Beginning position number (zero-based)
162          * @param  p_end_i  End position number (zero-based)
163          * @param  len  Length of the region; -2 if c_name not present, -1 general error
164          * @return      Pointer to the sequence; null on failure
165          *  The returned sequence is allocated by `malloc()` family and should be destroyed
166          *  by end users by calling `free()` on it.
167          *
168         */
169         fetchedSeq = faidx_fetch_seq(this.faidx, toStringz(contig), start, end, &fetchedLen);
170         
171         string seq = fromStringz(fetchedSeq).idup;
172         free(fetchedSeq);
173 
174         if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error");
175         else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found");
176 
177         assert(seq.length == fetchedLen);
178         return seq;
179     }
180 
181     /// Test whether the FASTA file/index contains string seqname
182     bool hasSeq(const(char)[] seqname)
183     {
184         // int faidx_has_seq(const faidx_t *fai, const char *seq);
185         return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) );
186     }
187 
188     /// Return the number of sequences in the FASTA file/index
189     @property auto nSeq()
190     {
191         return faidx_nseq(this.faidx);
192     }
193 
194     /// Return the name of the i'th sequence
195     string seqName(int i)
196     {
197         // const(char) *faidx_iseq(const faidx_t *fai, int i);
198         // TODO determine if this property is zero indexed or one indexed
199         if (i > this.nSeq) throw new Exception("seqName: sequece number not present");
200 
201         return fromStringz( faidx_iseq(this.faidx, i) ).idup;
202     }
203 
204     /// Return sequence length, -1 if not present
205     int seqLen(const(char)[] seqname)
206     {
207         // TODO should I check for -1 and throw exception or pass to caller?
208         // int faidx_seq_len(const faidx_t *fai, const char *seq);
209         const int l = faidx_seq_len(this.faidx, toStringz(seqname) );
210         if ( l == -1 ) throw new Exception("seqLen: sequence name not found");
211         return l;
212     }
213 }