1 /** 2 This module provides a wrapper, `IndexedFastaFile` over a FASTA. 3 If an index exists, it will be used for rapid random access. 4 If an index does not exist, one will be built. 5 6 The wrapper provides the ability to list sequence names (i.e., chromosomes/contigs) 7 in the FASTA, efficiently retrieve sequences (by contig, start, end) 8 9 Sequence caching and multithreaded BGZF decompression are supported. 10 */ 11 12 module dhtslib.faidx; 13 14 import std.string; 15 import std.typecons : Tuple; 16 import core.stdc.stdlib : malloc, free; 17 18 import dhtslib.coordinates; 19 import dhtslib.memory; 20 import dhtslib.util; 21 import htslib.faidx; 22 23 /** Build index for a FASTA or bgzip-compressed FASTA file. 24 25 @param fn FASTA file name 26 @param fnfai Name of .fai file to build. 27 @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 28 @return 0 on success; or -1 on failure 29 30 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 31 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 32 file will only be built if fn is bgzip-compressed. 33 */ 34 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "") 35 { 36 // int fai_build3(const char *fn, const char *fnfai, const char *fngzi); 37 const int ret = fai_build3( toStringz(fn), 38 fnfai ? toStringz(fnfai) : null, 39 fngzi ? toStringz(fngzi) : null); 40 41 if (ret == 0) return true; 42 if (ret == -1) return false; 43 assert(0); 44 } 45 46 /** FASTA file with .fai or .gzi index 47 48 Reads existing FASTA file, optionally creating FASTA index if one does not exist. 49 50 Convenient member fns to get no. of sequences, get sequence names and lengths, 51 test for membership, and rapidly fetch sequence at offset. 52 */ 53 struct IndexedFastaFile { 54 55 private Faidx faidx; 56 57 /// construct from filename, optionally creating index if it does not exist 58 /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true 59 this(string fn, bool create=false) 60 { 61 if (create) { 62 this.faidx = Faidx(fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE)); 63 if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index."); 64 } 65 else { 66 this.faidx = Faidx(fai_load3( toStringz(fn) , null, null, 0)); 67 if (this.faidx is null) throw new Exception("Unable to load the FASTA index."); 68 } 69 } 70 71 /// Enable BGZF cacheing (size: bytes) 72 void setCacheSize(int size) 73 { 74 fai_set_cache_size(this.faidx, size); 75 } 76 77 /// Enable multithreaded decompression of this FA/FQ 78 /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting) 79 /// but we'll retain name for consistency with setCacheSize 80 /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???) 81 deprecated("disabled until faidx_t again made non-opaque") 82 void setThreads(int nthreads) 83 { 84 import htslib.bgzf : BGZF, bgzf_mt; 85 // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256 86 //bgzf_mt(this.faidx.bgzf, nthreads, 64); 87 } 88 89 private struct OffsetType 90 { 91 ptrdiff_t offset; 92 alias offset this; 93 94 // supports e.g. $ - x 95 OffsetType opBinary(string s, T)(T val) 96 { 97 mixin("return OffsetType(offset " ~ s ~ " val);"); 98 } 99 100 invariant 101 { 102 assert(this.offset <= 0, "Offset from end should be zero or negative"); 103 } 104 } 105 /** Array-end `$` indexing hack courtesy of Steve Schveighoffer 106 https://forum.dlang.org/post/rl7a56$nad$1@digitalmars.com 107 108 Requires in addition to opDollar returning a bespoke non-integral type 109 a series of overloads for opIndex and opSlice taking this type 110 */ 111 OffsetType opDollar(size_t dim)() if(dim == 1) 112 { 113 return OffsetType.init; 114 } 115 /// opSlice as Coordinate and an offset 116 /// i.e [ZB(2) .. $] 117 auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, OffsetType off) if(dim == 1) 118 { 119 return Tuple!(Coordinate!bs, OffsetType)(start, off); 120 } 121 /// opSlice as two offset 122 /// i.e [$-2 .. $] 123 auto opSlice(size_t dim)(OffsetType start, OffsetType end) if(dim == 1) 124 { 125 return Tuple!(OffsetType, OffsetType)(start, end); 126 } 127 /// opIndex coordinate and Offset 128 /// i.e fai["chrom1", ZB(1) .. $] 129 auto opIndex(Basis bs)(string ctg, Tuple!(Coordinate!bs, OffsetType) coords) 130 { 131 auto end = this.seqLen(ctg) + coords[1]; 132 auto endCoord = ZB(end); 133 auto newEndCoord = endCoord.to!bs; 134 auto c = Interval!(getCoordinateSystem!(bs,End.open))(coords[0], newEndCoord); 135 return fetchSequence(ctg, c); 136 } 137 138 /// opIndex two Offsets 139 /// i.e fai["chrom1", $-2 .. $] 140 auto opIndex(string ctg, Tuple!(OffsetType, OffsetType) coords) 141 { 142 auto start = this.seqLen(ctg) + coords[0]; 143 auto end = this.seqLen(ctg) + coords[1]; 144 auto c = ZBHO(start, end); 145 return fetchSequence(ctg, c); 146 } 147 /// opIndex one offset 148 /// i.e fai["chrom1", $-1] 149 auto opIndex(string ctg, OffsetType endoff) 150 { 151 auto end = this.seqLen(ctg) + endoff.offset; 152 auto coords = Interval!(CoordSystem.zbho)(end, end + 1); 153 return fetchSequence(ctg, coords); 154 } 155 156 /// Fetch sequence in region by assoc array-style lookup: 157 /// Uses htslib string region parsing 158 /// `string sequence = fafile["chr2:20123-30456"]` 159 auto opIndex(string region) 160 { 161 auto coords = getIntervalFromString(region); 162 return fetchSequence(coords.contig, coords.interval); 163 } 164 165 /// Fetch sequence in region by multidimensional slicing: 166 /// `string sequence = fafile["chr2", 20123 .. 30456]` 167 /// 168 /// Sadly, $ to represent max length is not supported 169 auto opIndex(CoordSystem cs)(string contig, Interval!cs coords) 170 { 171 return fetchSequence(contig, coords); 172 } 173 174 /// ditto 175 auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, Coordinate!bs end) if (dim == 1) 176 in { assert(start >= 0); assert(start <= end); } 177 do 178 { 179 auto coords = Interval!(getCoordinateSystem!(bs, End.open))(start, end); 180 return coords; 181 } 182 183 /// Fetch sequencing in a region by function call with contig, start, end 184 /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)` 185 string fetchSequence(CoordSystem cs)(string contig, Interval!cs coords) 186 { 187 char *fetchedSeq; 188 long fetchedLen; 189 190 // Convert given coordinates in system cs to zero-based, closed (faidx_fetch_seq) 191 auto newcoords = coords.to!(CoordSystem.zbc); 192 /* htslib API for my reference: 193 * 194 * char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); 195 * @param fai Pointer to the faidx_t struct 196 * @param c_name Region name 197 * @param p_beg_i Beginning position number (zero-based) 198 * @param p_end_i End position number (zero-based) 199 * @param len Length of the region; -2 if c_name not present, -1 general error 200 * @return Pointer to the sequence; null on failure 201 * The returned sequence is allocated by `malloc()` family and should be destroyed 202 * by end users by calling `free()` on it. 203 * 204 */ 205 fetchedSeq = faidx_fetch_seq64(this.faidx, toStringz(contig), newcoords.start, newcoords.end, &fetchedLen); 206 207 if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error"); 208 else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found"); 209 210 string seq = fromStringz(fetchedSeq).idup; 211 free(fetchedSeq); 212 213 assert(seq.length == fetchedLen); 214 return seq; 215 } 216 217 /// Test whether the FASTA file/index contains string seqname 218 bool hasSeq(const(char)[] seqname) 219 { 220 // int faidx_has_seq(const faidx_t *fai, const char *seq); 221 return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) ); 222 } 223 224 /// Return the number of sequences in the FASTA file/index 225 @property auto nSeq() 226 { 227 return faidx_nseq(this.faidx); 228 } 229 230 /// Return the name of the i'th sequence 231 string seqName(int i) 232 { 233 // const(char) *faidx_iseq(const faidx_t *fai, int i); 234 // TODO determine if this property is zero indexed or one indexed 235 if (i > this.nSeq) throw new Exception("seqName: sequece number not present"); 236 237 return fromStringz( faidx_iseq(this.faidx, i) ).idup; 238 } 239 240 /// Return sequence length, -1 if not present 241 /// NOTE: there is no 64 bit equivalent of this function (yet) in htslib-1.10 242 int seqLen(const(char)[] seqname) 243 { 244 // TODO should I check for -1 and throw exception or pass to caller? 245 // int faidx_seq_len(const faidx_t *fai, const char *seq); 246 const int l = faidx_seq_len(this.faidx, toStringz(seqname) ); 247 if ( l == -1 ) throw new Exception("seqLen: sequence name not found"); 248 return l; 249 } 250 } 251 252 debug(dhtslib_unittest) unittest 253 { 254 import dhtslib.sam; 255 import htslib.hts_log; 256 import std.path : buildPath, dirName; 257 258 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 259 hts_log_info(__FUNCTION__, "Testing IndexedFastaFile"); 260 hts_log_info(__FUNCTION__, "Loading test file"); 261 auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa")); 262 263 fai.setCacheSize(4000000); 264 265 assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(0, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 266 assert(fai.fetchSequence("CHROMOSOME_I",OBHO(1, 30)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 267 assert(fai.fetchSequence("CHROMOSOME_I",ZBC(0, 28)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 268 assert(fai.fetchSequence("CHROMOSOME_I",OBC(1, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 269 270 assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 271 assert(fai.fetchSequence("CHROMOSOME_I",OBHO(2, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 272 assert(fai.fetchSequence("CHROMOSOME_I",ZBC(1, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 273 assert(fai.fetchSequence("CHROMOSOME_I",OBC(2, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 274 275 assert(fai["CHROMOSOME_I",ZB(0) .. ZB(29)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 276 import std.stdio; 277 writeln(fai["CHROMOSOME_I",OB(1) .. OB(30)]); 278 assert(fai["CHROMOSOME_I",OB(1) .. OB(30)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 279 280 assert(fai.seqLen("CHROMOSOME_I") == 1009800); 281 assert(fai.nSeq == 7); 282 283 assert(fai.hasSeq("CHROMOSOME_I")); 284 assert(!fai.hasSeq("CHROMOSOME_")); 285 286 assert(fai.seqName(0) == "CHROMOSOME_I"); 287 } 288 289 debug(dhtslib_unittest) unittest 290 { 291 import dhtslib.sam; 292 import htslib.hts_log; 293 import std.path : buildPath, dirName; 294 295 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 296 hts_log_info(__FUNCTION__, "Testing IndexedFastaFile"); 297 hts_log_info(__FUNCTION__, "Loading test file"); 298 auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa")); 299 300 fai.setCacheSize(4000000); 301 302 assert(fai.fetchSequence("CHROMOSOME_II",ZBHO(0, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 303 assert(fai.fetchSequence("CHROMOSOME_II",OBHO(1, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 304 assert(fai.fetchSequence("CHROMOSOME_II",ZBC(0, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 305 assert(fai.fetchSequence("CHROMOSOME_II",OBC(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 306 assert(fai["CHROMOSOME_II:1-29"] == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 307 // "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG" 308 import std.stdio; 309 assert(fai["CHROMOSOME_II",$-50 .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 310 311 assert(fai["CHROMOSOME_II",OB(4951) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 312 assert(fai["CHROMOSOME_II",ZB(4950) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 313 assert(fai["CHROMOSOME_II",$-1] == "G"); 314 }