1 /** 2 This module provides a wrapper, `IndexedFastaFile` over a FASTA. 3 If an index exists, it will be used for rapid random access. 4 If an index does not exist, one will be built. 5 6 The wrapper provides the ability to list sequence names (i.e., chromosomes/contigs) 7 in the FASTA, efficiently retrieve sequences (by contig, start, end) 8 9 Sequence caching and multithreaded BGZF decompression are supported. 10 */ 11 12 module dhtslib.faidx; 13 14 import std.string; 15 import std.typecons : Tuple; 16 import core.stdc.stdlib : malloc, free; 17 18 import dhtslib.coordinates; 19 import dhtslib.memory; 20 import dhtslib.util; 21 import htslib.faidx; 22 23 /** Build index for a FASTA or bgzip-compressed FASTA file. 24 25 @param fn FASTA file name 26 @param fnfai Name of .fai file to build. 27 @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 28 @return 0 on success; or -1 on failure 29 30 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 31 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 32 file will only be built if fn is bgzip-compressed. 33 */ 34 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "") 35 { 36 // int fai_build3(const char *fn, const char *fnfai, const char *fngzi); 37 const int ret = fai_build3( toStringz(fn), 38 fnfai ? toStringz(fnfai) : null, 39 fngzi ? toStringz(fngzi) : null); 40 41 if (ret == 0) return true; 42 if (ret == -1) return false; 43 assert(0); 44 } 45 46 /** FASTA file with .fai or .gzi index 47 48 Reads existing FASTA file, optionally creating FASTA index if one does not exist. 49 50 Convenient member fns to get no. of sequences, get sequence names and lengths, 51 test for membership, and rapidly fetch sequence at offset. 52 */ 53 struct IndexedFastaFile { 54 55 private Faidx faidx; 56 57 /// construct from filename, optionally creating index if it does not exist 58 /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true 59 this(string fn, bool create=false) 60 { 61 if (create) { 62 this.faidx = Faidx(fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE)); 63 if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index."); 64 } 65 else { 66 this.faidx = Faidx(fai_load3( toStringz(fn) , null, null, 0)); 67 if (this.faidx is null) throw new Exception("Unable to load the FASTA index."); 68 } 69 } 70 71 /// Explicit postblit to avoid 72 /// https://github.com/blachlylab/dhtslib/issues/122 73 this(this) 74 { 75 this.faidx = faidx; 76 } 77 78 /// Enable BGZF cacheing (size: bytes) 79 void setCacheSize(int size) 80 { 81 fai_set_cache_size(this.faidx, size); 82 } 83 84 /// Enable multithreaded decompression of this FA/FQ 85 /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting) 86 /// but we'll retain name for consistency with setCacheSize 87 /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???) 88 deprecated("disabled until faidx_t again made non-opaque") 89 void setThreads(int nthreads) 90 { 91 import htslib.bgzf : BGZF, bgzf_mt; 92 // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256 93 //bgzf_mt(this.faidx.bgzf, nthreads, 64); 94 } 95 96 private struct OffsetType 97 { 98 ptrdiff_t offset; 99 alias offset this; 100 101 // supports e.g. $ - x 102 OffsetType opBinary(string s, T)(T val) 103 { 104 mixin("return OffsetType(offset " ~ s ~ " val);"); 105 } 106 107 invariant 108 { 109 assert(this.offset <= 0, "Offset from end should be zero or negative"); 110 } 111 } 112 /** Array-end `$` indexing hack courtesy of Steve Schveighoffer 113 https://forum.dlang.org/post/rl7a56$nad$1@digitalmars.com 114 115 Requires in addition to opDollar returning a bespoke non-integral type 116 a series of overloads for opIndex and opSlice taking this type 117 */ 118 OffsetType opDollar(size_t dim)() if(dim == 1) 119 { 120 return OffsetType.init; 121 } 122 /// opSlice as Coordinate and an offset 123 /// i.e [ZB(2) .. $] 124 auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, OffsetType off) if(dim == 1) 125 { 126 return Tuple!(Coordinate!bs, OffsetType)(start, off); 127 } 128 /// opSlice as two offset 129 /// i.e [$-2 .. $] 130 auto opSlice(size_t dim)(OffsetType start, OffsetType end) if(dim == 1) 131 { 132 return Tuple!(OffsetType, OffsetType)(start, end); 133 } 134 /// opIndex coordinate and Offset 135 /// i.e fai["chrom1", ZB(1) .. $] 136 auto opIndex(Basis bs)(string ctg, Tuple!(Coordinate!bs, OffsetType) coords) 137 { 138 auto end = this.seqLen(ctg) + coords[1]; 139 auto endCoord = ZB(end); 140 auto newEndCoord = endCoord.to!bs; 141 auto c = Interval!(getCoordinateSystem!(bs,End.open))(coords[0], newEndCoord); 142 return fetchSequence(ctg, c); 143 } 144 145 /// opIndex two Offsets 146 /// i.e fai["chrom1", $-2 .. $] 147 auto opIndex(string ctg, Tuple!(OffsetType, OffsetType) coords) 148 { 149 auto start = this.seqLen(ctg) + coords[0]; 150 auto end = this.seqLen(ctg) + coords[1]; 151 auto c = ZBHO(start, end); 152 return fetchSequence(ctg, c); 153 } 154 /// opIndex one offset 155 /// i.e fai["chrom1", $-1] 156 auto opIndex(string ctg, OffsetType endoff) 157 { 158 auto end = this.seqLen(ctg) + endoff.offset; 159 auto coords = Interval!(CoordSystem.zbho)(end, end + 1); 160 return fetchSequence(ctg, coords); 161 } 162 163 /// Fetch sequence in region by assoc array-style lookup: 164 /// Uses htslib string region parsing 165 /// `string sequence = fafile["chr2:20123-30456"]` 166 auto opIndex(string region) 167 { 168 auto coords = getIntervalFromString(region); 169 return fetchSequence(coords.contig, coords.interval); 170 } 171 172 /// Fetch sequence in region by multidimensional slicing: 173 /// `string sequence = fafile["chr2", 20123 .. 30456]` 174 /// 175 /// Sadly, $ to represent max length is not supported 176 auto opIndex(CoordSystem cs)(string contig, Interval!cs coords) 177 { 178 return fetchSequence(contig, coords); 179 } 180 181 /// ditto 182 auto opSlice(size_t dim, Basis bs)(Coordinate!bs start, Coordinate!bs end) if (dim == 1) 183 in { assert(start >= 0); assert(start <= end); } 184 do 185 { 186 auto coords = Interval!(getCoordinateSystem!(bs, End.open))(start, end); 187 return coords; 188 } 189 190 /// Fetch sequencing in a region by function call with contig, start, end 191 /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)` 192 string fetchSequence(CoordSystem cs)(string contig, Interval!cs coords) 193 { 194 char *fetchedSeq; 195 long fetchedLen; 196 197 // Convert given coordinates in system cs to zero-based, closed (faidx_fetch_seq) 198 auto newcoords = coords.to!(CoordSystem.zbc); 199 /* htslib API for my reference: 200 * 201 * char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); 202 * @param fai Pointer to the faidx_t struct 203 * @param c_name Region name 204 * @param p_beg_i Beginning position number (zero-based) 205 * @param p_end_i End position number (zero-based) 206 * @param len Length of the region; -2 if c_name not present, -1 general error 207 * @return Pointer to the sequence; null on failure 208 * The returned sequence is allocated by `malloc()` family and should be destroyed 209 * by end users by calling `free()` on it. 210 * 211 */ 212 fetchedSeq = faidx_fetch_seq64(this.faidx, toStringz(contig), newcoords.start, newcoords.end, &fetchedLen); 213 214 if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error"); 215 else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found"); 216 217 string seq = fromStringz(fetchedSeq).idup; 218 free(fetchedSeq); 219 220 assert(seq.length == fetchedLen); 221 return seq; 222 } 223 224 /// Test whether the FASTA file/index contains string seqname 225 bool hasSeq(const(char)[] seqname) 226 { 227 // int faidx_has_seq(const faidx_t *fai, const char *seq); 228 return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) ); 229 } 230 231 /// Return the number of sequences in the FASTA file/index 232 @property auto nSeq() 233 { 234 return faidx_nseq(this.faidx); 235 } 236 237 /// Return the name of the i'th sequence 238 string seqName(int i) 239 { 240 // const(char) *faidx_iseq(const faidx_t *fai, int i); 241 // TODO determine if this property is zero indexed or one indexed 242 if (i > this.nSeq) throw new Exception("seqName: sequece number not present"); 243 244 return fromStringz( faidx_iseq(this.faidx, i) ).idup; 245 } 246 247 /// Return sequence length, -1 if not present 248 /// NOTE: there is no 64 bit equivalent of this function (yet) in htslib-1.10 249 int seqLen(const(char)[] seqname) 250 { 251 // TODO should I check for -1 and throw exception or pass to caller? 252 // int faidx_seq_len(const faidx_t *fai, const char *seq); 253 const int l = faidx_seq_len(this.faidx, toStringz(seqname) ); 254 if ( l == -1 ) throw new Exception("seqLen: sequence name not found"); 255 return l; 256 } 257 } 258 259 debug(dhtslib_unittest) unittest 260 { 261 import dhtslib.sam; 262 import htslib.hts_log; 263 import std.path : buildPath, dirName; 264 265 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 266 hts_log_info(__FUNCTION__, "Testing IndexedFastaFile"); 267 hts_log_info(__FUNCTION__, "Loading test file"); 268 auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa")); 269 270 fai.setCacheSize(4000000); 271 272 assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(0, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 273 assert(fai.fetchSequence("CHROMOSOME_I",OBHO(1, 30)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 274 assert(fai.fetchSequence("CHROMOSOME_I",ZBC(0, 28)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 275 assert(fai.fetchSequence("CHROMOSOME_I",OBC(1, 29)) == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 276 277 assert(fai.fetchSequence("CHROMOSOME_I",ZBHO(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 278 assert(fai.fetchSequence("CHROMOSOME_I",OBHO(2, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 279 assert(fai.fetchSequence("CHROMOSOME_I",ZBC(1, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 280 assert(fai.fetchSequence("CHROMOSOME_I",OBC(2, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 281 282 assert(fai["CHROMOSOME_I",ZB(0) .. ZB(29)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 283 import std.stdio; 284 writeln(fai["CHROMOSOME_I",OB(1) .. OB(30)]); 285 assert(fai["CHROMOSOME_I",OB(1) .. OB(30)] == "GCCTAAGCCTAAGCCTAAGCCTAAGCCTA"); 286 287 assert(fai.seqLen("CHROMOSOME_I") == 1009800); 288 assert(fai.nSeq == 7); 289 290 assert(fai.hasSeq("CHROMOSOME_I")); 291 assert(!fai.hasSeq("CHROMOSOME_")); 292 293 assert(fai.seqName(0) == "CHROMOSOME_I"); 294 } 295 296 debug(dhtslib_unittest) unittest 297 { 298 import dhtslib.sam; 299 import htslib.hts_log; 300 import std.path : buildPath, dirName; 301 302 hts_set_log_level(htsLogLevel.HTS_LOG_INFO); 303 hts_log_info(__FUNCTION__, "Testing IndexedFastaFile"); 304 hts_log_info(__FUNCTION__, "Loading test file"); 305 auto fai = IndexedFastaFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","ce.fa")); 306 307 fai.setCacheSize(4000000); 308 309 assert(fai.fetchSequence("CHROMOSOME_II",ZBHO(0, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 310 assert(fai.fetchSequence("CHROMOSOME_II",OBHO(1, 30)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 311 assert(fai.fetchSequence("CHROMOSOME_II",ZBC(0, 28)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 312 assert(fai.fetchSequence("CHROMOSOME_II",OBC(1, 29)) == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 313 assert(fai["CHROMOSOME_II:1-29"] == "CCTAAGCCTAAGCCTAAGCCTAAGCCTAA"); 314 // "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG" 315 import std.stdio; 316 assert(fai["CHROMOSOME_II",$-50 .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 317 318 assert(fai["CHROMOSOME_II",OB(4951) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 319 assert(fai["CHROMOSOME_II",ZB(4950) .. $] == "GTCAACACAGACCGTTAATTTTGGGAAGTTGAGAAATTCGCTAGTTTCTG"); 320 assert(fai["CHROMOSOME_II",$-1] == "G"); 321 }