1 module dhtslib.faidx; 2 3 import std.string; 4 import core.stdc.stdlib : malloc, free; 5 6 import dhtslib.htslib.faidx; 7 8 /** Coodinate Systems, since htslib sequence fetching methods surprisingly use zero-based, closed */ 9 enum CoordSystem 10 { 11 zbho = 0, /// zero-based, half-open 12 zbc, /// zero-based, closed (behavior of faidx_fetch_seq) 13 obho, /// one-based, half-open 14 obc /// one-based, closed 15 } 16 17 /** Build index for a FASTA or bgzip-compressed FASTA file. 18 19 @param fn FASTA file name 20 @param fnfai Name of .fai file to build. 21 @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 22 @return 0 on success; or -1 on failure 23 24 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 25 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 26 file will only be built if fn is bgzip-compressed. 27 */ 28 bool buildFastaIndex(string fn, string fnfai = "", string fngzi = "") 29 { 30 // int fai_build3(const char *fn, const char *fnfai, const char *fngzi); 31 const int ret = fai_build3( toStringz(fn), 32 fnfai ? toStringz(fnfai) : null, 33 fngzi ? toStringz(fngzi) : null); 34 35 if (ret == 0) return true; 36 if (ret == -1) return false; 37 assert(0); 38 } 39 40 /** FASTA file with .fai or .gzi index 41 42 Reads existing FASTA file, optionally creating FASTA index if one does not exist. 43 44 Convenient member fns to get no. of sequences, get sequence names and lengths, 45 test for membership, and rapidly fetch sequence at offset. 46 */ 47 struct IndexedFastaFile { 48 49 private faidx_t *faidx; 50 private int refct; // Postblit refcounting in case the object is passed around 51 52 this(this) 53 { 54 refct++; 55 } 56 /// construct from filename, optionally creating index if it does not exist 57 /// throws Exception (TODO: remove) if file DNE, or if index DNE unless create->true 58 this(string fn, bool create=false) 59 { 60 if (create) { 61 this.faidx = fai_load3( toStringz(fn), null, null, fai_load_options.FAI_CREATE); 62 if (this.faidx is null) throw new Exception("Unable to load or create the FASTA index."); 63 } 64 else { 65 this.faidx = fai_load3( toStringz(fn) , null, null, 0); 66 if (this.faidx is null) throw new Exception("Unable to load the FASTA index."); 67 } 68 refct = 1; 69 } 70 ~this() 71 { 72 if (--refct == 0) 73 fai_destroy(this.faidx); 74 } 75 76 /// Enable BGZF cacheing (size: bytes) 77 void setCacheSize(int size) 78 { 79 import dhtslib.htslib.bgzf : BGZF, bgzf_set_cache_size; 80 bgzf_set_cache_size(this.faidx.bgzf, size); 81 } 82 83 /// Enable multithreaded decompression of this FA/FQ 84 /// Reading fn body of bgzf_mt, this actually ADDS threads (rather than setting) 85 /// but we'll retain name for consistency with setCacheSize 86 /// NB: IN A REAL-WORLD TEST (swiftover) CALLING setThreads(1) doubled runtime(???) 87 void setThreads(int nthreads) 88 { 89 import dhtslib.htslib.bgzf : BGZF, bgzf_mt; 90 // third parameter, n_sub_blks is not used in htslib 1.9; .h suggests value 64-256 91 bgzf_mt(this.faidx.bgzf, nthreads, 64); 92 } 93 94 /// Fetch sequence in region by assoc array-style lookup: 95 /// `string sequence = fafile["chr2:20123-30456"]` 96 auto opIndex(string region) 97 { 98 return fetchSequence(region); 99 } 100 101 /// Fetch sequence in region by assoc array-style lookup: 102 /// `string sequence = fafile.fetchSequence("chr2:20123-30456")` 103 string fetchSequence(string region) 104 { 105 char *fetchedSeq; 106 int fetchedLen; 107 // char *fai_fetch(const faidx_t *fai, const char *reg, int *len); 108 // @param len Length of the region; -2 if seq not present, -1 general error 109 fetchedSeq = fai_fetch(this.faidx, toStringz(region), &fetchedLen); 110 111 string seq = fromStringz(fetchedSeq).idup; 112 free(fetchedSeq); 113 114 if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error"); 115 else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found"); 116 117 return seq; 118 } 119 120 /// Fetch sequence in region by multidimensional slicing: 121 /// `string sequence = fafile["chr2", 20123 .. 30456]` 122 /// 123 /// Sadly, $ to represent max length is not supported 124 auto opIndex(string contig, int[2] pos) 125 { 126 return fetchSequence(contig, pos[0], pos[1]); 127 } 128 /// ditto 129 int[2] opSlice(size_t dim)(int start, int end) if (dim == 1) 130 in { assert(start >= 0); } 131 do 132 { 133 return [start, end]; 134 } 135 136 /// Fetch sequence in region by multidimensional slicing: 137 /// `string sequence = fafile.fetchSequence("chr2", 20123, 30456)` 138 string fetchSequence(CoordSystem cs = CoordSystem.zbho)(string contig, int start, int end) 139 { 140 char *fetchedSeq; 141 int fetchedLen; 142 143 static if (cs == CoordSystem.zbho) { 144 end--; 145 } 146 else static if (cs == CoordSystem.zbc) { 147 // ok 148 } 149 else static if (cs == CoordSystem.obho) { 150 start--; 151 end--; 152 } 153 else static if (cs == CoordSystem.obc) { 154 start--; 155 } 156 /* htslib API for my reference: 157 * 158 * char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); 159 * @param fai Pointer to the faidx_t struct 160 * @param c_name Region name 161 * @param p_beg_i Beginning position number (zero-based) 162 * @param p_end_i End position number (zero-based) 163 * @param len Length of the region; -2 if c_name not present, -1 general error 164 * @return Pointer to the sequence; null on failure 165 * The returned sequence is allocated by `malloc()` family and should be destroyed 166 * by end users by calling `free()` on it. 167 * 168 */ 169 fetchedSeq = faidx_fetch_seq(this.faidx, toStringz(contig), start, end, &fetchedLen); 170 171 string seq = fromStringz(fetchedSeq).idup; 172 free(fetchedSeq); 173 174 if (fetchedLen == -1) throw new Exception("fai_fetch: unknown error"); 175 else if (fetchedLen == -2) throw new Exception("fai_fetch: sequence not found"); 176 177 assert(seq.length == fetchedLen); 178 return seq; 179 } 180 181 /// Test whether the FASTA file/index contains string seqname 182 bool hasSeq(const(char)[] seqname) 183 { 184 // int faidx_has_seq(const faidx_t *fai, const char *seq); 185 return cast(bool) faidx_has_seq(this.faidx, toStringz(seqname) ); 186 } 187 188 /// Return the number of sequences in the FASTA file/index 189 @property auto nSeq() 190 { 191 return faidx_nseq(this.faidx); 192 } 193 194 /// Return the name of the i'th sequence 195 string seqName(int i) 196 { 197 // const(char) *faidx_iseq(const faidx_t *fai, int i); 198 // TODO determine if this property is zero indexed or one indexed 199 if (i > this.nSeq) throw new Exception("seqName: sequece number not present"); 200 201 return fromStringz( faidx_iseq(this.faidx, i) ).idup; 202 } 203 204 /// Return sequence length, -1 if not present 205 int seqLen(const(char)[] seqname) 206 { 207 // TODO should I check for -1 and throw exception or pass to caller? 208 // int faidx_seq_len(const faidx_t *fai, const char *seq); 209 const int l = faidx_seq_len(this.faidx, toStringz(seqname) ); 210 if ( l == -1 ) throw new Exception("seqLen: sequence name not found"); 211 return l; 212 } 213 }