1 /// @file htslib/faidx.h 2 /// FASTA random access. 3 /* 4 Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd. 5 6 Author: Heng Li <lh3@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining 9 a copy of this software and associated documentation files (the 10 "Software"), to deal in the Software without restriction, including 11 without limitation the rights to use, copy, modify, merge, publish, 12 distribute, sublicense, and/or sell copies of the Software, and to 13 permit persons to whom the Software is furnished to do so, subject to 14 the following conditions: 15 16 The above copyright notice and this permission notice shall be 17 included in all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 23 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 24 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 25 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 SOFTWARE. 27 */ 28 29 module htslib.faidx; 30 31 import htslib.hts : hts_pos_t; 32 33 @system: 34 nothrow: 35 @nogc: 36 37 extern (C): 38 39 /** @file 40 41 Index FASTA or FASTQ files and extract subsequence. 42 43 The fai file index columns for FASTA are: 44 - chromosome name 45 - chromosome length: number of bases 46 - offset: number of bytes to skip to get to the first base 47 from the beginning of the file, including the length 48 of the sequence description string (`>chr ..\n`) 49 - line length: number of bases per line (excluding `\n`) 50 - binary line length: number of bytes, including `\n` 51 52 The index for FASTQ is similar to above: 53 - chromosome name 54 - chromosome length: number of bases 55 - sequence offset: number of bytes to skip to get to the first base 56 from the beginning of the file, including the length 57 of the sequence description string (`@chr ..\n`) 58 - line length: number of bases per line (excluding `\n`) 59 - binary line length: number of bytes, including `\n` 60 - quality offset: number of bytes to skip from the beginning of the file 61 to get to the first quality value in the indexed entry. 62 63 The FASTQ version of the index uses line length and binary line length 64 for both the sequence and the quality values, so they must be line 65 wrapped in the same way. 66 */ 67 68 struct __faidx_t; 69 /// Opaque structure representing FASTA index 70 alias faidx_t = __faidx_t; 71 72 /// File format to be dealing with. 73 enum fai_format_options 74 { 75 FAI_NONE = 0, 76 FAI_FASTA = 1, 77 FAI_FASTQ = 2 78 } 79 80 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file. 81 /** @param fn FASTA/FASTQ file name 82 @param fnfai Name of .fai file to build. 83 @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 84 @return 0 on success; or -1 on failure 85 86 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 87 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 88 file will only be built if fn is bgzip-compressed. 89 */ 90 int fai_build3(const(char)* fn, const(char)* fnfai, const(char)* fngzi); 91 92 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file. 93 /** @param fn FASTA/FASTQ file name 94 @return 0 on success; or -1 on failure 95 96 File "fn.fai" will be generated. This function is equivalent to 97 fai_build3(fn, NULL, NULL); 98 */ 99 int fai_build(const(char)* fn); 100 101 /// Destroy a faidx_t struct 102 void fai_destroy(faidx_t* fai); 103 104 enum fai_load_options 105 { 106 FAI_CREATE = 0x01 107 } 108 109 /// Load FASTA indexes. 110 /** @param fn File name of the FASTA file (can be compressed with bgzip). 111 @param fnfai File name of the FASTA index. 112 @param fngzi File name of the bgzip index. 113 @param flags Option flags to control index file caching and creation. 114 @return Pointer to a faidx_t struct on success, NULL on failure. 115 116 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 117 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. 118 The bgzip index is only needed if fn is compressed. 119 120 If (flags & FAI_CREATE) is true, the index files will be built using 121 fai_build3() if they are not already present. 122 123 The struct returned by a successful call should be freed via fai_destroy() 124 when it is no longer needed. 125 */ 126 faidx_t* fai_load3( 127 const(char)* fn, 128 const(char)* fnfai, 129 const(char)* fngzi, 130 int flags); 131 132 /// Load index from "fn.fai". 133 /** @param fn File name of the FASTA file 134 @return Pointer to a faidx_t struct on success, NULL on failure. 135 136 This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); 137 */ 138 faidx_t* fai_load(const(char)* fn); 139 140 /// Load FASTA or FASTQ indexes. 141 /** @param fn File name of the FASTA/FASTQ file (can be compressed with bgzip). 142 @param fnfai File name of the FASTA/FASTQ index. 143 @param fngzi File name of the bgzip index. 144 @param flags Option flags to control index file caching and creation. 145 @param format FASTA or FASTQ file format 146 @return Pointer to a faidx_t struct on success, NULL on failure. 147 148 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 149 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. 150 The bgzip index is only needed if fn is compressed. 151 152 If (flags & FAI_CREATE) is true, the index files will be built using 153 fai_build3() if they are not already present. 154 155 The struct returned by a successful call should be freed via fai_destroy() 156 when it is no longer needed. 157 */ 158 faidx_t* fai_load3_format( 159 const(char)* fn, 160 const(char)* fnfai, 161 const(char)* fngzi, 162 int flags, 163 fai_format_options format); 164 165 /// Load index from "fn.fai". 166 /** @param fn File name of the FASTA/FASTQ file 167 @param format FASTA or FASTQ file format 168 @return Pointer to a faidx_t struct on success, NULL on failure. 169 170 This function is equivalent to fai_load3_format(fn, NULL, NULL, FAI_CREATE|FAI_CACHE, format); 171 */ 172 faidx_t* fai_load_format(const(char)* fn, fai_format_options format); 173 174 /// Fetch the sequence in a region 175 /** @param fai Pointer to the faidx_t struct 176 @param reg Region in the format "chr2:20,000-30,000" 177 @param len Length of the region; -2 if seq not present, -1 general error 178 @return Pointer to the sequence; `NULL` on failure 179 180 The returned sequence is allocated by `malloc()` family and should be destroyed 181 by end users by calling `free()` on it. 182 183 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 184 are reference names, quote using curly braces. 185 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 186 */ 187 char* fai_fetch(const(faidx_t)* fai, const(char)* reg, int* len); 188 char* fai_fetch64(const(faidx_t)* fai, const(char)* reg, hts_pos_t* len); 189 190 /// Fetch the quality string for a region for FASTQ files 191 /** @param fai Pointer to the faidx_t struct 192 @param reg Region in the format "chr2:20,000-30,000" 193 @param len Length of the region; -2 if seq not present, -1 general error 194 @return Pointer to the quality string; null on failure 195 196 The returned quality string is allocated by `malloc()` family and should be 197 destroyed by end users by calling `free()` on it. 198 199 Region names can be quoted with curly braces, as for fai_fetch(). 200 */ 201 char* fai_fetchqual(const(faidx_t)* fai, const(char)* reg, int* len); 202 char* fai_fetchqual64(const(faidx_t)* fai, const(char)* reg, hts_pos_t* len); 203 204 /// Fetch the number of sequences 205 /** @param fai Pointer to the faidx_t struct 206 @return The number of sequences 207 */ 208 int faidx_fetch_nseq(const(faidx_t)* fai); 209 210 /// Fetch the sequence in a region 211 /** @param fai Pointer to the faidx_t struct 212 @param c_name Region name 213 @param p_beg_i Beginning position number (zero-based) 214 @param p_end_i End position number (zero-based) 215 @param len Length of the region; -2 if c_name not present, -1 general error 216 @return Pointer to the sequence; null on failure 217 218 The returned sequence is allocated by `malloc()` family and should be destroyed 219 by end users by calling `free()` on it. 220 */ 221 char* faidx_fetch_seq( 222 const(faidx_t)* fai, 223 const(char)* c_name, 224 int p_beg_i, 225 int p_end_i, 226 int* len); 227 228 /// Fetch the sequence in a region 229 /** @param fai Pointer to the faidx_t struct 230 @param c_name Region name 231 @param p_beg_i Beginning position number (zero-based) 232 @param p_end_i End position number (zero-based) 233 @param len Length of the region; -2 if c_name not present, -1 general error 234 @return Pointer to the sequence; null on failure 235 236 The returned sequence is allocated by `malloc()` family and should be destroyed 237 by end users by calling `free()` on it. 238 */ 239 char* faidx_fetch_seq64( 240 const(faidx_t)* fai, 241 const(char)* c_name, 242 hts_pos_t p_beg_i, 243 hts_pos_t p_end_i, 244 hts_pos_t* len); 245 246 /// Fetch the quality string in a region for FASTQ files 247 /** @param fai Pointer to the faidx_t struct 248 @param c_name Region name 249 @param p_beg_i Beginning position number (zero-based) 250 @param p_end_i End position number (zero-based) 251 @param len Length of the region; -2 if c_name not present, -1 general error 252 @return Pointer to the sequence; null on failure 253 254 The returned sequence is allocated by `malloc()` family and should be destroyed 255 by end users by calling `free()` on it. 256 */ 257 char* faidx_fetch_qual( 258 const(faidx_t)* fai, 259 const(char)* c_name, 260 int p_beg_i, 261 int p_end_i, 262 int* len); 263 264 /// Fetch the quality string in a region for FASTQ files 265 /** @param fai Pointer to the faidx_t struct 266 @param c_name Region name 267 @param p_beg_i Beginning position number (zero-based) 268 @param p_end_i End position number (zero-based) 269 @param len Length of the region; -2 if c_name not present, -1 general error 270 @return Pointer to the sequence; null on failure 271 272 The returned sequence is allocated by `malloc()` family and should be destroyed 273 by end users by calling `free()` on it. 274 */ 275 char* faidx_fetch_qual64( 276 const(faidx_t)* fai, 277 const(char)* c_name, 278 hts_pos_t p_beg_i, 279 hts_pos_t p_end_i, 280 hts_pos_t* len); 281 282 /// Query if sequence is present 283 /** @param fai Pointer to the faidx_t struct 284 @param seq Sequence name 285 @return 1 if present or 0 if absent 286 */ 287 int faidx_has_seq(const(faidx_t)* fai, const(char)* seq); 288 289 /// Return number of sequences in fai index 290 int faidx_nseq(const(faidx_t)* fai); 291 292 /// Return name of i-th sequence 293 const(char)* faidx_iseq(const(faidx_t)* fai, int i); 294 295 /// Return sequence length, -1 if not present 296 int faidx_seq_len(const(faidx_t)* fai, const(char)* seq); 297 298 /// Parses a region string. 299 /** @param fai Pointer to the faidx_t struct 300 @param s Region string 301 @param tid Returns which i-th sequence is described in the region. 302 @param beg Returns the start of the region (0 based) 303 @param end Returns the one past last of the region (0 based) 304 @param flags Parsing method, see HTS_PARSE_* in hts.h. 305 @return Pointer to end of parsed s if successful, NULL if not. 306 307 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 308 are reference names, quote using curly braces. 309 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 310 */ 311 const(char)* fai_parse_region( 312 const(faidx_t)* fai, 313 const(char)* s, 314 int* tid, 315 hts_pos_t* beg, 316 hts_pos_t* end, 317 int flags); 318 319 /// Sets the cache size of the underlying BGZF compressed file 320 /** @param fai Pointer to the faidx_t struct 321 * @param cache_size Selected cache size in bytes 322 */ 323 void fai_set_cache_size(faidx_t* fai, int cache_size); 324 325 /// Determines the path to the reference index file 326 /** @param fa String with the path to the reference file 327 * @return String with the path to the reference index file, or NULL on failure 328 329 If the reference path has the format reference.fa##idx##index.fa.fai, 330 the index path is taken directly from it as index.fa.fai. 331 If the reference file is local and the index file cannot be found, it 332 will be created alongside the reference file. 333 If the reference file is remote and the index file cannot be found, 334 the method returns NULL. 335 336 The returned string has to be freed by the user at the end of its scope. 337 */ 338 char* fai_path(const(char)* fa); 339