1 // htslib-1.9 faidx.h as D module 2 module htslib.faidx; 3 import htslib.bgzf : BGZF; 4 extern (C): 5 6 // @file htslib/faidx.h 7 // FASTA random access. 8 /* 9 Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2019 Genome Research Ltd. 10 11 Author: Heng Li <lh3@sanger.ac.uk> 12 13 Permission is hereby granted, free of charge, to any person obtaining 14 a copy of this software and associated documentation files (the 15 "Software"), to deal in the Software without restriction, including 16 without limitation the rights to use, copy, modify, merge, publish, 17 distribute, sublicense, and/or sell copies of the Software, and to 18 permit persons to whom the Software is furnished to do so, subject to 19 the following conditions: 20 21 The above copyright notice and this permission notice shall be 22 included in all copies or substantial portions of the Software. 23 24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 SOFTWARE. 32 */ 33 34 import core.stdc.stdint; 35 import htslib.hts; 36 37 /** @file 38 39 Index FASTA or FASTQ files and extract subsequence. 40 41 The fai file index columns for FASTA are: 42 - chromosome name 43 - chromosome length: number of bases 44 - offset: number of bytes to skip to get to the first base 45 from the beginning of the file, including the length 46 of the sequence description string (`>chr ..\n`) 47 - line length: number of bases per line (excluding `\n`) 48 - binary line length: number of bytes, including `\n` 49 50 The index for FASTQ is similar to above: 51 - chromosome name 52 - chromosome length: number of bases 53 - sequence offset: number of bytes to skip to get to the first base 54 from the beginning of the file, including the length 55 of the sequence description string (`@chr ..\n`) 56 - line length: number of bases per line (excluding `\n`) 57 - binary line length: number of bytes, including `\n` 58 - quality offset: number of bytes to skip from the beginning of the file 59 to get to the first quality value in the indexed entry. 60 61 The FASTQ version of the index uses line length and binary line length 62 for both the sequence and the quality values, so they must be line 63 wrapped in the same way. 64 */ 65 66 /// Opaque structure representing FASTA index 67 struct __faidx_t 68 { 69 BGZF *bgzf; 70 } // @suppress(dscanner.style.phobos_naming_convention) 71 /// ditto 72 alias faidx_t = __faidx_t; 73 74 /// File format to be dealing with. 75 enum fai_format_options { // @suppress(dscanner.style.phobos_naming_convention) 76 FAI_NONE, 77 FAI_FASTA, 78 FAI_FASTQ 79 } 80 81 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file. 82 /** @param fn FASTA/FASTQ file name 83 @param fnfai Name of .fai file to build. 84 @param fngzi Name of .gzi file to build (if fn is bgzip-compressed). 85 @return 0 on success; or -1 on failure 86 87 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 88 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file. The GZI 89 file will only be built if fn is bgzip-compressed. 90 */ 91 int fai_build3(const(char) *fn, const(char) *fnfai, const(char) *fngzi); 92 93 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file. 94 /** @param fn FASTA/FASTQ file name @return 0 on success; or -1 on failure 95 96 File "fn.fai" will be generated. This function is equivalent to 97 fai_build3(fn, NULL, NULL); 98 */ 99 int fai_build(const(char) *fn); 100 101 /// Destroy a faidx_t struct 102 void fai_destroy(faidx_t *fai); 103 104 /// Options for fai_load functions 105 enum fai_load_options { // @suppress(dscanner.style.phobos_naming_convention) 106 FAI_CREATE = 0x01, 107 } 108 109 /// Load FASTA indexes. 110 /** @param fn File name of the FASTA file (can be compressed with bgzip). 111 @param fnfai File name of the FASTA index. 112 @param fngzi File name of the bgzip index. 113 @param flags Option flags to control index file caching and creation. 114 @return Pointer to a faidx_t struct on success, NULL on failure. 115 116 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 117 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. 118 The bgzip index is only needed if fn is compressed. 119 120 If (flags & FAI_CREATE) is true, the index files will be built using 121 fai_build3() if they are not already present. 122 123 The struct returned by a successful call should be freed via fai_destroy() 124 when it is no longer needed. 125 */ 126 faidx_t *fai_load3(const(char) *fn, const(char) *fnfai, const(char) *fngzi, 127 int flags); 128 129 /// Load index from "fn.fai". 130 /** @param fn File name of the FASTA file 131 @return Pointer to a faidx_t struct on success, NULL on failure. 132 133 This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE); 134 */ 135 faidx_t *fai_load(const(char) *fn); 136 137 /// Load FASTA or FASTQ indexes. 138 /** @param fn File name of the FASTA/FASTQ file (can be compressed with bgzip). 139 @param fnfai File name of the FASTA/FASTQ index. 140 @param fngzi File name of the bgzip index. 141 @param flags Option flags to control index file caching and creation. 142 @param format FASTA or FASTQ file format 143 @return Pointer to a faidx_t struct on success, NULL on failure. 144 145 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name. 146 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name. 147 The bgzip index is only needed if fn is compressed. 148 149 If (flags & FAI_CREATE) is true, the index files will be built using 150 fai_build3() if they are not already present. 151 152 The struct returned by a successful call should be freed via fai_destroy() 153 when it is no longer needed. 154 */ 155 faidx_t *fai_load3_format(const(char) *fn, const(char) *fnfai, const(char) *fngzi, 156 int flags, fai_format_options format); 157 158 /// Load index from "fn.fai". 159 /** @param fn File name of the FASTA/FASTQ file 160 @param format FASTA or FASTQ file format 161 @return Pointer to a faidx_t struct on success, NULL on failure. 162 163 This function is equivalent to fai_load3_format(fn, NULL, NULL, FAI_CREATE|FAI_CACHE, format); 164 */ 165 faidx_t *fai_load_format(const(char) *fn, fai_format_options format); 166 167 /// Fetch the sequence in a region 168 /** @param fai Pointer to the faidx_t struct 169 @param reg Region in the format "chr2:20,000-30,000" 170 @param len Length of the region; -2 if seq not present, -1 general error 171 @return Pointer to the sequence; `NULL` on failure 172 173 The returned sequence is allocated by `malloc()` family and should be destroyed 174 by end users by calling `free()` on it. 175 176 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 177 are reference names, quote using curly braces. 178 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 179 */ 180 char *fai_fetch(const(faidx_t) *fai, const(char) *reg, int *len); 181 char *fai_fetch64(const(faidx_t) *fai, const(char) *reg, hts_pos_t *len); 182 183 /// Fetch the quality string for a region for FASTQ files 184 /** @param fai Pointer to the faidx_t struct 185 @param reg Region in the format "chr2:20,000-30,000" 186 @param len Length of the region; -2 if seq not present, -1 general error 187 @return Pointer to the quality string; null on failure 188 189 The returned quality string is allocated by `malloc()` family and should be 190 destroyed by end users by calling `free()` on it. 191 192 Region names can be quoted with curly braces, as for fai_fetch(). 193 */ 194 char *fai_fetchqual(const(faidx_t) *fai, const(char) *reg, int *len); 195 char *fai_fetchqual64(const(faidx_t) *fai, const(char) *reg, hts_pos_t *len); 196 197 /// Fetch the number of sequences 198 /** @param fai Pointer to the faidx_t struct 199 @return The number of sequences 200 */ 201 ///int faidx_fetch_nseq(const faidx_t *fai) HTS_DEPRECATED("Please use faidx_nseq instead"); 202 203 /// Fetch the sequence in a region 204 /** @param fai Pointer to the faidx_t struct 205 @param c_name Region name 206 @param p_beg_i Beginning position number (zero-based) 207 @param p_end_i End position number (zero-based) 208 @param len Length of the region; -2 if c_name not present, -1 general error 209 @return Pointer to the sequence; null on failure 210 211 The returned sequence is allocated by `malloc()` family and should be destroyed 212 by end users by calling `free()` on it. 213 */ 214 char *faidx_fetch_seq(const(faidx_t) *fai, const(char) *c_name, int p_beg_i, int p_end_i, int *len); 215 216 /// Fetch the sequence in a region 217 /** @param fai Pointer to the faidx_t struct 218 @param c_name Region name 219 @param p_beg_i Beginning position number (zero-based) 220 @param p_end_i End position number (zero-based) 221 @param len Length of the region; -2 if c_name not present, -1 general error 222 @return Pointer to the sequence; null on failure 223 224 The returned sequence is allocated by `malloc()` family and should be destroyed 225 by end users by calling `free()` on it. 226 */ 227 char *faidx_fetch_seq64(const(faidx_t) *fai, const(char) *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); 228 229 /// Fetch the quality string in a region for FASTQ files 230 /** @param fai Pointer to the faidx_t struct 231 @param c_name Region name 232 @param p_beg_i Beginning position number (zero-based) 233 @param p_end_i End position number (zero-based) 234 @param len Length of the region; -2 if c_name not present, -1 general error 235 @return Pointer to the sequence; null on failure 236 237 The returned sequence is allocated by `malloc()` family and should be destroyed 238 by end users by calling `free()` on it. 239 */ 240 char *faidx_fetch_qual(const(faidx_t) *fai, const(char) *c_name, int p_beg_i, int p_end_i, int *len); 241 242 /// Fetch the quality string in a region for FASTQ files 243 /** @param fai Pointer to the faidx_t struct 244 @param c_name Region name 245 @param p_beg_i Beginning position number (zero-based) 246 @param p_end_i End position number (zero-based) 247 @param len Length of the region; -2 if c_name not present, -1 general error 248 @return Pointer to the sequence; null on failure 249 250 The returned sequence is allocated by `malloc()` family and should be destroyed 251 by end users by calling `free()` on it. 252 */ 253 char *faidx_fetch_qual64(const(faidx_t) *fai, const(char) *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); // @suppress(dscanner.style.long_line) 254 255 /// Query if sequence is present 256 /** @param fai Pointer to the faidx_t struct 257 @param seq Sequence name 258 @return 1 if present or 0 if absent 259 */ 260 int faidx_has_seq(const(faidx_t) *fai, const(char) *seq); 261 262 /// Return number of sequences in fai index 263 int faidx_nseq(const(faidx_t) *fai); 264 265 /// Return name of i-th sequence 266 const(char) *faidx_iseq(const(faidx_t) *fai, int i); 267 268 /// Return sequence length, -1 if not present 269 int faidx_seq_len(const(faidx_t) *fai, const(char) *seq); 270 271 /// Parses a region string. 272 /** @param fai Pointer to the faidx_t struct 273 @param s Region string 274 @param tid Returns which i-th sequence is described in the region. 275 @param beg Returns the start of the region (0 based) 276 @param end Returns the one past last of the region (0 based) 277 @param flags Parsing method, see HTS_PARSE_* in hts.h. 278 @return pointer to end of parsed s if successs, NULL if not. 279 280 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 281 are reference names, quote using curly braces. 282 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 283 */ 284 const(char) *fai_parse_region(const(faidx_t) *fai, const(char) *s, 285 int *tid, hts_pos_t *beg, hts_pos_t *end, 286 int flags); 287 288 /// Sets the cache size of the underlying BGZF compressed file 289 /** @param fai Pointer to the faidx_t struct 290 * @param cache_size Selected cache size in bytes 291 */ 292 void fai_set_cache_size(faidx_t *fai, int cache_size); 293