1 /// @file htslib/faidx.h
2 /// FASTA random access.
3 /*
4    Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd.
5 
6    Author: Heng Li <lh3@sanger.ac.uk>
7 
8    Permission is hereby granted, free of charge, to any person obtaining
9    a copy of this software and associated documentation files (the
10    "Software"), to deal in the Software without restriction, including
11    without limitation the rights to use, copy, modify, merge, publish,
12    distribute, sublicense, and/or sell copies of the Software, and to
13    permit persons to whom the Software is furnished to do so, subject to
14    the following conditions:
15 
16    The above copyright notice and this permission notice shall be
17    included in all copies or substantial portions of the Software.
18 
19    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
23    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
24    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
25    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26    SOFTWARE.
27 */
28 
29 module htslib.faidx;
30 
31 import htslib.hts : hts_pos_t;
32 
33 @system:
34 nothrow:
35 @nogc:
36 
37 extern (C):
38 
39 /** @file
40 
41   Index FASTA or FASTQ files and extract subsequence.
42 
43   The fai file index columns for FASTA are:
44     - chromosome name
45     - chromosome length: number of bases
46     - offset: number of bytes to skip to get to the first base
47         from the beginning of the file, including the length
48         of the sequence description string (`>chr ..\n`)
49     - line length: number of bases per line (excluding `\n`)
50     - binary line length: number of bytes, including `\n`
51 
52    The index for FASTQ is similar to above:
53     - chromosome name
54     - chromosome length: number of bases
55     - sequence offset: number of bytes to skip to get to the first base
56         from the beginning of the file, including the length
57         of the sequence description string (`@chr ..\n`)
58     - line length: number of bases per line (excluding `\n`)
59     - binary line length: number of bytes, including `\n`
60     - quality offset: number of bytes to skip from the beginning of the file
61         to get to the first quality value in the indexed entry.
62 
63     The FASTQ version of the index uses line length and binary line length
64     for both the sequence and the quality values, so they must be line
65     wrapped in the same way.
66  */
67 
68 struct __faidx_t;
69 /// Opaque structure representing FASTA index
70 alias faidx_t = __faidx_t;
71 
72 /// File format to be dealing with.
73 enum fai_format_options
74 {
75     FAI_NONE = 0,
76     FAI_FASTA = 1,
77     FAI_FASTQ = 2
78 }
79 
80 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file.
81 /**  @param  fn  FASTA/FASTQ file name
82      @param  fnfai Name of .fai file to build.
83      @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
84      @return     0 on success; or -1 on failure
85 
86 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
87 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
88 file will only be built if fn is bgzip-compressed.
89 */
90 int fai_build3(const(char)* fn, const(char)* fnfai, const(char)* fngzi);
91 
92 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file.
93 /** @param  fn  FASTA/FASTQ file name
94     @return     0 on success; or -1 on failure
95 
96 File "fn.fai" will be generated.  This function is equivalent to
97 fai_build3(fn, NULL, NULL);
98 */
99 int fai_build(const(char)* fn);
100 
101 /// Destroy a faidx_t struct
102 void fai_destroy(faidx_t* fai);
103 
104 enum fai_load_options
105 {
106     FAI_CREATE = 0x01
107 }
108 
109 /// Load FASTA indexes.
110 /** @param  fn  File name of the FASTA file (can be compressed with bgzip).
111     @param  fnfai File name of the FASTA index.
112     @param  fngzi File name of the bgzip index.
113     @param  flags Option flags to control index file caching and creation.
114     @return Pointer to a faidx_t struct on success, NULL on failure.
115 
116 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
117 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
118 The bgzip index is only needed if fn is compressed.
119 
120 If (flags & FAI_CREATE) is true, the index files will be built using
121 fai_build3() if they are not already present.
122 
123 The struct returned by a successful call should be freed via fai_destroy()
124 when it is no longer needed.
125 */
126 faidx_t* fai_load3(
127     const(char)* fn,
128     const(char)* fnfai,
129     const(char)* fngzi,
130     int flags);
131 
132 /// Load index from "fn.fai".
133 /** @param  fn  File name of the FASTA file
134     @return Pointer to a faidx_t struct on success, NULL on failure.
135 
136 This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
137 */
138 faidx_t* fai_load(const(char)* fn);
139 
140 /// Load FASTA or FASTQ indexes.
141 /** @param  fn  File name of the FASTA/FASTQ file (can be compressed with bgzip).
142     @param  fnfai File name of the FASTA/FASTQ index.
143     @param  fngzi File name of the bgzip index.
144     @param  flags Option flags to control index file caching and creation.
145     @param  format FASTA or FASTQ file format
146     @return Pointer to a faidx_t struct on success, NULL on failure.
147 
148 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
149 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
150 The bgzip index is only needed if fn is compressed.
151 
152 If (flags & FAI_CREATE) is true, the index files will be built using
153 fai_build3() if they are not already present.
154 
155 The struct returned by a successful call should be freed via fai_destroy()
156 when it is no longer needed.
157 */
158 faidx_t* fai_load3_format(
159     const(char)* fn,
160     const(char)* fnfai,
161     const(char)* fngzi,
162     int flags,
163     fai_format_options format);
164 
165 /// Load index from "fn.fai".
166 /** @param  fn  File name of the FASTA/FASTQ file
167     @param  format FASTA or FASTQ file format
168     @return Pointer to a faidx_t struct on success, NULL on failure.
169 
170 This function is equivalent to fai_load3_format(fn, NULL, NULL, FAI_CREATE|FAI_CACHE, format);
171 */
172 faidx_t* fai_load_format(const(char)* fn, fai_format_options format);
173 
174 /// Fetch the sequence in a region
175 /** @param  fai  Pointer to the faidx_t struct
176     @param  reg  Region in the format "chr2:20,000-30,000"
177     @param  len  Length of the region; -2 if seq not present, -1 general error
178     @return      Pointer to the sequence; `NULL` on failure
179 
180 The returned sequence is allocated by `malloc()` family and should be destroyed
181 by end users by calling `free()` on it.
182 
183 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
184 are reference names, quote using curly braces.
185 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
186 */
187 char* fai_fetch(const(faidx_t)* fai, const(char)* reg, int* len);
188 char* fai_fetch64(const(faidx_t)* fai, const(char)* reg, hts_pos_t* len);
189 
190 /// Fetch the quality string for a region for FASTQ files
191 /** @param  fai  Pointer to the faidx_t struct
192     @param  reg  Region in the format "chr2:20,000-30,000"
193     @param  len  Length of the region; -2 if seq not present, -1 general error
194     @return      Pointer to the quality string; null on failure
195 
196 The returned quality string is allocated by `malloc()` family and should be
197 destroyed by end users by calling `free()` on it.
198 
199 Region names can be quoted with curly braces, as for fai_fetch().
200 */
201 char* fai_fetchqual(const(faidx_t)* fai, const(char)* reg, int* len);
202 char* fai_fetchqual64(const(faidx_t)* fai, const(char)* reg, hts_pos_t* len);
203 
204 /// Fetch the number of sequences
205 /** @param  fai  Pointer to the faidx_t struct
206     @return      The number of sequences
207 */
208 int faidx_fetch_nseq(const(faidx_t)* fai);
209 
210 /// Fetch the sequence in a region
211 /** @param  fai  Pointer to the faidx_t struct
212     @param  c_name Region name
213     @param  p_beg_i  Beginning position number (zero-based)
214     @param  p_end_i  End position number (zero-based)
215     @param  len  Length of the region; -2 if c_name not present, -1 general error
216     @return      Pointer to the sequence; null on failure
217 
218 The returned sequence is allocated by `malloc()` family and should be destroyed
219 by end users by calling `free()` on it.
220 */
221 char* faidx_fetch_seq(
222     const(faidx_t)* fai,
223     const(char)* c_name,
224     int p_beg_i,
225     int p_end_i,
226     int* len);
227 
228 /// Fetch the sequence in a region
229 /** @param  fai  Pointer to the faidx_t struct
230     @param  c_name Region name
231     @param  p_beg_i  Beginning position number (zero-based)
232     @param  p_end_i  End position number (zero-based)
233     @param  len  Length of the region; -2 if c_name not present, -1 general error
234     @return      Pointer to the sequence; null on failure
235 
236 The returned sequence is allocated by `malloc()` family and should be destroyed
237 by end users by calling `free()` on it.
238 */
239 char* faidx_fetch_seq64(
240     const(faidx_t)* fai,
241     const(char)* c_name,
242     hts_pos_t p_beg_i,
243     hts_pos_t p_end_i,
244     hts_pos_t* len);
245 
246 /// Fetch the quality string in a region for FASTQ files
247 /** @param  fai  Pointer to the faidx_t struct
248     @param  c_name Region name
249     @param  p_beg_i  Beginning position number (zero-based)
250     @param  p_end_i  End position number (zero-based)
251     @param  len  Length of the region; -2 if c_name not present, -1 general error
252     @return      Pointer to the sequence; null on failure
253 
254 The returned sequence is allocated by `malloc()` family and should be destroyed
255 by end users by calling `free()` on it.
256 */
257 char* faidx_fetch_qual(
258     const(faidx_t)* fai,
259     const(char)* c_name,
260     int p_beg_i,
261     int p_end_i,
262     int* len);
263 
264 /// Fetch the quality string in a region for FASTQ files
265 /** @param  fai  Pointer to the faidx_t struct
266     @param  c_name Region name
267     @param  p_beg_i  Beginning position number (zero-based)
268     @param  p_end_i  End position number (zero-based)
269     @param  len  Length of the region; -2 if c_name not present, -1 general error
270     @return      Pointer to the sequence; null on failure
271 
272 The returned sequence is allocated by `malloc()` family and should be destroyed
273 by end users by calling `free()` on it.
274 */
275 char* faidx_fetch_qual64(
276     const(faidx_t)* fai,
277     const(char)* c_name,
278     hts_pos_t p_beg_i,
279     hts_pos_t p_end_i,
280     hts_pos_t* len);
281 
282 /// Query if sequence is present
283 /**   @param  fai  Pointer to the faidx_t struct
284       @param  seq  Sequence name
285       @return      1 if present or 0 if absent
286 */
287 int faidx_has_seq(const(faidx_t)* fai, const(char)* seq);
288 
289 /// Return number of sequences in fai index
290 int faidx_nseq(const(faidx_t)* fai);
291 
292 /// Return name of i-th sequence
293 const(char)* faidx_iseq(const(faidx_t)* fai, int i);
294 
295 /// Return sequence length, -1 if not present
296 int faidx_seq_len(const(faidx_t)* fai, const(char)* seq);
297 
298 /// Parses a region string.
299 /** @param  fai   Pointer to the faidx_t struct
300     @param  s     Region string
301     @param  tid   Returns which i-th sequence is described in the region.
302     @param  beg   Returns the start of the region (0 based)
303     @param  end   Returns the one past last of the region (0 based)
304     @param  flags Parsing method, see HTS_PARSE_* in hts.h.
305     @return       Pointer to end of parsed s if successful, NULL if not.
306 
307     To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
308     are reference names, quote using curly braces.
309     Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
310 */
311 const(char)* fai_parse_region(
312     const(faidx_t)* fai,
313     const(char)* s,
314     int* tid,
315     hts_pos_t* beg,
316     hts_pos_t* end,
317     int flags);
318 
319 /// Sets the cache size of the underlying BGZF compressed file
320 /** @param  fai         Pointer to the faidx_t struct
321  *  @param  cache_size  Selected cache size in bytes
322  */
323 void fai_set_cache_size(faidx_t* fai, int cache_size);
324 
325 /// Determines the path to the reference index file
326 /** @param  fa    String with the path to the reference file
327  *  @return       String with the path to the reference index file, or NULL on failure
328 
329     If the reference path has the format reference.fa##idx##index.fa.fai,
330     the index path is taken directly from it as index.fa.fai.
331     If the reference file is local and the index file cannot be found, it
332     will be created alongside the reference file.
333     If the reference file is remote and the index file cannot be found,
334     the method returns NULL.
335 
336     The returned string has to be freed by the user at the end of its scope.
337  */
338 char* fai_path(const(char)* fa);
339