1 // htslib-1.9 faidx.h as D module
2 module htslib.faidx;
3 import htslib.bgzf : BGZF;
4 extern (C):
5 
6 // @file htslib/faidx.h
7 // FASTA random access.
8 /*
9    Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2019 Genome Research Ltd.
10 
11    Author: Heng Li <lh3@sanger.ac.uk>
12 
13    Permission is hereby granted, free of charge, to any person obtaining
14    a copy of this software and associated documentation files (the
15    "Software"), to deal in the Software without restriction, including
16    without limitation the rights to use, copy, modify, merge, publish,
17    distribute, sublicense, and/or sell copies of the Software, and to
18    permit persons to whom the Software is furnished to do so, subject to
19    the following conditions:
20 
21    The above copyright notice and this permission notice shall be
22    included in all copies or substantial portions of the Software.
23 
24    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31    SOFTWARE.
32 */
33 
34 import core.stdc.stdint;
35 import htslib.hts;
36 
37 /** @file
38 
39   Index FASTA or FASTQ files and extract subsequence.
40 
41   The fai file index columns for FASTA are:
42     - chromosome name
43     - chromosome length: number of bases
44     - offset: number of bytes to skip to get to the first base
45         from the beginning of the file, including the length
46         of the sequence description string (`>chr ..\n`)
47     - line length: number of bases per line (excluding `\n`)
48     - binary line length: number of bytes, including `\n`
49 
50    The index for FASTQ is similar to above:
51     - chromosome name
52     - chromosome length: number of bases
53     - sequence offset: number of bytes to skip to get to the first base
54         from the beginning of the file, including the length
55         of the sequence description string (`@chr ..\n`)
56     - line length: number of bases per line (excluding `\n`)
57     - binary line length: number of bytes, including `\n`
58     - quality offset: number of bytes to skip from the beginning of the file
59         to get to the first quality value in the indexed entry.
60 
61     The FASTQ version of the index uses line length and binary line length
62     for both the sequence and the quality values, so they must be line
63     wrapped in the same way.
64  */
65 
66 /// Opaque structure representing FASTA index
67 struct __faidx_t
68 {
69     BGZF *bgzf;
70 }   // @suppress(dscanner.style.phobos_naming_convention)
71 /// ditto
72 alias faidx_t = __faidx_t;
73 
74 /// File format to be dealing with.
75 enum fai_format_options { // @suppress(dscanner.style.phobos_naming_convention)
76     FAI_NONE,
77     FAI_FASTA,
78     FAI_FASTQ
79 }
80 
81 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file.
82 /**  @param  fn  FASTA/FASTQ file name
83      @param  fnfai Name of .fai file to build.
84      @param  fngzi Name of .gzi file to build (if fn is bgzip-compressed).
85      @return     0 on success; or -1 on failure
86 
87 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
88 If fngzi is NULL, ".gzi" will be appended to fn for the GZI file.  The GZI
89 file will only be built if fn is bgzip-compressed.
90 */
91 int fai_build3(const(char) *fn, const(char) *fnfai, const(char) *fngzi);
92 
93 /// Build index for a FASTA or FASTQ or bgzip-compressed FASTA or FASTQ file.
94 /** @param  fn  FASTA/FASTQ file name    @return     0 on success; or -1 on failure
95 
96 File "fn.fai" will be generated.  This function is equivalent to
97 fai_build3(fn, NULL, NULL);
98 */
99 int fai_build(const(char) *fn);
100 
101 /// Destroy a faidx_t struct
102 void fai_destroy(faidx_t *fai);
103 
104 /// Options for fai_load functions
105 enum fai_load_options { // @suppress(dscanner.style.phobos_naming_convention)
106     FAI_CREATE = 0x01,
107 }
108 
109 /// Load FASTA indexes.
110 /** @param  fn  File name of the FASTA file (can be compressed with bgzip).
111     @param  fnfai File name of the FASTA index.
112     @param  fngzi File name of the bgzip index.
113     @param  flags Option flags to control index file caching and creation.
114     @return Pointer to a faidx_t struct on success, NULL on failure.
115 
116 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
117 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
118 The bgzip index is only needed if fn is compressed.
119 
120 If (flags & FAI_CREATE) is true, the index files will be built using
121 fai_build3() if they are not already present.
122 
123 The struct returned by a successful call should be freed via fai_destroy()
124 when it is no longer needed.
125 */
126 faidx_t *fai_load3(const(char) *fn, const(char) *fnfai, const(char) *fngzi,
127                    int flags);
128 
129 /// Load index from "fn.fai".
130 /** @param  fn  File name of the FASTA file
131     @return Pointer to a faidx_t struct on success, NULL on failure.
132 
133 This function is equivalent to fai_load3(fn, NULL, NULL, FAI_CREATE|FAI_CACHE);
134 */
135 faidx_t *fai_load(const(char) *fn);
136 
137 /// Load FASTA or FASTQ indexes.
138 /** @param  fn  File name of the FASTA/FASTQ file (can be compressed with bgzip).
139     @param  fnfai File name of the FASTA/FASTQ index.
140     @param  fngzi File name of the bgzip index.
141     @param  flags Option flags to control index file caching and creation.
142     @param  format FASTA or FASTQ file format
143     @return Pointer to a faidx_t struct on success, NULL on failure.
144 
145 If fnfai is NULL, ".fai" will be appended to fn to make the FAI file name.
146 If fngzi is NULL, ".gzi" will be appended to fn for the bgzip index name.
147 The bgzip index is only needed if fn is compressed.
148 
149 If (flags & FAI_CREATE) is true, the index files will be built using
150 fai_build3() if they are not already present.
151 
152 The struct returned by a successful call should be freed via fai_destroy()
153 when it is no longer needed.
154 */
155 faidx_t *fai_load3_format(const(char) *fn, const(char) *fnfai, const(char) *fngzi,
156                    int flags, fai_format_options format);
157 
158 /// Load index from "fn.fai".
159 /** @param  fn  File name of the FASTA/FASTQ file
160     @param  format FASTA or FASTQ file format
161     @return Pointer to a faidx_t struct on success, NULL on failure.
162 
163 This function is equivalent to fai_load3_format(fn, NULL, NULL, FAI_CREATE|FAI_CACHE, format);
164 */
165 faidx_t *fai_load_format(const(char) *fn, fai_format_options format);
166 
167 /// Fetch the sequence in a region
168 /** @param  fai  Pointer to the faidx_t struct
169     @param  reg  Region in the format "chr2:20,000-30,000"
170     @param  len  Length of the region; -2 if seq not present, -1 general error
171     @return      Pointer to the sequence; `NULL` on failure
172 
173 The returned sequence is allocated by `malloc()` family and should be destroyed
174 by end users by calling `free()` on it.
175 
176 To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
177 are reference names, quote using curly braces.
178 Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
179 */
180 char *fai_fetch(const(faidx_t) *fai, const(char) *reg, int *len);
181 char *fai_fetch64(const(faidx_t) *fai, const(char) *reg, hts_pos_t *len);
182 
183 /// Fetch the quality string for a region for FASTQ files
184 /** @param  fai  Pointer to the faidx_t struct
185     @param  reg  Region in the format "chr2:20,000-30,000"
186     @param  len  Length of the region; -2 if seq not present, -1 general error
187     @return      Pointer to the quality string; null on failure
188 
189 The returned quality string is allocated by `malloc()` family and should be
190 destroyed by end users by calling `free()` on it.
191 
192 Region names can be quoted with curly braces, as for fai_fetch().
193 */
194 char *fai_fetchqual(const(faidx_t) *fai, const(char) *reg, int *len);
195 char *fai_fetchqual64(const(faidx_t) *fai, const(char) *reg, hts_pos_t *len);
196 
197 /// Fetch the number of sequences
198 /** @param  fai  Pointer to the faidx_t struct
199     @return      The number of sequences
200 */
201 ///int faidx_fetch_nseq(const faidx_t *fai) HTS_DEPRECATED("Please use faidx_nseq instead");
202 
203 /// Fetch the sequence in a region
204 /** @param  fai  Pointer to the faidx_t struct
205     @param  c_name Region name
206     @param  p_beg_i  Beginning position number (zero-based)
207     @param  p_end_i  End position number (zero-based)
208     @param  len  Length of the region; -2 if c_name not present, -1 general error
209     @return      Pointer to the sequence; null on failure
210 
211 The returned sequence is allocated by `malloc()` family and should be destroyed
212 by end users by calling `free()` on it.
213 */
214 char *faidx_fetch_seq(const(faidx_t) *fai, const(char) *c_name, int p_beg_i, int p_end_i, int *len);
215 
216 /// Fetch the sequence in a region
217 /** @param  fai  Pointer to the faidx_t struct
218     @param  c_name Region name
219     @param  p_beg_i  Beginning position number (zero-based)
220     @param  p_end_i  End position number (zero-based)
221     @param  len  Length of the region; -2 if c_name not present, -1 general error
222     @return      Pointer to the sequence; null on failure
223 
224 The returned sequence is allocated by `malloc()` family and should be destroyed
225 by end users by calling `free()` on it.
226 */
227 char *faidx_fetch_seq64(const(faidx_t) *fai, const(char) *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len);
228 
229 /// Fetch the quality string in a region for FASTQ files
230 /** @param  fai  Pointer to the faidx_t struct
231     @param  c_name Region name
232     @param  p_beg_i  Beginning position number (zero-based)
233     @param  p_end_i  End position number (zero-based)
234     @param  len  Length of the region; -2 if c_name not present, -1 general error
235     @return      Pointer to the sequence; null on failure
236 
237 The returned sequence is allocated by `malloc()` family and should be destroyed
238 by end users by calling `free()` on it.
239 */
240 char *faidx_fetch_qual(const(faidx_t) *fai, const(char) *c_name, int p_beg_i, int p_end_i, int *len);
241 
242 /// Fetch the quality string in a region for FASTQ files
243 /** @param  fai  Pointer to the faidx_t struct
244     @param  c_name Region name
245     @param  p_beg_i  Beginning position number (zero-based)
246     @param  p_end_i  End position number (zero-based)
247     @param  len  Length of the region; -2 if c_name not present, -1 general error
248     @return      Pointer to the sequence; null on failure
249 
250 The returned sequence is allocated by `malloc()` family and should be destroyed
251 by end users by calling `free()` on it.
252 */
253 char *faidx_fetch_qual64(const(faidx_t) *fai, const(char) *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); // @suppress(dscanner.style.long_line)
254 
255 /// Query if sequence is present
256 /**   @param  fai  Pointer to the faidx_t struct
257       @param  seq  Sequence name
258       @return      1 if present or 0 if absent
259 */
260 int faidx_has_seq(const(faidx_t) *fai, const(char) *seq);
261 
262 /// Return number of sequences in fai index
263 int faidx_nseq(const(faidx_t) *fai);
264 
265 /// Return name of i-th sequence
266 const(char) *faidx_iseq(const(faidx_t) *fai, int i);
267 
268 /// Return sequence length, -1 if not present
269 int faidx_seq_len(const(faidx_t) *fai, const(char) *seq);
270 
271 /// Parses a region string.
272 /** @param  fai   Pointer to the faidx_t struct
273     @param  s     Region string
274     @param  tid   Returns which i-th sequence is described in the region.
275     @param  beg   Returns the start of the region (0 based)
276     @param  end   Returns the one past last of the region (0 based)
277     @param  flags Parsing method, see HTS_PARSE_* in hts.h.
278     @return      pointer to end of parsed s if successs, NULL if not.
279 
280     To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
281     are reference names, quote using curly braces.
282     Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
283 */
284 const(char) *fai_parse_region(const(faidx_t) *fai, const(char) *s,
285                              int *tid, hts_pos_t *beg, hts_pos_t *end,
286                              int flags);
287 
288 /// Sets the cache size of the underlying BGZF compressed file
289 /** @param  fai         Pointer to the faidx_t struct
290  *  @param  cache_size  Selected cache size in bytes
291  */
292 void fai_set_cache_size(faidx_t *fai, int cache_size);
293