1 /// @file htslib/hts.h
2 /// Format-neutral I/O, indexing, and iterator API functions.
3 /*
4     Copyright (C) 2012-2022 Genome Research Ltd.
5     Copyright (C) 2010, 2012 Broad Institute.
6     Portions copyright (C) 2003-2006, 2008-2010 by Heng Li <lh3@live.co.uk>
7 
8     Author: Heng Li <lh3@sanger.ac.uk>
9 
10 Permission is hereby granted, free of charge, to any person obtaining a copy
11 of this software and associated documentation files (the "Software"), to deal
12 in the Software without restriction, including without limitation the rights
13 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 copies of the Software, and to permit persons to whom the Software is
15 furnished to do so, subject to the following conditions:
16 
17 The above copyright notice and this permission notice shall be included in
18 all copies or substantial portions of the Software.
19 
20 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.  */
27 module htslib.hts;
28 
29 import core.stdc.config;
30 import core.stdc.inttypes;
31 import core.stdc.limits;
32 import core.stdc.stdint;
33 
34 import htslib.bgzf : BGZF;
35 import htslib.cram : cram_fd;
36 import htslib.hfile : hFILE;
37 import htslib.thread_pool : hts_tpool;
38 import htslib.sam : sam_hdr_t;
39 import htslib.kstring : kstring_t;
40 
41 @system:
42 nothrow:
43 @nogc:
44 
45 extern (C):
46 
47 // Separator used to split HTS_PATH (for plugins); REF_PATH (cram references)
48 
49 version(Windows)
50 {
51     enum HTS_PATH_SEPARATOR_CHAR = ';';
52     enum HTS_PATH_SEPARATOR_STR = ";";
53 }
54 else
55 {
56     enum HTS_PATH_SEPARATOR_CHAR = ':';
57     enum HTS_PATH_SEPARATOR_STR = ":";
58 }
59 
60 
61 /**
62  * @hideinitializer
63  * Deprecated macro to expand a dynamic array of a given type
64  *
65  * @param         type_t The type of the array elements
66  * @param[in]     n      Requested number of elements of type type_t
67  * @param[in,out] m      Size of memory allocated
68  * @param[in,out] ptr    Pointer to the array
69  *
70  * @discussion
71  * Do not use this macro.  Use hts_resize() instead as allows allocation
72  * failures to be handled more gracefully.
73  *
74  * The array *ptr will be expanded if necessary so that it can hold @p n
75  * or more elements.  If the array is expanded then the new size will be
76  * written to @p m and the value in @p ptr may change.
77  *
78  * It must be possible to take the address of @p ptr and @p m must be usable
79  * as an lvalue.
80  *
81  * @bug
82  * If the memory allocation fails, this will call exit(1).  This is
83  * not ideal behaviour in a library.
84  */
85 
86 /**
87  * @hideinitializer
88  * Macro to expand a dynamic array, zeroing any newly-allocated memory
89  *
90  * @param         type_t The type of the array elements
91  * @param[in]     n      Requested number of elements of type type_t
92  * @param[in,out] m      Size of memory allocated
93  * @param[in,out] ptr    Pointer to the array
94  *
95  * @discussion
96  * Do not use this macro.  Use hts_resize() instead as allows allocation
97  * failures to be handled more gracefully.
98  *
99  * As for hts_expand(), except the bytes that make up the array elements
100  * between the old and new values of @p m are set to zero using memset().
101  *
102  * @bug
103  * If the memory allocation fails, this will call exit(1).  This is
104  * not ideal behaviour in a library.
105  */
106 
107 // For internal use (by hts_resize()) only
108 int hts_resize_array_(size_t, size_t, size_t, void*, void**, int, const(char)*);
109 
110 enum HTS_RESIZE_CLEAR = 1;
111 
112 /**
113  * @hideinitializer
114  * Macro to expand a dynamic array of a given type
115  *
116  * @param         type_t    The type of the array elements
117  * @param[in]     num       Requested number of elements of type type_t
118  * @param[in,out] size_ptr  Pointer to where the size (in elements) of the
119                             array is stored.
120  * @param[in,out] ptr       Location of the pointer to the array
121  * @param[in]     flags     Option flags
122  *
123  * @return        0 for success, or negative if an error occurred.
124  *
125  * @discussion
126  * The array *ptr will be expanded if necessary so that it can hold @p num
127  * or more elements.  If the array is expanded then the new size will be
128  * written to @p *size_ptr and the value in @p *ptr may change.
129  *
130  * If ( @p flags & HTS_RESIZE_CLEAR ) is set, any newly allocated memory will
131  * be cleared.
132  */
133 
134 pragma(inline,true)
135 int hts_resize(T)(size_t num, ref size_t size, T* ptr, int flags)
136 {
137     return (num > size)
138         ? hts_resize_array_(T.sizeof, num, size_t.sizeof, &size, cast(void **)&ptr, flags, __FUNCTION__)
139         : 0;
140 }
141 
142 /// Release resources when dlclosing a dynamically loaded HTSlib
143 /** @discussion
144  *  Normally HTSlib cleans up automatically when your program exits,
145  *  whether that is via exit(3) or returning from main(). However if you
146  *  have dlopen(3)ed HTSlib and wish to close it before your main program
147  *  exits, you must call hts_lib_shutdown() before dlclose(3).
148 */
149 void hts_lib_shutdown();
150 
151 /**
152  * Wrapper function for free(). Enables memory deallocation across DLL
153  * boundary. Should be used by all applications, which are compiled
154  * with a different standard library than htslib and call htslib
155  * methods that return dynamically allocated data.
156  */
157 void hts_free(void* ptr);
158 
159 /************
160  * File I/O *
161  ************/
162 
163 // Add new entries only at the end (but before the *_maximum entry)
164 // of these enums, as their numbering is part of the htslib ABI.
165 
166 /// Broad format category (sequence data, variant data, index, regions, etc.)
167 enum htsFormatCategory // @suppress(dscanner.style.phobos_naming_convention)
168 {
169     unknown_category = 0,
170     sequence_data = 1, // Sequence data -- SAM, BAM, CRAM, etc
171     variant_data = 2, // Variant calling data -- VCF, BCF, etc
172     index_file = 3, // Index file associated with some data file
173     region_list = 4, // Coordinate intervals or regions -- BED, etc
174     category_maximum = 32_767
175 }
176 
177 /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.)
178 enum htsExactFormat // @suppress(dscanner.style.phobos_naming_convention)
179 {
180     unknown_format = 0,
181     binary_format = 1,
182     text_format = 2,
183     sam = 3,
184     bam = 4,
185     bai = 5,
186     cram = 6,
187     crai = 7,
188     vcf = 8,
189     bcf = 9,
190     csi = 10,
191     gzi = 11,
192     tbi = 12,
193     bed = 13,
194     htsget = 14,
195     json = 14,
196     empty_format = 15, // File is empty (or empty after decompression)
197     fasta_format = 16,
198     fastq_format = 17,
199     fai_format = 18,
200     fqi_format = 19,
201     hts_crypt4gh_format = 20,
202     d4_format = 21,
203     format_maximum = 32_767
204 }
205 
206 /// Compression type
207 enum htsCompression // @suppress(dscanner.style.phobos_naming_convention)
208 {
209     no_compression = 0,
210     gzip = 1,
211     bgzf = 2,
212     custom = 3,
213     bzip2_compression = 4,
214     razf_compression = 5,
215     xz_compression = 6,
216     zstd_compression = 7,
217     compression_maximum = 32_767
218 }
219 
220 struct htsFormat
221 {
222     htsFormatCategory category; /// Broad format category (sequence data, variant data, index, regions, etc.)
223     htsExactFormat format;      /// Specific format (SAM, BAM, CRAM, BCF, VCF, TBI, BED, etc.)
224     /// format version
225     struct Vers { short major, minor; } // @suppress(dscanner.style.undocumented_declaration)
226     Vers v; /// format version
227     htsCompression compression; /// Compression type
228     short compression_level;/// currently unused
229     void *specific;         /// format specific options; see struct hts_opt.
230 }
231 
232 struct hts_idx_t;
233 struct hts_filter_t;
234 
235 /**
236  * @brief File handle returned by hts_open() etc.
237  * This structure should be considered opaque by end users. There should be
238  * no need to access most fields directly in user code, and in cases where
239  * it is desirable accessor functions such as hts_get_format() are provided.
240  */
241 // Maintainers note htsFile cannot be an incomplete struct because some of its
242 // fields are part of libhts.so's ABI (hence these fields must not be moved):
243 //  - fp is used in the public sam_itr_next()/etc macros
244 //  - is_bin is used directly in samtools <= 1.1 and bcftools <= 1.1
245 //  - is_write and is_cram are used directly in samtools <= 1.1
246 //  - fp is used directly in samtools (up to and including current develop)
247 //  - line is used directly in bcftools (up to and including current develop)
248 //  - is_bgzf and is_cram flags indicate which fp union member to use.
249 //    Note is_bgzf being set does not indicate the flag is BGZF compressed,
250 //    nor even whether it is compressed at all (eg on naked BAMs).
251 struct htsFile // @suppress(dscanner.style.phobos_naming_convention)
252 {
253     import std.bitmanip : bitfields;
254 
255     mixin(bitfields!(
256         uint, "is_bin", 1,
257         uint, "is_write", 1,
258         uint, "is_be", 1,
259         uint, "is_cram", 1,
260         uint, "is_bgzf", 1,
261         uint, "dummy", 27));
262     long lineno; /// uncompressed(?) file line no.
263     kstring_t line; /// buffer to hold line
264     char *fn;       /// filename
265     char *fn_aux;   /// auxillary (i.e, index) file name
266     /// hFile plus any needed bgzf or CRAM (if applicable) structure data
267     union FP {
268         BGZF *bgzf;     /// see bgzf.d
269         cram_fd *cram;  /// see cram.d
270         hFILE *hfile;   /// see hfile.d
271     }
272     FP fp;              /// hFile plus any needed bgzf or CRAM (if applicable) structure data
273     void* state; // format specific state information
274     htsFormat format;   /// hts file complete file format information
275     hts_idx_t* idx;
276     const(char)* fnidx;
277     sam_hdr_t* bam_header;
278     hts_filter_t* filter;
279 }
280 
281 /// A combined thread pool and queue allocation size.
282 /// The pool should already be defined, but qsize may be zero to
283 /// indicate an appropriate queue size is taken from the pool.
284 ///
285 /// Reasons for explicitly setting it could be where many more file
286 /// descriptors are in use than threads, so keeping memory low is
287 /// important.
288 struct htsThreadPool // @suppress(dscanner.style.phobos_naming_convention)
289 {
290     hts_tpool* pool; // The shared thread pool itself
291     int qsize; // Size of I/O queue to use for this fp
292 }
293 
294 /// REQUIRED_FIELDS
295 enum sam_fields // @suppress(dscanner.style.phobos_naming_convention)
296 {
297     SAM_QNAME = 0x00000001,
298     SAM_FLAG = 0x00000002,
299     SAM_RNAME = 0x00000004,
300     SAM_POS = 0x00000008,
301     SAM_MAPQ = 0x00000010,
302     SAM_CIGAR = 0x00000020,
303     SAM_RNEXT = 0x00000040,
304     SAM_PNEXT = 0x00000080,
305     SAM_TLEN = 0x00000100,
306     SAM_SEQ = 0x00000200,
307     SAM_QUAL = 0x00000400,
308     SAM_AUX = 0x00000800,
309     SAM_RGAUX = 0x00001000
310 }
311 
312 /// Mostly CRAM only, but this could also include other format options
313 enum hts_fmt_option
314 {
315     // CRAM specific
316     CRAM_OPT_DECODE_MD = 0,
317     CRAM_OPT_PREFIX = 1,
318     CRAM_OPT_VERBOSITY = 2, // obsolete, use hts_set_log_level() instead
319     CRAM_OPT_SEQS_PER_SLICE = 3,
320     CRAM_OPT_SLICES_PER_CONTAINER = 4,
321     CRAM_OPT_RANGE = 5,
322     CRAM_OPT_VERSION = 6, // rename to cram_version?
323     CRAM_OPT_EMBED_REF = 7,
324     CRAM_OPT_IGNORE_MD5 = 8,
325     CRAM_OPT_REFERENCE = 9, // make general
326     CRAM_OPT_MULTI_SEQ_PER_SLICE = 10,
327     CRAM_OPT_NO_REF = 11,
328     CRAM_OPT_USE_BZIP2 = 12,
329     CRAM_OPT_SHARED_REF = 13,
330     CRAM_OPT_NTHREADS = 14, // deprecated, use HTS_OPT_NTHREADS
331     CRAM_OPT_THREAD_POOL = 15, // make general
332     CRAM_OPT_USE_LZMA = 16,
333     CRAM_OPT_USE_RANS = 17,
334     CRAM_OPT_REQUIRED_FIELDS = 18,
335     CRAM_OPT_LOSSY_NAMES = 19,
336     CRAM_OPT_BASES_PER_SLICE = 20,
337     CRAM_OPT_STORE_MD = 21,
338     CRAM_OPT_STORE_NM = 22,
339     CRAM_OPT_RANGE_NOSEEK = 23, // CRAM_OPT_RANGE minus the seek
340     CRAM_OPT_USE_TOK = 24,
341     CRAM_OPT_USE_FQZ = 25,
342     CRAM_OPT_USE_ARITH = 26,
343     CRAM_OPT_POS_DELTA = 27, // force delta for AP, even on non-pos sorted data
344 
345     // General purpose
346     HTS_OPT_COMPRESSION_LEVEL = 100,
347     HTS_OPT_NTHREADS = 101,
348     HTS_OPT_THREAD_POOL = 102,
349     HTS_OPT_CACHE_SIZE = 103,
350     HTS_OPT_BLOCK_SIZE = 104,
351     HTS_OPT_FILTER = 105,
352     HTS_OPT_PROFILE = 106,
353 
354     // Fastq
355 
356     // Boolean.
357     // Read / Write CASAVA 1.8 format.
358     // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf
359     //
360     // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+
361     // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag,
362     // \d+ is a control number, and the sequence at the end is
363     // for barcode sequence.  Barcodes are read into the aux tag defined
364     // by FASTQ_OPT_BARCODE ("BC" by default).
365     FASTQ_OPT_CASAVA = 1000,
366 
367     // String.
368     // Whether to read / write extra SAM format aux tags from the fastq
369     // identifier line.  For reading this can simply be "1" to request
370     // decoding aux tags.  For writing it is a comma separated list of aux
371     // tag types to be written out.
372     FASTQ_OPT_AUX = 1001,
373 
374     // Boolean.
375     // Whether to add /1 and /2 to read identifiers when writing FASTQ.
376     // These come from the BAM_FREAD1 or BAM_FREAD2 flags.
377     // (Detecting the /1 and /2 is automatic when reading fastq.)
378     FASTQ_OPT_RNUM = 1002,
379 
380     // Two character string.
381     // Barcode aux tag for CASAVA; defaults to "BC".
382     FASTQ_OPT_BARCODE = 1003,
383 
384     // Process SRA and ENA read names which pointlessly move the original
385     // name to the second field and insert a constructed <run>.<number>
386     // name in its place.
387     FASTQ_OPT_NAME2 = 1004
388 }
389 
390 /// Profile options for encoding; primarily used at present in CRAM
391 /// but also usable in BAM as a synonym for deflate compression levels.
392 enum hts_profile_option
393 {
394     HTS_PROFILE_FAST = 0,
395     HTS_PROFILE_NORMAL = 1,
396     HTS_PROFILE_SMALL = 2,
397     HTS_PROFILE_ARCHIVE = 3
398 }
399 
400 /// For backwards compatibility
401 alias cram_option = hts_fmt_option;
402 
403 /// Options for cache, (de)compression, threads, CRAM, etc.
404 struct hts_opt // @suppress(dscanner.style.phobos_naming_convention)
405 {
406     char *arg;          /// string form, strdup()ed
407     hts_fmt_option opt; /// tokenised key
408     /// option value
409     union VAL {         /// ... and value
410         int i;          /// int value
411         char *s;        /// string value
412     }
413     VAL val;            /// value
414     hts_opt *next;      /// next option (linked list)
415 }
416 
417 /*
418  * Explicit index file name delimiter, see below
419  */
420 enum HTS_IDX_DELIM = "##idx##";
421 
422 /**********************
423  * Exported functions *
424  **********************/
425 
426 /**
427  * Parses arg and appends it to the option list.
428  *
429  * Returns 0 on success;
430  *        -1 on failure.
431  */
432 int hts_opt_add(hts_opt** opts, const(char)* c_arg);
433 
434 /**
435  * Applies an hts_opt option list to a given htsFile.
436  *
437  * Returns 0 on success
438  *        -1 on failure
439  */
440 int hts_opt_apply(htsFile* fp, hts_opt* opts);
441 
442 /**
443  * Frees an hts_opt list.
444  */
445 void hts_opt_free(hts_opt* opts);
446 
447 /**
448  * Accepts a string file format (sam, bam, cram, vcf, bam) optionally
449  * followed by a comma separated list of key=value options and splits
450  * these up into the fields of htsFormat struct.
451  *
452  * Returns 0 on success
453  *        -1 on failure.
454  */
455 int hts_parse_format(htsFormat* opt, const(char)* str);
456 
457 /**
458  * Tokenise options as (key(=value)?,)*(key(=value)?)?
459  * NB: No provision for ',' appearing in the value!
460  * Add backslashing rules?
461  *
462  * This could be used as part of a general command line option parser or
463  * as a string concatenated onto the file open mode.
464  *
465  * Returns 0 on success
466  *        -1 on failure.
467  */
468 int hts_parse_opt_list(htsFormat* opt, const(char)* str);
469 
470 /*! @abstract Table for converting a nucleotide character to 4-bit encoding.
471 The input character may be either an IUPAC ambiguity code, '=' for 0, or
472 '0'/'1'/'2'/'3' for a result of 1/2/4/8.  The result is encoded as 1/2/4/8
473 for A/C/G/T or combinations of these bits for ambiguous bases.
474 */
475 
476 version(Windows){
477     __gshared const(ubyte)[256] seq_nt16_table = [
478         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
479         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
480         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
481         1, 2, 4, 8, 15,15,15,15, 15,15,15,15, 15, 0 /*=*/,15,15,
482         15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
483         15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
484         15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
485         15,15, 5, 6,  8,15, 7, 9, 15,10,15,15, 15,15,15,15,
486 
487         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
488         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
489         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
490         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
491         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
492         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
493         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
494         15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
495     ];
496 }else{
497     extern __gshared const(ubyte)[256] seq_nt16_table;
498 }
499 /**! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC
500 ambiguity code letter (or '=' when given 0).
501 */
502 
503 version(Windows) __gshared const(char)[16] seq_nt16_str = ['=','A','C','M','G','R','S','V','T','W','Y','H','K','D','B','N'];
504 else extern __gshared const(char)[16] seq_nt16_str;
505 
506 /**! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits.
507 Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous).
508 */
509 version(Windows) __gshared const(int)[16] seq_nt16_int = [ 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 ];
510 else extern __gshared const(int)[16] seq_nt16_int;
511 /**!
512   @abstract  Get the htslib version number
513   @return    For released versions, a string like "N.N[.N]"; or git describe
514   output if using a library built within a Git repository.
515 */
516 const(char)* hts_version();
517 
518 /**!
519   @abstract  Compile-time HTSlib version number, for use in #if checks
520   @return    For released versions X.Y[.Z], an integer of the form XYYYZZ;
521   useful for preprocessor conditionals such as
522       #if HTS_VERSION >= 101000  // Check for v1.10 or later
523 */
524 // Maintainers: Bump this in the final stage of preparing a new release.
525 // Immediately after release, bump ZZ to 90 to distinguish in-development
526 // Git repository builds from the release; you may wish to increment this
527 // further when significant features are merged.
528 enum HTS_VERSION = 101600;
529 
530 /**! @abstract Introspection on the features enabled in htslib
531  *
532  * @return a bitfield of HTS_FEATURE_* macros.
533  */
534 uint hts_features();
535 
536 const(char)* hts_test_feature(uint id);
537 
538 /**! @abstract Introspection on the features enabled in htslib, string form
539  *
540  * @return a string describing htslib build features
541  */
542 const(char)* hts_feature_string();
543 
544 /// Whether ./configure was used or vanilla Makefile
545 enum HTS_FEATURE_CONFIGURE = 1;
546 
547 /// Whether --enable-plugins was used
548 enum HTS_FEATURE_PLUGINS = 2;
549 
550 /// Transport specific
551 enum HTS_FEATURE_LIBCURL = 1u << 10;
552 enum HTS_FEATURE_S3 = 1u << 11;
553 enum HTS_FEATURE_GCS = 1u << 12;
554 
555 /// Compression options
556 enum HTS_FEATURE_LIBDEFLATE = 1u << 20;
557 enum HTS_FEATURE_LZMA = 1u << 21;
558 enum HTS_FEATURE_BZIP2 = 1u << 22;
559 enum HTS_FEATURE_HTSCODECS = 1u << 23; // htscodecs library version
560 
561 /// Build params
562 enum HTS_FEATURE_CC = 1u << 27;
563 enum HTS_FEATURE_CFLAGS = 1u << 28;
564 enum HTS_FEATURE_CPPFLAGS = 1u << 29;
565 enum HTS_FEATURE_LDFLAGS = 1u << 30;
566 
567 /**!
568   @abstract    Determine format by peeking at the start of a file
569   @param fp    File opened for reading, positioned at the beginning
570   @param fmt   Format structure that will be filled out on return
571   @return      0 for success, or negative if an error occurred.
572 
573   Equivalent to hts_detect_format2(fp, NULL, fmt).
574 */
575 int hts_detect_format(hFILE* fp, htsFormat* fmt);
576 
577 /**History: !
578   @abstract    Determine format primarily by peeking at the start of a file
579   @param fp    File opened for reading, positioned at the beginning
580   @param fname Name of the file, or NULL if not available
581   @param fmt   Format structure that will be filled out on return
582   @return      0 for success, or negative if an error occurred.
583   @since       1.15
584  
585 Some formats are only recognised if the filename is available and has the
586 expected extension, as otherwise more generic files may be misrecognised.
587 In particular:
588  - FASTA/Q indexes must have .fai/.fqi extensions; without this requirement,
589    some similar BED files would be misrecognised as indexes.
590 */
591 int hts_detect_format2(hFILE* fp, const(char)* fname, htsFormat* fmt);
592 
593 /**!
594   @abstract    Get a human-readable description of the file format
595   @param fmt   Format structure holding type, version, compression, etc.
596   @return      Description string, to be freed by the caller after use.
597 */
598 char* hts_format_description(const(htsFormat)* format);
599 
600 /**!
601   @abstract       Open a sequence data (SAM/BAM/CRAM) or variant data (VCF/BCF)
602                   or possibly-compressed textual line-orientated file
603   @param fn       The file name or "-" for stdin/stdout. For indexed files
604                   with a non-standard naming, the file name can include the
605                   name of the index file delimited with HTS_IDX_DELIM
606   @param mode     Mode matching / [rwa][bcefFguxz0-9]* /
607   @discussion
608       With 'r' opens for reading; any further format mode letters are ignored
609       as the format is detected by checking the first few bytes or BGZF blocks
610       of the file.  With 'w' or 'a' opens for writing or appending, with format
611       specifier letters:
612         b  binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc)
613         c  CRAM format
614         f  FASTQ format
615         F  FASTA format
616         g  gzip compressed
617         u  uncompressed
618         z  bgzf compressed
619         [0-9]  zlib compression level
620       and with non-format option letters (for any of 'r'/'w'/'a'):
621         e  close the file on exec(2) (opens with O_CLOEXEC, where supported)
622         x  create the file exclusively (opens with O_EXCL, where supported)
623       Note that there is a distinction between 'u' and '0': the first yields
624       plain uncompressed output whereas the latter outputs uncompressed data
625       wrapped in the zlib format.
626   @example
627       [rw]b  .. compressed BCF, BAM, FAI
628       [rw]bu .. uncompressed BCF
629       [rw]z  .. compressed VCF
630       [rw]   .. uncompressed VCF
631 */
632 htsFile* hts_open(const(char)* fn, const(char)* mode);
633 
634 /**!
635   @abstract       Open a SAM/BAM/CRAM/VCF/BCF/etc file
636   @param fn       The file name or "-" for stdin/stdout
637   @param mode     Open mode, as per hts_open()
638   @param fmt      Optional format specific parameters
639   @discussion
640       See hts_open() for description of fn and mode.
641       // TODO Update documentation for s/opts/fmt/
642       Opts contains a format string (sam, bam, cram, vcf, bcf) which will,
643       if defined, override mode.  Opts also contains a linked list of hts_opt
644       structures to apply to the open file handle.  These can contain things
645       like pointers to the reference or information on compression levels,
646       block sizes, etc.
647 */
648 htsFile* hts_open_format(
649     const(char)* fn,
650     const(char)* mode,
651     const(htsFormat)* fmt);
652 
653 /**!
654   @abstract  For output streams, flush any buffered data
655   @param fp  The file handle to be flushed
656   @return    0 for success, or negative if an error occurred.
657   @since     1.14
658 */
659 int hts_flush(htsFile* fp);
660 
661 /**!
662   @abstract       Open an existing stream as a SAM/BAM/CRAM/VCF/BCF/etc file
663   @param fn       The already-open file handle
664   @param mode     Open mode, as per hts_open()
665 */
666 htsFile* hts_hopen(hFILE* fp, const(char)* fn, const(char)* mode);
667 
668 /**!
669   @abstract  Close a file handle, flushing buffered data for output streams
670   @param fp  The file handle to be closed
671   @return    0 for success, or negative if an error occurred.
672 */
673 int hts_close(htsFile* fp);
674 
675 /**!
676   @abstract  Returns the file's format information
677   @param fp  The file handle
678   @return    Read-only pointer to the file's htsFormat.
679 */
680 const(htsFormat)* hts_get_format(htsFile* fp);
681 
682 /**!
683   @ abstract      Returns a string containing the file format extension.
684   @ param format  Format structure containing the file type.
685   @ return        A string ("sam", "bam", etc) or "?" for unknown formats.
686  */
687 const(char)* hts_format_file_extension(const(htsFormat)* format);
688 
689 /**!
690   @abstract  Sets a specified CRAM option on the open file handle.
691   @param fp  The file handle open the open file.
692   @param opt The CRAM_OPT_* option.
693   @param ... Optional arguments, dependent on the option used.
694   @return    0 for success, or negative if an error occurred.
695 */
696 int hts_set_opt(htsFile* fp, hts_fmt_option opt, ...);
697 
698 /**!
699   @abstract         Read a line (and its \n or \r\n terminator) from a file
700   @param fp         The file handle
701   @param delimiter  Unused, but must be '\n' (or KS_SEP_LINE)
702   @param str        The line (not including the terminator) is written here
703   @return           Length of the string read (capped at INT_MAX);
704                     -1 on end-of-file; <= -2 on error
705 */
706 int hts_getline(htsFile* fp, int delimiter, kstring_t* str);
707 
708 char** hts_readlines(const(char)* fn, int* _n);
709 /**!
710     @abstract       Parse comma-separated list or read list from a file
711     @param list     File name or comma-separated list
712     @param is_file
713     @param _n       Size of the output array (number of items read)
714     @return         NULL on failure or pointer to newly allocated array of
715                     strings
716 */
717 char** hts_readlist(const(char)* fn, int is_file, int* _n);
718 
719 /**!
720   @abstract  Create extra threads to aid compress/decompression for this file
721   @param fp  The file handle
722   @param n   The number of worker threads to create
723   @return    0 for success, or negative if an error occurred.
724   @notes     This function creates non-shared threads for use solely by fp.
725              The hts_set_thread_pool function is the recommended alternative.
726 */
727 int hts_set_threads(htsFile* fp, int n);
728 
729 /**!
730   @abstract  Create extra threads to aid compress/decompression for this file
731   @param fp  The file handle
732   @param p   A pool of worker threads, previously allocated by hts_create_threads().
733   @return    0 for success, or negative if an error occurred.
734 */
735 int hts_set_thread_pool(htsFile* fp, htsThreadPool* p);
736 
737 /**!
738   @abstract  Adds a cache of decompressed blocks, potentially speeding up seeks.
739              This may not work for all file types (currently it is bgzf only).
740   @param fp  The file handle
741   @param n   The size of cache, in bytes
742 */
743 void hts_set_cache_size(htsFile* fp, int n);
744 
745 /*!
746   @abstract  Set .fai filename for a file opened for reading
747   @return    0 for success, negative on failure
748   @discussion
749       Called before *_hdr_read(), this provides the name of a .fai file
750       used to provide a reference list if the htsFile contains no @SQ headers.
751 */
752 int hts_set_fai_filename(htsFile* fp, const(char)* fn_aux);
753 
754 /**!
755   @abstract  Sets a filter expression
756   @return    0 for success, negative on failure
757   @discussion
758       To clear an existing filter, specifying expr as NULL.
759 */
760 int hts_set_filter_expression(htsFile* fp, const(char)* expr);
761 
762 /**!
763   @abstract  Determine whether a given htsFile contains a valid EOF block
764   @return    3 for a non-EOF checkable filetype;
765              2 for an unseekable file type where EOF cannot be checked;
766              1 for a valid EOF block;
767              0 for if the EOF marker is absent when it should be present;
768             -1 (with errno set) on failure
769   @discussion
770       Check if the BGZF end-of-file (EOF) marker is present
771 */
772 int hts_check_EOF(htsFile* fp);
773 
774 /************
775  * Indexing *
776  ************/
777 
778 /**!
779 These HTS_IDX_* macros are used as special tid values for hts_itr_query()/etc,
780 producing iterators operating as follows:
781  - HTS_IDX_NOCOOR iterates over unmapped reads sorted at the end of the file
782  - HTS_IDX_START  iterates over the entire file
783  - HTS_IDX_REST   iterates from the current position to the end of the file
784  - HTS_IDX_NONE   always returns "no more alignment records"
785 When one of these special tid values is used, beg and end are ignored.
786 When REST or NONE is used, idx is also ignored and may be NULL.
787 */
788 enum HTS_IDX_NOCOOR = -2;
789 enum HTS_IDX_START = -3;
790 enum HTS_IDX_REST = -4;
791 enum HTS_IDX_NONE = -5;
792 
793 enum HTS_FMT_CSI = 0;
794 enum HTS_FMT_BAI = 1;
795 enum HTS_FMT_TBI = 2;
796 enum HTS_FMT_CRAI = 3;
797 enum HTS_FMT_FAI = 4;
798 
799 // Almost INT64_MAX, but when cast into a 32-bit int it's
800 // also INT_MAX instead of -1.  This avoids bugs with old code
801 // using the new hts_pos_t data type.
802 enum HTS_POS_MAX = ((cast(long) INT_MAX) << 32) | INT_MAX;
803 enum HTS_POS_MIN = INT64_MIN;
804 enum PRIhts_pos = PRId64;
805 alias hts_pos_t = c_long;
806 
807 // For comparison with previous release:
808 //
809 // #define HTS_POS_MAX INT_MAX
810 // #define HTS_POS_MIN INT_MIN
811 // #define PRIhts_pos PRId32
812 // typedef int32_t hts_pos_t;
813 
814 struct hts_pair_pos_t
815 {
816     hts_pos_t beg;
817     hts_pos_t end;
818 }
819 
820 alias hts_pair32_t = hts_pair_pos_t; // For backwards compatibility
821 
822 struct hts_pair64_t // @suppress(dscanner.style.phobos_naming_convention)
823 {
824     ulong u;
825     ulong v;
826 }
827 
828 /// 64-bit start, end coordinate pair tracking max (internally used in hts.c)
829 struct hts_pair64_max_t // @suppress(dscanner.style.phobos_naming_convention)
830 {
831     ulong u;
832     ulong v;
833     ulong max;
834 }
835 
836 /// Region list used in iterators (NB: apparently confined to single contig/tid)
837 struct hts_reglist_t
838 {
839     const(char) *reg;   /// Region string
840     hts_pair_pos_t *intervals;  /// (start,end) intervals
841     int tid;            /// Contig id
842     uint count;         /// How many intervals
843     hts_pos_t min_beg;  /// absolute bounds
844     hts_pos_t max_end;  /// absolute bounds
845 }
846 
847 alias hts_readrec_func = int function(BGZF* fp, void* data, void* r, int* tid, hts_pos_t* beg, hts_pos_t* end);
848 alias hts_seek_func = int function(void* fp, long offset, int where);
849 alias hts_tell_func = c_long function(void* fp);
850 
851 /**
852  * @brief File iterator that can handle multiple target regions.
853  * This structure should be considered opaque by end users.
854  * It does both the stepping inside the file and the filtering of alignments.
855  * It can operate in single or multi-region mode, and depending on this,
856  * it uses different fields.
857  *
858  * read_rest (1) - read everything from the current offset, without filtering
859  * finished  (1) - no more iterations
860  * is_cram   (1) - current file has CRAM format
861  * nocoor    (1) - read all unmapped reads
862  *
863  * multi     (1) - multi-region moode
864  * reg_list  - List of target regions
865  * n_reg     - Size of the above list
866  * curr_reg  - List index of the current region of search
867  * curr_intv - Interval index inside the current region; points to a (beg, end)
868  * end       - Used for CRAM files, to preserve the max end coordinate
869  *
870  * multi     (0) - single-region mode
871  * tid       - Reference id of the target region
872  * beg       - Start position of the target region
873  * end       - End position of the target region
874  *
875  * Common fields:
876  * off        - List of file offsets computed from the index
877  * n_off      - Size of the above list
878  * i          - List index of the current file offset
879  * curr_off   - File offset for the next file read
880  * curr_tid   - Reference id of the current alignment
881  * curr_beg   - Start position of the current alignment
882  * curr_end   - End position of the current alignment
883  * nocoor_off - File offset where the unmapped reads start
884  *
885  * readrec    - File specific function that reads an alignment
886  * seek       - File specific function for changing the file offset
887  * tell       - File specific function for indicating the file offset
888  */
889 
890 struct hts_itr_t // @suppress(dscanner.style.phobos_naming_convention)
891 {
892     import std.bitmanip : bitfields;
893 
894     mixin(bitfields!(
895         uint, "read_rest", 1,
896         uint, "finished", 1,
897         uint, "is_cram", 1,
898         uint, "nocoor", 1,
899         uint, "multi", 1,
900         uint, "dummy", 27));
901 
902     int tid;
903     int n_off;
904     int i;
905     int n_reg;
906     hts_pos_t beg;
907     hts_pos_t end;
908     hts_reglist_t* reg_list;
909     int curr_tid;
910     int curr_reg;
911     int curr_intv;
912     hts_pos_t curr_beg;
913     hts_pos_t curr_end;
914     ulong curr_off;
915     ulong nocoor_off;
916     hts_pair64_max_t* off;
917     int function() readrec;
918     int function() seek;
919     long function() tell;
920 
921     struct Bins
922     {
923         int n;
924         int m;
925         int* a;
926     }
927 
928     Bins bins;
929 }
930 /// ? index key
931 struct aux_key_t { // @suppress(dscanner.style.phobos_naming_convention)
932     int key;    /// ???
933     /// ???
934     ulong min_off, max_off;
935 }
936 
937 alias hts_itr_multi_t = hts_itr_t;
938 
939 pragma(inline, true)
940 extern (D) auto hts_bin_first(T)(auto ref T l)
941 {
942     return ((1 << ((l << 1) + l)) - 1) / 7;
943 }
944 
945 pragma(inline, true)
946 extern (D) auto hts_bin_parent(T)(auto ref T b)
947 {
948     return (b - 1) >> 3;
949 }
950 
951 ///////////////////////////////////////////////////////////
952 // Low-level API for building indexes.
953 
954 /// Create a BAI/CSI/TBI type index structure
955 /** @param n          Initial number of targets
956     @param fmt        Format, one of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
957     @param offset0    Initial file offset
958     @param min_shift  Number of bits for the minimal interval
959     @param n_lvls     Number of levels in the binning index
960     @return An initialised hts_idx_t struct on success; NULL on failure
961 
962 The struct returned by a successful call should be freed via hts_idx_destroy()
963 when it is no longer needed.
964 */
965 hts_idx_t* hts_idx_init(
966     int n,
967     int fmt,
968     ulong offset0,
969     int min_shift,
970     int n_lvls);
971 
972 /// Free a BAI/CSI/TBI type index
973 /** @param idx   Index structure to free
974  */
975 void hts_idx_destroy(hts_idx_t* idx);
976 
977 /// Push an index entry
978 /** @param idx        Index
979     @param tid        Target id
980     @param beg        Range start (zero-based)
981     @param end        Range end (zero-based, half-open)
982     @param offset     File offset
983     @param is_mapped  Range corresponds to a mapped read
984     @return 0 on success; -1 on failure
985 
986 The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts
987 stored in the meta-data bin.
988  */
989 int hts_idx_push(
990     hts_idx_t* idx,
991     int tid,
992     hts_pos_t beg,
993     hts_pos_t end,
994     ulong offset,
995     int is_mapped);
996 
997 /// Finish building an index
998 /** @param idx          Index
999     @param final_offset Last file offset
1000     @return 0 on success; non-zero on failure.
1001 */
1002 int hts_idx_finish(hts_idx_t* idx, ulong final_offset);
1003 
1004 /// Returns index format
1005 /** @param idx   Index
1006     @return One of HTS_FMT_CSI, HTS_FMT_BAI or HTS_FMT_TBI
1007 */
1008 int hts_idx_fmt(hts_idx_t* idx);
1009 
1010 /// Add name to TBI index meta-data
1011 /** @param idx   Index
1012     @param tid   Target identifier
1013     @param name  Target name
1014     @return Index number of name in names list on success; -1 on failure.
1015 */
1016 int hts_idx_tbi_name(hts_idx_t* idx, int tid, const(char)* name);
1017 
1018 // Index loading and saving
1019 
1020 /// Save an index to a file
1021 /** @param idx  Index to be written
1022     @param fn   Input BAM/BCF/etc filename, to which .bai/.csi/etc will be added
1023     @param fmt  One of the HTS_FMT_* index formats
1024     @return  0 if successful, or negative if an error occurred.
1025 */
1026 int hts_idx_save(const(hts_idx_t)* idx, const(char)* fn, int fmt);
1027 
1028 /// Save an index to a specific file
1029 /** @param idx    Index to be written
1030     @param fn     Input BAM/BCF/etc filename
1031     @param fnidx  Output filename, or NULL to add .bai/.csi/etc to @a fn
1032     @param fmt    One of the HTS_FMT_* index formats
1033     @return  0 if successful, or negative if an error occurred.
1034 */
1035 int hts_idx_save_as(
1036     const(hts_idx_t)* idx,
1037     const(char)* fn,
1038     const(char)* fnidx,
1039     int fmt);
1040 
1041 /// Load an index file
1042 /** @param fn   BAM/BCF/etc filename, to which .bai/.csi/etc will be added or
1043                 the extension substituted, to search for an existing index file.
1044                 In case of a non-standard naming, the file name can include the
1045                 name of the index file delimited with HTS_IDX_DELIM.
1046     @param fmt  One of the HTS_FMT_* index formats
1047     @return  The index, or NULL if an error occurred.
1048 
1049 If @p fn contains the string "##idx##" (HTS_IDX_DELIM), the part before
1050 the delimiter will be used as the name of the data file and the part after
1051 it will be used as the name of the index.
1052 
1053 Otherwise, this function tries to work out the index name as follows:
1054 
1055   It will try appending ".csi" to @p fn
1056   It will try substituting an existing suffix (e.g. .bam, .vcf) with ".csi"
1057   Then, if @p fmt is HTS_FMT_BAI:
1058     It will try appending ".bai" to @p fn
1059     To will substituting the existing suffix (e.g. .bam) with ".bai"
1060   else if @p fmt is HTS_FMT_TBI:
1061     It will try appending ".tbi" to @p fn
1062     To will substituting the existing suffix (e.g. .vcf) with ".tbi"
1063 
1064 If the index file is remote (served over a protocol like https), first a check
1065 is made to see is a locally cached copy is available.  This is done for all
1066 of the possible names listed above.  If a cached copy is not available then
1067 the index will be downloaded and stored in the current working directory,
1068 with the same name as the remote index.
1069 
1070     Equivalent to hts_idx_load3(fn, NULL, fmt, HTS_IDX_SAVE_REMOTE);
1071 */
1072 hts_idx_t* hts_idx_load(const(char)* fn, int fmt);
1073 
1074 /// Load a specific index file
1075 /** @param fn     Input BAM/BCF/etc filename
1076     @param fnidx  The input index filename
1077     @return  The index, or NULL if an error occurred.
1078 
1079     Equivalent to hts_idx_load3(fn, fnidx, 0, 0);
1080 
1081     This function will not attempt to save index files locally.
1082 */
1083 hts_idx_t* hts_idx_load2(const(char)* fn, const(char)* fnidx);
1084 
1085 /// Load a specific index file
1086 /** @param fn     Input BAM/BCF/etc filename
1087     @param fnidx  The input index filename
1088     @param fmt    One of the HTS_FMT_* index formats
1089     @param flags  Flags to alter behaviour (see description)
1090     @return  The index, or NULL if an error occurred.
1091 
1092     If @p fnidx is NULL, the index name will be derived from @p fn in the
1093     same way as hts_idx_load().
1094 
1095     If @p fnidx is not NULL, @p fmt is ignored.
1096 
1097     The @p flags parameter can be set to a combination of the following
1098     values:
1099 
1100         HTS_IDX_SAVE_REMOTE   Save a local copy of any remote indexes
1101         HTS_IDX_SILENT_FAIL   Fail silently if the index is not present
1102 
1103     The index struct returned by a successful call should be freed
1104     via hts_idx_destroy() when it is no longer needed.
1105 */
1106 hts_idx_t* hts_idx_load3(
1107     const(char)* fn,
1108     const(char)* fnidx,
1109     int fmt,
1110     int flags);
1111 
1112 /// Flags for hts_idx_load3() ( and also sam_idx_load3(), tbx_idx_load3() )
1113 enum HTS_IDX_FLAG : int 
1114 {
1115     HTS_IDX_SAVE_REMOTE = 1,
1116     HTS_IDX_SILENT_FAIL = 2
1117 }
1118 ///////////////////////////////////////////////////////////
1119 // Functions for accessing meta-data stored in indexes
1120 
1121 alias hts_id2name_f = const(char)* function(void*, int);
1122 
1123 /// Get extra index meta-data
1124 /** @param idx    The index
1125     @param l_meta Pointer to where the length of the extra data is stored
1126     @return Pointer to the extra data if present; NULL otherwise
1127 
1128     Indexes (both .tbi and .csi) made by tabix include extra data about
1129     the indexed file.  The returns a pointer to this data.  Note that the
1130     data is stored exactly as it is in the index.  Callers need to interpret
1131     the results themselves, including knowing what sort of data to expect;
1132     byte swapping etc.
1133 */
1134 ubyte* hts_idx_get_meta(hts_idx_t* idx, uint* l_meta);
1135 
1136 /// Set extra index meta-data
1137 /** @param idx     The index
1138     @param l_meta  Length of data
1139     @param meta    Pointer to the extra data
1140     @param is_copy If not zero, a copy of the data is taken
1141     @return 0 on success; -1 on failure (out of memory).
1142 
1143     Sets the data that is returned by hts_idx_get_meta().
1144 
1145     If is_copy != 0, a copy of the input data is taken.  If not, ownership of
1146     the data pointed to by *meta passes to the index.
1147 */
1148 int hts_idx_set_meta(hts_idx_t* idx, uint l_meta, ubyte* meta, int is_copy);
1149 
1150 /// Get number of mapped and unmapped reads from an index
1151 /** @param      idx      Index
1152     @param      tid      Target ID
1153     @param[out] mapped   Location to store number of mapped reads
1154     @param[out] unmapped Location to store number of unmapped reads
1155     @return 0 on success; -1 on failure (data not available)
1156 
1157     BAI and CSI indexes store information on the number of reads for each
1158     target that were mapped or unmapped (unmapped reads will generally have
1159     a paired read that is mapped to the target).  This function returns this
1160     information if it is available.
1161 
1162     @note Cram CRAI indexes do not include this information.
1163 */
1164 int hts_idx_get_stat(
1165     const(hts_idx_t)* idx,
1166     int tid,
1167     ulong* mapped,
1168     ulong* unmapped);
1169 
1170 /// Return the number of unplaced reads from an index
1171 /** @param idx    Index
1172     @return Unplaced reads count
1173 
1174     Unplaced reads are not linked to any reference (e.g. RNAME is '*' in SAM
1175     files).
1176 */
1177 ulong hts_idx_get_n_no_coor(const(hts_idx_t)* idx);
1178 
1179 /// Return a list of target names from an index
1180 /** @param      idx    Index
1181     @param[out] n      Location to store the number of targets
1182     @param      getid  Callback function to get the name for a target ID
1183     @param      hdr    Header from indexed file
1184     @return An array of pointers to the names on success; NULL on failure
1185 
1186     @note The names are pointers into the header data structure.  When cleaning
1187     up, only the array should be freed, not the names.
1188  */
1189 const(char *)* hts_idx_seqnames(const(hts_idx_t)* idx, int* n, hts_id2name_f getid, void* hdr); // free only the array, not the values
1190 
1191 /// Return the number of targets from an index
1192 /** @param      idx    Index
1193     @return The number of targets
1194  */
1195 int hts_idx_nseq(const(hts_idx_t) *idx);
1196 
1197 ///////////////////////////////////////////////////////////
1198 // Region parsing
1199 
1200 enum HTS_PARSE_FLAGS : int 
1201 {
1202     HTS_PARSE_THOUSANDS_SEP = 1, ///< Ignore ',' separators within numbers
1203     HTS_PARSE_ONE_COORD = 2, ///< chr:pos means chr:pos-pos and not chr:pos-end
1204     HTS_PARSE_LIST = 4, ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP)
1205 }
1206 
1207 /// Parse a numeric string
1208 /** The number may be expressed in scientific notation, and optionally may
1209     contain commas in the integer part (before any decimal point or E notation).
1210     @param str     String to be parsed
1211     @param strend  If non-NULL, set on return to point to the first character
1212                    in @a str after those forming the parsed number
1213     @param flags   Or'ed-together combination of HTS_PARSE_* flags
1214     @return  Integer value of the parsed number, or 0 if no valid number
1215 
1216     The input string is parsed as: optional whitespace; an optional '+' or
1217     '-' sign; decimal digits possibly including ',' characters (if @a flags
1218     includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional
1219     case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific
1220     notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and
1221     decimal digits. To be considered a valid numeric value, the main part (not
1222     including any suffix or scientific notation) must contain at least one
1223     digit (either before or after the decimal point).
1224 
1225     When @a strend is NULL, @a str is expected to contain only (optional
1226     whitespace followed by) the numeric value. A warning will be printed
1227     (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number
1228     is found or if there are any unused characters after the number.
1229 
1230     When @a strend is non-NULL, @a str starts with (optional whitespace
1231     followed by) the numeric value. On return, @a strend is set to point
1232     to the first unused character after the numeric value, or to @a str
1233     if no valid parsable number is found.
1234 */
1235 long hts_parse_decimal(const(char)* str, char** strend, HTS_PARSE_FLAGS flags);
1236 
1237 alias hts_name2id_f = int function(void*, const(char)*) *;
1238 
1239 /// Parse a "CHR:START-END"-style region string
1240 /** @param str  String to be parsed
1241     @param beg  Set on return to the 0-based start of the region
1242     @param end  Set on return to the 1-based end of the region
1243     @return  Pointer to the colon or '\0' after the reference sequence name,
1244              or NULL if @a str could not be parsed.
1245 
1246     NOTE: For compatibility with hts_parse_reg only.
1247     Please use hts_parse_region instead.
1248 */
1249 const(char)* hts_parse_reg64(const(char)* str, hts_pos_t* beg, hts_pos_t* end);
1250 
1251 /// Parse a "CHR:START-END"-style region string
1252 /** @param str  String to be parsed
1253     @param beg  Set on return to the 0-based start of the region
1254     @param end  Set on return to the 1-based end of the region
1255     @return  Pointer to the colon or '\0' after the reference sequence name,
1256              or NULL if @a str could not be parsed.
1257 */
1258 const(char)* hts_parse_reg(const(char)* str, int* beg, int* end);
1259 
1260 /// Parse a "CHR:START-END"-style region string
1261 /** @param str   String to be parsed
1262     @param tid   Set on return (if not NULL) to be reference index (-1 if invalid)
1263     @param beg   Set on return to the 0-based start of the region
1264     @param end   Set on return to the 1-based end of the region
1265     @param getid Function pointer.  Called if not NULL to set tid.
1266     @param hdr   Caller data passed to getid.
1267     @param flags Bitwise HTS_PARSE_* flags listed above.
1268     @return      Pointer to the byte after the end of the entire region
1269                  specifier (including any trailing comma) on success,
1270                  or NULL if @a str could not be parsed.
1271 
1272     A variant of hts_parse_reg which is reference-id aware.  It uses
1273     the iterator name2id callbacks to validate the region tokenisation works.
1274 
1275     This is necessary due to GRCh38 HLA additions which have reference names
1276     like "HLA-DRB1*12:17".
1277 
1278     To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
1279     are reference names, quote using curly braces.
1280     Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
1281 
1282     Flags are used to control how parsing works, and can be one of the below.
1283 
1284     HTS_PARSE_THOUSANDS_SEP:
1285         Ignore commas in numbers.  For example with this flag 1,234,567
1286         is interpreted as 1234567.
1287 
1288     HTS_PARSE_LIST:
1289         If present, the region is assmed to be a comma separated list and
1290         position parsing will not contain commas (this implicitly
1291         clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
1292         On success the return pointer will be the start of the next region, ie
1293         the character after the comma.  (If *ret != '\0' then the caller can
1294         assume another region is present in the list.)
1295 
1296         If not set then positions may contain commas.  In this case the return
1297         value should point to the end of the string, or NULL on failure.
1298 
1299     HTS_PARSE_ONE_COORD:
1300         If present, X:100 is treated as the single base pair region X:100-100.
1301         In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
1302         (This is the standard bcftools region convention.)
1303 
1304         When not set X:100 is considered to be X:100-<end> where <end> is
1305         the end of chromosome X (set to INT_MAX here).  X:100- and X:-100 are
1306         invalid.
1307         (This is the standard samtools region convention.)
1308 
1309     Note the supplied string expects 1 based inclusive coordinates, but the
1310     returned coordinates start from 0 and are half open, so pos0 is valid
1311     for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
1312 
1313     If NULL is returned, the value in tid mat give additional information
1314     about the error:
1315 
1316         -2   Failed to parse @p hdr; or out of memory
1317         -1   The reference in @p str has mismatched braces, or does not
1318              exist in @p hdr
1319         >= 0 The specified range in @p str could not be parsed
1320 */
1321 const(char)* hts_parse_region(
1322     const(char)* s,
1323     int* tid,
1324     hts_pos_t* beg,
1325     hts_pos_t* end,
1326     hts_name2id_f getid,
1327     void* hdr,
1328     HTS_PARSE_FLAGS flags);
1329 
1330 ///////////////////////////////////////////////////////////
1331 // Generic iterators
1332 //
1333 // These functions provide the low-level infrastructure for iterators.
1334 // Wrappers around these are used to make iterators for specific file types.
1335 // See:
1336 //     htslib/sam.h  for SAM/BAM/CRAM iterators
1337 //     htslib/vcf.h  for VCF/BCF iterators
1338 //     htslib/tbx.h  for files indexed by tabix
1339 
1340 /// Create a single-region iterator
1341 /** @param idx      Index
1342     @param tid      Target ID
1343     @param beg      Start of region
1344     @param end      End of region
1345     @param readrec  Callback to read a record from the input file
1346     @return An iterator on success; NULL on failure
1347 
1348     The iterator struct returned by a successful call should be freed
1349     via hts_itr_destroy() when it is no longer needed.
1350  */
1351 hts_itr_t* hts_itr_query(
1352     const(hts_idx_t)* idx,
1353     int tid,
1354     hts_pos_t beg,
1355     hts_pos_t end,
1356     hts_readrec_func readrec);
1357 
1358 /// Free an iterator
1359 /** @param iter   Iterator to free
1360  */
1361 void hts_itr_destroy(hts_itr_t* iter);
1362 
1363 alias hts_itr_query_func = hts_itr_t* function(const(hts_idx_t)* idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func readrec);
1364 
1365 /// Create a single-region iterator from a text region specification
1366 /** @param idx       Index
1367     @param reg       Region specifier
1368     @param getid     Callback function to return the target ID for a name
1369     @param hdr       Input file header
1370     @param itr_query Callback function returning an iterator for a numeric tid,
1371                      start and end position
1372     @param readrec   Callback to read a record from the input file
1373     @return An iterator on success; NULL on error
1374 
1375     The iterator struct returned by a successful call should be freed
1376     via hts_itr_destroy() when it is no longer needed.
1377  */
1378 hts_itr_t* hts_itr_querys(
1379     const(hts_idx_t)* idx,
1380     const(char)* reg,
1381     hts_name2id_f getid,
1382     void* hdr,
1383     hts_itr_query_func itr_query,
1384     hts_readrec_func readrec);
1385 
1386 /// Return the next record from an iterator
1387 /** @param fp      Input file handle
1388     @param iter    Iterator
1389     @param r       Pointer to record placeholder
1390     @param data    Data passed to the readrec callback
1391     @return >= 0 on success, -1 when there is no more data, < -1 on error
1392  */
1393 int hts_itr_next(BGZF* fp, hts_itr_t* iter, void* r, void* data);
1394 
1395 
1396 /**********************************
1397  * Iterator with multiple regions *
1398  **********************************/
1399 
1400 alias hts_itr_multi_query_func = int function(const(hts_idx_t)* idx, hts_itr_t* itr);
1401 int hts_itr_multi_bam(const(hts_idx_t)* idx, hts_itr_t* iter);
1402 int hts_itr_multi_cram(const(hts_idx_t)* idx, hts_itr_t* iter);
1403 
1404 /// Create a multi-region iterator from a region list
1405 /** @param idx          Index
1406     @param reglist      Region list
1407     @param count        Number of items in region list
1408     @param getid        Callback to convert names to target IDs
1409     @param hdr          Indexed file header (passed to getid)
1410     @param itr_specific Filetype-specific callback function
1411     @param readrec      Callback to read an input file record
1412     @param seek         Callback to seek in the input file
1413     @param tell         Callback to return current input file location
1414     @return An iterator on success; NULL on failure
1415 
1416     The iterator struct returned by a successful call should be freed
1417     via hts_itr_destroy() when it is no longer needed.
1418  */
1419 hts_itr_t* hts_itr_regions(
1420     const(hts_idx_t)* idx,
1421     hts_reglist_t* reglist,
1422     int count,
1423     hts_name2id_f getid,
1424     void* hdr,
1425     hts_itr_multi_query_func itr_specific,
1426     hts_readrec_func readrec,
1427     hts_seek_func seek,
1428     hts_tell_func tell);
1429 
1430 /// Return the next record from an iterator
1431 /** @param fp      Input file handle
1432     @param iter    Iterator
1433     @param r       Pointer to record placeholder
1434     @return >= 0 on success, -1 when there is no more data, < -1 on error
1435  */
1436 int hts_itr_multi_next(htsFile* fd, hts_itr_t* iter, void* r);
1437 
1438 /// Create a region list from a char array
1439 /** @param argv      Char array of target:interval elements, e.g. chr1:2500-3600, chr1:5100, chr2
1440     @param argc      Number of items in the array
1441     @param r_count   Pointer to the number of items in the resulting region list
1442     @param hdr       Header for the sam/bam/cram file
1443     @param getid     Callback to convert target names to target ids.
1444     @return  A region list on success, NULL on failure
1445 
1446     The hts_reglist_t struct returned by a successful call should be freed
1447     via hts_reglist_free() when it is no longer needed.
1448  */
1449 hts_reglist_t* hts_reglist_create(
1450     char** argv,
1451     int argc,
1452     int* r_count,
1453     void* hdr,
1454     hts_name2id_f getid);
1455 
1456 /// Free a region list
1457 /** @param reglist    Region list
1458     @param count      Number of items in the list
1459  */
1460 void hts_reglist_free(hts_reglist_t* reglist, int count);
1461 
1462 /// Free a multi-region iterator
1463 /** @param iter   Iterator to free
1464  */
1465 alias hts_itr_multi_destroy = hts_itr_destroy;
1466 
1467 /**
1468  * hts_file_type() - Convenience function to determine file type
1469  * DEPRECATED:  This function has been replaced by hts_detect_format().
1470  * It and these FT_* macros will be removed in a future HTSlib release.
1471  */
1472 enum FT_UNKN = 0;
1473 enum FT_GZ = 1;
1474 enum FT_VCF = 2;
1475 enum FT_VCF_GZ = FT_GZ | FT_VCF;
1476 enum FT_BCF = 1 << 2;
1477 enum FT_BCF_GZ = FT_GZ | FT_BCF;
1478 enum FT_STDIN = 1 << 3;
1479 int hts_file_type(const(char)* fname);
1480 
1481 /***************************
1482  * Revised MAQ error model *
1483  ***************************/
1484 
1485 struct errmod_t;
1486 
1487 errmod_t* errmod_init(double depcorr);
1488 void errmod_destroy(errmod_t* em);
1489 
1490 /*
1491     n: number of bases
1492     m: maximum base
1493     bases[i]: qual:6, strand:1, base:4
1494     q[i*m+j]: phred-scaled likelihood of (i,j)
1495  */
1496 int errmod_cal(const(errmod_t)* em, int n, int m, ushort* bases, float* q);
1497 
1498 /*****************************************************
1499  * Probabilistic banded glocal alignment             *
1500  * See https://doi.org/10.1093/bioinformatics/btr076 *
1501  *****************************************************/
1502 
1503 struct probaln_par_t
1504 {
1505     float d;
1506     float e;
1507     int bw;
1508 }
1509 
1510 /// Perform probabilistic banded glocal alignment
1511 /** @param      ref     Reference sequence
1512     @param      l_ref   Length of reference
1513     @param      query   Query sequence
1514     @param      l_query Length of query sequence
1515     @param      iqual   Query base qualities
1516     @param      c       Alignment parameters
1517     @param[out] state   Output alignment
1518     @param[out] q    Phred scaled posterior probability of state[i] being wrong
1519     @return     Phred-scaled likelihood score, or INT_MIN on failure.
1520 
1521 The reference and query sequences are coded using integers 0,1,2,3,4 for
1522 bases A,C,G,T,N respectively (N here is for any ambiguity code).
1523 
1524 On output, state and q are arrays of length l_query. The higher 30
1525 bits give the reference position the query base is matched to and the
1526 lower two bits can be 0 (an alignment match) or 1 (an
1527 insertion). q[i] gives the phred scaled posterior probability of
1528 state[i] being wrong.
1529 
1530 On failure, errno will be set to EINVAL if the values of l_ref or l_query
1531 were invalid; or ENOMEM if a memory allocation failed.
1532 */
1533 
1534 int probaln_glocal(
1535     const(ubyte)* ref_,
1536     int l_ref,
1537     const(ubyte)* query,
1538     int l_query,
1539     const(ubyte)* iqual,
1540     const(probaln_par_t)* c,
1541     int* state,
1542     ubyte* q);
1543 
1544 /**********************
1545  * MD5 implementation *
1546  **********************/
1547 
1548 struct hts_md5_context;
1549 
1550 /*! @abstract   Initialises an MD5 context.
1551  *  @discussion
1552  *    The expected use is to allocate an hts_md5_context using
1553  *    hts_md5_init().  This pointer is then passed into one or more calls
1554  *    of hts_md5_update() to compute successive internal portions of the
1555  *    MD5 sum, which can then be externalised as a full 16-byte MD5sum
1556  *    calculation by calling hts_md5_final().  This can then be turned
1557  *    into ASCII via hts_md5_hex().
1558  *
1559  *    To dealloate any resources created by hts_md5_init() call the
1560  *    hts_md5_destroy() function.
1561  *
1562  *  @return     hts_md5_context pointer on success, NULL otherwise.
1563  */
1564 hts_md5_context* hts_md5_init();
1565 
1566 /*! @abstract Updates the context with the MD5 of the data. */
1567 void hts_md5_update(hts_md5_context* ctx, const(void)* data, c_ulong size);
1568 
1569 /*! @abstract Computes the final 128-bit MD5 hash from the given context */
1570 void hts_md5_final(ubyte* digest, hts_md5_context* ctx);
1571 
1572 /*! @abstract Resets an md5_context to the initial state, as returned
1573  *            by hts_md5_init().
1574  */
1575 void hts_md5_reset(hts_md5_context* ctx);
1576 
1577 /*! @abstract Converts a 128-bit MD5 hash into a 33-byte nul-termninated
1578  *            hex string.
1579  */
1580 void hts_md5_hex(char* hex, const(ubyte)* digest);
1581 
1582 /*! @abstract Deallocates any memory allocated by hts_md5_init. */
1583 void hts_md5_destroy(hts_md5_context* ctx);
1584 
1585 pragma(inline,true)
1586 long hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls)
1587 {
1588     int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
1589     for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
1590         if (beg>>s == end>>s) return t + (beg>>s);
1591     return 0;
1592 }
1593 
1594 /// Compute the level of a bin in a binning index
1595 pragma(inline, true)
1596 int hts_bin_level(int bin) {
1597     int l, b;
1598     for (l = 0, b = bin; b; ++l){ b = hts_bin_parent(b);}
1599     return l;
1600 }
1601 
1602 //! Compute the corresponding entry into the linear index of a given bin from
1603 //! a binning index
1604 /*!
1605  *  @param bin    The bin number
1606  *  @param n_lvls The index depth (number of levels - 0 based)
1607  *  @return       The integer offset into the linear index
1608  *
1609  *  Explanation of the return value formula:
1610  *  Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs.
1611  *  A linear index entry covers exp(2, min_shift) base pairs.
1612  */
1613 
1614 // compute the level of bin
1615 pragma(inline, true)
1616 int hts_bin_bot(int bin, int n_lvls)
1617 {
1618     int l = hts_bin_level(bin);
1619     return (bin - hts_bin_first(l)) << (n_lvls - l) * 3;
1620 }
1621 
1622 /**************
1623  * Endianness *
1624  **************/
1625 pragma(inline, true)
1626 int ed_is_big()
1627 {
1628     long one= 1;
1629     return !(*(cast(char *)(&one)));
1630 }
1631 pragma(inline, true)
1632 ushort ed_swap_2(ushort v)
1633 {
1634     return cast(ushort)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8));
1635 }
1636 pragma(inline, true)
1637 void *ed_swap_2p(void *x)
1638 {
1639     *cast(ushort*)x = ed_swap_2(*cast(ushort*)x);
1640     return x;
1641 }
1642 pragma(inline, true)
1643 uint ed_swap_4(uint v)
1644 {
1645     v = ((v & 0x0000FFFFU) << 16) | (v >> 16);
1646     return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8);
1647 }
1648 pragma(inline, true)
1649 void *ed_swap_4p(void *x)
1650 {
1651     *cast(uint*)x = ed_swap_4(*cast(uint*)x);
1652     return x;
1653 }
1654 pragma(inline, true)
1655 ulong ed_swap_8(ulong v)
1656 {
1657     v = ((v & 0x00000000FFFFFFFFLU) << 32) | (v >> 32);
1658     v = ((v & 0x0000FFFF0000FFFFLU) << 16) | ((v & 0xFFFF0000FFFF0000LU) >> 16);
1659     return ((v & 0x00FF00FF00FF00FFLU) << 8) | ((v & 0xFF00FF00FF00FF00LU) >> 8);
1660 }
1661 pragma(inline, true)
1662 void *ed_swap_8p(void *x)
1663 {
1664     *cast(ulong*)x = ed_swap_8(*cast(ulong*)x);
1665     return x;
1666 }
1667