1 /// D bindings to htslib-1.9 vcf 2 /// Copyright 2018 James S Blachly, MD 3 /// Changes are MIT licensed 4 /// Section numbers refer to VCF Specification v4.2: https://samtools.github.io/hts-specs/VCFv4.2.pdf 5 module dhtslib.htslib.vcf; 6 7 import std.bitmanip; 8 import std.string: toStringz; 9 10 extern (C): 11 12 /// @file htslib/vcf.h 13 /// High-level VCF/BCF variant calling file operations. 14 /* 15 Copyright (C) 2012, 2013 Broad Institute. 16 Copyright (C) 2012-2014 Genome Research Ltd. 17 18 Author: Heng Li <lh3@sanger.ac.uk> 19 20 Permission is hereby granted, free of charge, to any person obtaining a copy 21 of this software and associated documentation files (the "Software"), to deal 22 in the Software without restriction, including without limitation the rights 23 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 24 copies of the Software, and to permit persons to whom the Software is 25 furnished to do so, subject to the following conditions: 26 27 The above copyright notice and this permission notice shall be included in 28 all copies or substantial portions of the Software. 29 30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 31 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 32 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 33 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 34 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 35 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 36 DEALINGS IN THE SOFTWARE. */ 37 38 /* 39 todo: 40 - make the function names consistent 41 - provide calls to abstract away structs as much as possible 42 */ 43 44 import core.stdc.stdint; 45 import core.stdc.limits; 46 import core.stdc.assert_; 47 import dhtslib.htslib.hts; 48 import dhtslib.htslib.kstring: __kstring_t, kstring_t; 49 import dhtslib.htslib.bgzf: BGZF; // normally typedefed as opaque struct in hts.h 50 //#include "hts_defs.h" 51 //#include "hts_endian.h" 52 53 /***************** 54 * Header struct * 55 *****************/ 56 enum { 57 BCF_HL_FLT = 0, /// header line: FILTER 58 BCF_HL_INFO = 1, /// header line: INFO 59 BCF_HL_FMT = 2, /// header line: FORMAT 60 BCF_HL_CTG = 3, /// header line: contig 61 BCF_HL_STR = 4, /// header line: structured header line TAG=<A=..,B=..> 62 BCF_HL_GEN = 5, /// header line: generic header line 63 64 BCF_HT_FLAG = 0, /// header type: FLAG 65 BCF_HT_INT = 1, /// header type: INTEGER 66 BCF_HT_REAL = 2, /// header type: REAL 67 BCF_HT_STR = 3, /// header type: STRING 68 69 BCF_VL_FIXED= 0, /// variable length: fixed (?) 70 BCF_VL_VAR = 1, /// variable length: variable 71 BCF_VL_A = 2, /// variable length: ? 72 BCF_VL_G = 3, /// variable length: ? 73 BCF_VL_R = 4 /// variable length: ? 74 } 75 76 /* === Dictionary === 77 78 The header keeps three dictonaries. The first keeps IDs in the 79 "FILTER/INFO/FORMAT" lines, the second keeps the sequence names and lengths 80 in the "contig" lines and the last keeps the sample names. bcf_hdr_t::dict[] 81 is the actual hash table, which is opaque to the end users. In the hash 82 table, the key is the ID or sample name as a C string and the value is a 83 bcf_idinfo_t struct. bcf_hdr_t::id[] points to key-value pairs in the hash 84 table in the order that they appear in the VCF header. bcf_hdr_t::n[] is the 85 size of the hash table or, equivalently, the length of the id[] arrays. 86 */ 87 88 enum int BCF_DT_ID = 0; /// dictionary type: ID 89 enum int BCF_DT_CTG = 1; /// dictionary type: CONTIG 90 enum int BCF_DT_SAMPLE = 2; /// dictionary type: SAMPLE 91 92 /// Structured representation of a header line (§1.2) 93 struct bcf_hrec_t { // @suppress(dscanner.style.phobos_naming_convention) 94 int type; /// One of the BCF_HL_* type 95 char *key; /// The part before '=', i.e. FILTER/INFO/FORMAT/contig/fileformat etc. 96 char *value; /// Set only for generic lines, NULL for FILTER/INFO, etc. 97 int nkeys; /// Number of structured fields 98 char **keys; /// The key=value pairs 99 char **vals; /// The key=value pairs 100 } 101 102 /// ID Dictionary entry 103 struct bcf_idinfo_t { // @suppress(dscanner.style.phobos_naming_convention) 104 uint32_t[3] info; /** stores Number:20, var:4, Type:4, ColType:4 in info[0..2] 105 for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG */ 106 bcf_hrec_t *[3] hrec; /// pointers to header lines for [FILTER, INFO, FORMAT] in order 107 int id; /// primary key 108 } 109 110 /// ID Dictionary k/v 111 struct bcf_idpair_t { // @suppress(dscanner.style.phobos_naming_convention) 112 const(char) *key; /// header dictionary FILTER/INFO/FORMAT ID key 113 const bcf_idinfo_t *val;/// header dictionary FILTER/INFO/FORMAT ID entry 114 } 115 116 /// Structured repreentation of VCF header (§1.2) 117 /// Note that bcf_hdr_t structs must always be created via bcf_hdr_init() 118 struct bcf_hdr_t { // @suppress(dscanner.style.phobos_naming_convention) 119 int32_t[3] n; /// n:the size of the dictionary block in use, (allocated size, m, is below to preserve ABI) 120 bcf_idpair_t *[3] id; /// ID dictionary {FILTER/INFO/FORMAT, contig, sample} ID key/entry 121 void *[3] dict; /// hash table 122 char **samples; /// ?list of samples 123 bcf_hrec_t **hrec; /// Structured representation of this header line 124 int nhrec; /// # of header records 125 int dirty; /// ? 126 int ntransl; /// for bcf_translate() 127 int *[2] transl; /// for bcf_translate() 128 int nsamples_ori; /// for bcf_hdr_set_samples() 129 uint8_t *keep_samples; /// ? 130 kstring_t mem; /// ? 131 int32_t[3] m; /// m: allocated size of the dictionary block in use (see n above) 132 } 133 134 /// Lookup table used in bcf_record_check 135 /// MAINTAINER: in C header is [] 136 extern __gshared uint8_t[16] bcf_type_shift; 137 138 /************** 139 * VCF record * 140 **************/ 141 142 enum int BCF_BT_NULL = 0; /// null 143 enum int BCF_BT_INT8 = 1; /// int8 144 enum int BCF_BT_INT16 = 2; /// int16 145 enum int BCF_BT_INT32 = 3; /// int32 146 enum int BCF_BT_FLOAT = 5; /// float (32?) 147 enum int BCF_BT_CHAR = 7; /// char (8 bit) 148 149 enum int VCF_REF =0; /// ref (e.g. in a gVCF) 150 enum int VCF_SNP =1; /// SNP 151 enum int VCF_MNP =2; /// MNP 152 enum int VCF_INDEL=4; /// INDEL 153 enum int VCF_OTHER=8; /// other (e.g. SV) 154 enum int VCF_BND =16; /// breakend 155 156 /// variant type record embedded in bcf_dec_t 157 /// variant type and the number of bases affected, negative for deletions 158 struct variant_t { // @suppress(dscanner.style.phobos_naming_convention) 159 int type; /// variant type and the number of bases affected, negative for deletions 160 int n; /// variant type and the number of bases affected, negative for deletions 161 } 162 163 /// FORMAT field data (§1.4.2 Genotype fields) 164 struct bcf_fmt_t { // @suppress(dscanner.style.phobos_naming_convention) 165 int id; /// id: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$id].key 166 int n; /// n: number of values per-sample; size: number of bytes per-sample; type: one of BCF_BT_* types 167 int size; /// size: number of bytes per-sample; type: one of BCF_BT_* types 168 int type; /// type: one of BCF_BT_* types 169 uint8_t *p; /// same as vptr and vptr_* in bcf_info_t below 170 uint32_t p_len; /// ? 171 172 /// ??? 173 mixin(bitfields!( 174 uint, "p_off", 31, 175 bool, "p_free", 1)); /// ? 176 } 177 178 /// INFO field data (§1.4.1 Fixed fields, (8) INFO) 179 struct bcf_info_t { // @suppress(dscanner.style.phobos_naming_convention) 180 int key; /// key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key 181 int type; /// type: one of BCF_BT_* types; len: vector length, 1 for scalars 182 int len; /// type: one of BCF_BT_* types; len: vector length, 1 for scalars 183 /// Stores a numeric value iff this INFO field is a scalar 184 union V1 { 185 int32_t i; /// integer value 186 float f; /// float value 187 } 188 V1 v1; /// only set if $len==1; for easier access 189 uint8_t *vptr; /// pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes 190 uint32_t vptr_len; /// length of the vptr block or, when set, of the vptr_mod block, excluding offset 191 192 /// vptr offset, i.e., the size of the INFO key plus size+type bytes 193 /// indicates that vptr-vptr_off must be freed; set only when modified and the new 194 /// data block is bigger than the original 195 mixin(bitfields!( 196 uint, "vptr_off", 31, 197 bool, "vptr_free", 1)); 198 } 199 200 201 enum int BCF1_DIRTY_ID = 1; /// ID was edited 202 enum int BCF1_DIRTY_ALS = 2; /// Allele(s) was edited 203 enum int BCF1_DIRTY_FLT = 4; /// FILTER was edited 204 enum int BCF1_DIRTY_INF = 8; /// INFO was edited 205 206 /// Variable-length data from a VCF record 207 struct bcf_dec_t { // @suppress(dscanner.style.phobos_naming_convention) 208 /// allocated size (high-water mark); do not change 209 int m_fmt, m_info, m_id, m_als, m_allele, m_flt; 210 int n_flt; /// Number of FILTER fields 211 int *flt; /// FILTER keys in the dictionary 212 char *id; /// ID 213 char *als; /// REF+ALT block (\0-seperated) 214 char **allele; /// allele[0] is the REF (allele[] pointers to the als block); all null terminated 215 bcf_info_t *info; /// INFO 216 bcf_fmt_t *fmt; /// FORMAT and individual sample 217 variant_t *var; /// $var and $var_type set only when set_variant_types called 218 int n_var; /// variant number(???) 219 int var_type; /// variant type (TODO: make enum) 220 int shared_dirty; /// if set, shared.s must be recreated on BCF output (TODO: make enum) 221 int indiv_dirty; /// if set, indiv.s must be recreated on BCF output (TODO: make enum) 222 } 223 224 225 enum int BCF_ERR_CTG_UNDEF = 1; /// BCF error: undefined contig 226 enum int BCF_ERR_TAG_UNDEF = 2; /// BCF error: undefined tag 227 enum int BCF_ERR_NCOLS = 4; /// BCF error: 228 enum int BCF_ERR_LIMITS = 8; /// BCF error: 229 enum int BCF_ERR_CHAR = 16; /// BCF error: 230 enum int BCF_ERR_CTG_INVALID = 32; /// BCF error: 231 enum int BCF_ERR_TAG_INVALID = 64; /// BCF error: 232 233 /** 234 The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file 235 is slower because the string is first to be parsed, packed into BCF line 236 (done in vcf_parse), then unpacked into internal bcf1_t structure. If it 237 is known in advance that some of the fields will not be required (notably 238 the sample columns), parsing of these can be skipped by setting max_unpack 239 appropriately. 240 Similarly, it is fast to output a BCF line because the columns (kept in 241 shared.s, indiv.s, etc.) are written directly by bcf_write, whereas a VCF 242 line must be formatted in vcf_format. 243 */ 244 struct bcf1_t { // @suppress(dscanner.style.phobos_naming_convention) 245 int32_t rid; /// CHROM 246 int32_t pos; /// POS 247 int32_t rlen; /// length of REF 248 float qual; /// QUAL 249 250 mixin(bitfields!( 251 uint, "n_info", 16, /// For whatever reason, windows doesn't like uint32_t 252 uint, "n_allele", 16)); 253 254 mixin(bitfields!( 255 uint, "n_fmt", 8, /// For whatever reason, windows doesn't like uint32_t 256 uint, "n_sample", 24)); 257 258 kstring_t _shared; /// ??? (name mangled due to D reserved keyword shared) 259 kstring_t indiv; /// ??? 260 bcf_dec_t d; /// lazy evaluation: $d is not generated by bcf_read(), but by explicitly calling bcf_unpack() 261 int max_unpack; /// Set to BCF_UN_STR, BCF_UN_FLT, or BCF_UN_INFO to boost performance of vcf_parse when some of the fields won't be needed 262 int unpacked; /// remember what has been unpacked to allow calling bcf_unpack() repeatedly without redoing the work 263 int[3] unpack_size; /// the original block size of ID, REF+ALT and FILTER 264 int errcode; /// one of BCF_ERR_* codes (TODO: make enum) 265 } 266 267 /******* 268 * API * 269 *******/ 270 271 /*********************************************************************** 272 * BCF and VCF I/O 273 * 274 * A note about naming conventions: htslib internally represents VCF 275 * records as bcf1_t data structures, therefore most functions are 276 * prefixed with bcf_. There are a few exceptions where the functions must 277 * be aware of both BCF and VCF worlds, such as bcf_parse vs vcf_parse. In 278 * these cases, functions prefixed with bcf_ are more general and work 279 * with both BCF and VCF. 280 * 281 ***********************************************************************/ 282 283 /** These macros are defined only for consistency with other parts of htslib */ 284 alias bcf_init1 = bcf_init; 285 alias bcf_read1 = bcf_read; 286 alias vcf_read1 = vcf_read; 287 alias bcf_write1 = bcf_write; 288 alias vcf_write1 = vcf_write; 289 alias bcf_destroy1 = bcf_destroy; 290 alias bcf_empty1 = bcf_empty; 291 alias vcf_parse1 = vcf_parse; 292 alias bcf_clear1 = bcf_clear; 293 alias vcf_format1 = vcf_format; 294 295 /** 296 * bcf_hdr_init() - create an empty BCF header. 297 * @param mode "r" or "w" 298 * 299 * When opened for writing, the mandatory fileFormat and 300 * FILTER=PASS lines are added automatically. 301 */ 302 bcf_hdr_t *bcf_hdr_init(const(char) *mode); 303 304 /** Destroy a BCF header struct */ 305 void bcf_hdr_destroy(bcf_hdr_t *h); 306 307 /** Initialize a bcf1_t object; equivalent to calloc(1, sizeof(bcf1_t)) */ 308 bcf1_t *bcf_init(); 309 310 /** Deallocate a bcf1_t object */ 311 void bcf_destroy(bcf1_t *v); 312 313 /** 314 * Same as bcf_destroy() but frees only the memory allocated by bcf1_t, 315 * not the bcf1_t object itself. 316 */ 317 void bcf_empty(bcf1_t *v); 318 319 /** 320 * Make the bcf1_t object ready for next read. Intended mostly for 321 * internal use, the user should rarely need to call this function 322 * directly. 323 */ 324 void bcf_clear(bcf1_t *v); 325 326 327 /** bcf_open and vcf_open mode: please see hts_open() in hts.h */ 328 alias vcfFile = htsFile; 329 alias bcf_open = hts_open; 330 alias vcf_open = hts_open; 331 alias bcf_close = hts_close; 332 alias vcf_close = hts_close; 333 334 /** Reads VCF or BCF header */ 335 bcf_hdr_t *bcf_hdr_read(htsFile *fp); 336 337 /** 338 * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed 339 * @samples: samples to include or exclude from file or as a comma-separated string. 340 * LIST|FILE .. select samples in list/file 341 * ^LIST|FILE .. exclude samples from list/file 342 * - .. include all samples 343 * NULL .. exclude all samples 344 * @is_file: @samples is a file (1) or a comma-separated list (0) 345 * 346 * The bottleneck of VCF reading is parsing of genotype fields. If the 347 * reader knows in advance that only subset of samples is needed (possibly 348 * no samples at all), the performance of bcf_read() can be significantly 349 * improved by calling bcf_hdr_set_samples after bcf_hdr_read(). 350 * The function bcf_read() will subset the VCF/BCF records automatically 351 * with the notable exception when reading records via bcf_itr_next(). 352 * In this case, bcf_subset_format() must be called explicitly, because 353 * bcf_readrec() does not see the header. 354 * 355 * Returns 0 on success, -1 on error or a positive integer if the list 356 * contains samples not present in the VCF header. In such a case, the 357 * return value is the index of the offending sample. 358 */ 359 int bcf_hdr_set_samples(bcf_hdr_t *hdr, const(char) *samples, int is_file); 360 /// ditto 361 int bcf_subset_format(const(bcf_hdr_t) *hdr, bcf1_t *rec); 362 363 364 /** Writes VCF or BCF header */ 365 int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h); 366 367 /** 368 * Parse VCF line contained in kstring and populate the bcf1_t struct 369 * The line must not end with \n or \r characters. 370 */ 371 int vcf_parse(kstring_t *s, const(bcf_hdr_t) *h, bcf1_t *v); 372 373 /** The opposite of vcf_parse. It should rarely be called directly, see vcf_write */ 374 int vcf_format(const(bcf_hdr_t) *h, const(bcf1_t) *v, kstring_t *s); 375 376 /** 377 * bcf_read() - read next VCF or BCF record 378 * 379 * Returns -1 on critical errors, 0 otherwise. On errors which are not 380 * critical for reading, such as missing header definitions, v->errcode is 381 * set to one of BCF_ERR* code and must be checked before calling 382 * vcf_write(). 383 */ 384 int bcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); 385 386 /** 387 * bcf_unpack() - unpack/decode a BCF record (fills the bcf1_t::d field) 388 * 389 * Note that bcf_unpack() must be called even when reading VCF. It is safe 390 * to call the function repeatedly, it will not unpack the same field 391 * twice. 392 */ 393 int bcf_unpack(bcf1_t *b, int which); 394 enum int BCF_UN_STR = 1; /// up to ALT inclusive 395 enum int BCF_UN_FLT = 2; /// up to FILTER 396 enum int BCF_UN_INFO = 4; /// up to INFO 397 enum int BCF_UN_SHR = (BCF_UN_STR|BCF_UN_FLT|BCF_UN_INFO); /// all shared information 398 enum int BCF_UN_FMT = 8; /// unpack format and each sample 399 alias BCF_UN_IND = BCF_UN_FMT; // a synonymo of BCF_UN_FMT 400 enum int BCF_UN_ALL = (BCF_UN_SHR|BCF_UN_FMT); /// everything 401 402 /** 403 * bcf_dup() - create a copy of BCF record. 404 * 405 * Note that bcf_unpack() must be called on the returned copy as if it was 406 * obtained from bcf_read(). Also note that bcf_dup() calls bcf_sync1(src) 407 * internally to reflect any changes made by bcf_update_* functions. 408 */ 409 bcf1_t *bcf_dup(bcf1_t *src); 410 /// ditto 411 bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src); 412 413 /** 414 * bcf_write() - write one VCF or BCF record. The type is determined at the open() call. 415 */ 416 int bcf_write(htsFile *fp, bcf_hdr_t *h, bcf1_t *v); 417 418 /** 419 * The following functions work only with VCFs and should rarely be called 420 * directly. Usually one wants to use their bcf_* alternatives, which work 421 * transparently with both VCFs and BCFs. 422 */ 423 bcf_hdr_t *vcf_hdr_read(htsFile *fp); 424 /// ditto 425 int vcf_hdr_write(htsFile *fp, const(bcf_hdr_t) *h); 426 /// ditto 427 int vcf_read(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); 428 /// ditto 429 int vcf_write(htsFile *fp, const(bcf_hdr_t) *h, bcf1_t *v); 430 431 /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ 432 /** NOTE: C API second parameter called "null", mangled here as _null */ 433 int bcf_readrec(BGZF *fp, void *_null, void *v, int *tid, int *beg, int *end); 434 435 436 437 /************************************************************************** 438 * Header querying and manipulation routines 439 **************************************************************************/ 440 441 /** Create a new header using the supplied template */ 442 bcf_hdr_t *bcf_hdr_dup(const(bcf_hdr_t) *hdr); 443 444 /** 445 * Copy header lines from src to dst if not already present in dst. See also bcf_translate(). 446 * Returns 0 on success or sets a bit on error: 447 * 1 .. conflicting definitions of tag length 448 * // todo 449 */ 450 deprecated("Please use bcf_hdr_merge instead") 451 int bcf_hdr_combine(bcf_hdr_t *dst, const(bcf_hdr_t) *src); 452 453 /** 454 * bcf_hdr_merge() - copy header lines from src to dst, see also bcf_translate() 455 * @param dst: the destination header to be merged into, NULL on the first pass 456 * @param src: the source header 457 * 458 * Notes: 459 * - use as: 460 * bcf_hdr_t *dst = NULL; 461 * for (i=0; i<nsrc; i++) dst = bcf_hdr_merge(dst,src[i]); 462 * 463 * - bcf_hdr_merge() replaces bcf_hdr_combine() which had a problem when 464 * combining multiple BCF headers. The current bcf_hdr_combine() 465 * does not have this problem, but became slow when used for many files. 466 */ 467 bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const(bcf_hdr_t) *src); 468 469 /** 470 * bcf_hdr_add_sample() - add a new sample. 471 * @param sample: sample name to be added 472 */ 473 int bcf_hdr_add_sample(bcf_hdr_t *hdr, const(char) *sample); 474 475 /** Read VCF header from a file and update the header */ 476 int bcf_hdr_set(bcf_hdr_t *hdr, const(char) *fname); 477 478 /// Appends formatted header text to _str_. 479 /** If _is_bcf_ is zero, `IDX` fields are discarded. 480 * @return 0 if successful, or negative if an error occurred 481 * @since 1.4 482 */ 483 int bcf_hdr_format(const(bcf_hdr_t) *hdr, int is_bcf, kstring_t *str); 484 485 /** Returns formatted header (newly allocated string) and its length, 486 * excluding the terminating \0. If is_bcf parameter is unset, IDX 487 * fields are discarded. 488 * @deprecated Use bcf_hdr_format() instead as it can handle huge headers. 489 */ 490 deprecated("use bcf_hdr_format() instead") 491 char *bcf_hdr_fmt_text(const(bcf_hdr_t) *hdr, int is_bcf, int *len); 492 493 /** Append new VCF header line, returns 0 on success */ 494 int bcf_hdr_append(bcf_hdr_t *h, const(char) *line); 495 /// ditto 496 int bcf_hdr_printf(bcf_hdr_t *h, const(char) *format, ...); 497 498 /** VCF version, e.g. VCFv4.2 */ 499 const(char) *bcf_hdr_get_version(const(bcf_hdr_t) *hdr); 500 /// Ditto 501 /// NB: mangled second parameter to _version 502 void bcf_hdr_set_version(bcf_hdr_t *hdr, const(char) *_version); 503 504 /** 505 * bcf_hdr_remove() - remove VCF header tag 506 * @param type: one of BCF_HL_* 507 * @param key: tag name or NULL to remove all tags of the given type 508 */ 509 void bcf_hdr_remove(bcf_hdr_t *h, int type, const(char) *key); 510 511 /** 512 * bcf_hdr_subset() - creates a new copy of the header removing unwanted samples 513 * @param n: number of samples to keep 514 * @param samples: names of the samples to keep 515 * @param imap: mapping from index in @samples to the sample index in the original file 516 * 517 * Sample names not present in h0 are ignored. The number of unmatched samples can be checked 518 * by comparing n and bcf_hdr_nsamples(out_hdr). 519 * This function can be used to reorder samples. 520 * See also bcf_subset() which subsets individual records. 521 */ 522 /// NOTE: char *const* samples really exmplifies what I hate about C pointers 523 /// My interpretation of this is it is equivalent to char **samples, but that the outer pointer is const 524 /// which in D would be const(char *)*samples. I don't know what it implies about constancy of *samples or samples. 525 bcf_hdr_t *bcf_hdr_subset(const(bcf_hdr_t) *h0, int n, const(char *)*samples, int *imap); 526 //bcf_hdr_t *bcf_hdr_subset(const(bcf_hdr_t) *h0, int n, char *const* samples, int *imap); 527 528 /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ 529 const(char) **bcf_hdr_seqnames(const(bcf_hdr_t) *h, int *nseqs); 530 531 /** Get number of samples */ 532 pragma(inline, true) auto bcf_hdr_nsamples(bcf_hdr_t *hdr) { return hdr.n[BCF_DT_SAMPLE]; } 533 //#define bcf_hdr_nsamples(hdr) (hdr)->n[BCF_DT_SAMPLE] 534 535 536 /** The following functions are for internal use and should rarely be called directly */ 537 int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); 538 /// ditto 539 int bcf_hdr_sync(bcf_hdr_t *h); 540 /// ditto 541 bcf_hrec_t *bcf_hdr_parse_line(const(bcf_hdr_t) *h, const(char) *line, int *len); 542 /// ditto 543 void bcf_hrec_format(const(bcf_hrec_t) *hrec, kstring_t *str); 544 /// ditto 545 int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); 546 547 /** 548 * bcf_hdr_get_hrec() - get header line info 549 * @param type: one of the BCF_HL_* types: FLT,INFO,FMT,CTG,STR,GEN 550 * @param key: the header key for generic lines (e.g. "fileformat"), any field 551 * for structured lines, typically "ID". 552 * @param value: the value which pairs with key. Can be be NULL for BCF_HL_GEN 553 * @param str_class: the class of BCF_HL_STR line (e.g. "ALT" or "SAMPLE"), otherwise NULL 554 */ 555 bcf_hrec_t *bcf_hdr_get_hrec(const(bcf_hdr_t) *hdr, int type, const(char) *key, const(char) *value, 556 const(char) *str_class); 557 /// Duplicate header record 558 bcf_hrec_t *bcf_hrec_dup(bcf_hrec_t *hrec); 559 /// Add key to header record 560 void bcf_hrec_add_key(bcf_hrec_t *hrec, const(char) *str, int len); 561 /// Set key's value for header record 562 void bcf_hrec_set_val(bcf_hrec_t *hrec, int i, const(char) *str, int len, int is_quoted); 563 /// Lookup header record by key 564 int bcf_hrec_find_key(bcf_hrec_t *hrec, const(char) *key); 565 /// Index header record 566 void hrec_add_idx(bcf_hrec_t *hrec, int idx); 567 /// Deallocate header record 568 void bcf_hrec_destroy(bcf_hrec_t *hrec); 569 570 571 572 /************************************************************************** 573 * Individual record querying and manipulation routines 574 **************************************************************************/ 575 576 /** See the description of bcf_hdr_subset() */ 577 int bcf_subset(const(bcf_hdr_t) *h, bcf1_t *v, int n, int *imap); 578 579 /** 580 * bcf_translate() - translate tags ids to be consistent with different header. This function 581 * is useful when lines from multiple VCF need to be combined. 582 * @dst_hdr: the destination header, to be used in bcf_write(), see also bcf_hdr_combine() 583 * @src_hdr: the source header, used in bcf_read() 584 * @src_line: line obtained by bcf_read() 585 */ 586 int bcf_translate(const(bcf_hdr_t) *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); 587 588 /** 589 * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc 590 */ 591 int bcf_get_variant_types(bcf1_t *rec); 592 /// ditto 593 int bcf_get_variant_type(bcf1_t *rec, int ith_allele); 594 /// ditto (TODO: what is this? should this be bool?) 595 int bcf_is_snp(bcf1_t *v); 596 597 /** 598 * bcf_update_filter() - sets the FILTER column 599 * @flt_ids: The filter IDs to set, numeric IDs returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 600 * @n: Number of filters. If n==0, all filters are removed 601 * Returns: zero 602 */ 603 int bcf_update_filter(const(bcf_hdr_t) *hdr, bcf1_t *line, int *flt_ids, int n); 604 /** 605 * bcf_add_filter() - adds to the FILTER column 606 * @flt_id: filter ID to add, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 607 * 608 * If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed. 609 */ 610 int bcf_add_filter(const(bcf_hdr_t) *hdr, bcf1_t *line, int flt_id); 611 /** 612 * bcf_remove_filter() - removes from the FILTER column 613 * @flt_id: filter ID to remove, numeric ID returned by bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS") 614 * @pass: when set to 1 and no filters are present, set to PASS 615 */ 616 int bcf_remove_filter(const(bcf_hdr_t) *hdr, bcf1_t *line, int flt_id, int pass); 617 /** 618 * Returns 1 if present, 0 if absent, or -1 if filter does not exist. "PASS" and "." can be used interchangeably. 619 */ 620 int bcf_has_filter(const(bcf_hdr_t) *hdr, bcf1_t *line, char *filter); 621 /** 622 * bcf_update_alleles() and bcf_update_alleles_str() - update REF and ALLT column 623 * @alleles: Array of alleles 624 * @nals: Number of alleles 625 * @alleles_string: Comma-separated alleles, starting with the REF allele 626 */ 627 int bcf_update_alleles(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) **alleles, int nals); 628 /// ditto 629 int bcf_update_alleles_str(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *alleles_string); 630 631 /** 632 * bcf_update_id() - sets new ID string 633 * bcf_add_id() - adds to the ID string checking for duplicates 634 */ 635 int bcf_update_id(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *id); 636 /// ditto 637 int bcf_add_id(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *id); 638 639 /** 640 * bcf_update_info_*() - functions for updating INFO fields 641 * @hdr: the BCF header 642 * @line: VCF line to be edited 643 * @key: the INFO tag to be updated 644 * @values: pointer to the array of values. Pass NULL to remove the tag. 645 * @n: number of values in the array. When set to 0, the INFO tag is removed 646 * 647 * The @string in bcf_update_info_flag() is optional, @n indicates whether 648 * the flag is set or removed. 649 * 650 * Returns 0 on success or negative value on error. 651 */ 652 int bcf_update_info(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values, int n, int type); 653 // TODO: Write D template 654 pragma(inline, true) { 655 auto bcf_update_info_int32(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values, int n) // @suppress(dscanner.style.undocumented_declaration) 656 { return bcf_update_info(hdr, line, key, values, n, BCF_HT_INT); } 657 auto bcf_update_info_float(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values, int n) // @suppress(dscanner.style.undocumented_declaration) 658 { return bcf_update_info(hdr, line, key, values, n, BCF_HT_REAL); } 659 auto bcf_update_info_flag(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values, int n) // @suppress(dscanner.style.undocumented_declaration) 660 { return bcf_update_info(hdr, line, key, values, n, BCF_HT_FLAG); } 661 auto bcf_update_info_string(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values) // @suppress(dscanner.style.undocumented_declaration) 662 { return bcf_update_info(hdr, line, key, values, 1, BCF_HT_STR); } 663 } 664 665 /** 666 * bcf_update_format_*() - functions for updating FORMAT fields 667 * @values: pointer to the array of values, the same number of elements 668 * is expected for each sample. Missing values must be padded 669 * with bcf_*_missing or bcf_*_vector_end values. 670 * @n: number of values in the array. If n==0, existing tag is removed. 671 * 672 * The function bcf_update_format_string() is a higher-level (slower) variant of 673 * bcf_update_format_char(). The former accepts array of \0-terminated strings 674 * whereas the latter requires that the strings are collapsed into a single array 675 * of fixed-length strings. In case of strings with variable length, shorter strings 676 * can be \0-padded. Note that the collapsed strings passed to bcf_update_format_char() 677 * are not \0-terminated. 678 * 679 * Returns 0 on success or negative value on error. 680 */ 681 int bcf_update_format_string(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(char) **values, int n); 682 /// ditto 683 int bcf_update_format(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(void) *values, int n, int type); 684 // TODO: Write D template 685 pragma(inline, true) { 686 auto bcf_update_format_int32(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(int) *values, int n) // @suppress(dscanner.style.undocumented_declaration) 687 { return bcf_update_format(hdr, line, key, values, n, BCF_HT_INT); } 688 auto bcf_update_format_float(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(float) *values, int n) // @suppress(dscanner.style.undocumented_declaration) 689 { return bcf_update_format(hdr, line, key, values, n, BCF_HT_REAL); } 690 auto bcf_update_format_char(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key, const(char) **values, int n) // @suppress(dscanner.style.undocumented_declaration) 691 { return bcf_update_format(hdr, line, key, values, n, BCF_HT_STR); } 692 auto bcf_update_genotypes(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) **gts, int n) // @suppress(dscanner.style.undocumented_declaration) 693 { return bcf_update_format(hdr, line, toStringz("GT"c), gts, n, BCF_HT_INT); } 694 } 695 696 697 /// Macros for setting genotypes correctly, for use with bcf_update_genotypes only; idx corresponds 698 /// to VCF's GT (1-based index to ALT or 0 for the reference allele) and val is the opposite, obtained 699 /// from bcf_get_genotypes() below. 700 // TODO: is int appropriate? 701 pragma(inline, true) { 702 auto bcf_gt_phased(int idx) { return (((idx)+1)<<1|1); } 703 /// ditto 704 auto bcf_gt_unphased(int idx) { return (((idx)+1)<<1); } 705 /// ditto 706 auto bcf_gt_is_missing(int val) { return ((val)>>1 ? 0 : 1);} 707 /// ditto 708 auto bcf_gt_is_phased(int idx) { return ((idx)&1); } 709 /// ditto 710 auto bcf_gt_allele(int val) { return (((val)>>1)-1); } 711 } 712 /// ditto 713 enum int bcf_gt_missing = 0; 714 715 /** Conversion between alleles indexes to Number=G genotype index (assuming diploid, all 0-based) */ 716 pragma(inline, true) { 717 auto bcf_alleles2gt(int a, int b) { return ((a)>(b)?((a)*((a)+1)/2+(b)):((b)*((b)+1)/2+(a))); } 718 /// ditto 719 void bcf_gt2alleles(int igt, int *a, int *b) 720 { 721 int k = 0, dk = 1; // @suppress(dscanner.useless-initializer) 722 while ( k<igt ) { dk++; k += dk; } 723 *b = dk - 1; *a = igt - k + *b; 724 } 725 } 726 727 /** 728 * bcf_get_fmt() - returns pointer to FORMAT's field data 729 * @header: for access to BCF_DT_ID dictionary 730 * @line: VCF line obtained from vcf_parse1 731 * @fmt: one of GT,PL,... 732 * 733 * Returns bcf_fmt_t* if the call succeeded, or returns NULL when the field 734 * is not available. 735 */ 736 bcf_fmt_t *bcf_get_fmt(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key); 737 /// ditto 738 bcf_info_t *bcf_get_info(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *key); 739 740 /** 741 * bcf_get_*_id() - returns pointer to FORMAT/INFO field data given the header index instead of the string ID 742 * @line: VCF line obtained from vcf_parse1 743 * @id: The header index for the tag, obtained from bcf_hdr_id2int() 744 * 745 * Returns bcf_fmt_t* / bcf_info_t*. These functions do not check if the index is valid 746 * as their goal is to avoid the header lookup. 747 */ 748 bcf_fmt_t *bcf_get_fmt_id(bcf1_t *line, const int id); 749 /// ditto 750 bcf_info_t *bcf_get_info_id(bcf1_t *line, const int id); 751 752 /** 753 * bcf_get_info_*() - get INFO values, integers or floats 754 * @hdr: BCF header 755 * @line: BCF record 756 * @tag: INFO tag to retrieve 757 * @dst: *dst is pointer to a memory location, can point to NULL 758 * @ndst: pointer to the size of allocated memory 759 * 760 * Returns negative value on error or the number of written values 761 * (including missing values) on success. bcf_get_info_string() returns 762 * on success the number of characters written excluding the null- 763 * terminating byte. bcf_get_info_flag() returns 1 when flag is set or 0 764 * if not. 765 * 766 * List of return codes: 767 * -1 .. no such INFO tag defined in the header 768 * -2 .. clash between types defined in the header and encountered in the VCF record 769 * -3 .. tag is not present in the VCF record 770 */ 771 int bcf_get_info_values(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst, int type); 772 pragma(inline, true) { 773 auto bcf_get_info_int32(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) 774 { return bcf_get_info_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_INT); } 775 auto bcf_get_info_float(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) 776 { return bcf_get_info_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_REAL); } 777 auto bcf_get_info_string(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) 778 { return bcf_get_info_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_STR); } 779 auto bcf_get_info_flag(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) 780 { return bcf_get_info_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_FLAG); } 781 } 782 783 /** 784 * bcf_get_format_*() - same as bcf_get_info*() above 785 * 786 * The function bcf_get_format_string() is a higher-level (slower) variant of bcf_get_format_char(). 787 * see the description of bcf_update_format_string() and bcf_update_format_char() above. 788 * Unlike other bcf_get_format__*() functions, bcf_get_format_string() allocates two arrays: 789 * a single block of \0-terminated strings collapsed into a single array and an array of pointers 790 * to these strings. Both arrays must be cleaned by the user. 791 * 792 * Returns negative value on error or the number of written values on success. 793 * 794 * Use the returned number of written values for accessing valid entries of dst, as ndst is only a 795 * watermark that can be higher than the returned value, i.e. the end of dst can contain carry-over 796 * values from previous calls to bcf_get_format_*() on lines with more values per sample. 797 * 798 * Example: 799 * int ndst = 0; char **dst = NULL; 800 * if ( bcf_get_format_string(hdr, line, "XX", &dst, &ndst) > 0 ) 801 * for (i=0; i<bcf_hdr_nsamples(hdr); i++) printf("%s\n", dst[i]); 802 * free(dst[0]); free(dst); 803 * 804 * Example: 805 * int i, j, ngt, nsmpl = bcf_hdr_nsamples(hdr); 806 * int32_t *gt_arr = NULL, ngt_arr = 0; 807 * 808 * ngt = bcf_get_genotypes(hdr, line, >_arr, &ngt_arr); 809 * if ( ngt<=0 ) return; // GT not present 810 * 811 * int max_ploidy = ngt/nsmpl; 812 * for (i=0; i<nsmpl; i++) 813 * { 814 * int32_t *ptr = gt + i*max_ploidy; 815 * for (j=0; j<max_ploidy; j++) 816 * { 817 * // if true, the sample has smaller ploidy 818 * if ( ptr[j]==bcf_int32_vector_end ) break; 819 * 820 * // missing allele 821 * if ( bcf_gt_is_missing(ptr[j]) ) continue; 822 * 823 * // the VCF 0-based allele index 824 * int allele_index = bcf_gt_allele(ptr[j]); 825 * 826 * // is phased? 827 * int is_phased = bcf_gt_is_phased(ptr[j]); 828 * 829 * // .. do something .. 830 * } 831 * } 832 * free(gt_arr); 833 * 834 */ 835 int bcf_get_format_string(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, char ***dst, int *ndst); 836 /// ditto 837 int bcf_get_format_values(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst, int type); 838 pragma(inline, true) { 839 auto bcf_get_format_int32(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) // @suppress(dscanner.style.long_line) 840 { return bcf_get_format_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_INT); } 841 auto bcf_get_format_float(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) // @suppress(dscanner.style.long_line) 842 { return bcf_get_format_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_REAL); } 843 auto bcf_get_format_char(const(bcf_hdr_t) *hdr, bcf1_t *line, const(char) *tag, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) // @suppress(dscanner.style.long_line) 844 { return bcf_get_format_values(hdr, line, tag, cast(void**) dst, ndst, BCF_HT_STR); } 845 auto bcf_get_genotypes(const(bcf_hdr_t) *hdr, bcf1_t *line, void **dst, int *ndst) // @suppress(dscanner.style.undocumented_declaration) // @suppress(dscanner.style.long_line) 846 { return bcf_get_format_values(hdr, line, toStringz("GT"c), cast(void**) dst, ndst, BCF_HT_INT); } 847 } 848 849 850 851 /************************************************************************** 852 * Helper functions 853 **************************************************************************/ 854 855 /** 856 * bcf_hdr_id2int() - Translates string into numeric ID 857 * bcf_hdr_int2id() - Translates numeric ID into string 858 * @type: one of BCF_DT_ID, BCF_DT_CTG, BCF_DT_SAMPLE 859 * @id: tag name, such as: PL, DP, GT, etc. 860 * 861 * Returns -1 if string is not in dictionary, otherwise numeric ID which identifies 862 * fields in BCF records. 863 */ 864 int bcf_hdr_id2int(const(bcf_hdr_t) *hdr, int type, const(char) *id); 865 /// ditto 866 pragma(inline, true) 867 auto bcf_hdr_int2id(const(bcf_hdr_t) *hdr, int type, int int_id) 868 { return hdr.id[type][int_id].key; } 869 //#define bcf_hdr_int2id(hdr,type,int_id) ((hdr)->id[type][int_id].key) 870 871 /** 872 * bcf_hdr_name2id() - Translates sequence names (chromosomes) into numeric ID 873 * bcf_hdr_id2name() - Translates numeric ID to sequence name 874 */ 875 pragma(inline, true) int bcf_hdr_name2id(const(bcf_hdr_t) *hdr, const(char) *id) { return bcf_hdr_id2int(hdr, BCF_DT_CTG, id); } // @suppress(dscanner.style.long_line) 876 /// ditto 877 pragma(inline, true) const(char) *bcf_hdr_id2name(const(bcf_hdr_t) *hdr, int rid) { return hdr.id[BCF_DT_CTG][rid].key; } // @suppress(dscanner.style.long_line) 878 /// ditto 879 pragma(inline, true) const(char) *bcf_seqname(const(bcf_hdr_t) *hdr, bcf1_t *rec) { return hdr.id[BCF_DT_CTG][rec.rid].key; } // @suppress(dscanner.style.long_line) 880 881 /** 882 * bcf_hdr_id2*() - Macros for accessing bcf_idinfo_t 883 * @type: one of BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT 884 * @int_id: return value of bcf_hdr_id2int, must be >=0 885 * 886 * The returned values are: 887 * bcf_hdr_id2length .. whether the number of values is fixed or variable, one of BCF_VL_* 888 * bcf_hdr_id2number .. the number of values, 0xfffff for variable length fields 889 * bcf_hdr_id2type .. the field type, one of BCF_HT_* 890 * bcf_hdr_id2coltype .. the column type, one of BCF_HL_* 891 * 892 * Notes: Prior to using the macros, the presence of the info should be 893 * tested with bcf_hdr_idinfo_exists(). 894 */ 895 // TODO: for dict_type and col_type use ENUMs 896 pragma(inline, true) { 897 auto bcf_hdr_id2length(const(bcf_hdr_t) *hdr, int type, int int_id) { return ((hdr).id[BCF_DT_ID][int_id].val.info[type]>>8 & 0xf); } // @suppress(dscanner.style.long_line) 898 /// ditto 899 auto bcf_hdr_id2number(const(bcf_hdr_t) *hdr, int type, int int_id) { return ((hdr).id[BCF_DT_ID][int_id].val.info[type]>>12); } // @suppress(dscanner.style.long_line) 900 /// ditto 901 auto bcf_hdr_id2type(const(bcf_hdr_t) *hdr, int type, int int_id) { return ((hdr).id[BCF_DT_ID][int_id].val.info[type]>>4 & 0xf); } // @suppress(dscanner.style.long_line) 902 /// ditto 903 auto bcf_hdr_id2coltype(const(bcf_hdr_t) *hdr, int type, int int_id){ return ((hdr).id[BCF_DT_ID][int_id].val.info[type] & 0xf); } // @suppress(dscanner.style.long_line) 904 /// ditto 905 auto bcf_hdr_idinfo_exists(const(bcf_hdr_t) *hdr, int type, int int_id) { return ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1); } // @suppress(dscanner.style.long_line) 906 /// ditto 907 auto bcf_hdr_id2hrc(const(bcf_hdr_t) *hdr, int dict_type, int col_type, int int_id) 908 { return ((hdr).id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val.hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]); } // @suppress(dscanner.style.long_line) 909 } 910 911 /// Undocumented 912 void bcf_fmt_array(kstring_t *s, int n, int type, void *data); 913 /// ditto 914 uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr); 915 916 /// ditto 917 void bcf_enc_vchar(kstring_t *s, int l, const(char) *a); 918 /// ditto 919 void bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize); 920 /// ditto 921 void bcf_enc_vfloat(kstring_t *s, int n, float *a); 922 923 924 /************************************************************************** 925 * BCF index 926 * 927 * Note that these functions work with BCFs only. See synced_bcf_reader.h 928 * which provides (amongst other things) an API to work transparently with 929 * both indexed BCFs and VCFs. 930 **************************************************************************/ 931 932 alias bcf_itr_destroy = hts_itr_destroy; 933 934 pragma(inline, true) { 935 /// Generate an iterator for an integer-based range query 936 auto bcf_itr_queryi(const(hts_idx_t) *idx, int tid, int beg, int end) 937 { return hts_itr_query(idx, tid, beg, end, &bcf_readrec); } 938 939 /// Generate an iterator for a string-based range query 940 auto bcf_itr_querys(const(hts_idx_t) *idx, const(bcf_hdr_t) *hdr, const(char) *s) 941 { return hts_itr_querys(idx, s, cast(hts_name2id_f) &bcf_hdr_name2id, cast(void *) hdr, 942 &hts_itr_query, &bcf_readrec); } 943 944 /// Iterate through the range 945 /// r should (probably) point to your VCF (BCF) row structure 946 /// TODO: attempt to define parameter r as bcf1_t *, which is what I think it should be 947 auto bcf_itr_next(htsFile *htsfp, hts_itr_t *itr, void *r) 948 { return hts_itr_next(htsfp.fp.bgzf, itr, r, null); } // last param was 0 949 950 /// Load a BCF file's CSI index 951 auto bcf_index_load(const(char) *fn) { return hts_idx_load(fn, HTS_FMT_CSI); } 952 953 /// Get a list (char **) of sequence names from the index -- free only the array, not the values 954 auto bcf_index_seqnames(const(hts_idx_t) *idx, const(bcf_hdr_t) *hdr, int *nptr) 955 { return hts_idx_seqnames(idx, nptr, cast(hts_id2name_f) &bcf_hdr_id2name, cast(void *) hdr); } 956 } 957 958 /** Load BCF index with explicit fn and explicit index fn */ 959 hts_idx_t *bcf_index_load2(const(char) *fn, const(char) *fnidx); 960 961 /** 962 * bcf_index_build() - Generate and save an index file 963 * @fn: Input VCF(compressed)/BCF filename 964 * @min_shift: log2(width of the smallest bin), e.g. a value of 14 965 * imposes a 16k base lower limit on the width of index bins. 966 * Positive to generate CSI, or 0 to generate TBI. However, a small 967 * value of min_shift would create a large index, which would lead to 968 * reduced performance when using the index. A recommended value is 14. 969 * For BCF files, only the CSI index can be generated. 970 * 971 * Returns 0 if successful, or negative if an error occurred. 972 * 973 * List of error codes: 974 * -1 .. indexing failed 975 * -2 .. opening @fn failed 976 * -3 .. format not indexable 977 * -4 .. failed to create and/or save the index 978 */ 979 int bcf_index_build(const(char) *fn, int min_shift); 980 981 /** 982 * bcf_index_build2() - Generate and save an index to a specific file 983 * @fn: Input VCF/BCF filename 984 * @fnidx: Output filename, or NULL to add .csi/.tbi to @fn 985 * @min_shift: Positive to generate CSI, or 0 to generate TBI 986 * 987 * Returns 0 if successful, or negative if an error occurred. 988 * 989 * List of error codes: 990 * -1 .. indexing failed 991 * -2 .. opening @fn failed 992 * -3 .. format not indexable 993 * -4 .. failed to create and/or save the index 994 */ 995 int bcf_index_build2(const(char) *fn, const(char) *fnidx, int min_shift); 996 997 /** 998 * bcf_index_build3() - Generate and save an index to a specific file 999 * @fn: Input VCF/BCF filename 1000 * @fnidx: Output filename, or NULL to add .csi/.tbi to @fn 1001 * @min_shift: Positive to generate CSI, or 0 to generate TBI 1002 * @n_threads: Number of VCF/BCF decoder threads 1003 * 1004 * Returns 0 if successful, or negative if an error occurred. 1005 * 1006 * List of error codes: 1007 * -1 .. indexing failed 1008 * -2 .. opening @fn failed 1009 * -3 .. format not indexable 1010 * -4 .. failed to create and/or save the index 1011 */ 1012 int bcf_index_build3(const(char) *fn, const(char) *fnidx, int min_shift, int n_threads); 1013 1014 /******************* 1015 * Typed value I/O * 1016 *******************/ 1017 1018 /** 1019 Note that in contrast with BCFv2.1 specification, HTSlib implementation 1020 allows missing values in vectors. For integer types, the values 0x80, 1021 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, 1022 0x80000001 as end-of-vector indicators. Similarly for floats, the value of 1023 0x7F800001 is interpreted as a missing value and 0x7F800002 as an 1024 end-of-vector indicator. 1025 Note that the end-of-vector byte is not part of the vector. 1026 1027 This trial BCF version (v2.2) is compatible with the VCF specification and 1028 enables to handle correctly vectors with different ploidy in presence of 1029 missing values. 1030 */ 1031 enum int8_t bcf_int8_vector_end = (INT8_MIN+1); 1032 /// ditto 1033 enum int16_t bcf_int16_vector_end = (INT16_MIN+1); 1034 /// ditto 1035 enum int32_t bcf_int32_vector_end = (INT32_MIN+1); 1036 /// ditto 1037 enum char bcf_str_vector_end = 0; //#define bcf_str_vector_end 0 1038 /// ditto 1039 enum int8_t bcf_int8_missing = INT8_MIN; 1040 /// ditto 1041 enum int16_t bcf_int16_missing = INT16_MIN; 1042 /// ditto 1043 enum int32_t bcf_int32_missing = INT32_MIN; 1044 /// ditto 1045 enum char bcf_str_missing = 0x07; // #define bcf_str_missing 0x07 1046 extern __gshared uint32_t bcf_float_vector_end; /// ditto 1047 extern __gshared uint32_t bcf_float_missing; /// ditto 1048 1049 version(LDC) pragma(inline, true): 1050 version(GNU) pragma(inline, true): 1051 /** u wot */ 1052 void bcf_float_set(float *ptr, uint32_t value) 1053 { 1054 union U { uint32_t i; float f; } 1055 U u; 1056 u.i = value; 1057 *ptr = u.f; 1058 } 1059 1060 /// float vector macros 1061 void bcf_float_set_vector_end(float x) { bcf_float_set(&x, bcf_float_vector_end); } 1062 /// ditto 1063 void bcf_float_set_missing(float x) { bcf_float_set(&x, bcf_float_missing); } 1064 1065 /** u wot */ 1066 int bcf_float_is_missing(float f) 1067 { 1068 union U { uint32_t i; float f; } 1069 U u; 1070 u.f = f; 1071 return u.i==bcf_float_missing ? 1 : 0; 1072 } 1073 /// ditto 1074 int bcf_float_is_vector_end(float f) 1075 { 1076 union U { uint32_t i; float f; } 1077 U u; 1078 u.f = f; 1079 return u.i==bcf_float_vector_end ? 1 : 0; 1080 } 1081 /// ditto 1082 /+void bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) 1083 { 1084 #define BRANCH(type_t, missing, vector_end) { \ 1085 type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ 1086 int i; \ 1087 for (i=0; i<fmt->n && ptr[i]!=vector_end; i++) \ 1088 { \ 1089 if ( i ) kputc("/|"[ptr[i]&1], str); \ 1090 if ( !(ptr[i]>>1) ) kputc('.', str); \ 1091 else kputw((ptr[i]>>1) - 1, str); \ 1092 } \ 1093 if (i == 0) kputc('.', str); \ 1094 } 1095 switch (fmt->type) { 1096 case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; 1097 case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; 1098 case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; 1099 case BCF_BT_NULL: kputc('.', str); break; 1100 default: hts_log_error("Unexpected type %d", fmt->type); abort(); break; 1101 } 1102 #undef BRANCH 1103 }+/ 1104 /// ditto 1105 /+void bcf_enc_size(kstring_t *s, int size, int type) 1106 { 1107 if (size >= 15) { 1108 kputc(15<<4|type, s); 1109 if (size >= 128) { 1110 if (size >= 32768) { 1111 int32_t x = size; 1112 kputc(1<<4|BCF_BT_INT32, s); 1113 kputsn(cast(char*)&x, 4, s); 1114 } else { 1115 int16_t x = size; 1116 kputc(1<<4|BCF_BT_INT16, s); 1117 kputsn(cast(char*)&x, 2, s); 1118 } 1119 } else { 1120 kputc(1<<4|BCF_BT_INT8, s); 1121 kputc(size, s); 1122 } 1123 } else kputc(size<<4|type, s); 1124 }+/ 1125 /// ditto 1126 int bcf_enc_inttype(long x) 1127 { 1128 if (x <= INT8_MAX && x > bcf_int8_missing) return BCF_BT_INT8; 1129 if (x <= INT16_MAX && x > bcf_int16_missing) return BCF_BT_INT16; 1130 return BCF_BT_INT32; 1131 } 1132 /// ditto 1133 /+void bcf_enc_int1(kstring_t *s, int32_t x) 1134 { 1135 if (x == bcf_int32_vector_end) { 1136 bcf_enc_size(s, 1, BCF_BT_INT8); 1137 kputc(bcf_int8_vector_end, s); 1138 } else if (x == bcf_int32_missing) { 1139 bcf_enc_size(s, 1, BCF_BT_INT8); 1140 kputc(bcf_int8_missing, s); 1141 } else if (x <= INT8_MAX && x > bcf_int8_missing) { 1142 bcf_enc_size(s, 1, BCF_BT_INT8); 1143 kputc(x, s); 1144 } else if (x <= INT16_MAX && x > bcf_int16_missing) { 1145 int16_t z = x; 1146 bcf_enc_size(s, 1, BCF_BT_INT16); 1147 kputsn(cast(char*)&z, 2, s); 1148 } else { 1149 int32_t z = x; 1150 bcf_enc_size(s, 1, BCF_BT_INT32); 1151 kputsn(cast(char*)&z, 4, s); 1152 } 1153 }+/ 1154 /// Need hts_endian 1155 /+ int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) 1156 { 1157 if (type == BCF_BT_INT8) { 1158 *q = cast(uint8_t*)p + 1; 1159 return le_to_i8(p); 1160 } else if (type == BCF_BT_INT16) { 1161 *q = cast(uint8_t*)p + 2; 1162 return le_to_i16(p); 1163 } else { 1164 *q = cast(uint8_t*)p + 4; 1165 return le_to_i32(p); 1166 } 1167 }+/ 1168 /// TODO 1169 /+int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) 1170 { 1171 return bcf_dec_int1(p + 1, *p&0xf, q); 1172 }+/ 1173 /// TODO 1174 /+int32_t bcf_dec_size(const uint8_t *p, uint8_t **q, int *type) 1175 { 1176 *type = *p & 0xf; 1177 if (*p>>4 != 15) { 1178 *q = cast(uint8_t*)p + 1; 1179 return *p>>4; 1180 } else return bcf_dec_typed_int1(p + 1, q); 1181 }+/