1 /// @file htslib/synced_bcf_reader.h 2 /// Stream through multiple VCF files. 3 /* 4 Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. 5 6 Author: Petr Danecek <pd3@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 21 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 23 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 24 DEALINGS IN THE SOFTWARE. */ 25 26 /* 27 The synced_bcf_reader allows to keep multiple VCFs open and stream them 28 using the next_line iterator in a seamless matter without worrying about 29 chromosomes and synchronizing the sites. This is used by vcfcheck to 30 compare multiple VCFs simultaneously and is used also for merging, 31 creating intersections, etc. 32 33 The synced_bcf_reader also provides API for reading indexed BCF/VCF, 34 hiding differences in BCF/VCF opening, indexing and reading. 35 36 37 Example of usage: 38 39 bcf_srs_t *sr = bcf_sr_init(); 40 bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, BCF_SR_PAIR_BOTH_REF); 41 bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX); 42 for (i=0; i<nfiles; i++) 43 bcf_sr_add_reader(sr,files[i]); 44 while ( bcf_sr_next_line(sr) ) 45 { 46 for (i=0; i<nfiles; i++) 47 { 48 bcf1_t *line = bcf_sr_get_line(sr,i); 49 ... 50 } 51 } 52 if ( sr->errnum ) error("Error: %s\n", bcf_sr_strerror(sr->errnum)); 53 bcf_sr_destroy(sr); 54 */ 55 module htslib.synced_bcf_reader; 56 57 import core.stdc.stddef; 58 59 import htslib.hts; 60 import htslib.vcf; 61 import htslib.kstring : kstring_t; 62 import htslib.tbx : tbx_t; 63 64 @system: 65 nothrow: 66 @nogc: 67 68 extern (C): 69 70 /* 71 When reading multiple files in parallel, duplicate records within each 72 file will be reordered and offered in intuitive order. For example, 73 when reading two files, each with unsorted SNP and indel record, the 74 reader should return the SNP records together and the indel records 75 together. The logic of compatible records can vary depending on the 76 application and can be set using the PAIR_* defined below. 77 78 The COLLAPSE_* definitions will be deprecated in future versions, please 79 use the PAIR_* definitions instead. 80 */ 81 enum COLLAPSE_NONE = 0; // require the exact same set of alleles in all files 82 enum COLLAPSE_SNPS = 1; // allow different alleles, as long as they all are SNPs 83 enum COLLAPSE_INDELS = 2; // the same as above, but with indels 84 enum COLLAPSE_ANY = 4; // any combination of alleles can be returned by bcf_sr_next_line() 85 enum COLLAPSE_SOME = 8; // at least some of the ALTs must match 86 enum COLLAPSE_BOTH = COLLAPSE_SNPS | COLLAPSE_INDELS; 87 88 enum BCF_SR_PAIR_SNPS = 1 << 0; // allow different alleles, as long as they all are SNPs 89 enum BCF_SR_PAIR_INDELS = 1 << 1; // the same as above, but with indels 90 enum BCF_SR_PAIR_ANY = 1 << 2; // any combination of alleles can be returned by bcf_sr_next_line() 91 enum BCF_SR_PAIR_SOME = 1 << 3; // at least some of multiallelic ALTs must match. Implied by all the others with the exception of EXACT 92 enum BCF_SR_PAIR_SNP_REF = 1 << 4; // allow REF-only records with SNPs 93 enum BCF_SR_PAIR_INDEL_REF = 1 << 5; // allow REF-only records with indels 94 enum BCF_SR_PAIR_EXACT = 1 << 6; // require the exact same set of alleles in all files 95 enum BCF_SR_PAIR_BOTH = BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS; 96 enum BCF_SR_PAIR_BOTH_REF = BCF_SR_PAIR_SNPS | BCF_SR_PAIR_INDELS | BCF_SR_PAIR_SNP_REF | BCF_SR_PAIR_INDEL_REF; 97 98 enum bcf_sr_opt_t 99 { 100 BCF_SR_REQUIRE_IDX = 0, 101 BCF_SR_PAIR_LOGIC = 1, // combination of the PAIR_* values above 102 BCF_SR_ALLOW_NO_IDX = 2, // allow to proceed even if required index is not present (at the user's risk) 103 BCF_SR_REGIONS_OVERLAP = 3, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1] 104 BCF_SR_TARGETS_OVERLAP = 4 // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0] 105 } 106 107 struct bcf_sr_region_t; 108 109 struct bcf_sr_regions_t 110 { 111 // for reading from tabix-indexed file (big data) 112 tbx_t* tbx; // tabix index 113 hts_itr_t* itr; // tabix iterator 114 kstring_t line; // holder of the current line, set only when reading from tabix-indexed files 115 htsFile* file; 116 char* fname; 117 int is_bin; // is open in binary mode (tabix access) 118 char** als; // parsed alleles if targets_als set and _regions_match_alleles called 119 kstring_t als_str; // block of parsed alleles 120 int nals; 121 int mals; // number of set alleles and the size of allocated array 122 int als_type; // alleles type, currently VCF_SNP or VCF_INDEL 123 124 // user handler to deal with skipped regions without a counterpart in VCFs 125 void function(bcf_sr_regions_t*, void*) missed_reg_handler; 126 void* missed_reg_data; 127 128 // for in-memory regions (small data) 129 bcf_sr_region_t* regs; // the regions 130 131 // shared by both tabix-index and in-memory regions 132 void* seq_hash; // keys: sequence names, values: index to seqs 133 char** seq_names; // sequence names 134 int nseqs; // number of sequences (chromosomes) in the file 135 int iseq; // current position: chr name, index to snames 136 hts_pos_t start; 137 hts_pos_t end; // current position: start, end of the region (0-based) 138 int prev_seq; 139 hts_pos_t prev_start; 140 hts_pos_t prev_end; 141 int overlap; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP 142 } 143 144 struct bcf_sr_t 145 { 146 htsFile* file; 147 tbx_t* tbx_idx; 148 hts_idx_t* bcf_idx; 149 bcf_hdr_t* header; 150 hts_itr_t* itr; 151 char* fname; 152 bcf1_t** buffer; // cached VCF records. First is the current record synced across the reader 153 int nbuffer; 154 int mbuffer; // number of cached records (including the current record); number of allocated records 155 int nfilter_ids; 156 int* filter_ids; // -1 for ".", otherwise filter id as returned by bcf_hdr_id2int 157 int* samples; 158 int n_smpl; // list of columns in the order consistent with bcf_srs_t.samples 159 } 160 161 enum bcf_sr_error 162 { 163 open_failed = 0, 164 not_bgzf = 1, 165 idx_load_failed = 2, 166 file_type_error = 3, 167 api_usage_error = 4, 168 header_error = 5, 169 no_eof = 6, 170 no_memory = 7, 171 vcf_parse_error = 8, 172 bcf_read_error = 9, 173 noidx_error = 10 174 } 175 176 struct bcf_srs_t 177 { 178 // Parameters controlling the logic 179 int collapse; // Do not access directly, use bcf_sr_set_pairing_logic() instead 180 char* apply_filters; // If set, sites where none of the FILTER strings is listed 181 // will be skipped. Active only at the time of 182 // initialization, that is during the add_reader() 183 // calls. Therefore, each reader can be initialized with different 184 // filters. 185 int require_index; // Some tools do not need random access 186 int max_unpack; // When reading VCFs and knowing some fields will not be needed, boost performance of vcf_parse1 187 int* has_line; // Corresponds to return value of bcf_sr_next_line but is not limited by sizeof(int). Use bcf_sr_has_line macro to query. 188 bcf_sr_error errnum; 189 190 // Auxiliary data 191 bcf_sr_t* readers; 192 int nreaders; 193 int streaming; // reading mode: index-jumping or streaming 194 int explicit_regs; // was the list of regions se by bcf_sr_set_regions or guessed from tabix index? 195 char** samples; // List of samples 196 bcf_sr_regions_t* regions; 197 bcf_sr_regions_t* targets; // see bcf_sr_set_[targets|regions] for description 198 int targets_als; // subset to targets not only by position but also by alleles? 199 int targets_exclude; 200 kstring_t tmps; 201 int n_smpl; 202 203 int n_threads; // Simple multi-threaded decoding / encoding. 204 htsThreadPool* p; // Our pool, but it can be used by others if needed. 205 void* aux; // Opaque auxiliary data 206 } 207 208 /** Allocate and initialize a bcf_srs_t struct. 209 * 210 * The bcf_srs_t struct returned by a successful call should be freed 211 * via bcf_sr_destroy() when it is no longer needed. 212 */ 213 bcf_srs_t* bcf_sr_init(); 214 215 /** Destroy a bcf_srs_t struct */ 216 void bcf_sr_destroy(bcf_srs_t* readers); 217 218 char* bcf_sr_strerror(int errnum); 219 220 int bcf_sr_set_opt(bcf_srs_t* readers, bcf_sr_opt_t opt, ...); 221 222 /** 223 * bcf_sr_set_threads() - allocates a thread-pool for use by the synced reader. 224 * @n_threads: size of thread pool 225 * 226 * Returns 0 if the call succeeded, or <0 on error. 227 */ 228 int bcf_sr_set_threads(bcf_srs_t* files, int n_threads); 229 230 /** Deallocates thread memory, if owned by us. */ 231 void bcf_sr_destroy_threads(bcf_srs_t* files); 232 233 /** 234 * bcf_sr_add_reader() - open new reader 235 * @readers: holder of the open readers 236 * @fname: the VCF file 237 * 238 * Returns 1 if the call succeeded, or 0 on error. 239 * 240 * See also the bcf_srs_t data structure for parameters controlling 241 * the reader's logic. 242 */ 243 int bcf_sr_add_reader(bcf_srs_t* readers, const(char)* fname); 244 245 void bcf_sr_remove_reader(bcf_srs_t* files, int i); 246 247 /** 248 * bcf_sr_next_line() - the iterator 249 * @readers: holder of the open readers 250 * 251 * Returns the number of readers which have the current line 252 * (bcf_sr_t.buffer[0]) set at this position. Use the bcf_sr_has_line macro to 253 * determine which of the readers are set. 254 */ 255 int bcf_sr_next_line(bcf_srs_t* readers); 256 257 extern (D) auto bcf_sr_has_line(T0, T1)(auto ref T0 readers, auto ref T1 i) 258 { 259 return readers.has_line[i]; 260 } 261 262 extern (D) auto bcf_sr_get_line(T0, T1)(auto ref T0 _readers, auto ref T1 i) 263 { 264 return _readers.has_line[i] ? (_readers.readers[i].buffer[0]) : cast(bcf1_t*) NULL; 265 } 266 267 extern (D) int bcf_sr_region_done(T0, T1)(auto ref T0 _readers, auto ref T1 i) 268 { 269 return !_readers.has_line[i] && !_readers.readers[i].nbuffer ? 1 : 0; 270 } 271 272 extern (D) auto bcf_sr_get_header(T0, T1)(auto ref T0 _readers, auto ref T1 i) 273 { 274 return _readers.readers[i].header; 275 } 276 277 extern (D) auto bcf_sr_get_reader(T0, T1)(auto ref T0 _readers, auto ref T1 i) 278 { 279 return &(_readers.readers[i]); 280 } 281 282 /** 283 * bcf_sr_seek() - set all readers to selected position 284 * @seq: sequence name; NULL to seek to start 285 * @pos: 0-based coordinate 286 */ 287 int bcf_sr_seek(bcf_srs_t* readers, const(char)* seq, hts_pos_t pos); 288 289 /** 290 * bcf_sr_set_samples() - sets active samples 291 * @readers: holder of the open readers 292 * @samples: this can be one of: file name with one sample per line; 293 * or column-separated list of samples; or '-' for a list of 294 * samples shared by all files. If first character is the 295 * exclamation mark, all but the listed samples are included. 296 * @is_file: 0: list of samples; 1: file with sample names 297 * 298 * Returns 1 if the call succeeded, or 0 on error. 299 */ 300 int bcf_sr_set_samples(bcf_srs_t* readers, const(char)* samples, int is_file); 301 302 /** 303 * bcf_sr_set_targets(), bcf_sr_set_regions() - init targets/regions 304 * @readers: holder of the open readers 305 * @targets: list of regions, one-based and inclusive. 306 * @is_fname: 0: targets is a comma-separated list of regions (chr,chr:from-to) 307 * 1: targets is a tabix indexed file with a list of regions 308 * (<chr,pos> or <chr,from,to>) 309 * 310 * Returns 0 if the call succeeded, or -1 on error. 311 * 312 * Both functions behave the same way, unlisted positions will be skipped by 313 * bcf_sr_next_line(). However, there is an important difference: regions use 314 * index to jump to desired positions while targets streams the whole files 315 * and merely skip unlisted positions. 316 * 317 * Moreover, bcf_sr_set_targets() accepts an optional parameter $alleles which 318 * is interpreted as a 1-based column index in the tab-delimited file where 319 * alleles are listed. This in principle enables to perform the COLLAPSE_* 320 * logic also with tab-delimited files. However, the current implementation 321 * considers the alleles merely as a suggestion for prioritizing one of possibly 322 * duplicate VCF lines. It is up to the caller to examine targets->als if 323 * perfect match is sought after. Note that the duplicate positions in targets 324 * file are currently not supported. 325 * Targets (but not regions) can be prefixed with "^" to request logical complement, 326 * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. 327 * 328 * API note: bcf_sr_set_regions/bcf_sr_set_targets MUST be called before the 329 * first call to bcf_sr_add_reader(). 330 */ 331 int bcf_sr_set_targets( 332 bcf_srs_t* readers, 333 const(char)* targets, 334 int is_file, 335 int alleles); 336 337 int bcf_sr_set_regions(bcf_srs_t* readers, const(char)* regions, int is_file); 338 339 /* 340 * bcf_sr_regions_init() 341 * @regions: regions can be either a comma-separated list of regions 342 * (chr|chr:pos|chr:from-to|chr:from-) or VCF, BED, or 343 * tab-delimited file (the default). Uncompressed files 344 * are stored in memory while bgzip-compressed and tabix-indexed 345 * region files are streamed. 346 * @is_file: 0: regions is a comma-separated list of regions 347 * (chr|chr:pos|chr:from-to|chr:from-) 348 * 1: VCF, BED or tab-delimited file 349 * @chr, from, to: 350 * Column indexes of chromosome, start position and end position 351 * in the tab-delimited file. The positions are 1-based and 352 * inclusive. 353 * These parameters are ignored when reading from VCF, BED or 354 * tabix-indexed files. When end position column is not present, 355 * supply 'from' in place of 'to'. When 'to' is negative, first 356 * abs(to) will be attempted and if that fails, 'from' will be used 357 * instead. 358 * 359 * The bcf_sr_regions_t struct returned by a successful call should be freed 360 * via bcf_sr_regions_destroy() when it is no longer needed. 361 */ 362 bcf_sr_regions_t* bcf_sr_regions_init( 363 const(char)* regions, 364 int is_file, 365 int chr, 366 int from, 367 int to); 368 369 void bcf_sr_regions_destroy(bcf_sr_regions_t* regions); 370 371 /* 372 * bcf_sr_regions_seek() - seek to the chromosome block 373 * 374 * Returns 0 on success or -1 on failure. Sets reg->seq appropriately and 375 * reg->start,reg->end to -1. 376 */ 377 int bcf_sr_regions_seek(bcf_sr_regions_t* regions, const(char)* chr); 378 379 /* 380 * bcf_sr_regions_next() - retrieves next region. Returns 0 on success and -1 381 * when all regions have been read. The fields reg->seq, reg->start and 382 * reg->end are filled with the genomic coordinates on success or with 383 * NULL,-1,-1 when no region is available. The coordinates are 0-based, 384 * inclusive. 385 */ 386 int bcf_sr_regions_next(bcf_sr_regions_t* reg); 387 388 /* 389 * bcf_sr_regions_overlap() - checks if the interval <start,end> overlaps any of 390 * the regions, the coordinates are 0-based, inclusive. The coordinate queries 391 * must come in ascending order. 392 * 393 * Returns 0 if the position is in regions; -1 if the position is not in the 394 * regions and more regions exist; -2 if not in the regions and there are no more 395 * regions left. 396 */ 397 int bcf_sr_regions_overlap( 398 bcf_sr_regions_t* reg, 399 const(char)* seq, 400 hts_pos_t start, 401 hts_pos_t end); 402 403 /* 404 * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until 405 * all remaining records are processed. 406 * Returns 0 on success, <0 on error. 407 */ 408 int bcf_sr_regions_flush(bcf_sr_regions_t* regs); 409