1 /// @file htslib/regidx.h 2 /// Region indexing. 3 /* 4 Copyright (C) 2014-2019 Genome Research Ltd. 5 6 Author: Petr Danecek <pd3@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 THE SOFTWARE. 25 */ 26 27 /* 28 Region indexing with an optional payload. 29 30 Example of usage: 31 32 // Init the parser and print regions. In this example the payload is a 33 // pointer to a string. For the description of parse_custom and 34 // free_custom functions, see regidx_parse_f and regidx_free_f below, 35 // and for working example see test/test-regidx.c. 36 regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); 37 38 // Query overlap with chr:beg-end (beg,end are 1-based coordinates) 39 regitr_t *itr = regitr_init(idx); 40 if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); 41 42 while ( regitr_overlap(itr) ) 43 { 44 printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n", 45 beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*)); 46 } 47 48 regidx_destroy(idx); 49 regitr_destroy(itr); 50 51 52 Another example, loop over all regions: 53 54 regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); 55 regitr_t *itr = regitr_init(idx); 56 57 while ( regitr_loop(itr) ) 58 printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); 59 60 regidx_destroy(idx); 61 regitr_destroy(itr); 62 */ 63 module htslib.regidx; 64 65 import htslib.hts: hts_pos_t; 66 67 @system: 68 nothrow: 69 @nogc: 70 71 extern (C): 72 73 // maximum regidx position (0-based). Used to represent the end point of 74 // regions which do not explicitly set one. regidx_push() also limits 75 // positions passed to it to be no bigger than this. 76 77 // Limit is set to ensure some internal values used by regidx keep within 32 78 // bits and to stop the index from getting too big. 79 80 enum REGIDX_MAX = 1UL << 35; 81 82 struct regidx_t; 83 84 struct regitr_t 85 { 86 hts_pos_t beg; 87 hts_pos_t end; 88 void* payload; 89 char* seq; 90 void* itr; 91 } 92 93 // Old API for backwards compatibility 94 extern (D) auto REGITR_START(T)(auto ref T itr) 95 { 96 return itr.beg; 97 } 98 99 extern (D) auto REGITR_END(T)(auto ref T itr) 100 { 101 return itr.end; 102 } 103 104 /* 105 * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed 106 * or regidx_parse_tab below. The function is expected to set `chr_from` and 107 * `chr_to` to point to first and last character of chromosome name and set 108 * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was 109 * called with non-zero payload_size, the `payload` points to a memory 110 * location of the payload_size and `usr` is the data passed to regidx_init(). 111 * Any memory allocated by the function will be freed by regidx_free_f called 112 * by regidx_destroy(). 113 * 114 * Return value: 0 on success, -1 to skip a record, -2 on fatal error. 115 */ 116 alias regidx_parse_f = int function(const(char)* line, char** chr_beg, char** chr_end, hts_pos_t* beg, hts_pos_t* end, void* payload, void* usr); 117 alias regidx_free_f = void function(void* payload); 118 119 /* 120 * A note about the parsers: 121 * - leading spaces are ignored 122 * - lines starting with "#" are ignored 123 */ 124 int regidx_parse_bed( 125 const(char)*, 126 char**, 127 char**, 128 hts_pos_t*, 129 hts_pos_t*, 130 void*, 131 void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) 132 int regidx_parse_tab( 133 const(char)*, 134 char**, 135 char**, 136 hts_pos_t*, 137 hts_pos_t*, 138 void*, 139 void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) 140 int regidx_parse_reg( 141 const(char)*, 142 char**, 143 char**, 144 hts_pos_t*, 145 hts_pos_t*, 146 void*, 147 void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) 148 int regidx_parse_vcf( 149 const(char)*, 150 char**, 151 char**, 152 hts_pos_t*, 153 hts_pos_t*, 154 void*, 155 void*); 156 157 /* 158 * regidx_init() - creates new index 159 * regidx_init_string() - creates new index, from a string rather than from a file 160 * 161 * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() 162 * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, 163 * the format will be autodected, currently either regidx_parse_tab (the default) or 164 * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that 165 * the exact autodetection algorithm will change. 166 * @param freef: NULL or see description of regidx_parse_f 167 * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f 168 * @param usr: optional user data passed to regidx_parse_f 169 * 170 * Returns index on success or NULL on error. 171 * 172 * The regidx_t index struct returned by a successful call should be freed 173 * via regidx_destroy() when it is no longer needed. 174 */ 175 regidx_t* regidx_init( 176 const(char)* fname, 177 regidx_parse_f parsef, 178 regidx_free_f freef, 179 size_t payload_size, 180 void* usr); 181 regidx_t* regidx_init_string( 182 const(char)* string, 183 regidx_parse_f parsef, 184 regidx_free_f freef, 185 size_t payload_size, 186 void* usr); 187 188 /* 189 * regidx_destroy() - free memory allocated by regidx_init 190 */ 191 void regidx_destroy(regidx_t* idx); 192 193 /* 194 * regidx_overlap() - check overlap of the location chr:from-to with regions 195 * @param beg,end: 0-based start, end coordinate (inclusive) 196 * @param itr: pointer to iterator, can be NULL if regidx_loop not needed 197 * 198 * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping 199 * regions can be iterated as shown in the example above. 200 */ 201 int regidx_overlap( 202 regidx_t* idx, 203 const(char)* chr, 204 hts_pos_t beg, 205 hts_pos_t end, 206 regitr_t* itr); 207 208 /* 209 * regidx_insert() - add a new region. 210 * regidx_insert_list() - add new regions from a list 211 * regidx_push() - low level insertion of a new region 212 * 213 * Returns 0 on success or -1 on error. 214 */ 215 int regidx_insert(regidx_t* idx, char* line); 216 int regidx_insert_list(regidx_t* idx, char* line, char delim); 217 int regidx_push( 218 regidx_t* idx, 219 char* chr_beg, 220 char* chr_end, 221 hts_pos_t beg, 222 hts_pos_t end, 223 void* payload); 224 225 /* 226 * regidx_seq_names() - return list of all sequence names 227 */ 228 char** regidx_seq_names(regidx_t* idx, int* n); 229 230 /* 231 * regidx_seq_nregs() - number of regions 232 * regidx_nregs() - total number of regions 233 */ 234 int regidx_seq_nregs(regidx_t* idx, const(char)* seq); 235 236 int regidx_nregs(regidx_t* idx); 237 238 /* 239 * regitr_init() - initialize an iterator. The idx parameter is required only 240 * with regitr_loop. If only regitr_overlap is called, NULL 241 * can be given. 242 * 243 * The regitr_t struct returned by a successful regitr_init() 244 * call should be freed via regitr_destroy() when it is no 245 * longer needed. 246 * 247 * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. 248 * Not required with regitr_overlap. 249 */ 250 regitr_t* regitr_init(regidx_t* idx); 251 void regitr_destroy(regitr_t* itr); 252 void regitr_reset(regidx_t* idx, regitr_t* itr); 253 254 /* 255 * regitr_overlap() - next overlapping region 256 * Returns 0 when done or 1 when itr is set to next region 257 */ 258 int regitr_overlap(regitr_t* itr); 259 260 /* 261 * regitr_loop() - loop over all regions 262 * Returns 0 when done or 1 when itr is set to next region 263 */ 264 int regitr_loop(regitr_t* itr); 265 266 /* 267 * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop 268 */ 269 void regitr_copy(regitr_t* dst, regitr_t* src); 270