1 /// @file htslib/regidx.h 2 /// Region indexing. 3 /* 4 Copyright (C) 2014-2019 Genome Research Ltd. 5 6 Author: Petr Danecek <pd3@sanger.ac.uk> 7 8 Permission is hereby granted, free of charge, to any person obtaining a copy 9 of this software and associated documentation files (the "Software"), to deal 10 in the Software without restriction, including without limitation the rights 11 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 copies of the Software, and to permit persons to whom the Software is 13 furnished to do so, subject to the following conditions: 14 15 The above copyright notice and this permission notice shall be included in 16 all copies or substantial portions of the Software. 17 18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 THE SOFTWARE. 25 */ 26 27 /* 28 Region indexing with an optional payload. 29 30 Example of usage: 31 32 // Init the parser and print regions. In this example the payload is a 33 // pointer to a string. For the description of parse_custom and 34 // free_custom functions, see regidx_parse_f and regidx_free_f below, 35 // and for working example see test/test-regidx.c. 36 regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL); 37 38 // Query overlap with chr:beg-end (beg,end are 1-based coordinates) 39 regitr_t *itr = regitr_init(idx); 40 if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n"); 41 42 while ( regitr_overlap(itr) ) 43 { 44 printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n", 45 beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*)); 46 } 47 48 regidx_destroy(idx); 49 regitr_destroy(itr); 50 51 52 Another example, loop over all regions: 53 54 regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL); 55 regitr_t *itr = regitr_init(idx); 56 57 while ( regitr_loop(itr) ) 58 printf("chr=%s beg=%d end=%d\n", itr->seq, itr->beg+1, itr->end+1); 59 60 regidx_destroy(idx); 61 regitr_destroy(itr); 62 */ 63 64 module htslib.regidx; 65 import htslib.hts : hts_pos_t; 66 67 extern (C): 68 69 // maximum regidx position (0-based). Used to represent the end point of 70 // regions which do not explicitly set one. regidx_push() also limits 71 // positions passed to it to be no bigger than this. 72 73 // Limit is set to ensure some internal values used by regidx keep within 32 74 // bits and to stop the index from getting too big. 75 76 enum REGIDX_MAX = 1UL << 35; 77 78 struct regidx_t; 79 80 struct regitr_t 81 { 82 hts_pos_t beg; 83 hts_pos_t end; 84 void* payload; 85 char* seq; 86 void* itr; 87 } 88 89 // Old API for backwards compatibility 90 extern (D) auto REGITR_START(T)(auto ref T itr) 91 { 92 return itr.beg; 93 } 94 95 extern (D) auto REGITR_END(T)(auto ref T itr) 96 { 97 return itr.end; 98 } 99 100 /* Omitted due to symbol collission -- dstep does not recognize case sensitivity 101 #define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload) 102 #define REGITR_OVERLAP(itr,from,to) regidx_overlap((itr)); 103 */ 104 105 /* 106 * regidx_parse_f - Function to parse one input line, such as regidx_parse_bed 107 * or regidx_parse_tab below. The function is expected to set `chr_from` and 108 * `chr_to` to point to first and last character of chromosome name and set 109 * coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was 110 * called with non-zero payload_size, the `payload` points to a memory 111 * location of the payload_size and `usr` is the data passed to regidx_init(). 112 * Any memory allocated by the function will be freed by regidx_free_f called 113 * by regidx_destroy(). 114 * 115 * Return value: 0 on success, -1 to skip a record, -2 on fatal error. 116 */ 117 alias regidx_parse_f = int function(const(char)* line, char** chr_beg, char** chr_end, hts_pos_t* beg, hts_pos_t* end, 118 void* payload, void* usr); 119 alias regidx_free_f = void function(void* payload); 120 121 /* 122 * A note about the parsers: 123 * - leading spaces are ignored 124 * - lines starting with "#" are ignored 125 */ 126 int regidx_parse_bed( 127 const(char)*, 128 char**, 129 char**, 130 hts_pos_t*, 131 hts_pos_t*, 132 void*, 133 void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open) 134 int regidx_parse_tab( 135 const(char)*, 136 char**, 137 char**, 138 hts_pos_t*, 139 hts_pos_t*, 140 void*, 141 void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive) 142 int regidx_parse_reg( 143 const(char)*, 144 char**, 145 char**, 146 hts_pos_t*, 147 hts_pos_t*, 148 void*, 149 void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive) 150 int regidx_parse_vcf( 151 const(char)*, 152 char**, 153 char**, 154 hts_pos_t*, 155 hts_pos_t*, 156 void*, 157 void*); 158 159 /* 160 * regidx_init() - creates new index 161 * regidx_init_string() - creates new index, from a string rather than from a file 162 * 163 * @param fname: input file name or NULL if regions will be added one-by-one via regidx_insert() 164 * @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL, 165 * the format will be autodected, currently either regidx_parse_tab (the default) or 166 * regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that 167 * the exact autodetection algorithm will change. 168 * @param freef: NULL or see description of regidx_parse_f 169 * @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f 170 * @param usr: optional user data passed to regidx_parse_f 171 * 172 * Returns index on success or NULL on error. 173 * 174 * The regidx_t index struct returned by a successful call should be freed 175 * via regidx_destroy() when it is no longer needed. 176 */ 177 regidx_t* regidx_init( 178 const(char)* fname, 179 regidx_parse_f parsef, 180 regidx_free_f freef, 181 size_t payload_size, 182 void* usr); 183 regidx_t* regidx_init_string( 184 const(char)* string, 185 regidx_parse_f parsef, 186 regidx_free_f freef, 187 size_t payload_size, 188 void* usr); 189 190 /* 191 * regidx_destroy() - free memory allocated by regidx_init 192 */ 193 void regidx_destroy(regidx_t* idx); 194 195 /* 196 * regidx_overlap() - check overlap of the location chr:from-to with regions 197 * @param beg,end: 0-based start, end coordinate (inclusive) 198 * @param itr: pointer to iterator, can be NULL if regidx_loop not needed 199 * 200 * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping 201 * regions can be iterated as shown in the example above. 202 */ 203 int regidx_overlap( 204 regidx_t* idx, 205 const(char)* chr, 206 hts_pos_t beg, 207 hts_pos_t end, 208 regitr_t* itr); 209 210 /* 211 * regidx_insert() - add a new region. 212 * regidx_insert_list() - add new regions from a list 213 * regidx_push() - low level insertion of a new region 214 * 215 * Returns 0 on success or -1 on error. 216 */ 217 int regidx_insert(regidx_t* idx, char* line); 218 int regidx_insert_list(regidx_t* idx, char* line, char delim); 219 int regidx_push( 220 regidx_t* idx, 221 char* chr_beg, 222 char* chr_end, 223 hts_pos_t beg, 224 hts_pos_t end, 225 void* payload); 226 227 /* 228 * regidx_seq_names() - return list of all sequence names 229 */ 230 char** regidx_seq_names(regidx_t* idx, int* n); 231 232 /* 233 * regidx_seq_nregs() - number of regions 234 * regidx_nregs() - total number of regions 235 */ 236 int regidx_seq_nregs(regidx_t* idx, const(char)* seq); 237 238 int regidx_nregs(regidx_t* idx); 239 240 /* 241 * regitr_init() - initialize an iterator. The idx parameter is required only 242 * with regitr_loop. If only regitr_overlap is called, NULL 243 * can be given. 244 * 245 * The regitr_t struct returned by a successful regitr_init() 246 * call should be freed via regitr_destroy() when it is no 247 * longer needed. 248 * 249 * regitr_reset() - initialize an iterator for a repeated regitr_loop cycle. 250 * Not required with regitr_overlap. 251 */ 252 regitr_t* regitr_init(regidx_t* idx); 253 void regitr_destroy(regitr_t* itr); 254 void regitr_reset(regidx_t* idx, regitr_t* itr); 255 256 /* 257 * regitr_overlap() - next overlapping region 258 * Returns 0 when done or 1 when itr is set to next region 259 */ 260 int regitr_overlap(regitr_t* itr); 261 262 /* 263 * regitr_loop() - loop over all regions 264 * Returns 0 when done or 1 when itr is set to next region 265 */ 266 int regitr_loop(regitr_t* itr); 267 268 /* 269 * regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop 270 */ 271 void regitr_copy(regitr_t* dst, regitr_t* src); 272