1 /// @file htslib/regidx.h
2 /// Region indexing.
3 /*
4     Copyright (C) 2014-2019 Genome Research Ltd.
5 
6     Author: Petr Danecek <pd3@sanger.ac.uk>
7 
8     Permission is hereby granted, free of charge, to any person obtaining a copy
9     of this software and associated documentation files (the "Software"), to deal
10     in the Software without restriction, including without limitation the rights
11     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12     copies of the Software, and to permit persons to whom the Software is
13     furnished to do so, subject to the following conditions:
14 
15     The above copyright notice and this permission notice shall be included in
16     all copies or substantial portions of the Software.
17 
18     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24     THE SOFTWARE.
25 */
26 
27 /*
28     Region indexing with an optional payload.
29 
30     Example of usage:
31 
32         // Init the parser and print regions. In this example the payload is a
33         // pointer to a string. For the description of parse_custom and
34         // free_custom functions, see regidx_parse_f and regidx_free_f below,
35         // and for working example see test/test-regidx.c.
36         regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
37 
38         // Query overlap with chr:beg-end (beg,end are 1-based coordinates)
39         regitr_t *itr = regitr_init(idx);
40         if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n");
41 
42         while ( regitr_overlap(itr) )
43         {
44             printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n",
45                    beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*));
46         }
47 
48         regidx_destroy(idx);
49         regitr_destroy(itr);
50 
51 
52     Another example, loop over all regions:
53 
54         regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL);
55         regitr_t *itr = regitr_init(idx);
56 
57         while ( regitr_loop(itr) )
58             printf("chr=%s  beg=%d  end=%d\n", itr->seq, itr->beg+1, itr->end+1);
59 
60         regidx_destroy(idx);
61         regitr_destroy(itr);
62 */
63 module htslib.regidx;
64 
65 import htslib.hts: hts_pos_t;
66 
67 @system:
68 nothrow:
69 @nogc:
70 
71 extern (C):
72 
73 // maximum regidx position (0-based).  Used to represent the end point of
74 // regions which do not explicitly set one.  regidx_push() also limits
75 // positions passed to it to be no bigger than this.
76 
77 // Limit is set to ensure some internal values used by regidx keep within 32
78 // bits and to stop the index from getting too big.
79 
80 enum REGIDX_MAX = 1UL << 35;
81 
82 struct regidx_t;
83 
84 struct regitr_t
85 {
86     hts_pos_t beg;
87     hts_pos_t end;
88     void* payload;
89     char* seq;
90     void* itr;
91 }
92 
93 // Old API for backwards compatibility
94 extern (D) auto REGITR_START(T)(auto ref T itr)
95 {
96     return itr.beg;
97 }
98 
99 extern (D) auto REGITR_END(T)(auto ref T itr)
100 {
101     return itr.end;
102 }
103 
104 /*
105  *  regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
106  *  or regidx_parse_tab below. The function is expected to set `chr_from` and
107  *  `chr_to` to point to first and last character of chromosome name and set
108  *  coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was
109  *  called with non-zero payload_size, the `payload` points to a memory
110  *  location of the payload_size and `usr` is the data passed to regidx_init().
111  *  Any memory allocated by the function will be freed by regidx_free_f called
112  *  by regidx_destroy().
113  *
114  *  Return value: 0 on success, -1 to skip a record, -2 on fatal error.
115  */
116 alias regidx_parse_f = int function(const(char)* line, char** chr_beg, char** chr_end, hts_pos_t* beg, hts_pos_t* end, void* payload, void* usr);
117 alias regidx_free_f = void function(void* payload);
118 
119 /*
120  *  A note about the parsers:
121  *      - leading spaces are ignored
122  *      - lines starting with "#" are ignored
123  */
124 int regidx_parse_bed(
125     const(char)*,
126     char**,
127     char**,
128     hts_pos_t*,
129     hts_pos_t*,
130     void*,
131     void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
132 int regidx_parse_tab(
133     const(char)*,
134     char**,
135     char**,
136     hts_pos_t*,
137     hts_pos_t*,
138     void*,
139     void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
140 int regidx_parse_reg(
141     const(char)*,
142     char**,
143     char**,
144     hts_pos_t*,
145     hts_pos_t*,
146     void*,
147     void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
148 int regidx_parse_vcf(
149     const(char)*,
150     char**,
151     char**,
152     hts_pos_t*,
153     hts_pos_t*,
154     void*,
155     void*);
156 
157 /*
158  *  regidx_init() - creates new index
159  *  regidx_init_string() - creates new index, from a string rather than from a file
160  *
161  *  @param fname:  input file name or NULL if regions will be added one-by-one via regidx_insert()
162  *  @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
163  *                 the format will be autodected, currently either regidx_parse_tab (the default) or
164  *                 regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
165  *                 the exact autodetection algorithm will change.
166  *  @param freef:  NULL or see description of regidx_parse_f
167  *  @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
168  *  @param usr:    optional user data passed to regidx_parse_f
169  *
170  *  Returns index on success or NULL on error.
171  *
172  *  The regidx_t index struct returned by a successful call should be freed
173  *  via regidx_destroy() when it is no longer needed.
174  */
175 regidx_t* regidx_init(
176     const(char)* fname,
177     regidx_parse_f parsef,
178     regidx_free_f freef,
179     size_t payload_size,
180     void* usr);
181 regidx_t* regidx_init_string(
182     const(char)* string,
183     regidx_parse_f parsef,
184     regidx_free_f freef,
185     size_t payload_size,
186     void* usr);
187 
188 /*
189  *  regidx_destroy() - free memory allocated by regidx_init
190  */
191 void regidx_destroy(regidx_t* idx);
192 
193 /*
194  *  regidx_overlap() - check overlap of the location chr:from-to with regions
195  *  @param beg,end:     0-based start, end coordinate (inclusive)
196  *  @param itr:         pointer to iterator, can be NULL if regidx_loop not needed
197  *
198  *  Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
199  *  regions can be iterated as shown in the example above.
200  */
201 int regidx_overlap(
202     regidx_t* idx,
203     const(char)* chr,
204     hts_pos_t beg,
205     hts_pos_t end,
206     regitr_t* itr);
207 
208 /*
209  *  regidx_insert() - add a new region.
210  *  regidx_insert_list() - add new regions from a list
211  *  regidx_push() - low level insertion of a new region
212  *
213  *  Returns 0 on success or -1 on error.
214  */
215 int regidx_insert(regidx_t* idx, char* line);
216 int regidx_insert_list(regidx_t* idx, char* line, char delim);
217 int regidx_push(
218     regidx_t* idx,
219     char* chr_beg,
220     char* chr_end,
221     hts_pos_t beg,
222     hts_pos_t end,
223     void* payload);
224 
225 /*
226  *  regidx_seq_names() - return list of all sequence names
227  */
228 char** regidx_seq_names(regidx_t* idx, int* n);
229 
230 /*
231  *  regidx_seq_nregs() - number of regions
232  *  regidx_nregs()  - total number of regions
233  */
234 int regidx_seq_nregs(regidx_t* idx, const(char)* seq);
235 
236 int regidx_nregs(regidx_t* idx);
237 
238 /*
239  *  regitr_init() - initialize an iterator. The idx parameter is required only
240  *                  with regitr_loop. If only regitr_overlap is called, NULL
241  *                  can be given.
242  *
243  *                  The regitr_t struct returned by a successful regitr_init()
244  *                  call should be freed via regitr_destroy() when it is no
245  *                  longer needed.
246  *
247  *  regitr_reset() - initialize an iterator for a repeated regitr_loop cycle.
248  *                  Not required with regitr_overlap.
249  */
250 regitr_t* regitr_init(regidx_t* idx);
251 void regitr_destroy(regitr_t* itr);
252 void regitr_reset(regidx_t* idx, regitr_t* itr);
253 
254 /*
255  *  regitr_overlap() - next overlapping region
256  *  Returns 0 when done or 1 when itr is set to next region
257  */
258 int regitr_overlap(regitr_t* itr);
259 
260 /*
261  *  regitr_loop() - loop over all regions
262  *  Returns 0 when done or 1 when itr is set to next region
263  */
264 int regitr_loop(regitr_t* itr);
265 
266 /*
267  *  regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop
268  */
269 void regitr_copy(regitr_t* dst, regitr_t* src);
270