1 /// @file htslib/regidx.h
2 /// Region indexing.
3 /*
4     Copyright (C) 2014-2019 Genome Research Ltd.
5 
6     Author: Petr Danecek <pd3@sanger.ac.uk>
7 
8     Permission is hereby granted, free of charge, to any person obtaining a copy
9     of this software and associated documentation files (the "Software"), to deal
10     in the Software without restriction, including without limitation the rights
11     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12     copies of the Software, and to permit persons to whom the Software is
13     furnished to do so, subject to the following conditions:
14 
15     The above copyright notice and this permission notice shall be included in
16     all copies or substantial portions of the Software.
17 
18     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24     THE SOFTWARE.
25 */
26 
27 /*
28     Region indexing with an optional payload.
29 
30     Example of usage:
31 
32         // Init the parser and print regions. In this example the payload is a
33         // pointer to a string. For the description of parse_custom and
34         // free_custom functions, see regidx_parse_f and regidx_free_f below,
35         // and for working example see test/test-regidx.c.
36         regidx_t *idx = regidx_init(in_fname,parse_custom,free_custom,sizeof(char*),NULL);
37 
38         // Query overlap with chr:beg-end (beg,end are 1-based coordinates)
39         regitr_t *itr = regitr_init(idx);
40         if ( regidx_overlap(idx, chr,beg-1,end-1, itr) ) printf("There is an overlap!\n");
41 
42         while ( regitr_overlap(itr) )
43         {
44             printf("[%"PRIhts_pos",%"PRIhts_pos"] overlaps with [%"PRIhts_pos",%"PRIhts_pos"], payload=%s\n",
45                    beg, end, itr->beg+1, itr->end+1, regitr_payload(itr,char*));
46         }
47 
48         regidx_destroy(idx);
49         regitr_destroy(itr);
50 
51 
52     Another example, loop over all regions:
53 
54         regidx_t *idx = regidx_init(in_fname,NULL,NULL,0,NULL);
55         regitr_t *itr = regitr_init(idx);
56 
57         while ( regitr_loop(itr) )
58             printf("chr=%s  beg=%d  end=%d\n", itr->seq, itr->beg+1, itr->end+1);
59 
60         regidx_destroy(idx);
61         regitr_destroy(itr);
62 */
63 
64 module htslib.regidx;
65 import htslib.hts : hts_pos_t;
66 
67 extern (C):
68 
69 // maximum regidx position (0-based).  Used to represent the end point of
70 // regions which do not explicitly set one.  regidx_push() also limits
71 // positions passed to it to be no bigger than this.
72 
73 // Limit is set to ensure some internal values used by regidx keep within 32
74 // bits and to stop the index from getting too big.
75 
76 enum REGIDX_MAX = 1UL << 35;
77 
78 struct regidx_t;
79 
80 struct regitr_t
81 {
82     hts_pos_t beg;
83     hts_pos_t end;
84     void* payload;
85     char* seq;
86     void* itr;
87 }
88 
89 // Old API for backwards compatibility
90 extern (D) auto REGITR_START(T)(auto ref T itr)
91 {
92     return itr.beg;
93 }
94 
95 extern (D) auto REGITR_END(T)(auto ref T itr)
96 {
97     return itr.end;
98 }
99 
100 /* Omitted due to symbol collission -- dstep does not recognize case sensitivity
101 #define REGITR_PAYLOAD(itr,type_t) ((type_t*)(itr).payload)
102 #define REGITR_OVERLAP(itr,from,to) regidx_overlap((itr));
103 */
104 
105 /*
106  *  regidx_parse_f - Function to parse one input line, such as regidx_parse_bed
107  *  or regidx_parse_tab below. The function is expected to set `chr_from` and
108  *  `chr_to` to point to first and last character of chromosome name and set
109  *  coordinates `beg` and `end` (0-based, inclusive). If regidx_init() was
110  *  called with non-zero payload_size, the `payload` points to a memory
111  *  location of the payload_size and `usr` is the data passed to regidx_init().
112  *  Any memory allocated by the function will be freed by regidx_free_f called
113  *  by regidx_destroy().
114  *
115  *  Return value: 0 on success, -1 to skip a record, -2 on fatal error.
116  */
117 alias regidx_parse_f = int function(const(char)* line, char** chr_beg, char** chr_end, hts_pos_t* beg, hts_pos_t* end,
118     void* payload, void* usr);
119 alias regidx_free_f = void function(void* payload);
120 
121 /*
122  *  A note about the parsers:
123  *      - leading spaces are ignored
124  *      - lines starting with "#" are ignored
125  */
126 int regidx_parse_bed(
127     const(char)*,
128     char**,
129     char**,
130     hts_pos_t*,
131     hts_pos_t*,
132     void*,
133     void*); // CHROM or whitespace-sepatated CHROM,FROM,TO (0-based,right-open)
134 int regidx_parse_tab(
135     const(char)*,
136     char**,
137     char**,
138     hts_pos_t*,
139     hts_pos_t*,
140     void*,
141     void*); // CHROM or whitespace-separated CHROM,POS (1-based, inclusive)
142 int regidx_parse_reg(
143     const(char)*,
144     char**,
145     char**,
146     hts_pos_t*,
147     hts_pos_t*,
148     void*,
149     void*); // CHROM, CHROM:POS, CHROM:FROM-TO, CHROM:FROM- (1-based, inclusive)
150 int regidx_parse_vcf(
151     const(char)*,
152     char**,
153     char**,
154     hts_pos_t*,
155     hts_pos_t*,
156     void*,
157     void*);
158 
159 /*
160  *  regidx_init() - creates new index
161  *  regidx_init_string() - creates new index, from a string rather than from a file
162  *
163  *  @param fname:  input file name or NULL if regions will be added one-by-one via regidx_insert()
164  *  @param parsef: regidx_parse_bed, regidx_parse_tab or see description of regidx_parse_f. If NULL,
165  *                 the format will be autodected, currently either regidx_parse_tab (the default) or
166  *                 regidx_parse_bed (file must be named 'bed' or 'bed.gz') will be used. Note that
167  *                 the exact autodetection algorithm will change.
168  *  @param freef:  NULL or see description of regidx_parse_f
169  *  @param payload_size: 0 with regidx_parse_bed, regidx_parse_tab or see regidx_parse_f
170  *  @param usr:    optional user data passed to regidx_parse_f
171  *
172  *  Returns index on success or NULL on error.
173  *
174  *  The regidx_t index struct returned by a successful call should be freed
175  *  via regidx_destroy() when it is no longer needed.
176  */
177 regidx_t* regidx_init(
178     const(char)* fname,
179     regidx_parse_f parsef,
180     regidx_free_f freef,
181     size_t payload_size,
182     void* usr);
183 regidx_t* regidx_init_string(
184     const(char)* string,
185     regidx_parse_f parsef,
186     regidx_free_f freef,
187     size_t payload_size,
188     void* usr);
189 
190 /*
191  *  regidx_destroy() - free memory allocated by regidx_init
192  */
193 void regidx_destroy(regidx_t* idx);
194 
195 /*
196  *  regidx_overlap() - check overlap of the location chr:from-to with regions
197  *  @param beg,end:     0-based start, end coordinate (inclusive)
198  *  @param itr:         pointer to iterator, can be NULL if regidx_loop not needed
199  *
200  *  Returns 0 if there is no overlap or 1 if overlap is found. The overlapping
201  *  regions can be iterated as shown in the example above.
202  */
203 int regidx_overlap(
204     regidx_t* idx,
205     const(char)* chr,
206     hts_pos_t beg,
207     hts_pos_t end,
208     regitr_t* itr);
209 
210 /*
211  *  regidx_insert() - add a new region.
212  *  regidx_insert_list() - add new regions from a list
213  *  regidx_push() - low level insertion of a new region
214  *
215  *  Returns 0 on success or -1 on error.
216  */
217 int regidx_insert(regidx_t* idx, char* line);
218 int regidx_insert_list(regidx_t* idx, char* line, char delim);
219 int regidx_push(
220     regidx_t* idx,
221     char* chr_beg,
222     char* chr_end,
223     hts_pos_t beg,
224     hts_pos_t end,
225     void* payload);
226 
227 /*
228  *  regidx_seq_names() - return list of all sequence names
229  */
230 char** regidx_seq_names(regidx_t* idx, int* n);
231 
232 /*
233  *  regidx_seq_nregs() - number of regions
234  *  regidx_nregs()  - total number of regions
235  */
236 int regidx_seq_nregs(regidx_t* idx, const(char)* seq);
237 
238 int regidx_nregs(regidx_t* idx);
239 
240 /*
241  *  regitr_init() - initialize an iterator. The idx parameter is required only
242  *                  with regitr_loop. If only regitr_overlap is called, NULL
243  *                  can be given.
244  *
245  *                  The regitr_t struct returned by a successful regitr_init()
246  *                  call should be freed via regitr_destroy() when it is no
247  *                  longer needed.
248  *
249  *  regitr_reset() - initialize an iterator for a repeated regitr_loop cycle.
250  *                  Not required with regitr_overlap.
251  */
252 regitr_t* regitr_init(regidx_t* idx);
253 void regitr_destroy(regitr_t* itr);
254 void regitr_reset(regidx_t* idx, regitr_t* itr);
255 
256 /*
257  *  regitr_overlap() - next overlapping region
258  *  Returns 0 when done or 1 when itr is set to next region
259  */
260 int regitr_overlap(regitr_t* itr);
261 
262 /*
263  *  regitr_loop() - loop over all regions
264  *  Returns 0 when done or 1 when itr is set to next region
265  */
266 int regitr_loop(regitr_t* itr);
267 
268 /*
269  *  regitr_copy() - create a copy of an iterator for a repeated iteration with regitr_loop
270  */
271 void regitr_copy(regitr_t* dst, regitr_t* src);
272