1 /** Coordinates and Coordinate Systems
2 
3     Interval include `start` and `end`, but no reference sequence id (chr, contig, etc.)
4 
5     The `Coordinate` type is templated on `CoordSystem` enum, so that the actual coordinate
6     system in use -- and by this, we mean zero- or one-based, and half-open vs. closed --
7     is encoded within the type itself. e.g. `Coordinate!(CoordSystem.zbho)(0, 100)`
8 
9     In this way, dhtslib functions that take type `Coordinate` can enforce safety checks,
10     or optionally, interconversions, to avoid off-by-one errors when dealing with different
11     systems, as is common in different HTS/NGS file formats.
12 
13     In general, zero-based half-open (ZBHO) coordinates are easy to compute on, whereas
14     many data sources intended to be read and understood by humans (including NCBI) are
15     one-based closed systems. You may enjoy reading the following articles:
16 
17     https://www.biostars.org/p/84686/
18     https://www.biostars.org/p/6373/
19     http://genome.ucsc.edu/blog/the-ucsc-genome-browser-coordinate-counting-systems/
20     http://genomewiki.ucsc.edu/index.php/Coordinate_Transforms
21 
22     Zero-based, half open:  BED, BAM
23     Zero-based, closed:     HTSlib function faidx_fetch_seq , strangely
24     One-based, half open:   ?
25     One-based, closed:      GFF3, SAM (text; not BAM), VCF (text; not BCF)
26 */
27 
28 /*
29 
30 Translation table for my reference:
31     zbho    zbc     obho    obc
32 zbho -       0,-1   +1,+1   +1, 0
33 zbc  0,+1   -       +1,+2   +1,+1
34 obho -1,-1  -1,-2   -        0,-1
35 obc  -1,0   -1,-1    0,+1    -
36 
37 */
38 
39 module dhtslib.coordinates;
40 import std.traits : isIntegral;
41 import std.conv : to;
42 
43 /// Represents 0-based vs 1-based coordinate types
44 enum Basis
45 {
46     zero = 0,
47     one
48 }
49 
50 /// Represents whether a coordinate set's end 
51 /// coordinate is open or closed
52 enum End
53 {
54     open = 0,
55     closed
56 }
57 
58 /// Coordinate type for a single position with in a Coordinate set,
59 /// where the type itself encodes the coordinate system details
60 /// (zero or one-based)
61 struct Coordinate(Basis bs)
62 {
63     @safe @nogc nothrow pure:
64     /// Coordinate value
65     long pos;
66     alias pos this;
67 
68     invariant
69     {
70         assert(pos >= 0);
71         // in one based systems, pos cannot be zero
72         static if(bs == Basis.one)
73         {
74             assert(pos != 0);
75         }
76     }
77     
78     /// Convert coordinate to another based system 
79     auto to(Basis tobs)()
80     {
81         // return same Base type
82         static if (bs == tobs) return Coordinate!bs(pos);
83         // one to zero base, decrement
84         else static if (tobs == Basis.zero) return Coordinate!bs(pos - 1);
85         // zero to one base, increment
86         else static if (tobs == Basis.one) return Coordinate!bs(pos + 1);
87         else static assert(0, "Coordinate Type error");
88     }
89 
90     /// Convert coordinate to another based system using shortcuts
91     auto to(T: ZB)()
92     {
93         return this.to!(Basis.zero);
94     }
95 
96     auto to(T: OB)()
97     {
98         return this.to!(Basis.one);
99     }
100 
101     /// make a new coordinate with a value of this.pos + off
102     /// this.pos + off
103     auto offset(T)(T off)
104     if(isIntegral!T)
105     {
106         return Coordinate!(bs)(cast(long)(this.pos + off));
107     }
108 
109     /// opbinary + or - to use offset
110     auto opBinary(string op, T)(T off)
111     if ((op == "+" || op == "-") && isIntegral!T)
112     {
113         static if(op == "+")
114             return this.offset(off);
115         else static if(op == "-")
116             return this.offset(-off);
117         else static assert(0,"Operator "~op~" not implemented");
118     }
119 }
120 
121 alias ZB = Coordinate!(Basis.zero);
122 alias OB = Coordinate!(Basis.one);
123 
124 alias ZeroBased = Coordinate!(Basis.zero);
125 alias OneBased = Coordinate!(Basis.one);
126 
127 /// template to convert Basis, End enum combination to 
128 /// respective CoordinateSystem enum
129 template getCoordinateSystem(Basis bs, End es){
130     static if(bs == Basis.zero){
131         static if(es == End.open)
132             enum CoordSystem getCoordinateSystem = CoordSystem.zbho;
133         else static if(es == End.closed)
134             enum CoordSystem getCoordinateSystem = CoordSystem.zbc;
135     }
136     else static if(bs == Basis.one){
137         static if(es == End.open)
138             enum CoordSystem getCoordinateSystem = CoordSystem.obho;
139         else static if(es == End.closed)
140             enum CoordSystem getCoordinateSystem = CoordSystem.obc;
141     }
142 }
143 
144 // just sanity checking
145 static assert(getCoordinateSystem!(Basis.zero, End.open) == CoordSystem.zbho);
146 static assert(getCoordinateSystem!(Basis.zero, End.closed) == CoordSystem.zbc);
147 static assert(getCoordinateSystem!(Basis.one, End.open) == CoordSystem.obho);
148 static assert(getCoordinateSystem!(Basis.one, End.closed) == CoordSystem.obc);
149 
150 /// template to convert CoordinateSystem enum to 
151 /// respective Basis enum
152 template coordinateSystemToBasis(CoordSystem cs){
153     static if(cs == CoordSystem.zbho){
154         enum Basis coordinateSystemToBasis = Basis.zero;
155     }
156     else static if(cs == CoordSystem.zbc){
157         enum Basis coordinateSystemToBasis = Basis.zero;
158     }
159     else static if(cs == CoordSystem.obho){
160         enum Basis coordinateSystemToBasis = Basis.one;
161     }
162     else static if(cs == CoordSystem.obc){
163         enum Basis coordinateSystemToBasis = Basis.one;
164     }
165 }
166 
167 // just sanity checking
168 static assert(coordinateSystemToBasis!(CoordSystem.zbho) == Basis.zero);
169 static assert(coordinateSystemToBasis!(CoordSystem.zbc) == Basis.zero);
170 static assert(coordinateSystemToBasis!(CoordSystem.obho) == Basis.one);
171 static assert(coordinateSystemToBasis!(CoordSystem.obc) == Basis.one);
172 
173 /// template to convert CoordinateSystem enum to 
174 /// respective End enum
175 template coordinateSystemToEnd(CoordSystem cs){
176     static if(cs == CoordSystem.zbho){
177         enum End coordinateSystemToEnd = End.open;
178     }
179     else static if(cs == CoordSystem.zbc){
180         enum End coordinateSystemToEnd = End.closed;
181     }
182     else static if(cs == CoordSystem.obho){
183         enum End coordinateSystemToEnd = End.open;
184     }
185     else static if(cs == CoordSystem.obc){
186         enum End coordinateSystemToEnd = End.closed;
187     }
188 }
189 
190 // just sanity checking
191 static assert(coordinateSystemToEnd!(CoordSystem.zbho) == End.open);
192 static assert(coordinateSystemToEnd!(CoordSystem.zbc) == End.closed);
193 static assert(coordinateSystemToEnd!(CoordSystem.obho) == End.open);
194 static assert(coordinateSystemToEnd!(CoordSystem.obc) == End.closed);
195 
196 /// Coordsytem types
197 enum CoordSystem
198 {
199     zbho = 0,   /// zero-based, half-open (BED, BAM)
200     zbc,        /// zero-based, closed (HTSlib.faidx_fetch_seq)
201     obho,       /// one-based, half-open
202     obc         /// one-based, closed (GFF3, VCF [not BCF], SAM [not BAM])
203 }
204 
205 /// Labels for each CoordSystem Type
206 enum CoordSystemLabels = __traits(allMembers, CoordSystem);
207 
208 /// The (start, end) coordinates within a coordinate system,
209 /// where the type itself encodes the coordinate system details
210 /// (zero or one-based; half-open vs. closed)
211 struct Interval(CoordSystem cs)
212 {
213     @safe nothrow pure:
214 
215     /// alias Basis and End enums for this Coordsystem type
216     alias basetype = coordinateSystemToBasis!cs;
217     alias endtype = coordinateSystemToEnd!cs;
218 
219     /// Starting coordinate
220     Coordinate!basetype start;
221     /// Ending coordinate
222     Coordinate!basetype end;
223 
224     /// long constructor
225     this(long start, long end) @nogc
226     {
227         this.start.pos = start;
228         this.end.pos = end;
229     }
230 
231     invariant
232     {
233         // In half-open systems, ensure start strictly less than end,
234         // unless the zero-length range is truly desired, in which case use (0,0)
235         static if (cs == CoordSystem.zbho || cs == CoordSystem.obho)
236             assert(this.start < this.end || (this.start == 0 && this.end == 0));
237         else
238             assert(this.start <= this.end);
239     }
240 
241     /// Return the size of the interval spanned by start and end
242     long size() @nogc
243     {
244         static if (cs == CoordSystem.zbho || cs == CoordSystem.obho)
245             return end - start;
246         else static if (cs == CoordSystem.zbc || cs == CoordSystem.obc)
247             return end - start + 1;
248         else
249             static assert(0, "Coordinate Type error");
250     }
251     
252     /// Convert coordinates to another coordinate system 
253     auto to(CoordSystem tocs)() @nogc
254     {
255 
256         /// alias Basis and End enums for the tocs Coordsystem type
257         alias tobasetype = coordinateSystemToBasis!tocs;
258         alias toendtype = coordinateSystemToEnd!tocs;
259 
260         // new coordinates
261         auto newStart = this.start;
262         auto newEnd = this.end;
263 
264         // return same CoordinateSystem type
265         static if (cs == tocs)
266             return Interval!(tocs)(newStart, newEnd);
267         else{
268             // convert coordinates to new base type
269             newStart = newStart.to!tobasetype;
270             newEnd = newEnd.to!tobasetype;
271 
272             // if going between end types
273             static if (endtype != toendtype){
274                 // open to closed end, decrement end
275                 // closed to open end, increment end
276                 static if(toendtype == End.closed){
277                     newEnd--;
278                 }else{
279                     newEnd++;
280                 }
281             }
282             return Interval!(tocs)(newStart, newEnd);
283         }
284     }
285 
286     /// Convert coordinate to another based system using shortcuts
287     auto to(T: ZBHO)() @nogc
288     {
289         return this.to!(CoordSystem.zbho);
290     }
291 
292     /// Convert coordinate to another based system using shortcuts
293     auto to(T: OBHO)() @nogc
294     {
295         return this.to!(CoordSystem.obho);
296     }
297 
298     /// Convert coordinate to another based system using shortcuts
299     auto to(T: ZBC)() @nogc
300     {
301         return this.to!(CoordSystem.zbc);
302     }
303     
304     /// Convert coordinate to another based system using shortcuts
305     auto to(T: OBC)() @nogc
306     {
307         return this.to!(CoordSystem.obc);
308     }
309 
310     /// make a new coordinate pair with a value of 
311     /// this.start + off and this.end + off
312     auto offset(T)(T off) @nogc
313     if(isIntegral!T)
314     {
315         return Interval!(cs)(cast(long)(this.start + off), cast(long)(this.end + off));
316     }
317 
318     /// intersection of two regions
319     auto intersectImpl(Interval!cs other) @nogc
320     {
321         if(!this.isOverlap(other)){
322             return Interval!(cs).init;
323         }
324         return Interval!(cs)(
325             this.getMaxStart(other),
326             this.getMinEnd(other)
327             );
328     }
329 
330     /// union of two regions
331     /// specifically computes the convex hull of the two intervals
332     /// if Intervals don't overlap returns the init Interval
333     /// 
334     /// https://en.wikipedia.org/wiki/Convex_hull
335     ///
336     /// Thanks Arne!
337     /// https://forum.dlang.org/post/duzdclxvvrifoastdztv@forum.dlang.org
338     auto unionImpl(Interval!cs other) @nogc
339     {
340         if(!this.isOverlap(other)){
341             return Interval!(cs).init;
342         }
343         return Interval!(cs)(
344             this.getMinStart(other),
345             this.getMaxEnd(other)
346             );
347     }
348 
349     /// does this interval overlap
350     /// with the other interval?
351     auto isOverlap(Interval!cs other) @nogc
352     {
353         static if(endtype == End.closed){
354             return this.getMaxStart(other) <= this.getMinEnd(other);
355         }else{
356             return this.getMaxStart(other) < this.getMinEnd(other);
357         }
358     }
359 
360     /// get minimum start value between this interval
361     /// and other interval
362     auto getMinStart(Interval!cs other) @nogc
363     {
364         return this.start < other.start ? this.start : other.start;
365     }
366 
367     /// get maximum start value between this interval
368     /// and other interval
369     auto getMaxStart(Interval!cs other) @nogc
370     {
371         return this.start > other.start ? this.start : other.start;
372     }
373 
374     /// get minimum end value between this interval
375     /// and other interval
376     auto getMinEnd(Interval!cs other) @nogc
377     {
378         return this.end < other.end ? this.end : other.end;
379     }
380     
381     /// get maximum end value between this interval
382     /// and other interval
383     auto getMaxEnd(Interval!cs other) @nogc
384     {
385         return this.end > other.end ? this.end : other.end;
386     }
387 
388     /// Set operators for interval intersection (|) and union (&).
389     /// If the intervals are non-overlapping, an empty interval is returned i.e ZBHO.init
390     /// union is not a true union but returns the convex hull.
391     auto opBinary(string op)(Interval!cs other) @nogc
392     if(op == "|" || op == "&")
393     {
394         static if(op == "|") return unionImpl(other);
395         else static if(op == "&") return intersectImpl(other);
396         else static assert(0,"Operator "~op~" not implemented");
397     }
398 
399     /// opbinary + or - to use offset
400     auto opBinary(string op, T)(T off) @nogc
401     if ((op == "+" || op == "-") && isIntegral!T)
402     {
403         static if(op == "+")
404             return this.offset(off);
405         else static if(op == "-")
406             return this.offset(-off);
407         else static assert(0,"Operator "~op~" not implemented");
408     }
409 
410     /// Get string representation for printing
411     string toString() const{
412         return "[" ~ CoordSystemLabels[cs] ~ "] " ~ this.start.pos.to!string ~ "-" ~ this.end.pos.to!string;
413     }
414 }
415 
416 alias ZBHO = Interval!(CoordSystem.zbho);
417 alias OBHO = Interval!(CoordSystem.obho);
418 alias ZBC = Interval!(CoordSystem.zbc);
419 alias OBC = Interval!(CoordSystem.obc);
420 
421 alias ZeroBasedHalfOpen = Interval!(CoordSystem.zbho);
422 alias OneBasedHalfOpen = Interval!(CoordSystem.obho);
423 alias ZeroBasedClosed = Interval!(CoordSystem.zbc);
424 alias OneBasedClosed = Interval!(CoordSystem.obc);
425 
426 
427 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
428 {
429     auto c0 = Interval!(CoordSystem.zbho)(0, 100);
430     assert(c0.size == 100);
431 
432     auto c1 = c0.to!(CoordSystem.zbc);
433     auto c2 = c0.to!(CoordSystem.obc);
434     auto c3 = c0.to!(CoordSystem.obho);
435     auto c4 = c0.to!(CoordSystem.zbho);
436     
437     assert(c1 == ZBC(0, 99));
438     assert(c2 == OBC(1, 100));
439     assert(c3 == OBHO(1, 101));
440     assert(c4 == ZBHO(0, 100));
441     // ...
442 }
443 
444 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
445 {
446     ZB coord = ZB(1);
447     assert(coord + 1 == ZB(2));
448     assert(coord - 1 == ZB(0));
449 
450     ZBHO coords = ZBHO(1, 2);
451     assert(coords + 1 == ZBHO(2, 3));
452     assert(coords - 1 == ZBHO(0, 1));
453     // ...
454 }
455 
456 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
457 {
458     ZBHO coords = ZBHO(1, 5);
459     assert((coords & ZBHO(2, 6)) == ZBHO(2, 5));
460     assert((coords | ZBHO(3, 10)) == ZBHO(1, 10));
461     assert((coords | ZBHO(6, 10)) == ZBHO(0, 0));
462     // ...
463 }
464 
465 
466 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
467 {
468     auto c0 = Interval!(CoordSystem.zbho)(0, 100);
469     assert(c0.size == 100);
470 
471     auto c1 = c0.to!(CoordSystem.zbc);
472     auto c2 = c0.to!(CoordSystem.obc);
473     auto c3 = c0.to!(CoordSystem.obho);
474     auto c4 = c0.to!(CoordSystem.zbho);
475     
476     assert(c1 == ZBC(0, 99));
477     assert(c2 == OBC(1, 100));
478     assert(c3 == OBHO(1, 101));
479     assert(c4 == ZBHO(0, 100));
480 }
481 
482 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
483 {
484     auto c0 = ZBHO(0, 100);
485     assert(c0.size == 100);
486 
487     auto c1 = c0.to!ZBC;
488     auto c2 = c0.to!OBC;
489     auto c3 = c0.to!OBHO;
490     auto c4 = c0.to!ZBHO;
491     
492     assert(c1 == ZBC(0, 99));
493     assert(c2 == OBC(1, 100));
494     assert(c3 == OBHO(1, 101));
495     assert(c4 == ZBHO(0, 100));
496 }
497 
498 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
499 {
500     auto c0 = Interval!(CoordSystem.obho)(1, 101);
501     assert(c0.size == 100);
502 
503     auto c1 = c0.to!(CoordSystem.zbc);
504     auto c2 = c0.to!(CoordSystem.obc);
505     auto c3 = c0.to!(CoordSystem.obho);
506     auto c4 = c0.to!(CoordSystem.zbho);
507     
508     assert(c1 == ZBC(0, 99));
509     assert(c2 == OBC(1, 100));
510     assert(c3 == OBHO(1, 101));
511     assert(c4 == ZBHO(0, 100));
512     // ...
513 }
514 
515 debug(dhtslib_unittest) @safe @nogc nothrow pure unittest
516 {
517     ZBHO c0 = ZBHO(0, 100);
518     assert(c0.size == 100);
519 
520     auto c1 = c0.offset(50);
521     assert(c1 == ZBHO(50, 150));
522     assert((c0 & c1) == ZBHO(50, 100));
523     assert((c0 | c1) == ZBHO(0, 150));
524 
525     c1 = c0.offset(99);
526     assert(c1 == ZBHO(99, 199));
527 
528     assert((c0 & c1) == ZBHO(99, 100));
529     assert((c0 | c1) == ZBHO(0, 199));
530 }
531 
532 debug(dhtslib_unittest) unittest
533 {
534     OBC c0 = OBC(1, 100);
535     assert(c0.size == 100);
536 
537     auto c1 = c0.offset(50);
538     assert(c1 == OBC(51, 150));
539 
540     assert((c0 & c1) == OBC(51, 100));
541     assert((c0 | c1) == OBC(1, 150));
542 
543     c1 = c0.offset(99);
544     assert(c1 == OBC(100, 199));
545     import std.stdio;
546     writeln((c0 & c1));
547     assert((c0 & c1) == OBC(100, 100));
548     assert((c0 | c1) == OBC(1, 199));
549 }
550 
551 ///
552 // debug(dhtslib_unittest) unittest
553 // {
554 //     import dhtslib.sam;
555 //     import htslib.hts_log;
556 //     import std.path : buildPath, dirName;
557 //     import std.string : fromStringz;
558 //     import std.array : array; 
559 
560 //     hts_set_log_level(htsLogLevel.HTS_LOG_WARNING);
561 //     hts_log_info(__FUNCTION__, "Testing SAMFile & SAMRecord");
562 //     hts_log_info(__FUNCTION__, "Loading test file");
563 //     auto sam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))),"htslib","test","auxf#values.sam"), 0);
564     
565 //     auto reg = getIntervalFromString("sheila:1-10",sam.header);
566 //     assert(reg.tid == 0);
567 //     assert(reg.interval == ZBHO(0,11));
568 // }