1 /**
2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records.
3 */
4 module dhtslib.sam.cigar;
5 
6 import std.stdio;
7 import std.bitmanip : bitfields;
8 import std.array : join;
9 import std.algorithm : map;
10 import std.algorithm.iteration : each;
11 import std.conv : to;
12 import std.range : array;
13 import std.traits : isIntegral;
14 
15 import htslib.hts_log;
16 import dhtslib.sam.record : SAMRecord;
17 import dhtslib.coordinates;
18 import dhtslib.memory;
19 import htslib.sam : bam_get_cigar;
20 
21 /// Represents a CIGAR string
22 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
23 struct Cigar
24 {
25     /// array of distinct CIGAR ops 
26     /// private now to force fixing some reference issues
27     private CigarOp[] ops;
28     private Bam1 b;
29     /// Construct Cigar from SAMRecord
30     this(Bam1 b)
31     {
32         this.b = b;
33         auto cigar = bam_get_cigar(this.b);
34         this(cigar, this.b.core.n_cigar);
35     }
36 
37     /// Construct Cigar from raw data
38     this(uint* cigar, uint length)
39     {
40         this((cast(CigarOp*) cigar)[0 .. length]);
41     }
42 
43     /// Construct Cigar from an array of CIGAR ops
44     this(CigarOp[] ops)
45     {
46         this.ops = ops;
47     }
48 
49     /// null CIGAR -- don't read!
50     bool is_null()
51     {
52         return this.ops.length == 0;
53     }
54 
55     /// Format Cigar struct as CIGAR string in accordance with SAM spec
56     string toString()
57     {
58         return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join;
59     }
60 
61     /// get length of cigar
62     auto length(){
63         return this.ops.length;
64     }
65     
66     /// set length of cigar
67     void length(T)(T len)
68     if(isIntegral!T)
69     {
70         this.ops.length = len;
71     }
72 
73     /// copy cigar
74     auto const dup(){
75         return Cigar(this.ops.dup);
76     }
77 
78     /// return the alignment length expressed by this Cigar
79     @property int refBasesCovered()
80     {
81         int len;
82         foreach (op; this.ops)
83         {
84             // look ma no branching (ignore the above foreach)
85             // add op.length if reference consuming
86             // mask is 0 if not reference consuming
87             // mask is -1 if reference consuming
88             len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1));
89         }
90         return len;
91     }
92 
93     deprecated("Use camelCase names instead")
94     alias ref_bases_covered = refBasesCovered;
95     /// previous alignedLength function had a bug and 
96     /// it is just a duplicate of ref_bases_covered
97     alias alignedLength = refBasesCovered;
98 
99     /// allow foreach on Cigar
100     int opApply(int delegate(ref CigarOp) dg) {
101         int result = 0;
102 
103         foreach (op; ops) {
104             result = dg(op);
105 
106             if (result) {
107                 break;
108             }
109         }
110 
111         return result;
112     }
113     
114     /// Get a Slice of cigar ops from a range in the Cigar string
115     auto opSlice()
116     {
117         return ops;
118     }
119 
120     /// Get a Slice of cigar ops from a range in the Cigar string
121     auto opSlice(T)(T start, T end) if (isIntegral!T)
122     {
123         return ops[start .. end];
124     }
125 
126     /// Get a cigar op from a single position in the Cigar string
127     ref auto opIndex(ulong i)
128     {
129         return ops[i];
130     }
131 
132     /// Assign a cigar op at a single position in the Cigar string
133     auto opIndexAssign(CigarOp value, size_t index)
134     {
135         return ops[index] = value;
136     }
137 
138     /// Assign a range cigar ops over a range in the Cigar string
139     auto opSliceAssign(T)(CigarOp[] values, T start, T end)
140     {
141         assert(end - start == values.length);
142         return ops[start .. end] = values;
143     }
144 
145     auto opAssign(T: CigarOp[])(T value)
146     {
147         this.ops = value;
148         return this;
149     }
150 
151     auto opAssign(T: Cigar)(T value)
152     {
153         this.ops = value.ops;
154         return this;
155     }
156 
157     size_t opDollar()
158     {
159         return length;
160     }
161 
162     auto opBinary(string op)(const Cigar rhs) const
163     if(op == "~")
164     {
165         return Cigar(this.dup[] ~ rhs.dup[]);
166     }
167 }
168 
169 // Each pair of bits has first bit set iff the operation is query consuming,
170 // and second bit set iff it is reference consuming.
171 //                                            X  =  P  H  S  N  D  I  M
172 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
173 
174 /// Represents a distinct cigar operation
175 union CigarOp
176 {
177     /// raw opcode
178     uint raw;
179 
180     mixin(bitfields!( //lower 4 bits store op
181             Ops, "op", 4, //higher 28 bits store length
182             uint, "length", 28));
183 
184     /// construct Op from raw opcode
185     this(uint raw)
186     {
187         this.raw = raw;
188     }
189 
190     /// construct Op from an operator and operand (length)
191     this(uint len, Ops op)
192     {
193         this.op = op;
194         this.length = len;
195     }
196     /// Credit to Biod for this code below
197     /// https://github.com/biod/BioD from their bam.cigar module
198     /// True iff operation is one of M, =, X, I, S
199     bool isQueryConsuming() @property const nothrow @nogc
200     {
201         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
202     }
203 
204     /// True iff operation is one of M, =, X, D, N
205     bool isReferenceConsuming() @property const nothrow @nogc
206     {
207         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
208     }
209 
210     /// True iff operation is one of M, =, X
211     bool isMatchOrMismatch() @property const nothrow @nogc
212     {
213         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
214     }
215 
216     /// True iff operation is one of 'S', 'H'
217     bool isClipping() @property const nothrow @nogc
218     {
219         return ((raw & 0xF) >> 1) == 2; // 4 or 5
220     }
221 
222     deprecated("Use camelCase names instead")
223     alias is_query_consuming = isQueryConsuming;
224     deprecated("Use camelCase names instead")
225     alias is_reference_consuming = isReferenceConsuming;
226     deprecated("Use camelCase names instead")
227     alias is_match_or_mismatch = isMatchOrMismatch;
228     deprecated("Use camelCase names instead")
229     alias is_clipping = isClipping;
230 
231     string toString(){
232         return this.length.to!string ~ CIGAR_STR[this.op];
233     }
234 }
235 
236 /**
237 Represents all ops
238 #define BAM_CIGAR_STR   "MIDNSHP=XB"
239 #define BAM_CMATCH      0
240 #define BAM_CINS        1
241 #define BAM_CDEL        2
242 #define BAM_CREF_SKIP   3
243 #define BAM_CSOFT_CLIP  4
244 #define BAM_CHARD_CLIP  5
245 #define BAM_CPAD        6
246 #define BAM_CEQUAL      7
247 #define BAM_CDIFF       8
248 #define BAM_CBACK       9
249 */
250 string CIGAR_STR = "MIDNSHP=XB";
251 /// ditto
252 enum Ops
253 {
254     MATCH = 0,
255     INS = 1,
256     DEL = 2,
257     REF_SKIP = 3,
258     SOFT_CLIP = 4,
259     HARD_CLIP = 5,
260     PAD = 6,
261     EQUAL = 7,
262     DIFF = 8,
263     BACK = 9
264 }
265 
266 debug (dhtslib_unittest) unittest
267 {
268     writeln();
269     import dhtslib.sam;
270     import htslib.hts_log;
271     import std.path : buildPath, dirName;
272 
273     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
274     hts_log_info(__FUNCTION__, "Testing cigar");
275     hts_log_info(__FUNCTION__, "Loading test file");
276     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
277             "test", "range.bam"), 0);
278     auto readrange = bam["CHROMOSOME_I", ZB(914)];
279     hts_log_info(__FUNCTION__, "Getting read 1");
280     assert(readrange.empty == false);
281     auto read = readrange.front();
282     auto cigar = read.cigar;
283     writeln(read.queryName);
284     hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString());
285     writeln(cigar.toString());
286     assert(cigar.toString() == "78M1D22M");
287     assert(cigar[0] == CigarOp(78, Ops.MATCH));
288     assert(Cigar(cigar[0 .. 2]).toString == "78M1D");
289     cigar[0] = CigarOp(4,Ops.HARD_CLIP);
290     cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)];
291     assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]);
292     assert(cigar.toString() == "4H2I21M");
293 }
294 
295 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
296 Cigar cigarFromString(string cigar)
297 {
298     import std.regex;
299 
300     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
301             m[2].to!char.charToOp)).array);
302 }
303 
304 /// Convert single char representing a CIGAR Op to Op union
305 Ops charToOp(char c)
306 {
307     foreach (i, o; CIGAR_STR)
308     {
309         if (c == o)
310         {
311             return cast(Ops) i;
312         }
313     }
314     return cast(Ops) 9;
315 }
316 
317 debug (dhtslib_unittest) unittest
318 {
319     writeln();
320     hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
321     string c = "130M2D40M";
322     auto cig = cigarFromString(c);
323     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
324     assert(cig.toString() == c);
325     assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming);
326     assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming);
327 }
328 
329 /// Range-based iteration of a Cigar string
330 /// Returns a range of Ops that is the length of all op lengths
331 /// e.g. if the Cigar is 4M5D2I4M
332 /// CigarItr will return a range of MMMMDDDDDIIMMMM
333 /// Range is of the Ops enum not chars
334 struct CigarItr
335 {
336     Cigar cigar;
337     CigarOp current;
338 
339     this(Cigar c)
340     {
341         // Copy the cigar
342         cigar = c.dup;
343         current = cigar[0];
344         current.length = current.length - 1;
345     }
346 
347     Ops front()
348     {
349         return current.op;
350     }
351 
352     void popFront()
353     {
354         // import std.stdio;
355         // writeln(current);
356         // writeln(cigar);
357         if(cigar.length == 1 && current.length == 0)
358             cigar.length = 0;
359         else if (current.length == 0)
360         {
361             cigar = cigar[1 .. $];
362             current = cigar[0];
363             current.length = current.length - 1;
364         }
365         else if (current.length != 0)
366             current.length = current.length - 1;
367     }
368 
369     bool empty()
370     {
371         return cigar.length == 0;
372     }
373 }
374 
375 debug (dhtslib_unittest) unittest
376 {
377     import std.algorithm : map;
378 
379     writeln();
380     hts_log_info(__FUNCTION__, "Testing CigarItr");
381     string c = "7M2D4M";
382     auto cig = cigarFromString(c);
383     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
384     auto itr = CigarItr(cig);
385     assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM");
386 }
387 
388 /// Coordinate pair representing aligned query and reference positions,
389 /// and the CIGAR op at or effecting their alignment.
390 struct AlignedCoordinate
391 {
392     Coordinate!(Basis.zero) qpos, rpos;
393     Ops cigar_op;
394 }
395 
396 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string
397 struct AlignedCoordinatesItr
398 {
399     CigarItr itr;
400     AlignedCoordinate current;
401 
402     this(Cigar cigar)
403     {
404         itr = CigarItr(cigar);
405         current.qpos = current.rpos = Coordinate!(Basis.zero)(-1);
406         current.cigar_op = itr.front;
407         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
408         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
409     }
410 
411     /// InputRange interface
412     AlignedCoordinate front()
413     {
414         return current;
415     }
416 
417     /// InputRange interface
418     void popFront()
419     {
420         itr.popFront;
421         current.cigar_op = itr.front;
422         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
423         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
424     }
425 
426     /// InputRange interface
427     bool empty()
428     {
429         return itr.empty;
430     }
431 }
432 
433 debug (dhtslib_unittest) unittest
434 {
435     writeln();
436     import dhtslib.sam;
437     import htslib.hts_log;
438     import std.path : buildPath, dirName;
439     import std.algorithm : map;
440     import std.array : array;
441 
442     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
443     hts_log_info(__FUNCTION__, "Testing cigar");
444     hts_log_info(__FUNCTION__, "Loading test file");
445     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
446             "test", "range.bam"), 0);
447     auto readrange = bam["CHROMOSOME_I", ZB(914)];
448     hts_log_info(__FUNCTION__, "Getting read 1");
449     assert(readrange.empty == false);
450     auto read = readrange.front();
451     auto coords = read.getAlignedCoordinates(77, 82);
452     assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]);
453 }