1 /**
2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records.
3 */
4 module dhtslib.sam.cigar;
5 
6 import std.stdio;
7 import std.bitmanip : bitfields;
8 import std.array : join;
9 import std.algorithm : map, each, sum;
10 import std.conv : to;
11 import std.range : array;
12 import std.traits : isIntegral;
13 
14 import htslib.hts_log;
15 import dhtslib.sam.record : SAMRecord;
16 import dhtslib.coordinates;
17 import dhtslib.memory;
18 import htslib.sam : bam_get_cigar;
19 
20 
21 /**
22 Represents all cigar ops
23 as a friendlier alternative to BAM_C htslib enums
24 */
25 enum Ops
26 {
27     MATCH = 0,
28     INS = 1,
29     DEL = 2,
30     REF_SKIP = 3,
31     SOFT_CLIP = 4,
32     HARD_CLIP = 5,
33     PAD = 6,
34     EQUAL = 7,
35     DIFF = 8,
36     BACK = 9
37 }
38 
39 /**
40 string to aid in conversion from CigarOp to string
41 */
42 static immutable string CIGAR_STR = "MIDNSHP=XB";
43 
44 /// Credit to Biod for this code below
45 /// https://github.com/biod/BioD from their bam.cigar module
46 /** 
47     uint to aid in identifying if a cigar op is query consuming 
48     or reference consuming.
49 
50     Each pair of bits has first bit set iff the operation is query consuming,
51     and second bit set iff it is reference consuming.
52                                              X  =  P  H  S  N  D  I  M
53 */                                            
54 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
55 
56 /// True iff operation is one of M, =, X, I, S
57 bool isQueryConsuming(T)(T op) nothrow @nogc
58 {
59     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 1) != 0;
60 }
61 
62 /// True iff operation is one of M, =, X, D, N
63 bool isReferenceConsuming(T)(T op)nothrow @nogc
64 {
65     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 2) != 0;
66 }
67 
68 /// True iff operation is one of M, =, X
69 bool isMatchOrMismatch(T)(T op) nothrow @nogc
70 {
71     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 3) == 3;
72 }
73 
74 /// True iff operation is one of 'S', 'H'
75 bool isClipping(T)(T op)nothrow @nogc
76 {
77     return ((op & 0xF) >> 1) == 2; // 4 or 5
78 }
79 
80 /// Represents a singular cigar operation
81 /// as a uint
82 union CigarOp
83 {
84     /// raw opcode
85     uint raw;
86 
87     mixin(bitfields!( //lower 4 bits store op
88             Ops, "op", 4, //higher 28 bits store length
89             uint, "length", 28));
90 
91     /// construct Op from raw opcode
92     this(uint raw)
93     {
94         this.raw = raw;
95     }
96 
97     /// construct Op from an operator and operand (length)
98     this(uint len, Ops op)
99     {
100         this.op = op;
101         this.length = len;
102     }
103 
104     string toString() const
105     {
106         return this.length.to!string ~ CIGAR_STR[this.op];
107     }
108 }
109 
110 /** 
111     Represents a CIGAR string as an array of CigarOps. 
112 
113     https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
114 
115     In many cases will represent a slice or reference to 
116     underlying cigar data in the bam record 
117     UNLESS it is copied with .dup or one of the assignment
118     methods.    
119 */ 
120 struct Cigar
121 {
122     /// array of distinct CIGAR ops 
123     /// private now to force fixing some reference issues
124     private CigarOp[] ops;
125     private Bam1 b;
126     /// Construct Cigar from SAMRecord
127     this(Bam1 b)
128     {
129         this.b = b;
130         auto cigar = bam_get_cigar(this.b);
131         this(cigar, this.b.core.n_cigar);
132     }
133 
134     /// Construct Cigar from raw data
135     this(uint* cigar, uint length)
136     {
137         this((cast(CigarOp*) cigar)[0 .. length]);
138     }
139 
140     /// Construct Cigar from an array of CIGAR ops
141     this(CigarOp[] ops)
142     {
143         this.ops = ops;
144     }
145 
146     /// null CIGAR -- don't read!
147     bool is_null()
148     {
149         return this.ops.length == 0;
150     }
151 
152     /// Format Cigar struct as CIGAR string in accordance with SAM spec
153     string toString() const
154     {
155         return ops.map!(x => x.toString).array.join;
156     }
157 
158     /// get length of cigar
159     auto length(){
160         return this.ops.length;
161     }
162     
163     /// set length of cigar
164     void length(T)(T len)
165     if(isIntegral!T)
166     {
167         this.ops.length = len;
168     }
169 
170     /// copy cigar
171     auto const dup(){
172         return Cigar(this.ops.dup);
173     }
174 
175     /// return the alignment length expressed by this Cigar
176     @property int refBasesCovered()
177     {
178         int len;
179         foreach (op; this.ops)
180         {
181             // look ma no branching (ignore the above foreach)
182             // add op.length if reference consuming
183             // mask is 0 if not reference consuming
184             // mask is -1 if reference consuming
185             len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1));
186         }
187         return len;
188     }
189 
190     deprecated("Use camelCase names instead")
191     alias ref_bases_covered = refBasesCovered;
192     /// previous alignedLength function had a bug and 
193     /// it is just a duplicate of ref_bases_covered
194     alias alignedLength = refBasesCovered;
195 
196     /// allow foreach on Cigar
197     int opApply(int delegate(ref CigarOp) dg) {
198         int result = 0;
199 
200         foreach (op; ops) {
201             result = dg(op);
202 
203             if (result) {
204                 break;
205             }
206         }
207 
208         return result;
209     }
210     
211     /// Get a Slice of cigar ops from a range in the Cigar string
212     auto opSlice()
213     {
214         return ops;
215     }
216 
217     /// Get a Slice of cigar ops from a range in the Cigar string
218     auto opSlice(T)(T start, T end) if (isIntegral!T)
219     {
220         return ops[start .. end];
221     }
222 
223     /// Get a cigar op from a single position in the Cigar string
224     ref auto opIndex(ulong i)
225     {
226         return ops[i];
227     }
228 
229     /// Assign a cigar op at a single position in the Cigar string
230     auto opIndexAssign(CigarOp value, size_t index)
231     {
232         return ops[index] = value;
233     }
234 
235     /// Assign a range cigar ops over a range in the Cigar string
236     auto opSliceAssign(T)(CigarOp[] values, T start, T end)
237     {
238         assert(end - start == values.length);
239         return ops[start .. end] = values;
240     }
241 
242     /// Assign a Cigar with a CigarOp slice
243     /// Note: this is not a deep copy
244     auto opAssign(T: CigarOp[])(T value)
245     {
246         this.ops = value;
247         return this;
248     }
249 
250     /// Assign a Cigar with a Cigar
251     /// Note: this is not a deep copy
252     auto opAssign(T: Cigar)(T value)
253     {
254         this.ops = value.ops;
255         return this;
256     }
257 
258     /// use $ with Cigar slicing
259     size_t opDollar()
260     {
261         return length;
262     }
263 
264     /// combine Cigars
265     auto opBinary(string op)(const Cigar rhs) const
266     if(op == "~")
267     {
268         return Cigar(this.dup[] ~ rhs.dup[]);
269     }
270 }
271 
272 
273 debug (dhtslib_unittest) unittest
274 {
275     writeln();
276     import dhtslib.sam;
277     import htslib.hts_log;
278     import std.path : buildPath, dirName;
279 
280     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
281     hts_log_info(__FUNCTION__, "Testing cigar");
282     hts_log_info(__FUNCTION__, "Loading test file");
283     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
284             "test", "range.bam"), 0);
285     auto readrange = bam["CHROMOSOME_I", ZB(914)];
286     hts_log_info(__FUNCTION__, "Getting read 1");
287     assert(readrange.empty == false);
288     auto read = readrange.front();
289     auto cigar = read.cigar;
290     writeln(read.queryName);
291     hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString());
292     writeln(cigar.toString());
293     assert(cigar.toString() == "78M1D22M");
294     assert(cigar[0] == CigarOp(78, Ops.MATCH));
295     assert(Cigar(cigar[0 .. 2]).toString == "78M1D");
296     cigar[0] = CigarOp(4,Ops.HARD_CLIP);
297     cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)];
298     assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]);
299     assert(cigar.toString() == "4H2I21M");
300 }
301 
302 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
303 Cigar cigarFromString(string cigar)
304 {
305     import std.regex;
306 
307     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
308             m[2].to!char.charToOp)).array);
309 }
310 
311 /// Convert single char representing a CIGAR Op to Op union
312 Ops charToOp(char c)
313 {
314     foreach (i, o; CIGAR_STR)
315     {
316         if (c == o)
317         {
318             return cast(Ops) i;
319         }
320     }
321     return cast(Ops) 9;
322 }
323 
324 debug (dhtslib_unittest) unittest
325 {
326     writeln();
327     hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
328     string c = "130M2D40M";
329     auto cig = cigarFromString(c);
330     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
331     assert(cig.toString() == c);
332     assert(cig.ops[0].op.isQueryConsuming && cig.ops[0].op.isReferenceConsuming);
333     assert(!cig.ops[1].op.isQueryConsuming && cig.ops[1].op.isReferenceConsuming);
334 }
335 
336 /// Range-based iteration of a Cigar string
337 /// Returns a range of Ops that is the length of all op lengths
338 /// e.g. if the Cigar is 4M5D2I4M
339 /// CigarItr will return a range of MMMMDDDDDIIMMMM
340 /// Range is of the Ops enum not chars
341 struct CigarItr
342 {
343     Cigar cigar;
344     CigarOp[] ops;
345     CigarOp current;
346 
347     this(Cigar c)
348     {
349         // Copy the cigar
350         cigar = c;
351         ops = cigar[];
352         current = ops[0];
353         current.length = current.length - 1;
354     }
355 
356     Ops front()
357     {
358         return current.op;
359     }
360 
361     void popFront()
362     {
363         // import std.stdio;
364         // writeln(current);
365         // writeln(cigar);
366         if(ops.length == 1 && current.length == 0)
367             ops = [];
368         else if (current.length == 0)
369         {
370             ops = ops[1 .. $];
371             current = ops[0];
372             current.length = current.length - 1;
373         }
374         else if (current.length != 0)
375             current.length = current.length - 1;
376     }
377 
378     bool empty()
379     {
380         return ops.length == 0;
381     }
382 }
383 
384 debug (dhtslib_unittest) unittest
385 {
386     import std.algorithm : map;
387 
388     writeln();
389     hts_log_info(__FUNCTION__, "Testing CigarItr");
390     string c = "7M2D4M";
391     auto cig = cigarFromString(c);
392     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
393     auto itr = CigarItr(cig);
394     assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM");
395 }
396 
397 /// Coordinate pair representing aligned query and reference positions,
398 /// and the CIGAR op at or effecting their alignment.
399 struct AlignedCoordinate
400 {
401     Coordinate!(Basis.zero) qpos, rpos;
402     Ops cigar_op;
403 }
404 
405 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string
406 struct AlignedCoordinatesItr
407 {
408     CigarItr itr;
409     AlignedCoordinate current;
410 
411     this(Cigar cigar)
412     {
413         itr = CigarItr(cigar);
414         current.qpos = current.rpos = Coordinate!(Basis.zero)(-1);
415         current.cigar_op = itr.front;
416         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
417         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
418     }
419 
420     /// InputRange interface
421     AlignedCoordinate front()
422     {
423         return current;
424     }
425 
426     /// InputRange interface
427     void popFront()
428     {
429         itr.popFront;
430         current.cigar_op = itr.front;
431         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
432         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
433     }
434 
435     /// InputRange interface
436     bool empty()
437     {
438         return itr.empty;
439     }
440 }
441 
442 debug (dhtslib_unittest) unittest
443 {
444     writeln();
445     import dhtslib.sam;
446     import htslib.hts_log;
447     import std.path : buildPath, dirName;
448     import std.algorithm : map;
449     import std.array : array;
450 
451     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
452     hts_log_info(__FUNCTION__, "Testing cigar");
453     hts_log_info(__FUNCTION__, "Loading test file");
454     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
455             "test", "range.bam"), 0);
456     auto readrange = bam["CHROMOSOME_I", ZB(914)];
457     hts_log_info(__FUNCTION__, "Getting read 1");
458     assert(readrange.empty == false);
459     auto read = readrange.front();
460     auto coords = read.getAlignedCoordinates(77, 82);
461     assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]);
462 }