1 /**
2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records.
3 */
4 module dhtslib.sam.cigar;
5 
6 import std.stdio;
7 import std.bitmanip : bitfields;
8 import std.array : join;
9 import std.algorithm : map, each, sum;
10 import std.conv : to;
11 import std.range : array;
12 import std.traits : isIntegral;
13 
14 import htslib.hts_log;
15 import dhtslib.sam.record : SAMRecord;
16 import dhtslib.coordinates;
17 import dhtslib.memory;
18 import htslib.sam : bam_get_cigar;
19 
20 
21 /**
22 Represents all cigar ops
23 as a friendlier alternative to BAM_C htslib enums
24 */
25 enum Ops
26 {
27     MATCH = 0,
28     INS = 1,
29     DEL = 2,
30     REF_SKIP = 3,
31     SOFT_CLIP = 4,
32     HARD_CLIP = 5,
33     PAD = 6,
34     EQUAL = 7,
35     DIFF = 8,
36     BACK = 9
37 }
38 
39 /**
40 string to aid in conversion from CigarOp to string
41 */
42 static immutable string CIGAR_STR = "MIDNSHP=XB";
43 
44 /// Credit to Biod for this code below
45 /// https://github.com/biod/BioD from their bam.cigar module
46 /** 
47     uint to aid in identifying if a cigar op is query consuming 
48     or reference consuming.
49 
50     Each pair of bits has first bit set iff the operation is query consuming,
51     and second bit set iff it is reference consuming.
52                                              X  =  P  H  S  N  D  I  M
53 */                                            
54 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
55 
56 /// True iff operation is one of M, =, X, I, S
57 bool isQueryConsuming(T)(T op) nothrow @nogc
58 {
59     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 1) != 0;
60 }
61 
62 /// True iff operation is one of M, =, X, D, N
63 bool isReferenceConsuming(T)(T op)nothrow @nogc
64 {
65     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 2) != 0;
66 }
67 
68 /// True iff operation is one of M, =, X
69 bool isMatchOrMismatch(T)(T op) nothrow @nogc
70 {
71     return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 3) == 3;
72 }
73 
74 /// True iff operation is one of 'S', 'H'
75 bool isClipping(T)(T op)nothrow @nogc
76 {
77     return ((op & 0xF) >> 1) == 2; // 4 or 5
78 }
79 
80 /// Represents a singular cigar operation
81 /// as a uint
82 union CigarOp
83 {
84     /// raw opcode
85     uint raw;
86 
87     mixin(bitfields!( //lower 4 bits store op
88             Ops, "op", 4, //higher 28 bits store length
89             uint, "length", 28));
90 
91     /// construct Op from raw opcode
92     this(uint raw)
93     {
94         this.raw = raw;
95     }
96 
97     /// construct Op from an operator and operand (length)
98     this(uint len, Ops op)
99     {
100         this.op = op;
101         this.length = len;
102     }
103 
104     string toString() const
105     {
106         return this.length.to!string ~ CIGAR_STR[this.op];
107     }
108 }
109 
110 /** 
111     Represents a CIGAR string as an array of CigarOps. 
112 
113     https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
114 
115     In many cases will represent a slice or reference to 
116     underlying cigar data in the bam record 
117     UNLESS it is copied with .dup or one of the assignment
118     methods.    
119 */ 
120 struct Cigar
121 {
122     /// array of distinct CIGAR ops 
123     /// private now to force fixing some reference issues
124     private CigarOp[] ops;
125     private Bam1 b;
126     /// Construct Cigar from SAMRecord
127     this(Bam1 b)
128     {
129         this.b = b;
130         auto cigar = bam_get_cigar(this.b);
131         this(cigar, this.b.core.n_cigar);
132     }
133 
134     /// Construct Cigar from raw data
135     this(uint* cigar, uint length)
136     {
137         this((cast(CigarOp*) cigar)[0 .. length]);
138     }
139 
140     /// Construct Cigar from an array of CIGAR ops
141     this(CigarOp[] ops)
142     {
143         this.ops = ops;
144     }
145 
146     /// Explicit postblit to avoid 
147     /// https://github.com/blachlylab/dhtslib/issues/122
148     this(this) nothrow
149     {
150         this.b = b;
151         this.ops = ops;   
152     }
153 
154     /// null CIGAR -- don't read!
155     bool is_null()
156     {
157         return this.ops.length == 0;
158     }
159 
160     /// Format Cigar struct as CIGAR string in accordance with SAM spec
161     string toString() const
162     {
163         return ops.map!(x => x.toString).array.join;
164     }
165 
166     /// get length of cigar
167     auto length(){
168         return this.ops.length;
169     }
170     
171     /// set length of cigar
172     void length(T)(T len)
173     if(isIntegral!T)
174     {
175         this.ops.length = len;
176     }
177 
178     /// copy cigar
179     auto const dup(){
180         return Cigar(this.ops.dup);
181     }
182 
183     /// return the alignment length expressed by this Cigar
184     @property int refBasesCovered()
185     {
186         int len;
187         foreach (op; this.ops)
188         {
189             // look ma no branching (ignore the above foreach)
190             // add op.length if reference consuming
191             // mask is 0 if not reference consuming
192             // mask is -1 if reference consuming
193             len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1));
194         }
195         return len;
196     }
197 
198     deprecated("Use camelCase names instead")
199     alias ref_bases_covered = refBasesCovered;
200     /// previous alignedLength function had a bug and 
201     /// it is just a duplicate of ref_bases_covered
202     alias alignedLength = refBasesCovered;
203 
204     /// allow foreach on Cigar
205     int opApply(int delegate(ref CigarOp) dg) {
206         int result = 0;
207 
208         foreach (op; ops) {
209             result = dg(op);
210 
211             if (result) {
212                 break;
213             }
214         }
215 
216         return result;
217     }
218     
219     /// Get a Slice of cigar ops from a range in the Cigar string
220     auto opSlice()
221     {
222         return ops;
223     }
224 
225     /// Get a Slice of cigar ops from a range in the Cigar string
226     auto opSlice(T)(T start, T end) if (isIntegral!T)
227     {
228         return ops[start .. end];
229     }
230 
231     /// Get a cigar op from a single position in the Cigar string
232     ref auto opIndex(ulong i)
233     {
234         return ops[i];
235     }
236 
237     /// Assign a cigar op at a single position in the Cigar string
238     auto opIndexAssign(CigarOp value, size_t index)
239     {
240         return ops[index] = value;
241     }
242 
243     /// Assign a range cigar ops over a range in the Cigar string
244     auto opSliceAssign(T)(CigarOp[] values, T start, T end)
245     {
246         assert(end - start == values.length);
247         return ops[start .. end] = values;
248     }
249 
250     /// Assign a Cigar with a CigarOp slice
251     /// Note: this is not a deep copy
252     auto opAssign(T: CigarOp[])(T value)
253     {
254         this.ops = value;
255         return this;
256     }
257 
258     /// Assign a Cigar with a Cigar
259     /// Note: this is not a deep copy
260     auto opAssign(T: Cigar)(T value)
261     {
262         this.ops = value.ops;
263         return this;
264     }
265 
266     /// use $ with Cigar slicing
267     size_t opDollar()
268     {
269         return length;
270     }
271 
272     /// combine Cigars
273     auto opBinary(string op)(const Cigar rhs) const
274     if(op == "~")
275     {
276         return Cigar(this.dup[] ~ rhs.dup[]);
277     }
278 }
279 
280 
281 debug (dhtslib_unittest) unittest
282 {
283     writeln();
284     import dhtslib.sam;
285     import htslib.hts_log;
286     import std.path : buildPath, dirName;
287 
288     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
289     hts_log_info(__FUNCTION__, "Testing cigar");
290     hts_log_info(__FUNCTION__, "Loading test file");
291     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
292             "test", "range.bam"), 0);
293     auto readrange = bam["CHROMOSOME_I", ZB(914)];
294     hts_log_info(__FUNCTION__, "Getting read 1");
295     assert(readrange.empty == false);
296     auto read = readrange.front();
297     auto cigar = read.cigar;
298     writeln(read.queryName);
299     hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString());
300     writeln(cigar.toString());
301     assert(cigar.toString() == "78M1D22M");
302     assert(cigar[0] == CigarOp(78, Ops.MATCH));
303     assert(Cigar(cigar[0 .. 2]).toString == "78M1D");
304     cigar[0] = CigarOp(4,Ops.HARD_CLIP);
305     cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)];
306     assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]);
307     assert(cigar.toString() == "4H2I21M");
308 }
309 
310 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
311 Cigar cigarFromString(string cigar)
312 {
313     import std.regex;
314 
315     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
316             m[2].to!char.charToOp)).array);
317 }
318 
319 /// Convert single char representing a CIGAR Op to Op union
320 Ops charToOp(char c)
321 {
322     foreach (i, o; CIGAR_STR)
323     {
324         if (c == o)
325         {
326             return cast(Ops) i;
327         }
328     }
329     return cast(Ops) 9;
330 }
331 
332 debug (dhtslib_unittest) unittest
333 {
334     writeln();
335     hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
336     string c = "130M2D40M";
337     auto cig = cigarFromString(c);
338     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
339     assert(cig.toString() == c);
340     assert(cig.ops[0].op.isQueryConsuming && cig.ops[0].op.isReferenceConsuming);
341     assert(!cig.ops[1].op.isQueryConsuming && cig.ops[1].op.isReferenceConsuming);
342 }
343 
344 /// Range-based iteration of a Cigar string
345 /// Returns a range of Ops that is the length of all op lengths
346 /// e.g. if the Cigar is 4M5D2I4M
347 /// CigarItr will return a range of MMMMDDDDDIIMMMM
348 /// Range is of the Ops enum not chars
349 struct CigarItr
350 {
351     Cigar cigar;
352     CigarOp[] ops;
353     CigarOp current;
354 
355     this(Cigar c)
356     {
357         // Copy the cigar
358         cigar = c;
359         ops = cigar[];
360         current = ops[0];
361         current.length = current.length - 1;
362     }
363 
364     Ops front()
365     {
366         return current.op;
367     }
368 
369     void popFront()
370     {
371         // import std.stdio;
372         // writeln(current);
373         // writeln(cigar);
374         if(ops.length == 1 && current.length == 0)
375             ops = [];
376         else if (current.length == 0)
377         {
378             ops = ops[1 .. $];
379             current = ops[0];
380             current.length = current.length - 1;
381         }
382         else if (current.length != 0)
383             current.length = current.length - 1;
384     }
385 
386     bool empty()
387     {
388         return ops.length == 0;
389     }
390 }
391 
392 debug (dhtslib_unittest) unittest
393 {
394     import std.algorithm : map;
395 
396     writeln();
397     hts_log_info(__FUNCTION__, "Testing CigarItr");
398     string c = "7M2D4M";
399     auto cig = cigarFromString(c);
400     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
401     auto itr = CigarItr(cig);
402     assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM");
403 }
404 
405 /// Coordinate pair representing aligned query and reference positions,
406 /// and the CIGAR op at or effecting their alignment.
407 struct AlignedCoordinate
408 {
409     Coordinate!(Basis.zero) qpos, rpos;
410     Ops cigar_op;
411 }
412 
413 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string
414 struct AlignedCoordinatesItr
415 {
416     CigarItr itr;
417     AlignedCoordinate current;
418 
419     this(Cigar cigar)
420     {
421         itr = CigarItr(cigar);
422         current.qpos = current.rpos = Coordinate!(Basis.zero)(-1);
423         current.cigar_op = itr.front;
424         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
425         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
426     }
427 
428     /// InputRange interface
429     AlignedCoordinate front()
430     {
431         return current;
432     }
433 
434     /// InputRange interface
435     void popFront()
436     {
437         itr.popFront;
438         current.cigar_op = itr.front;
439         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
440         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
441     }
442 
443     /// InputRange interface
444     bool empty()
445     {
446         return itr.empty;
447     }
448 }
449 
450 debug (dhtslib_unittest) unittest
451 {
452     writeln();
453     import dhtslib.sam;
454     import htslib.hts_log;
455     import std.path : buildPath, dirName;
456     import std.algorithm : map;
457     import std.array : array;
458 
459     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
460     hts_log_info(__FUNCTION__, "Testing cigar");
461     hts_log_info(__FUNCTION__, "Loading test file");
462     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
463             "test", "range.bam"), 0);
464     auto readrange = bam["CHROMOSOME_I", ZB(914)];
465     hts_log_info(__FUNCTION__, "Getting read 1");
466     assert(readrange.empty == false);
467     auto read = readrange.front();
468     auto coords = read.getAlignedCoordinates(77, 82);
469     assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]);
470 }