1 /**
2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records.
3 */
4 module dhtslib.cigar;
5 
6 import std.stdio;
7 import std.bitmanip : bitfields;
8 import std.array : join;
9 import std.algorithm : map;
10 import std.algorithm.iteration : each;
11 import std.conv : to;
12 import std.range : array;
13 
14 import htslib.hts_log;
15 import dhtslib.sam : SAMRecord;
16 
17 /// Represents a CIGAR string
18 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
19 struct Cigar
20 {
21     /// array of distinct CIGAR ops 
22     CigarOp[] ops;
23 
24     /// Construct Cigar from raw data
25     this(uint* cigar, uint length)
26     {
27         ops = (cast(CigarOp*) cigar)[0 .. length];
28     }
29 
30     /// Construct Cigar from an array of CIGAR ops
31     this(CigarOp[] ops)
32     {
33         this.ops = ops;
34     }
35 
36     bool is_null()
37     {
38         return ops.length == 1 && ops[0].raw == '*';
39     }
40 
41     /// Format Cigar struct as CIGAR string in accordance with SAM spec
42     string toString()
43     {
44         return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join;
45     }
46 
47     /// return the alignment length expressed by this Cigar
48     /// TODO: use CIGAR_TYPE to get rid of switch statement for less branching
49     @property int refBasesCovered()
50     {
51         int len;
52         foreach (op; this.ops)
53         {
54             switch (op.op)
55             {
56             case Ops.MATCH:
57             case Ops.EQUAL:
58             case Ops.DIFF:
59             case Ops.DEL:
60             case Ops.REF_SKIP:
61                 len += op.length;
62                 break;
63             default:
64                 break;
65             }
66         }
67         return len;
68     }
69 
70     alias ref_bases_covered = refBasesCovered;
71     /// previous alignedLength function had a bug and 
72     /// it is just a duplicate of ref_bases_covered
73     alias alignedLength = refBasesCovered;
74 }
75 
76 // Each pair of bits has first bit set iff the operation is query consuming,
77 // and second bit set iff it is reference consuming.
78 //                                            X  =  P  H  S  N  D  I  M
79 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
80 
81 /// Represents a distinct cigar operation
82 union CigarOp
83 {
84     /// raw opcode
85     uint raw;
86 
87     mixin(bitfields!( //lower 4 bits store op
88             Ops, "op", 4, //higher 28 bits store length
89             uint, "length", 28));
90 
91     /// construct Op from raw opcode
92     this(uint raw)
93     {
94         this.raw = raw;
95     }
96 
97     /// construct Op from an operator and operand (length)
98     this(uint len, Ops op)
99     {
100         this.op = op;
101         this.length = len;
102     }
103     /// Credit to Biod for this code below
104     /// https://github.com/biod/BioD from their bam.cigar module
105     /// True iff operation is one of M, =, X, I, S
106     bool isQueryConsuming() @property const nothrow @nogc
107     {
108         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
109     }
110 
111     /// True iff operation is one of M, =, X, D, N
112     bool isReferenceConsuming() @property const nothrow @nogc
113     {
114         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
115     }
116 
117     /// True iff operation is one of M, =, X
118     bool isMatchOrMismatch() @property const nothrow @nogc
119     {
120         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
121     }
122 
123     /// True iff operation is one of 'S', 'H'
124     bool isClipping() @property const nothrow @nogc
125     {
126         return ((raw & 0xF) >> 1) == 2; // 4 or 5
127     }
128 
129     alias is_query_consuming = isQueryConsuming;
130     alias is_reference_consuming = isReferenceConsuming;
131     alias is_match_or_mismatch = isMatchOrMismatch;
132     alias is_clipping = isClipping;
133 }
134 
135 /**
136 Represents all ops
137 #define BAM_CIGAR_STR   "MIDNSHP=XB"
138 #define BAM_CMATCH      0
139 #define BAM_CINS        1
140 #define BAM_CDEL        2
141 #define BAM_CREF_SKIP   3
142 #define BAM_CSOFT_CLIP  4
143 #define BAM_CHARD_CLIP  5
144 #define BAM_CPAD        6
145 #define BAM_CEQUAL      7
146 #define BAM_CDIFF       8
147 #define BAM_CBACK       9
148 */
149 string CIGAR_STR = "MIDNSHP=XB";
150 /// ditto
151 enum Ops
152 {
153     MATCH = 0,
154     INS = 1,
155     DEL = 2,
156     REF_SKIP = 3,
157     SOFT_CLIP = 4,
158     HARD_CLIP = 5,
159     PAD = 6,
160     EQUAL = 7,
161     DIFF = 8,
162     BACK = 9
163 }
164 
165 debug (dhtslib_unittest) unittest
166 {
167     writeln();
168     import dhtslib.sam;
169     import htslib.hts_log;
170     import std.path : buildPath, dirName;
171 
172     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
173     hts_log_info(__FUNCTION__, "Testing cigar");
174     hts_log_info(__FUNCTION__, "Loading test file");
175     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib",
176             "test", "range.bam"), 0);
177     auto readrange = bam["CHROMOSOME_I", 914];
178     hts_log_info(__FUNCTION__, "Getting read 1");
179     auto read = readrange.front();
180     writeln(read.queryName);
181     hts_log_info(__FUNCTION__, "Cigar:" ~ read.cigar.toString());
182     writeln(read.cigar.toString());
183     assert(read.cigar.toString() == "78M1D22M");
184 }
185 
186 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
187 Cigar cigarFromString(string cigar)
188 {
189     import std.regex;
190 
191     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
192             m[2].to!char.charToOp)).array);
193 }
194 
195 /// Convert single char representing a CIGAR Op to Op union
196 Ops charToOp(char c)
197 {
198     foreach (i, o; CIGAR_STR)
199     {
200         if (c == o)
201         {
202             return cast(Ops) i;
203         }
204     }
205     return cast(Ops) 9;
206 }
207 
208 debug (dhtslib_unittest) unittest
209 {
210     writeln();
211     hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
212     string c = "130M2D40M";
213     auto cig = cigarFromString(c);
214     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
215     assert(cig.toString() == c);
216     assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming);
217     assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming);
218 }
219 
220 /// Range-based iteration of a Cigar string
221 /// Returns a range of Ops that is the length of all op lengths
222 /// e.g. if the Cigar is 4M5D2I4M
223 /// CigarItr will return a range of MMMMDDDDDIIMMMM
224 /// Range is of the Ops enum not chars
225 struct CigarItr
226 {
227     Cigar cigar;
228     CigarOp current;
229 
230     this(Cigar c)
231     {
232         // Copy the cigar
233         cigar.ops = c.ops.dup;
234         current = cigar.ops[0];
235         current.length = current.length - 1;
236     }
237 
238     Ops front()
239     {
240         return current.op;
241     }
242 
243     void popFront()
244     {
245         if (current.length == 0)
246         {
247             cigar.ops = cigar.ops[1 .. $];
248             if (!empty)
249                 current = cigar.ops[0];
250         }
251         if (current.length != 0)
252             current.length = current.length - 1;
253     }
254 
255     bool empty()
256     {
257         return cigar.ops.length == 0;
258     }
259 }
260 
261 debug (dhtslib_unittest) unittest
262 {
263     import std.algorithm : map;
264 
265     writeln();
266     hts_log_info(__FUNCTION__, "Testing CigarItr");
267     string c = "7M2D4M";
268     auto cig = cigarFromString(c);
269     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
270     auto itr = CigarItr(cig);
271     assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM");
272 }
273 
274 struct AlignedCoordinate
275 {
276     int qpos, rpos;
277     Ops cigar_op;
278 }
279 
280 struct AlignedCoordinatesItr
281 {
282     CigarItr itr;
283     AlignedCoordinate current;
284 
285     this(Cigar cigar)
286     {
287         itr = CigarItr(cigar);
288         current.qpos = current.rpos = -1;
289         current.cigar_op = itr.front;
290         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
291         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
292     }
293 
294     AlignedCoordinate front()
295     {
296         return current;
297     }
298 
299     void popFront()
300     {
301         itr.popFront;
302         current.cigar_op = itr.front;
303         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
304         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
305     }
306 
307     bool empty()
308     {
309         return itr.empty;
310     }
311 }
312 
313 unittest
314 {
315     writeln();
316     import dhtslib.sam;
317     import htslib.hts_log;
318     import std.path : buildPath, dirName;
319 
320     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
321     hts_log_info(__FUNCTION__, "Testing cigar");
322     hts_log_info(__FUNCTION__, "Loading test file");
323     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib",
324             "test", "range.bam"), 0);
325     auto readrange = bam["CHROMOSOME_I", 914];
326     hts_log_info(__FUNCTION__, "Getting read 1");
327     auto read = readrange.front();
328     writeln(read.getAlignedCoordinates(read.pos + 77, read.pos + 82));
329 }