1 /**
2     This module simplifies working with CIGAR strings/ops in SAM/BAM/CRAM format.
3 */
4 module dhtslib.cigar;
5 
6 import std.stdio;
7 import std.bitmanip : bitfields;
8 import std.array : join;
9 import std.algorithm : map;
10 import std.algorithm.iteration : each;
11 import std.conv : to;
12 import std.range : array;
13 
14 import dhtslib.htslib.hts_log;
15 import dhtslib.sam: SAMRecord;
16 
17 /// Represents a CIGAR string
18 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6
19 struct Cigar
20 {
21     /// array of distinct CIGAR ops 
22     CigarOp[] ops;
23 
24     /// Construct Cigar from raw data
25     this(uint* cigar, int length)
26     {
27         ops=(cast(CigarOp *)cigar)[0..length];
28     }
29 
30     /// Construct Cigar from an array of CIGAR ops
31     this(CigarOp[] ops)
32     {
33         this.ops = ops;
34     }
35 
36     bool is_null(){
37         return ops.length == 1 && ops[0].raw == '*';
38     }
39 
40     /// Format Cigar struct as CIGAR string in accordance with SAM spec
41     string toString()
42     {
43         return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join;
44     }
45 
46     /// return the alignment length expressed by this Cigar
47     /// TODO: use CIGAR_TYPE to get rid of switch statement for less branching
48     @property int ref_bases_covered(){
49         int len;
50         foreach(op;this.ops){
51             switch (op.op)
52             {
53             case Ops.MATCH:
54             case Ops.EQUAL:
55             case Ops.DIFF:
56             case Ops.DEL:
57             case Ops.REF_SKIP:
58                 len += op.length;
59                 break;
60             default:
61                 break;
62             }
63         }
64         return len;
65     }
66     
67     /// previous alignedLength function had a bug and 
68     /// it is just a duplicate of ref_bases_covered
69     alias alignedLength = ref_bases_covered;
70 }
71 
72 // Each pair of bits has first bit set iff the operation is query consuming,
73 // and second bit set iff it is reference consuming.
74 //                                            X  =  P  H  S  N  D  I  M
75 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11;
76 
77 
78 /// Represents a distinct cigar operation
79 union CigarOp
80 {
81     /// raw opcode
82     uint raw;
83 
84     mixin(bitfields!(//lower 4 bits store op
85             Ops, "op", 4,//higher 28 bits store length
86             uint, "length", 28));
87 
88     /// construct Op from raw opcode
89     this(uint raw)
90     {
91         this.raw = raw;
92     }
93 
94     /// construct Op from an operator and operand (length)
95     this(uint len, Ops op)
96     {
97         this.op = op;
98         this.length = len;
99     }
100     /// Credit to Biod for this code below
101     /// https://github.com/biod/BioD from their bam.cigar module
102     /// True iff operation is one of M, =, X, I, S
103     bool is_query_consuming() @property const nothrow @nogc {
104         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0;
105     }
106 
107     /// True iff operation is one of M, =, X, D, N
108     bool is_reference_consuming() @property const nothrow @nogc {
109         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0;
110     }
111 
112     /// True iff operation is one of M, =, X
113     bool is_match_or_mismatch() @property const nothrow @nogc {
114         return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3;
115     }
116 
117     /// True iff operation is one of 'S', 'H'
118     bool is_clipping() @property const nothrow @nogc {
119         return ((raw & 0xF) >> 1) == 2; // 4 or 5
120     }
121 }
122 
123 /**
124 Represents all ops
125 #define BAM_CIGAR_STR   "MIDNSHP=XB"
126 #define BAM_CMATCH      0
127 #define BAM_CINS        1
128 #define BAM_CDEL        2
129 #define BAM_CREF_SKIP   3
130 #define BAM_CSOFT_CLIP  4
131 #define BAM_CHARD_CLIP  5
132 #define BAM_CPAD        6
133 #define BAM_CEQUAL      7
134 #define BAM_CDIFF       8
135 #define BAM_CBACK       9
136 */
137 string CIGAR_STR = "MIDNSHP=XB";
138 /// ditto
139 enum Ops
140 {
141     MATCH = 0,
142     INS = 1,
143     DEL = 2,
144     REF_SKIP = 3,
145     SOFT_CLIP = 4,
146     HARD_CLIP = 5,
147     PAD = 6,
148     EQUAL = 7,
149     DIFF = 8,
150     BACK = 9
151 }
152 debug(dhtslib_unittest)
153 unittest
154 {
155     writeln();
156     import dhtslib.sam;
157     import dhtslib.htslib.hts_log;
158     import std.path:buildPath,dirName;
159     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
160     hts_log_info(__FUNCTION__, "Testing cigar");
161     hts_log_info(__FUNCTION__, "Loading test file");
162     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0);
163     auto readrange = bam["CHROMOSOME_I", 914];
164     hts_log_info(__FUNCTION__, "Getting read 1");
165     auto read = readrange.front();
166     writeln(read.queryName);
167     hts_log_info(__FUNCTION__, "Cigar:" ~ read.cigar.toString());
168     assert(read.cigar.toString() == "78M1D22M");
169 }
170 
171 /// return Cigar struct for a given CIGAR string (e.g. from SAM line)
172 Cigar cigarFromString(string cigar)
173 {
174     import std.regex;
175 
176     return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint,
177             m[2].to!char.charToOp)).array);
178 }
179 
180 /// Convert single char representing a CIGAR Op to Op union
181 Ops charToOp(char c)
182 {
183     foreach (i, o; CIGAR_STR)
184     {
185         if (c == o)
186         {
187             return cast(Ops) i;
188         }
189     }
190     return cast(Ops) 9;
191 }
192 debug(dhtslib_unittest)
193 unittest
194 {
195     writeln();
196     hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming");
197     string c = "130M2D40M";
198     auto cig = cigarFromString(c);
199     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
200     assert(cig.toString() == c);
201     assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming);
202     assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming);
203 }
204 
205 /// Range-based iteration of a Cigar string
206 /// Returns a range of Ops that is the length of all op lengths
207 /// e.g. if the Cigar is 4M5D2I4M
208 /// CigarItr will return a range of MMMMDDDDDIIMMMM
209 /// Range is of the Ops enum not chars
210 struct CigarItr
211 {
212     Cigar cigar;
213     CigarOp current;
214 
215     this(Cigar c)
216     {
217         // Copy the cigar
218         cigar.ops=c.ops.dup;
219         current = cigar.ops[0];
220         current.length=current.length-1;
221     }
222     
223     Ops front()
224     {
225         return current.op;
226     }
227     
228     void popFront()
229     {
230         if(current.length==0){
231             cigar.ops=cigar.ops[1..$];
232             if(!empty) current = cigar.ops[0];
233         }
234         if(current.length != 0) current.length=current.length-1;
235     }
236 
237     bool empty()
238     {
239         return cigar.ops.length==0;
240     }
241 }
242 
243 debug(dhtslib_unittest)
244 unittest
245 {
246     import std.algorithm:map;
247     writeln();
248     hts_log_info(__FUNCTION__, "Testing CigarItr");
249     string c = "7M2D4M";
250     auto cig = cigarFromString(c);
251     hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString());
252     auto itr=CigarItr(cig);
253     assert(itr.map!(x=>CIGAR_STR[x]).array.idup=="MMMMMMMDDMMMM");
254 }
255 
256 struct AlignedCoordinate{
257     int qpos, rpos;
258     Ops cigar_op;
259 }
260 
261 
262 struct AlignedCoordinatesItr{
263     CigarItr itr;
264     AlignedCoordinate current;
265 
266     this(Cigar cigar){ 
267         itr = CigarItr(cigar);
268         current.qpos = current.rpos = -1;
269         current.cigar_op = itr.front;
270         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
271         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
272     }
273 
274     AlignedCoordinate front(){
275         return current;
276     }
277 
278     void popFront(){
279         itr.popFront;
280         current.cigar_op = itr.front;
281         current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1);
282         current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1);
283     }
284 
285     bool empty(){
286         return itr.empty;
287     }
288 }
289 
290 unittest
291 {
292     writeln();
293     import dhtslib.sam;
294     import dhtslib.htslib.hts_log;
295     import std.path:buildPath,dirName;
296     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
297     hts_log_info(__FUNCTION__, "Testing cigar");
298     hts_log_info(__FUNCTION__, "Loading test file");
299     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0);
300     auto readrange = bam["CHROMOSOME_I", 914];
301     hts_log_info(__FUNCTION__, "Getting read 1");
302     auto read = readrange.front();
303     writeln(read.getAlignedCoordinates(read.pos + 77, read.pos + 82));   
304 }