1 /** 2 This module simplifies working with CIGAR strings/ops in SAM/BAM/CRAM format. 3 */ 4 module dhtslib.cigar; 5 6 import std.stdio; 7 import std.bitmanip : bitfields; 8 import std.array : join; 9 import std.algorithm : map; 10 import std.algorithm.iteration : each; 11 import std.conv : to; 12 import std.range : array; 13 14 import dhtslib.htslib.hts_log; 15 import dhtslib.sam: SAMRecord; 16 17 /// Represents a CIGAR string 18 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 19 struct Cigar 20 { 21 /// array of distinct CIGAR ops 22 CigarOp[] ops; 23 24 /// Construct Cigar from raw data 25 this(uint* cigar, int length) 26 { 27 ops=(cast(CigarOp *)cigar)[0..length]; 28 } 29 30 /// Construct Cigar from an array of CIGAR ops 31 this(CigarOp[] ops) 32 { 33 this.ops = ops; 34 } 35 36 bool is_null(){ 37 return ops.length == 1 && ops[0].raw == '*'; 38 } 39 40 /// Format Cigar struct as CIGAR string in accordance with SAM spec 41 string toString() 42 { 43 return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join; 44 } 45 46 /// return the alignment length expressed by this Cigar 47 /// TODO: use CIGAR_TYPE to get rid of switch statement for less branching 48 @property int ref_bases_covered(){ 49 int len; 50 foreach(op;this.ops){ 51 switch (op.op) 52 { 53 case Ops.MATCH: 54 case Ops.EQUAL: 55 case Ops.DIFF: 56 case Ops.DEL: 57 case Ops.REF_SKIP: 58 len += op.length; 59 break; 60 default: 61 break; 62 } 63 } 64 return len; 65 } 66 67 /// previous alignedLength function had a bug and 68 /// it is just a duplicate of ref_bases_covered 69 alias alignedLength = ref_bases_covered; 70 } 71 72 // Each pair of bits has first bit set iff the operation is query consuming, 73 // and second bit set iff it is reference consuming. 74 // X = P H S N D I M 75 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 76 77 78 /// Represents a distinct cigar operation 79 union CigarOp 80 { 81 /// raw opcode 82 uint raw; 83 84 mixin(bitfields!(//lower 4 bits store op 85 Ops, "op", 4,//higher 28 bits store length 86 uint, "length", 28)); 87 88 /// construct Op from raw opcode 89 this(uint raw) 90 { 91 this.raw = raw; 92 } 93 94 /// construct Op from an operator and operand (length) 95 this(uint len, Ops op) 96 { 97 this.op = op; 98 this.length = len; 99 } 100 /// Credit to Biod for this code below 101 /// https://github.com/biod/BioD from their bam.cigar module 102 /// True iff operation is one of M, =, X, I, S 103 bool is_query_consuming() @property const nothrow @nogc { 104 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; 105 } 106 107 /// True iff operation is one of M, =, X, D, N 108 bool is_reference_consuming() @property const nothrow @nogc { 109 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; 110 } 111 112 /// True iff operation is one of M, =, X 113 bool is_match_or_mismatch() @property const nothrow @nogc { 114 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; 115 } 116 117 /// True iff operation is one of 'S', 'H' 118 bool is_clipping() @property const nothrow @nogc { 119 return ((raw & 0xF) >> 1) == 2; // 4 or 5 120 } 121 } 122 123 /** 124 Represents all ops 125 #define BAM_CIGAR_STR "MIDNSHP=XB" 126 #define BAM_CMATCH 0 127 #define BAM_CINS 1 128 #define BAM_CDEL 2 129 #define BAM_CREF_SKIP 3 130 #define BAM_CSOFT_CLIP 4 131 #define BAM_CHARD_CLIP 5 132 #define BAM_CPAD 6 133 #define BAM_CEQUAL 7 134 #define BAM_CDIFF 8 135 #define BAM_CBACK 9 136 */ 137 string CIGAR_STR = "MIDNSHP=XB"; 138 /// ditto 139 enum Ops 140 { 141 MATCH = 0, 142 INS = 1, 143 DEL = 2, 144 REF_SKIP = 3, 145 SOFT_CLIP = 4, 146 HARD_CLIP = 5, 147 PAD = 6, 148 EQUAL = 7, 149 DIFF = 8, 150 BACK = 9 151 } 152 debug(dhtslib_unittest) 153 unittest 154 { 155 writeln(); 156 import dhtslib.sam; 157 import dhtslib.htslib.hts_log; 158 import std.path:buildPath,dirName; 159 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 160 hts_log_info(__FUNCTION__, "Testing cigar"); 161 hts_log_info(__FUNCTION__, "Loading test file"); 162 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0); 163 auto readrange = bam["CHROMOSOME_I", 914]; 164 hts_log_info(__FUNCTION__, "Getting read 1"); 165 auto read = readrange.front(); 166 writeln(read.queryName); 167 hts_log_info(__FUNCTION__, "Cigar:" ~ read.cigar.toString()); 168 assert(read.cigar.toString() == "78M1D22M"); 169 } 170 171 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 172 Cigar cigarFromString(string cigar) 173 { 174 import std.regex; 175 176 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 177 m[2].to!char.charToOp)).array); 178 } 179 180 /// Convert single char representing a CIGAR Op to Op union 181 Ops charToOp(char c) 182 { 183 foreach (i, o; CIGAR_STR) 184 { 185 if (c == o) 186 { 187 return cast(Ops) i; 188 } 189 } 190 return cast(Ops) 9; 191 } 192 debug(dhtslib_unittest) 193 unittest 194 { 195 writeln(); 196 hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 197 string c = "130M2D40M"; 198 auto cig = cigarFromString(c); 199 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 200 assert(cig.toString() == c); 201 assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming); 202 assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming); 203 } 204 205 /// Range-based iteration of a Cigar string 206 /// Returns a range of Ops that is the length of all op lengths 207 /// e.g. if the Cigar is 4M5D2I4M 208 /// CigarItr will return a range of MMMMDDDDDIIMMMM 209 /// Range is of the Ops enum not chars 210 struct CigarItr 211 { 212 Cigar cigar; 213 CigarOp current; 214 215 this(Cigar c) 216 { 217 // Copy the cigar 218 cigar.ops=c.ops.dup; 219 current = cigar.ops[0]; 220 current.length=current.length-1; 221 } 222 223 Ops front() 224 { 225 return current.op; 226 } 227 228 void popFront() 229 { 230 if(current.length==0){ 231 cigar.ops=cigar.ops[1..$]; 232 if(!empty) current = cigar.ops[0]; 233 } 234 if(current.length != 0) current.length=current.length-1; 235 } 236 237 bool empty() 238 { 239 return cigar.ops.length==0; 240 } 241 } 242 243 debug(dhtslib_unittest) 244 unittest 245 { 246 import std.algorithm:map; 247 writeln(); 248 hts_log_info(__FUNCTION__, "Testing CigarItr"); 249 string c = "7M2D4M"; 250 auto cig = cigarFromString(c); 251 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 252 auto itr=CigarItr(cig); 253 assert(itr.map!(x=>CIGAR_STR[x]).array.idup=="MMMMMMMDDMMMM"); 254 } 255 256 struct AlignedCoordinate{ 257 int qpos, rpos; 258 Ops cigar_op; 259 } 260 261 262 struct AlignedCoordinatesItr{ 263 CigarItr itr; 264 AlignedCoordinate current; 265 266 this(Cigar cigar){ 267 itr = CigarItr(cigar); 268 current.qpos = current.rpos = -1; 269 current.cigar_op = itr.front; 270 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 271 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 272 } 273 274 AlignedCoordinate front(){ 275 return current; 276 } 277 278 void popFront(){ 279 itr.popFront; 280 current.cigar_op = itr.front; 281 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 282 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 283 } 284 285 bool empty(){ 286 return itr.empty; 287 } 288 } 289 290 unittest 291 { 292 writeln(); 293 import dhtslib.sam; 294 import dhtslib.htslib.hts_log; 295 import std.path:buildPath,dirName; 296 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 297 hts_log_info(__FUNCTION__, "Testing cigar"); 298 hts_log_info(__FUNCTION__, "Loading test file"); 299 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0); 300 auto readrange = bam["CHROMOSOME_I", 914]; 301 hts_log_info(__FUNCTION__, "Getting read 1"); 302 auto read = readrange.front(); 303 writeln(read.getAlignedCoordinates(read.pos + 77, read.pos + 82)); 304 }