1 /** 2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records. 3 */ 4 module dhtslib.cigar; 5 6 import std.stdio; 7 import std.bitmanip : bitfields; 8 import std.array : join; 9 import std.algorithm : map; 10 import std.algorithm.iteration : each; 11 import std.conv : to; 12 import std.range : array; 13 14 import htslib.hts_log; 15 import dhtslib.sam : SAMRecord; 16 17 /// Represents a CIGAR string 18 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 19 struct Cigar 20 { 21 /// array of distinct CIGAR ops 22 CigarOp[] ops; 23 24 /// Construct Cigar from raw data 25 this(uint* cigar, uint length) 26 { 27 ops = (cast(CigarOp*) cigar)[0 .. length]; 28 } 29 30 /// Construct Cigar from an array of CIGAR ops 31 this(CigarOp[] ops) 32 { 33 this.ops = ops; 34 } 35 36 bool is_null() 37 { 38 return ops.length == 1 && ops[0].raw == '*'; 39 } 40 41 /// Format Cigar struct as CIGAR string in accordance with SAM spec 42 string toString() 43 { 44 return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join; 45 } 46 47 /// return the alignment length expressed by this Cigar 48 /// TODO: use CIGAR_TYPE to get rid of switch statement for less branching 49 @property int refBasesCovered() 50 { 51 int len; 52 foreach (op; this.ops) 53 { 54 switch (op.op) 55 { 56 case Ops.MATCH: 57 case Ops.EQUAL: 58 case Ops.DIFF: 59 case Ops.DEL: 60 case Ops.REF_SKIP: 61 len += op.length; 62 break; 63 default: 64 break; 65 } 66 } 67 return len; 68 } 69 70 alias ref_bases_covered = refBasesCovered; 71 /// previous alignedLength function had a bug and 72 /// it is just a duplicate of ref_bases_covered 73 alias alignedLength = refBasesCovered; 74 } 75 76 // Each pair of bits has first bit set iff the operation is query consuming, 77 // and second bit set iff it is reference consuming. 78 // X = P H S N D I M 79 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 80 81 /// Represents a distinct cigar operation 82 union CigarOp 83 { 84 /// raw opcode 85 uint raw; 86 87 mixin(bitfields!( //lower 4 bits store op 88 Ops, "op", 4, //higher 28 bits store length 89 uint, "length", 28)); 90 91 /// construct Op from raw opcode 92 this(uint raw) 93 { 94 this.raw = raw; 95 } 96 97 /// construct Op from an operator and operand (length) 98 this(uint len, Ops op) 99 { 100 this.op = op; 101 this.length = len; 102 } 103 /// Credit to Biod for this code below 104 /// https://github.com/biod/BioD from their bam.cigar module 105 /// True iff operation is one of M, =, X, I, S 106 bool isQueryConsuming() @property const nothrow @nogc 107 { 108 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; 109 } 110 111 /// True iff operation is one of M, =, X, D, N 112 bool isReferenceConsuming() @property const nothrow @nogc 113 { 114 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; 115 } 116 117 /// True iff operation is one of M, =, X 118 bool isMatchOrMismatch() @property const nothrow @nogc 119 { 120 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; 121 } 122 123 /// True iff operation is one of 'S', 'H' 124 bool isClipping() @property const nothrow @nogc 125 { 126 return ((raw & 0xF) >> 1) == 2; // 4 or 5 127 } 128 129 alias is_query_consuming = isQueryConsuming; 130 alias is_reference_consuming = isReferenceConsuming; 131 alias is_match_or_mismatch = isMatchOrMismatch; 132 alias is_clipping = isClipping; 133 } 134 135 /** 136 Represents all ops 137 #define BAM_CIGAR_STR "MIDNSHP=XB" 138 #define BAM_CMATCH 0 139 #define BAM_CINS 1 140 #define BAM_CDEL 2 141 #define BAM_CREF_SKIP 3 142 #define BAM_CSOFT_CLIP 4 143 #define BAM_CHARD_CLIP 5 144 #define BAM_CPAD 6 145 #define BAM_CEQUAL 7 146 #define BAM_CDIFF 8 147 #define BAM_CBACK 9 148 */ 149 string CIGAR_STR = "MIDNSHP=XB"; 150 /// ditto 151 enum Ops 152 { 153 MATCH = 0, 154 INS = 1, 155 DEL = 2, 156 REF_SKIP = 3, 157 SOFT_CLIP = 4, 158 HARD_CLIP = 5, 159 PAD = 6, 160 EQUAL = 7, 161 DIFF = 8, 162 BACK = 9 163 } 164 165 debug (dhtslib_unittest) unittest 166 { 167 writeln(); 168 import dhtslib.sam; 169 import htslib.hts_log; 170 import std.path : buildPath, dirName; 171 172 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 173 hts_log_info(__FUNCTION__, "Testing cigar"); 174 hts_log_info(__FUNCTION__, "Loading test file"); 175 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib", 176 "test", "range.bam"), 0); 177 auto readrange = bam["CHROMOSOME_I", 914]; 178 hts_log_info(__FUNCTION__, "Getting read 1"); 179 auto read = readrange.front(); 180 writeln(read.queryName); 181 hts_log_info(__FUNCTION__, "Cigar:" ~ read.cigar.toString()); 182 writeln(read.cigar.toString()); 183 assert(read.cigar.toString() == "78M1D22M"); 184 } 185 186 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 187 Cigar cigarFromString(string cigar) 188 { 189 import std.regex; 190 191 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 192 m[2].to!char.charToOp)).array); 193 } 194 195 /// Convert single char representing a CIGAR Op to Op union 196 Ops charToOp(char c) 197 { 198 foreach (i, o; CIGAR_STR) 199 { 200 if (c == o) 201 { 202 return cast(Ops) i; 203 } 204 } 205 return cast(Ops) 9; 206 } 207 208 debug (dhtslib_unittest) unittest 209 { 210 writeln(); 211 hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 212 string c = "130M2D40M"; 213 auto cig = cigarFromString(c); 214 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 215 assert(cig.toString() == c); 216 assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming); 217 assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming); 218 } 219 220 /// Range-based iteration of a Cigar string 221 /// Returns a range of Ops that is the length of all op lengths 222 /// e.g. if the Cigar is 4M5D2I4M 223 /// CigarItr will return a range of MMMMDDDDDIIMMMM 224 /// Range is of the Ops enum not chars 225 struct CigarItr 226 { 227 Cigar cigar; 228 CigarOp current; 229 230 this(Cigar c) 231 { 232 // Copy the cigar 233 cigar.ops = c.ops.dup; 234 current = cigar.ops[0]; 235 current.length = current.length - 1; 236 } 237 238 Ops front() 239 { 240 return current.op; 241 } 242 243 void popFront() 244 { 245 if (current.length == 0) 246 { 247 cigar.ops = cigar.ops[1 .. $]; 248 if (!empty) 249 current = cigar.ops[0]; 250 } 251 if (current.length != 0) 252 current.length = current.length - 1; 253 } 254 255 bool empty() 256 { 257 return cigar.ops.length == 0; 258 } 259 } 260 261 debug (dhtslib_unittest) unittest 262 { 263 import std.algorithm : map; 264 265 writeln(); 266 hts_log_info(__FUNCTION__, "Testing CigarItr"); 267 string c = "7M2D4M"; 268 auto cig = cigarFromString(c); 269 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 270 auto itr = CigarItr(cig); 271 assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM"); 272 } 273 274 struct AlignedCoordinate 275 { 276 int qpos, rpos; 277 Ops cigar_op; 278 } 279 280 struct AlignedCoordinatesItr 281 { 282 CigarItr itr; 283 AlignedCoordinate current; 284 285 this(Cigar cigar) 286 { 287 itr = CigarItr(cigar); 288 current.qpos = current.rpos = -1; 289 current.cigar_op = itr.front; 290 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 291 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 292 } 293 294 AlignedCoordinate front() 295 { 296 return current; 297 } 298 299 void popFront() 300 { 301 itr.popFront; 302 current.cigar_op = itr.front; 303 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 304 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 305 } 306 307 bool empty() 308 { 309 return itr.empty; 310 } 311 } 312 313 unittest 314 { 315 writeln(); 316 import dhtslib.sam; 317 import htslib.hts_log; 318 import std.path : buildPath, dirName; 319 320 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 321 hts_log_info(__FUNCTION__, "Testing cigar"); 322 hts_log_info(__FUNCTION__, "Loading test file"); 323 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib", 324 "test", "range.bam"), 0); 325 auto readrange = bam["CHROMOSOME_I", 914]; 326 hts_log_info(__FUNCTION__, "Getting read 1"); 327 auto read = readrange.front(); 328 writeln(read.getAlignedCoordinates(read.pos + 77, read.pos + 82)); 329 }