1 /** 2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records. 3 */ 4 module dhtslib.sam.cigar; 5 6 import std.stdio; 7 import std.bitmanip : bitfields; 8 import std.array : join; 9 import std.algorithm : map; 10 import std.algorithm.iteration : each; 11 import std.conv : to; 12 import std.range : array; 13 import std.traits : isIntegral; 14 15 import htslib.hts_log; 16 import dhtslib.sam.record : SAMRecord; 17 import dhtslib.coordinates; 18 import dhtslib.memory; 19 import htslib.sam : bam_get_cigar; 20 21 /// Represents a CIGAR string 22 /// https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 23 struct Cigar 24 { 25 /// array of distinct CIGAR ops 26 /// private now to force fixing some reference issues 27 private CigarOp[] ops; 28 private Bam1 b; 29 /// Construct Cigar from SAMRecord 30 this(Bam1 b) 31 { 32 this.b = b; 33 auto cigar = bam_get_cigar(this.b); 34 this(cigar, this.b.core.n_cigar); 35 } 36 37 /// Construct Cigar from raw data 38 this(uint* cigar, uint length) 39 { 40 this((cast(CigarOp*) cigar)[0 .. length]); 41 } 42 43 /// Construct Cigar from an array of CIGAR ops 44 this(CigarOp[] ops) 45 { 46 this.ops = ops; 47 } 48 49 /// null CIGAR -- don't read! 50 bool is_null() 51 { 52 return this.ops.length == 0; 53 } 54 55 /// Format Cigar struct as CIGAR string in accordance with SAM spec 56 string toString() 57 { 58 return ops.map!(x => x.length.to!string ~ CIGAR_STR[x.op]).array.join; 59 } 60 61 /// get length of cigar 62 auto length(){ 63 return this.ops.length; 64 } 65 66 /// set length of cigar 67 void length(T)(T len) 68 if(isIntegral!T) 69 { 70 this.ops.length = len; 71 } 72 73 /// copy cigar 74 auto const dup(){ 75 return Cigar(this.ops.dup); 76 } 77 78 /// return the alignment length expressed by this Cigar 79 @property int refBasesCovered() 80 { 81 int len; 82 foreach (op; this.ops) 83 { 84 // look ma no branching (ignore the above foreach) 85 // add op.length if reference consuming 86 // mask is 0 if not reference consuming 87 // mask is -1 if reference consuming 88 len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1)); 89 } 90 return len; 91 } 92 93 deprecated("Use camelCase names instead") 94 alias ref_bases_covered = refBasesCovered; 95 /// previous alignedLength function had a bug and 96 /// it is just a duplicate of ref_bases_covered 97 alias alignedLength = refBasesCovered; 98 99 /// allow foreach on Cigar 100 int opApply(int delegate(ref CigarOp) dg) { 101 int result = 0; 102 103 foreach (op; ops) { 104 result = dg(op); 105 106 if (result) { 107 break; 108 } 109 } 110 111 return result; 112 } 113 114 /// Get a Slice of cigar ops from a range in the Cigar string 115 auto opSlice() 116 { 117 return ops; 118 } 119 120 /// Get a Slice of cigar ops from a range in the Cigar string 121 auto opSlice(T)(T start, T end) if (isIntegral!T) 122 { 123 return ops[start .. end]; 124 } 125 126 /// Get a cigar op from a single position in the Cigar string 127 ref auto opIndex(ulong i) 128 { 129 return ops[i]; 130 } 131 132 /// Assign a cigar op at a single position in the Cigar string 133 auto opIndexAssign(CigarOp value, size_t index) 134 { 135 return ops[index] = value; 136 } 137 138 /// Assign a range cigar ops over a range in the Cigar string 139 auto opSliceAssign(T)(CigarOp[] values, T start, T end) 140 { 141 assert(end - start == values.length); 142 return ops[start .. end] = values; 143 } 144 145 auto opAssign(T: CigarOp[])(T value) 146 { 147 this.ops = value; 148 return this; 149 } 150 151 auto opAssign(T: Cigar)(T value) 152 { 153 this.ops = value.ops; 154 return this; 155 } 156 157 size_t opDollar() 158 { 159 return length; 160 } 161 162 auto opBinary(string op)(const Cigar rhs) const 163 if(op == "~") 164 { 165 return Cigar(this.dup[] ~ rhs.dup[]); 166 } 167 } 168 169 // Each pair of bits has first bit set iff the operation is query consuming, 170 // and second bit set iff it is reference consuming. 171 // X = P H S N D I M 172 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 173 174 /// Represents a distinct cigar operation 175 union CigarOp 176 { 177 /// raw opcode 178 uint raw; 179 180 mixin(bitfields!( //lower 4 bits store op 181 Ops, "op", 4, //higher 28 bits store length 182 uint, "length", 28)); 183 184 /// construct Op from raw opcode 185 this(uint raw) 186 { 187 this.raw = raw; 188 } 189 190 /// construct Op from an operator and operand (length) 191 this(uint len, Ops op) 192 { 193 this.op = op; 194 this.length = len; 195 } 196 /// Credit to Biod for this code below 197 /// https://github.com/biod/BioD from their bam.cigar module 198 /// True iff operation is one of M, =, X, I, S 199 bool isQueryConsuming() @property const nothrow @nogc 200 { 201 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 1) != 0; 202 } 203 204 /// True iff operation is one of M, =, X, D, N 205 bool isReferenceConsuming() @property const nothrow @nogc 206 { 207 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 2) != 0; 208 } 209 210 /// True iff operation is one of M, =, X 211 bool isMatchOrMismatch() @property const nothrow @nogc 212 { 213 return ((CIGAR_TYPE >> ((raw & 0xF) * 2)) & 3) == 3; 214 } 215 216 /// True iff operation is one of 'S', 'H' 217 bool isClipping() @property const nothrow @nogc 218 { 219 return ((raw & 0xF) >> 1) == 2; // 4 or 5 220 } 221 222 deprecated("Use camelCase names instead") 223 alias is_query_consuming = isQueryConsuming; 224 deprecated("Use camelCase names instead") 225 alias is_reference_consuming = isReferenceConsuming; 226 deprecated("Use camelCase names instead") 227 alias is_match_or_mismatch = isMatchOrMismatch; 228 deprecated("Use camelCase names instead") 229 alias is_clipping = isClipping; 230 231 string toString(){ 232 return this.length.to!string ~ CIGAR_STR[this.op]; 233 } 234 } 235 236 /** 237 Represents all ops 238 #define BAM_CIGAR_STR "MIDNSHP=XB" 239 #define BAM_CMATCH 0 240 #define BAM_CINS 1 241 #define BAM_CDEL 2 242 #define BAM_CREF_SKIP 3 243 #define BAM_CSOFT_CLIP 4 244 #define BAM_CHARD_CLIP 5 245 #define BAM_CPAD 6 246 #define BAM_CEQUAL 7 247 #define BAM_CDIFF 8 248 #define BAM_CBACK 9 249 */ 250 string CIGAR_STR = "MIDNSHP=XB"; 251 /// ditto 252 enum Ops 253 { 254 MATCH = 0, 255 INS = 1, 256 DEL = 2, 257 REF_SKIP = 3, 258 SOFT_CLIP = 4, 259 HARD_CLIP = 5, 260 PAD = 6, 261 EQUAL = 7, 262 DIFF = 8, 263 BACK = 9 264 } 265 266 debug (dhtslib_unittest) unittest 267 { 268 writeln(); 269 import dhtslib.sam; 270 import htslib.hts_log; 271 import std.path : buildPath, dirName; 272 273 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 274 hts_log_info(__FUNCTION__, "Testing cigar"); 275 hts_log_info(__FUNCTION__, "Loading test file"); 276 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 277 "test", "range.bam"), 0); 278 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 279 hts_log_info(__FUNCTION__, "Getting read 1"); 280 assert(readrange.empty == false); 281 auto read = readrange.front(); 282 auto cigar = read.cigar; 283 writeln(read.queryName); 284 hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString()); 285 writeln(cigar.toString()); 286 assert(cigar.toString() == "78M1D22M"); 287 assert(cigar[0] == CigarOp(78, Ops.MATCH)); 288 assert(Cigar(cigar[0 .. 2]).toString == "78M1D"); 289 cigar[0] = CigarOp(4,Ops.HARD_CLIP); 290 cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]; 291 assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]); 292 assert(cigar.toString() == "4H2I21M"); 293 } 294 295 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 296 Cigar cigarFromString(string cigar) 297 { 298 import std.regex; 299 300 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 301 m[2].to!char.charToOp)).array); 302 } 303 304 /// Convert single char representing a CIGAR Op to Op union 305 Ops charToOp(char c) 306 { 307 foreach (i, o; CIGAR_STR) 308 { 309 if (c == o) 310 { 311 return cast(Ops) i; 312 } 313 } 314 return cast(Ops) 9; 315 } 316 317 debug (dhtslib_unittest) unittest 318 { 319 writeln(); 320 hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 321 string c = "130M2D40M"; 322 auto cig = cigarFromString(c); 323 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 324 assert(cig.toString() == c); 325 assert(cig.ops[0].is_query_consuming && cig.ops[0].is_reference_consuming); 326 assert(!cig.ops[1].is_query_consuming && cig.ops[1].is_reference_consuming); 327 } 328 329 /// Range-based iteration of a Cigar string 330 /// Returns a range of Ops that is the length of all op lengths 331 /// e.g. if the Cigar is 4M5D2I4M 332 /// CigarItr will return a range of MMMMDDDDDIIMMMM 333 /// Range is of the Ops enum not chars 334 struct CigarItr 335 { 336 Cigar cigar; 337 CigarOp current; 338 339 this(Cigar c) 340 { 341 // Copy the cigar 342 cigar = c.dup; 343 current = cigar[0]; 344 current.length = current.length - 1; 345 } 346 347 Ops front() 348 { 349 return current.op; 350 } 351 352 void popFront() 353 { 354 // import std.stdio; 355 // writeln(current); 356 // writeln(cigar); 357 if(cigar.length == 1 && current.length == 0) 358 cigar.length = 0; 359 else if (current.length == 0) 360 { 361 cigar = cigar[1 .. $]; 362 current = cigar[0]; 363 current.length = current.length - 1; 364 } 365 else if (current.length != 0) 366 current.length = current.length - 1; 367 } 368 369 bool empty() 370 { 371 return cigar.length == 0; 372 } 373 } 374 375 debug (dhtslib_unittest) unittest 376 { 377 import std.algorithm : map; 378 379 writeln(); 380 hts_log_info(__FUNCTION__, "Testing CigarItr"); 381 string c = "7M2D4M"; 382 auto cig = cigarFromString(c); 383 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 384 auto itr = CigarItr(cig); 385 assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM"); 386 } 387 388 /// Coordinate pair representing aligned query and reference positions, 389 /// and the CIGAR op at or effecting their alignment. 390 struct AlignedCoordinate 391 { 392 Coordinate!(Basis.zero) qpos, rpos; 393 Ops cigar_op; 394 } 395 396 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string 397 struct AlignedCoordinatesItr 398 { 399 CigarItr itr; 400 AlignedCoordinate current; 401 402 this(Cigar cigar) 403 { 404 itr = CigarItr(cigar); 405 current.qpos = current.rpos = Coordinate!(Basis.zero)(-1); 406 current.cigar_op = itr.front; 407 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 408 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 409 } 410 411 /// InputRange interface 412 AlignedCoordinate front() 413 { 414 return current; 415 } 416 417 /// InputRange interface 418 void popFront() 419 { 420 itr.popFront; 421 current.cigar_op = itr.front; 422 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 423 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 424 } 425 426 /// InputRange interface 427 bool empty() 428 { 429 return itr.empty; 430 } 431 } 432 433 debug (dhtslib_unittest) unittest 434 { 435 writeln(); 436 import dhtslib.sam; 437 import htslib.hts_log; 438 import std.path : buildPath, dirName; 439 import std.algorithm : map; 440 import std.array : array; 441 442 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 443 hts_log_info(__FUNCTION__, "Testing cigar"); 444 hts_log_info(__FUNCTION__, "Loading test file"); 445 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 446 "test", "range.bam"), 0); 447 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 448 hts_log_info(__FUNCTION__, "Getting read 1"); 449 assert(readrange.empty == false); 450 auto read = readrange.front(); 451 auto coords = read.getAlignedCoordinates(77, 82); 452 assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]); 453 }