1 /** 2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records. 3 */ 4 module dhtslib.sam.cigar; 5 6 import std.stdio; 7 import std.bitmanip : bitfields; 8 import std.array : join; 9 import std.algorithm : map, each, sum; 10 import std.conv : to; 11 import std.range : array; 12 import std.traits : isIntegral; 13 14 import htslib.hts_log; 15 import dhtslib.sam.record : SAMRecord; 16 import dhtslib.coordinates; 17 import dhtslib.memory; 18 import htslib.sam : bam_get_cigar; 19 20 21 /** 22 Represents all cigar ops 23 as a friendlier alternative to BAM_C htslib enums 24 */ 25 enum Ops 26 { 27 MATCH = 0, 28 INS = 1, 29 DEL = 2, 30 REF_SKIP = 3, 31 SOFT_CLIP = 4, 32 HARD_CLIP = 5, 33 PAD = 6, 34 EQUAL = 7, 35 DIFF = 8, 36 BACK = 9 37 } 38 39 /** 40 string to aid in conversion from CigarOp to string 41 */ 42 static immutable string CIGAR_STR = "MIDNSHP=XB"; 43 44 /// Credit to Biod for this code below 45 /// https://github.com/biod/BioD from their bam.cigar module 46 /** 47 uint to aid in identifying if a cigar op is query consuming 48 or reference consuming. 49 50 Each pair of bits has first bit set iff the operation is query consuming, 51 and second bit set iff it is reference consuming. 52 X = P H S N D I M 53 */ 54 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 55 56 /// True iff operation is one of M, =, X, I, S 57 bool isQueryConsuming(T)(T op) nothrow @nogc 58 { 59 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 1) != 0; 60 } 61 62 /// True iff operation is one of M, =, X, D, N 63 bool isReferenceConsuming(T)(T op)nothrow @nogc 64 { 65 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 2) != 0; 66 } 67 68 /// True iff operation is one of M, =, X 69 bool isMatchOrMismatch(T)(T op) nothrow @nogc 70 { 71 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 3) == 3; 72 } 73 74 /// True iff operation is one of 'S', 'H' 75 bool isClipping(T)(T op)nothrow @nogc 76 { 77 return ((op & 0xF) >> 1) == 2; // 4 or 5 78 } 79 80 /// Represents a singular cigar operation 81 /// as a uint 82 union CigarOp 83 { 84 /// raw opcode 85 uint raw; 86 87 mixin(bitfields!( //lower 4 bits store op 88 Ops, "op", 4, //higher 28 bits store length 89 uint, "length", 28)); 90 91 /// construct Op from raw opcode 92 this(uint raw) 93 { 94 this.raw = raw; 95 } 96 97 /// construct Op from an operator and operand (length) 98 this(uint len, Ops op) 99 { 100 this.op = op; 101 this.length = len; 102 } 103 104 string toString() const 105 { 106 return this.length.to!string ~ CIGAR_STR[this.op]; 107 } 108 } 109 110 /** 111 Represents a CIGAR string as an array of CigarOps. 112 113 https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 114 115 In many cases will represent a slice or reference to 116 underlying cigar data in the bam record 117 UNLESS it is copied with .dup or one of the assignment 118 methods. 119 */ 120 struct Cigar 121 { 122 /// array of distinct CIGAR ops 123 /// private now to force fixing some reference issues 124 private CigarOp[] ops; 125 private Bam1 b; 126 /// Construct Cigar from SAMRecord 127 this(Bam1 b) 128 { 129 this.b = b; 130 auto cigar = bam_get_cigar(this.b); 131 this(cigar, this.b.core.n_cigar); 132 } 133 134 /// Construct Cigar from raw data 135 this(uint* cigar, uint length) 136 { 137 this((cast(CigarOp*) cigar)[0 .. length]); 138 } 139 140 /// Construct Cigar from an array of CIGAR ops 141 this(CigarOp[] ops) 142 { 143 this.ops = ops; 144 } 145 146 /// Explicit postblit to avoid 147 /// https://github.com/blachlylab/dhtslib/issues/122 148 this(this) nothrow 149 { 150 this.b = b; 151 this.ops = ops; 152 } 153 154 /// null CIGAR -- don't read! 155 bool is_null() 156 { 157 return this.ops.length == 0; 158 } 159 160 /// Format Cigar struct as CIGAR string in accordance with SAM spec 161 string toString() const 162 { 163 return ops.map!(x => x.toString).array.join; 164 } 165 166 /// get length of cigar 167 auto length(){ 168 return this.ops.length; 169 } 170 171 /// set length of cigar 172 void length(T)(T len) 173 if(isIntegral!T) 174 { 175 this.ops.length = len; 176 } 177 178 /// copy cigar 179 auto const dup(){ 180 return Cigar(this.ops.dup); 181 } 182 183 /// return the alignment length expressed by this Cigar 184 @property int refBasesCovered() 185 { 186 int len; 187 foreach (op; this.ops) 188 { 189 // look ma no branching (ignore the above foreach) 190 // add op.length if reference consuming 191 // mask is 0 if not reference consuming 192 // mask is -1 if reference consuming 193 len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1)); 194 } 195 return len; 196 } 197 198 deprecated("Use camelCase names instead") 199 alias ref_bases_covered = refBasesCovered; 200 /// previous alignedLength function had a bug and 201 /// it is just a duplicate of ref_bases_covered 202 alias alignedLength = refBasesCovered; 203 204 /// allow foreach on Cigar 205 int opApply(int delegate(ref CigarOp) dg) { 206 int result = 0; 207 208 foreach (op; ops) { 209 result = dg(op); 210 211 if (result) { 212 break; 213 } 214 } 215 216 return result; 217 } 218 219 /// Get a Slice of cigar ops from a range in the Cigar string 220 auto opSlice() 221 { 222 return ops; 223 } 224 225 /// Get a Slice of cigar ops from a range in the Cigar string 226 auto opSlice(T)(T start, T end) if (isIntegral!T) 227 { 228 return ops[start .. end]; 229 } 230 231 /// Get a cigar op from a single position in the Cigar string 232 ref auto opIndex(ulong i) 233 { 234 return ops[i]; 235 } 236 237 /// Assign a cigar op at a single position in the Cigar string 238 auto opIndexAssign(CigarOp value, size_t index) 239 { 240 return ops[index] = value; 241 } 242 243 /// Assign a range cigar ops over a range in the Cigar string 244 auto opSliceAssign(T)(CigarOp[] values, T start, T end) 245 { 246 assert(end - start == values.length); 247 return ops[start .. end] = values; 248 } 249 250 /// Assign a Cigar with a CigarOp slice 251 /// Note: this is not a deep copy 252 auto opAssign(T: CigarOp[])(T value) 253 { 254 this.ops = value; 255 return this; 256 } 257 258 /// Assign a Cigar with a Cigar 259 /// Note: this is not a deep copy 260 auto opAssign(T: Cigar)(T value) 261 { 262 this.ops = value.ops; 263 return this; 264 } 265 266 /// use $ with Cigar slicing 267 size_t opDollar() 268 { 269 return length; 270 } 271 272 /// combine Cigars 273 auto opBinary(string op)(const Cigar rhs) const 274 if(op == "~") 275 { 276 return Cigar(this.dup[] ~ rhs.dup[]); 277 } 278 } 279 280 281 debug (dhtslib_unittest) unittest 282 { 283 writeln(); 284 import dhtslib.sam; 285 import htslib.hts_log; 286 import std.path : buildPath, dirName; 287 288 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 289 hts_log_info(__FUNCTION__, "Testing cigar"); 290 hts_log_info(__FUNCTION__, "Loading test file"); 291 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 292 "test", "range.bam"), 0); 293 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 294 hts_log_info(__FUNCTION__, "Getting read 1"); 295 assert(readrange.empty == false); 296 auto read = readrange.front(); 297 auto cigar = read.cigar; 298 writeln(read.queryName); 299 hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString()); 300 writeln(cigar.toString()); 301 assert(cigar.toString() == "78M1D22M"); 302 assert(cigar[0] == CigarOp(78, Ops.MATCH)); 303 assert(Cigar(cigar[0 .. 2]).toString == "78M1D"); 304 cigar[0] = CigarOp(4,Ops.HARD_CLIP); 305 cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]; 306 assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]); 307 assert(cigar.toString() == "4H2I21M"); 308 } 309 310 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 311 Cigar cigarFromString(string cigar) 312 { 313 import std.regex; 314 315 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 316 m[2].to!char.charToOp)).array); 317 } 318 319 /// Convert single char representing a CIGAR Op to Op union 320 Ops charToOp(char c) 321 { 322 foreach (i, o; CIGAR_STR) 323 { 324 if (c == o) 325 { 326 return cast(Ops) i; 327 } 328 } 329 return cast(Ops) 9; 330 } 331 332 debug (dhtslib_unittest) unittest 333 { 334 writeln(); 335 hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 336 string c = "130M2D40M"; 337 auto cig = cigarFromString(c); 338 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 339 assert(cig.toString() == c); 340 assert(cig.ops[0].op.isQueryConsuming && cig.ops[0].op.isReferenceConsuming); 341 assert(!cig.ops[1].op.isQueryConsuming && cig.ops[1].op.isReferenceConsuming); 342 } 343 344 /// Range-based iteration of a Cigar string 345 /// Returns a range of Ops that is the length of all op lengths 346 /// e.g. if the Cigar is 4M5D2I4M 347 /// CigarItr will return a range of MMMMDDDDDIIMMMM 348 /// Range is of the Ops enum not chars 349 struct CigarItr 350 { 351 Cigar cigar; 352 CigarOp[] ops; 353 CigarOp current; 354 355 this(Cigar c) 356 { 357 // Copy the cigar 358 cigar = c; 359 ops = cigar[]; 360 current = ops[0]; 361 current.length = current.length - 1; 362 } 363 364 Ops front() 365 { 366 return current.op; 367 } 368 369 void popFront() 370 { 371 // import std.stdio; 372 // writeln(current); 373 // writeln(cigar); 374 if(ops.length == 1 && current.length == 0) 375 ops = []; 376 else if (current.length == 0) 377 { 378 ops = ops[1 .. $]; 379 current = ops[0]; 380 current.length = current.length - 1; 381 } 382 else if (current.length != 0) 383 current.length = current.length - 1; 384 } 385 386 bool empty() 387 { 388 return ops.length == 0; 389 } 390 } 391 392 debug (dhtslib_unittest) unittest 393 { 394 import std.algorithm : map; 395 396 writeln(); 397 hts_log_info(__FUNCTION__, "Testing CigarItr"); 398 string c = "7M2D4M"; 399 auto cig = cigarFromString(c); 400 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 401 auto itr = CigarItr(cig); 402 assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM"); 403 } 404 405 /// Coordinate pair representing aligned query and reference positions, 406 /// and the CIGAR op at or effecting their alignment. 407 struct AlignedCoordinate 408 { 409 Coordinate!(Basis.zero) qpos, rpos; 410 Ops cigar_op; 411 } 412 413 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string 414 struct AlignedCoordinatesItr 415 { 416 CigarItr itr; 417 AlignedCoordinate current; 418 419 this(Cigar cigar) 420 { 421 itr = CigarItr(cigar); 422 current.qpos = current.rpos = Coordinate!(Basis.zero)(-1); 423 current.cigar_op = itr.front; 424 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 425 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 426 } 427 428 /// InputRange interface 429 AlignedCoordinate front() 430 { 431 return current; 432 } 433 434 /// InputRange interface 435 void popFront() 436 { 437 itr.popFront; 438 current.cigar_op = itr.front; 439 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 440 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 441 } 442 443 /// InputRange interface 444 bool empty() 445 { 446 return itr.empty; 447 } 448 } 449 450 debug (dhtslib_unittest) unittest 451 { 452 writeln(); 453 import dhtslib.sam; 454 import htslib.hts_log; 455 import std.path : buildPath, dirName; 456 import std.algorithm : map; 457 import std.array : array; 458 459 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 460 hts_log_info(__FUNCTION__, "Testing cigar"); 461 hts_log_info(__FUNCTION__, "Loading test file"); 462 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 463 "test", "range.bam"), 0); 464 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 465 hts_log_info(__FUNCTION__, "Getting read 1"); 466 assert(readrange.empty == false); 467 auto read = readrange.front(); 468 auto coords = read.getAlignedCoordinates(77, 82); 469 assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]); 470 }