1 /** 2 This module simplifies working with CIGAR strings/ops from SAM/BAM/CRAM alignment records. 3 */ 4 module dhtslib.sam.cigar; 5 6 import std.stdio; 7 import std.bitmanip : bitfields; 8 import std.array : join; 9 import std.algorithm : map, each, sum; 10 import std.conv : to; 11 import std.range : array; 12 import std.traits : isIntegral; 13 14 import htslib.hts_log; 15 import dhtslib.sam.record : SAMRecord; 16 import dhtslib.coordinates; 17 import dhtslib.memory; 18 import htslib.sam : bam_get_cigar; 19 20 21 /** 22 Represents all cigar ops 23 as a friendlier alternative to BAM_C htslib enums 24 */ 25 enum Ops 26 { 27 MATCH = 0, 28 INS = 1, 29 DEL = 2, 30 REF_SKIP = 3, 31 SOFT_CLIP = 4, 32 HARD_CLIP = 5, 33 PAD = 6, 34 EQUAL = 7, 35 DIFF = 8, 36 BACK = 9 37 } 38 39 /** 40 string to aid in conversion from CigarOp to string 41 */ 42 static immutable string CIGAR_STR = "MIDNSHP=XB"; 43 44 /// Credit to Biod for this code below 45 /// https://github.com/biod/BioD from their bam.cigar module 46 /** 47 uint to aid in identifying if a cigar op is query consuming 48 or reference consuming. 49 50 Each pair of bits has first bit set iff the operation is query consuming, 51 and second bit set iff it is reference consuming. 52 X = P H S N D I M 53 */ 54 private static immutable uint CIGAR_TYPE = 0b11_11_00_00_01_10_10_01_11; 55 56 /// True iff operation is one of M, =, X, I, S 57 bool isQueryConsuming(T)(T op) nothrow @nogc 58 { 59 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 1) != 0; 60 } 61 62 /// True iff operation is one of M, =, X, D, N 63 bool isReferenceConsuming(T)(T op)nothrow @nogc 64 { 65 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 2) != 0; 66 } 67 68 /// True iff operation is one of M, =, X 69 bool isMatchOrMismatch(T)(T op) nothrow @nogc 70 { 71 return ((CIGAR_TYPE >> ((op & 0xF) * 2)) & 3) == 3; 72 } 73 74 /// True iff operation is one of 'S', 'H' 75 bool isClipping(T)(T op)nothrow @nogc 76 { 77 return ((op & 0xF) >> 1) == 2; // 4 or 5 78 } 79 80 /// Represents a singular cigar operation 81 /// as a uint 82 union CigarOp 83 { 84 /// raw opcode 85 uint raw; 86 87 mixin(bitfields!( //lower 4 bits store op 88 Ops, "op", 4, //higher 28 bits store length 89 uint, "length", 28)); 90 91 /// construct Op from raw opcode 92 this(uint raw) 93 { 94 this.raw = raw; 95 } 96 97 /// construct Op from an operator and operand (length) 98 this(uint len, Ops op) 99 { 100 this.op = op; 101 this.length = len; 102 } 103 104 string toString() const 105 { 106 return this.length.to!string ~ CIGAR_STR[this.op]; 107 } 108 } 109 110 /** 111 Represents a CIGAR string as an array of CigarOps. 112 113 https://samtools.github.io/hts-specs/SAMv1.pdf §1.4.6 114 115 In many cases will represent a slice or reference to 116 underlying cigar data in the bam record 117 UNLESS it is copied with .dup or one of the assignment 118 methods. 119 */ 120 struct Cigar 121 { 122 /// array of distinct CIGAR ops 123 /// private now to force fixing some reference issues 124 private CigarOp[] ops; 125 private Bam1 b; 126 /// Construct Cigar from SAMRecord 127 this(Bam1 b) 128 { 129 this.b = b; 130 auto cigar = bam_get_cigar(this.b); 131 this(cigar, this.b.core.n_cigar); 132 } 133 134 /// Construct Cigar from raw data 135 this(uint* cigar, uint length) 136 { 137 this((cast(CigarOp*) cigar)[0 .. length]); 138 } 139 140 /// Construct Cigar from an array of CIGAR ops 141 this(CigarOp[] ops) 142 { 143 this.ops = ops; 144 } 145 146 /// null CIGAR -- don't read! 147 bool is_null() 148 { 149 return this.ops.length == 0; 150 } 151 152 /// Format Cigar struct as CIGAR string in accordance with SAM spec 153 string toString() const 154 { 155 return ops.map!(x => x.toString).array.join; 156 } 157 158 /// get length of cigar 159 auto length(){ 160 return this.ops.length; 161 } 162 163 /// set length of cigar 164 void length(T)(T len) 165 if(isIntegral!T) 166 { 167 this.ops.length = len; 168 } 169 170 /// copy cigar 171 auto const dup(){ 172 return Cigar(this.ops.dup); 173 } 174 175 /// return the alignment length expressed by this Cigar 176 @property int refBasesCovered() 177 { 178 int len; 179 foreach (op; this.ops) 180 { 181 // look ma no branching (ignore the above foreach) 182 // add op.length if reference consuming 183 // mask is 0 if not reference consuming 184 // mask is -1 if reference consuming 185 len += op.length & (uint.init - (((CIGAR_TYPE >> ((op.op & 0xF) << 1)) & 2) >> 1)); 186 } 187 return len; 188 } 189 190 deprecated("Use camelCase names instead") 191 alias ref_bases_covered = refBasesCovered; 192 /// previous alignedLength function had a bug and 193 /// it is just a duplicate of ref_bases_covered 194 alias alignedLength = refBasesCovered; 195 196 /// allow foreach on Cigar 197 int opApply(int delegate(ref CigarOp) dg) { 198 int result = 0; 199 200 foreach (op; ops) { 201 result = dg(op); 202 203 if (result) { 204 break; 205 } 206 } 207 208 return result; 209 } 210 211 /// Get a Slice of cigar ops from a range in the Cigar string 212 auto opSlice() 213 { 214 return ops; 215 } 216 217 /// Get a Slice of cigar ops from a range in the Cigar string 218 auto opSlice(T)(T start, T end) if (isIntegral!T) 219 { 220 return ops[start .. end]; 221 } 222 223 /// Get a cigar op from a single position in the Cigar string 224 ref auto opIndex(ulong i) 225 { 226 return ops[i]; 227 } 228 229 /// Assign a cigar op at a single position in the Cigar string 230 auto opIndexAssign(CigarOp value, size_t index) 231 { 232 return ops[index] = value; 233 } 234 235 /// Assign a range cigar ops over a range in the Cigar string 236 auto opSliceAssign(T)(CigarOp[] values, T start, T end) 237 { 238 assert(end - start == values.length); 239 return ops[start .. end] = values; 240 } 241 242 /// Assign a Cigar with a CigarOp slice 243 /// Note: this is not a deep copy 244 auto opAssign(T: CigarOp[])(T value) 245 { 246 this.ops = value; 247 return this; 248 } 249 250 /// Assign a Cigar with a Cigar 251 /// Note: this is not a deep copy 252 auto opAssign(T: Cigar)(T value) 253 { 254 this.ops = value.ops; 255 return this; 256 } 257 258 /// use $ with Cigar slicing 259 size_t opDollar() 260 { 261 return length; 262 } 263 264 /// combine Cigars 265 auto opBinary(string op)(const Cigar rhs) const 266 if(op == "~") 267 { 268 return Cigar(this.dup[] ~ rhs.dup[]); 269 } 270 } 271 272 273 debug (dhtslib_unittest) unittest 274 { 275 writeln(); 276 import dhtslib.sam; 277 import htslib.hts_log; 278 import std.path : buildPath, dirName; 279 280 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 281 hts_log_info(__FUNCTION__, "Testing cigar"); 282 hts_log_info(__FUNCTION__, "Loading test file"); 283 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 284 "test", "range.bam"), 0); 285 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 286 hts_log_info(__FUNCTION__, "Getting read 1"); 287 assert(readrange.empty == false); 288 auto read = readrange.front(); 289 auto cigar = read.cigar; 290 writeln(read.queryName); 291 hts_log_info(__FUNCTION__, "Cigar:" ~ cigar.toString()); 292 writeln(cigar.toString()); 293 assert(cigar.toString() == "78M1D22M"); 294 assert(cigar[0] == CigarOp(78, Ops.MATCH)); 295 assert(Cigar(cigar[0 .. 2]).toString == "78M1D"); 296 cigar[0] = CigarOp(4,Ops.HARD_CLIP); 297 cigar[1..3] = [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]; 298 assert(cigar[1 .. $] == [CigarOp(2, Ops.INS), CigarOp(21, Ops.MATCH)]); 299 assert(cigar.toString() == "4H2I21M"); 300 } 301 302 /// return Cigar struct for a given CIGAR string (e.g. from SAM line) 303 Cigar cigarFromString(string cigar) 304 { 305 import std.regex; 306 307 return Cigar(match(cigar, regex(`(\d+)([A-Z=])`, "g")).map!(m => CigarOp(m[1].to!uint, 308 m[2].to!char.charToOp)).array); 309 } 310 311 /// Convert single char representing a CIGAR Op to Op union 312 Ops charToOp(char c) 313 { 314 foreach (i, o; CIGAR_STR) 315 { 316 if (c == o) 317 { 318 return cast(Ops) i; 319 } 320 } 321 return cast(Ops) 9; 322 } 323 324 debug (dhtslib_unittest) unittest 325 { 326 writeln(); 327 hts_log_info(__FUNCTION__, "Testing is_query_consuming and is_reference_consuming"); 328 string c = "130M2D40M"; 329 auto cig = cigarFromString(c); 330 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 331 assert(cig.toString() == c); 332 assert(cig.ops[0].op.isQueryConsuming && cig.ops[0].op.isReferenceConsuming); 333 assert(!cig.ops[1].op.isQueryConsuming && cig.ops[1].op.isReferenceConsuming); 334 } 335 336 /// Range-based iteration of a Cigar string 337 /// Returns a range of Ops that is the length of all op lengths 338 /// e.g. if the Cigar is 4M5D2I4M 339 /// CigarItr will return a range of MMMMDDDDDIIMMMM 340 /// Range is of the Ops enum not chars 341 struct CigarItr 342 { 343 Cigar cigar; 344 CigarOp[] ops; 345 CigarOp current; 346 347 this(Cigar c) 348 { 349 // Copy the cigar 350 cigar = c; 351 ops = cigar[]; 352 current = ops[0]; 353 current.length = current.length - 1; 354 } 355 356 Ops front() 357 { 358 return current.op; 359 } 360 361 void popFront() 362 { 363 // import std.stdio; 364 // writeln(current); 365 // writeln(cigar); 366 if(ops.length == 1 && current.length == 0) 367 ops = []; 368 else if (current.length == 0) 369 { 370 ops = ops[1 .. $]; 371 current = ops[0]; 372 current.length = current.length - 1; 373 } 374 else if (current.length != 0) 375 current.length = current.length - 1; 376 } 377 378 bool empty() 379 { 380 return ops.length == 0; 381 } 382 } 383 384 debug (dhtslib_unittest) unittest 385 { 386 import std.algorithm : map; 387 388 writeln(); 389 hts_log_info(__FUNCTION__, "Testing CigarItr"); 390 string c = "7M2D4M"; 391 auto cig = cigarFromString(c); 392 hts_log_info(__FUNCTION__, "Cigar:" ~ cig.toString()); 393 auto itr = CigarItr(cig); 394 assert(itr.map!(x => CIGAR_STR[x]).array.idup == "MMMMMMMDDMMMM"); 395 } 396 397 /// Coordinate pair representing aligned query and reference positions, 398 /// and the CIGAR op at or effecting their alignment. 399 struct AlignedCoordinate 400 { 401 Coordinate!(Basis.zero) qpos, rpos; 402 Ops cigar_op; 403 } 404 405 /// Iterator yielding all AlignedCoordinate pairs for a given CIGAR string 406 struct AlignedCoordinatesItr 407 { 408 CigarItr itr; 409 AlignedCoordinate current; 410 411 this(Cigar cigar) 412 { 413 itr = CigarItr(cigar); 414 current.qpos = current.rpos = Coordinate!(Basis.zero)(-1); 415 current.cigar_op = itr.front; 416 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 417 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 418 } 419 420 /// InputRange interface 421 AlignedCoordinate front() 422 { 423 return current; 424 } 425 426 /// InputRange interface 427 void popFront() 428 { 429 itr.popFront; 430 current.cigar_op = itr.front; 431 current.qpos += ((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 1); 432 current.rpos += (((CIGAR_TYPE >> ((current.cigar_op & 0xF) << 1)) & 2) >> 1); 433 } 434 435 /// InputRange interface 436 bool empty() 437 { 438 return itr.empty; 439 } 440 } 441 442 debug (dhtslib_unittest) unittest 443 { 444 writeln(); 445 import dhtslib.sam; 446 import htslib.hts_log; 447 import std.path : buildPath, dirName; 448 import std.algorithm : map; 449 import std.array : array; 450 451 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 452 hts_log_info(__FUNCTION__, "Testing cigar"); 453 hts_log_info(__FUNCTION__, "Loading test file"); 454 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 455 "test", "range.bam"), 0); 456 auto readrange = bam["CHROMOSOME_I", ZB(914)]; 457 hts_log_info(__FUNCTION__, "Getting read 1"); 458 assert(readrange.empty == false); 459 auto read = readrange.front(); 460 auto coords = read.getAlignedCoordinates(77, 82); 461 assert(coords.map!(x => x.rpos).array == [77,78,79,80,81]); 462 }