1 /** 2 Module to deal with SAM records' `MD` auxillary tag. 3 4 This tag is a string encoding mismatched and deleted reference bases, 5 used in conjunction with CIGAR and SEQ fields to reconstruct the bases 6 of the reference sequence interval to which the alignment has been mapped. 7 This can enable variant calling without require access to the entire original reference. 8 9 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf 10 */ 11 module dhtslib.md; 12 13 import dhtslib.sam : SAMRecord; 14 import dhtslib.cigar; 15 import htslib.hts_log; 16 import std.regex; 17 import std.traits : ReturnType; 18 import std.conv : to; 19 20 /// regex to extract MD string groups 21 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 22 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`); 23 24 struct MDPair 25 { 26 int numMatches; 27 string __mismatch; 28 29 string mismatch() const 30 { 31 if (isDel) 32 return __mismatch[1 .. $]; 33 else 34 return __mismatch; 35 } 36 37 bool isDel() const 38 { 39 return !isLast && __mismatch[0] == '^'; 40 } 41 42 bool isLast() const 43 { 44 return __mismatch.length == 0; 45 } 46 47 invariant 48 { 49 if (__mismatch.length > 1) 50 assert(__mismatch[0] == '^'); 51 if (__mismatch.length == 1) 52 assert(__mismatch[0] != '^'); 53 } 54 55 string toString() 56 { 57 return numMatches.to!string ~ __mismatch; 58 } 59 } 60 61 auto getMDPairs(SAMRecord rec) 62 { 63 struct MDPairs 64 { 65 string md_string; 66 ReturnType!generateMDmatches matches; 67 68 this(string md) 69 { 70 md_string = md; 71 matches = generateMDmatches(); 72 } 73 74 auto generateMDmatches() 75 { 76 return matchAll(md_string, MDREGEX); 77 } 78 79 MDPair front() 80 { 81 return MDPair(matches.front[1].to!int, matches.front[2]); 82 } 83 84 bool empty() 85 { 86 return matches.empty; 87 } 88 89 void popFront() 90 { 91 matches.popFront; 92 } 93 94 } 95 96 auto tag = rec["MD"]; 97 debug if (!tag.exists) 98 hts_log_error(__FUNCTION__, "MD tag not present"); 99 return MDPairs(tag.toString); 100 } 101 102 struct MDItr 103 { 104 ReturnType!getMDPairs mdPairs; 105 int currentlen; 106 string currentSeq; 107 bool isLast; 108 109 this(SAMRecord rec) 110 { 111 mdPairs = getMDPairs(rec); 112 currentlen = mdPairs.front.numMatches; 113 currentSeq = mdPairs.front.mismatch; 114 isLast = mdPairs.front.isLast; 115 mdPairs.popFront; 116 } 117 118 char front() 119 { 120 if (currentlen) 121 return '='; 122 else 123 return currentSeq[0]; 124 } 125 126 void popFront() 127 { 128 if (currentlen) 129 currentlen--; 130 else if (currentSeq.length > 1) 131 currentSeq = currentSeq[1 .. $]; 132 else 133 { 134 currentlen = mdPairs.front.numMatches; 135 currentSeq = mdPairs.front.mismatch; 136 isLast = mdPairs.front.isLast; 137 mdPairs.popFront; 138 } 139 } 140 141 bool empty() 142 { 143 return isLast && currentlen == 0; 144 } 145 146 } 147 148 debug (dhtslib_unittest) unittest 149 { 150 import std.stdio; 151 import dhtslib.sam; 152 import std.array : array; 153 import std.path : buildPath, dirName; 154 155 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib", 156 "test", "range.bam"), 0); 157 auto read = bam.all_records.front; 158 read["MD"] = "2G11^GATC7T6^A11"; 159 assert(MDItr(read).array == "==G===========GATC=======T======A==========="); 160 }