1 /** 2 Module to deal with SAM records' `MD` auxillary tag. 3 4 This tag is a string encoding mismatched and deleted reference bases, 5 used in conjunction with CIGAR and SEQ fields to reconstruct the bases 6 of the reference sequence interval to which the alignment has been mapped. 7 This can enable variant calling without requiring access to the entire original reference. 8 9 "For example, a string `10A5^AC6` means from the leftmost reference base in the alignment, there are 10 matches followed by an A on the reference which is different from the aligned read base; the next 5 reference bases are matches followed by a 2bp deletion from the reference; the deleted sequence is AC; the last 6 bases are matches." 10 11 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf 12 */ 13 module dhtslib.sam.md; 14 15 import dhtslib.sam.record : SAMRecord; 16 import dhtslib.sam.cigar; 17 import htslib.hts_log; 18 import std.regex; 19 import std.traits : ReturnType; 20 import std.conv : to; 21 22 /// regex to extract MD string groups 23 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 24 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`); 25 26 struct MDPair 27 { 28 int numMatches; 29 string __mismatch; 30 31 string mismatch() const 32 { 33 if (isDel) 34 return __mismatch[1 .. $]; 35 else 36 return __mismatch; 37 } 38 39 bool isDel() const 40 { 41 return !isLast && __mismatch[0] == '^'; 42 } 43 44 bool isLast() const 45 { 46 return __mismatch.length == 0; 47 } 48 49 invariant 50 { 51 if (__mismatch.length > 1) 52 assert(__mismatch[0] == '^'); 53 if (__mismatch.length == 1) 54 assert(__mismatch[0] != '^'); 55 } 56 57 string toString() 58 { 59 return numMatches.to!string ~ __mismatch; 60 } 61 } 62 63 /// (?) For SAM record `rec`, return ForwardRange over read's MD tag data 64 auto getMDPairs(SAMRecord rec) 65 { 66 struct MDPairs 67 { 68 string md_string; 69 ReturnType!generateMDmatches matches; 70 71 this(string md) 72 { 73 md_string = md; 74 matches = generateMDmatches(); 75 } 76 77 auto generateMDmatches() 78 { 79 return matchAll(md_string, MDREGEX); 80 } 81 82 MDPair front() 83 { 84 return MDPair(matches.front[1].to!int, matches.front[2]); 85 } 86 87 bool empty() 88 { 89 return matches.empty; 90 } 91 92 void popFront() 93 { 94 matches.popFront; 95 } 96 97 } 98 99 auto tag = rec["MD"]; 100 import std.stdio; 101 debug if (!tag.exists) 102 hts_log_error(__FUNCTION__, "MD tag not present"); 103 return MDPairs(tag.toString); 104 } 105 106 /// (?) Iterator yielding mismatched base in query; or '=' if equal to reference 107 struct MDItr 108 { 109 ReturnType!getMDPairs mdPairs; 110 int currentlen; 111 string currentSeq; 112 bool isLast; 113 114 this(SAMRecord rec) 115 { 116 mdPairs = getMDPairs(rec); 117 currentlen = mdPairs.front.numMatches; 118 currentSeq = mdPairs.front.mismatch; 119 isLast = mdPairs.front.isLast; 120 mdPairs.popFront; 121 } 122 123 char front() 124 { 125 if (currentlen) 126 return '='; 127 else 128 return currentSeq[0]; 129 } 130 131 void popFront() 132 { 133 if (currentlen) 134 currentlen--; 135 else if (currentSeq.length > 1) 136 currentSeq = currentSeq[1 .. $]; 137 else 138 { 139 currentlen = mdPairs.front.numMatches; 140 currentSeq = mdPairs.front.mismatch; 141 isLast = mdPairs.front.isLast; 142 mdPairs.popFront; 143 } 144 } 145 146 bool empty() 147 { 148 return isLast && currentlen == 0; 149 } 150 151 } 152 153 debug (dhtslib_unittest) unittest 154 { 155 import std.stdio; 156 import dhtslib.sam; 157 import std.array : array; 158 import std.path : buildPath, dirName; 159 160 auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib", 161 "test", "range.bam"), 0); 162 auto ar = bam.allRecords; 163 assert(ar.empty == false); 164 auto read = ar.front; 165 read["MD"] = "2G11^GATC7T6^A11"; 166 assert(MDItr(read).array == "==G===========GATC=======T======A==========="); 167 }