1 module dhtslib.md; 2 3 import dhtslib.sam: SAMRecord; 4 import dhtslib.cigar; 5 import dhtslib.htslib.hts_log; 6 import std.regex; 7 import std.traits: ReturnType; 8 import std.conv: to; 9 10 /// regex to extract MD string groups 11 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 12 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`); 13 14 struct MDPair 15 { 16 int numMatches; 17 string __mismatch; 18 19 string mismatch() const { 20 if(isDel) 21 return __mismatch[1..$]; 22 else 23 return __mismatch; 24 } 25 bool isDel() const { 26 return !isLast && __mismatch[0]=='^'; 27 } 28 29 bool isLast() const { 30 return __mismatch.length == 0; 31 } 32 33 invariant 34 { 35 if(__mismatch.length > 1) 36 assert(__mismatch[0] == '^'); 37 if(__mismatch.length == 1) 38 assert(__mismatch[0] != '^'); 39 } 40 string toString(){ 41 return numMatches.to!string~__mismatch; 42 } 43 } 44 45 auto getMDPairs(SAMRecord rec){ 46 struct MDPairs 47 { 48 string md_string; 49 ReturnType!generateMDmatches matches; 50 51 this(string md){ 52 md_string = md; 53 matches = generateMDmatches(); 54 } 55 56 auto generateMDmatches(){ 57 return matchAll(md_string, MDREGEX); 58 } 59 60 MDPair front(){ 61 return MDPair(matches.front[1].to!int,matches.front[2]); 62 } 63 64 bool empty(){ 65 return matches.empty; 66 } 67 68 void popFront(){ 69 matches.popFront; 70 } 71 72 } 73 auto tag = rec["MD"]; 74 debug if(!tag.exists) hts_log_error(__FUNCTION__,"MD tag not present"); 75 return MDPairs(tag.toString); 76 } 77 78 struct MDItr { 79 ReturnType!getMDPairs mdPairs; 80 int currentlen; 81 string currentSeq; 82 bool isLast; 83 84 this(SAMRecord rec){ 85 mdPairs = getMDPairs(rec); 86 currentlen = mdPairs.front.numMatches; 87 currentSeq = mdPairs.front.mismatch; 88 isLast = mdPairs.front.isLast; 89 mdPairs.popFront; 90 } 91 92 char front(){ 93 if(currentlen) return '='; 94 else return currentSeq[0]; 95 } 96 97 void popFront(){ 98 if(currentlen) currentlen--; 99 else if(currentSeq.length > 1) currentSeq = currentSeq[1..$]; 100 else 101 { 102 currentlen = mdPairs.front.numMatches; 103 currentSeq = mdPairs.front.mismatch; 104 isLast = mdPairs.front.isLast; 105 mdPairs.popFront; 106 } 107 } 108 109 bool empty(){ 110 return isLast && currentlen == 0; 111 } 112 113 } 114 115 debug(dhtslib_unittest) 116 unittest 117 { 118 import std.stdio; 119 import dhtslib.sam; 120 import std.array: array; 121 import std.path:buildPath,dirName; 122 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0); 123 auto read=bam.all_records.front; 124 read["MD"] = "2G11^GATC7T6^A11"; 125 assert(MDItr(read).array == "==G===========GATC=======T======A==========="); 126 }