1 module dhtslib.md;
2 
3 import dhtslib.sam: SAMRecord;
4 import dhtslib.cigar;
5 import dhtslib.htslib.hts_log;
6 import std.regex;
7 import std.traits: ReturnType;
8 import std.conv: to;
9 
10 /// regex to extract MD string groups
11 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 
12 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`);
13 
14 struct MDPair 
15 {
16     int numMatches;
17     string __mismatch;
18 
19     string mismatch() const {
20         if(isDel)
21             return __mismatch[1..$];
22         else
23             return __mismatch;
24     }
25     bool isDel() const { 
26         return !isLast && __mismatch[0]=='^';
27     }
28     
29     bool isLast() const { 
30         return __mismatch.length == 0; 
31     }
32 
33     invariant 
34     {
35         if(__mismatch.length > 1)
36             assert(__mismatch[0] == '^');
37         if(__mismatch.length == 1)
38             assert(__mismatch[0] != '^');
39     }
40     string toString(){
41         return numMatches.to!string~__mismatch;
42     }
43 }
44 
45 auto getMDPairs(SAMRecord rec){
46     struct MDPairs
47     {
48         string md_string;
49         ReturnType!generateMDmatches matches;
50 
51         this(string md){
52             md_string = md;
53             matches = generateMDmatches();
54         }
55         
56         auto generateMDmatches(){
57             return matchAll(md_string, MDREGEX);
58         }
59 
60         MDPair front(){
61             return MDPair(matches.front[1].to!int,matches.front[2]);
62         }
63 
64         bool empty(){
65             return matches.empty;
66         }
67 
68         void popFront(){
69             matches.popFront;
70         }
71 
72     }
73     auto tag = rec["MD"];
74     debug if(!tag.exists) hts_log_error(__FUNCTION__,"MD tag not present");
75     return MDPairs(tag.toString);
76 }
77 
78 struct MDItr {
79     ReturnType!getMDPairs mdPairs;
80     int currentlen;
81     string currentSeq;
82     bool isLast;
83 
84     this(SAMRecord rec){
85         mdPairs = getMDPairs(rec);
86         currentlen = mdPairs.front.numMatches;
87         currentSeq = mdPairs.front.mismatch;
88         isLast = mdPairs.front.isLast;
89         mdPairs.popFront;
90     }
91 
92     char front(){
93         if(currentlen) return '=';
94         else return currentSeq[0];
95     }
96 
97     void popFront(){
98         if(currentlen) currentlen--;
99         else if(currentSeq.length > 1) currentSeq = currentSeq[1..$];
100         else
101         {
102             currentlen = mdPairs.front.numMatches;
103             currentSeq = mdPairs.front.mismatch;
104             isLast = mdPairs.front.isLast;
105             mdPairs.popFront;
106         }
107     }
108 
109     bool empty(){
110         return isLast && currentlen == 0;
111     }
112 
113 }
114 
115 debug(dhtslib_unittest) 
116 unittest
117 {
118     import std.stdio;
119     import dhtslib.sam;
120     import std.array: array;
121     import std.path:buildPath,dirName;
122     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))),"htslib","test","range.bam"), 0);
123     auto read=bam.all_records.front;
124     read["MD"] = "2G11^GATC7T6^A11";
125     assert(MDItr(read).array == "==G===========GATC=======T======A===========");
126 }