1 /**
2 Module to deal with SAM records' `MD` auxillary tag.
3 
4 This tag is a string encoding mismatched and deleted reference bases,
5 used in conjunction with CIGAR and SEQ fields to reconstruct the bases
6 of the reference sequence interval to which the alignment has been mapped.
7 This can enable variant calling without require access to the entire original reference.
8 
9 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf
10 */
11 module dhtslib.md;
12 
13 import dhtslib.sam : SAMRecord;
14 import dhtslib.cigar;
15 import htslib.hts_log;
16 import std.regex;
17 import std.traits : ReturnType;
18 import std.conv : to;
19 
20 /// regex to extract MD string groups
21 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 
22 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`);
23 
24 struct MDPair
25 {
26     int numMatches;
27     string __mismatch;
28 
29     string mismatch() const
30     {
31         if (isDel)
32             return __mismatch[1 .. $];
33         else
34             return __mismatch;
35     }
36 
37     bool isDel() const
38     {
39         return !isLast && __mismatch[0] == '^';
40     }
41 
42     bool isLast() const
43     {
44         return __mismatch.length == 0;
45     }
46 
47     invariant
48     {
49         if (__mismatch.length > 1)
50             assert(__mismatch[0] == '^');
51         if (__mismatch.length == 1)
52             assert(__mismatch[0] != '^');
53     }
54 
55     string toString()
56     {
57         return numMatches.to!string ~ __mismatch;
58     }
59 }
60 
61 auto getMDPairs(SAMRecord rec)
62 {
63     struct MDPairs
64     {
65         string md_string;
66         ReturnType!generateMDmatches matches;
67 
68         this(string md)
69         {
70             md_string = md;
71             matches = generateMDmatches();
72         }
73 
74         auto generateMDmatches()
75         {
76             return matchAll(md_string, MDREGEX);
77         }
78 
79         MDPair front()
80         {
81             return MDPair(matches.front[1].to!int, matches.front[2]);
82         }
83 
84         bool empty()
85         {
86             return matches.empty;
87         }
88 
89         void popFront()
90         {
91             matches.popFront;
92         }
93 
94     }
95 
96     auto tag = rec["MD"];
97     debug if (!tag.exists)
98         hts_log_error(__FUNCTION__, "MD tag not present");
99     return MDPairs(tag.toString);
100 }
101 
102 struct MDItr
103 {
104     ReturnType!getMDPairs mdPairs;
105     int currentlen;
106     string currentSeq;
107     bool isLast;
108 
109     this(SAMRecord rec)
110     {
111         mdPairs = getMDPairs(rec);
112         currentlen = mdPairs.front.numMatches;
113         currentSeq = mdPairs.front.mismatch;
114         isLast = mdPairs.front.isLast;
115         mdPairs.popFront;
116     }
117 
118     char front()
119     {
120         if (currentlen)
121             return '=';
122         else
123             return currentSeq[0];
124     }
125 
126     void popFront()
127     {
128         if (currentlen)
129             currentlen--;
130         else if (currentSeq.length > 1)
131             currentSeq = currentSeq[1 .. $];
132         else
133         {
134             currentlen = mdPairs.front.numMatches;
135             currentSeq = mdPairs.front.mismatch;
136             isLast = mdPairs.front.isLast;
137             mdPairs.popFront;
138         }
139     }
140 
141     bool empty()
142     {
143         return isLast && currentlen == 0;
144     }
145 
146 }
147 
148 debug (dhtslib_unittest) unittest
149 {
150     import std.stdio;
151     import dhtslib.sam;
152     import std.array : array;
153     import std.path : buildPath, dirName;
154 
155     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib",
156             "test", "range.bam"), 0);
157     auto read = bam.all_records.front;
158     read["MD"] = "2G11^GATC7T6^A11";
159     assert(MDItr(read).array == "==G===========GATC=======T======A===========");
160 }