1 /**
2 Module to deal with SAM records' `MD` auxillary tag.
3 
4 This tag is a string encoding mismatched and deleted reference bases,
5 used in conjunction with CIGAR and SEQ fields to reconstruct the bases
6 of the reference sequence interval to which the alignment has been mapped.
7 This can enable variant calling without requiring access to the entire original reference.
8 
9 "For example,  a string `10A5^AC6` means from the leftmost reference base in the alignment, there are 10 matches followed by an A on the reference which is different from the aligned read base; the next 5 reference bases are matches followed by a 2bp deletion from the reference; the deleted sequence is AC; the last 6 bases are matches."
10 
11 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf
12 */
13 module dhtslib.sam.md;
14 
15 import dhtslib.sam.record : SAMRecord;
16 import dhtslib.sam.cigar;
17 import htslib.hts_log;
18 import std.regex;
19 import std.traits : ReturnType;
20 import std.conv : to;
21 
22 /// regex to extract MD string groups
23 /// ex: "11A3^G" -> [(11, "A"), (3, "^G")] 
24 auto MDREGEX = regex(`(\d+)([\^ATGCN]*)`);
25 
26 struct MDPair
27 {
28     int numMatches;
29     string __mismatch;
30 
31     string mismatch() const
32     {
33         if (isDel)
34             return __mismatch[1 .. $];
35         else
36             return __mismatch;
37     }
38 
39     bool isDel() const
40     {
41         return !isLast && __mismatch[0] == '^';
42     }
43 
44     bool isLast() const
45     {
46         return __mismatch.length == 0;
47     }
48 
49     invariant
50     {
51         if (__mismatch.length > 1)
52             assert(__mismatch[0] == '^');
53         if (__mismatch.length == 1)
54             assert(__mismatch[0] != '^');
55     }
56 
57     string toString()
58     {
59         return numMatches.to!string ~ __mismatch;
60     }
61 }
62 
63 /// (?) For SAM record `rec`, return ForwardRange over read's MD tag data
64 auto getMDPairs(SAMRecord rec)
65 {
66     struct MDPairs
67     {
68         string md_string;
69         ReturnType!generateMDmatches matches;
70 
71         this(string md)
72         {
73             md_string = md;
74             matches = generateMDmatches();
75         }
76 
77         auto generateMDmatches()
78         {
79             return matchAll(md_string, MDREGEX);
80         }
81 
82         MDPair front()
83         {
84             return MDPair(matches.front[1].to!int, matches.front[2]);
85         }
86 
87         bool empty()
88         {
89             return matches.empty;
90         }
91 
92         void popFront()
93         {
94             matches.popFront;
95         }
96 
97     }
98 
99     auto tag = rec["MD"];
100     import std.stdio;
101     debug if (!tag.exists)
102         hts_log_error(__FUNCTION__, "MD tag not present");
103     return MDPairs(tag.toString);
104 }
105 
106 /// (?) Iterator yielding mismatched base in query; or '=' if equal to reference
107 struct MDItr
108 {
109     ReturnType!getMDPairs mdPairs;
110     int currentlen;
111     string currentSeq;
112     bool isLast;
113 
114     this(SAMRecord rec)
115     {
116         mdPairs = getMDPairs(rec);
117         currentlen = mdPairs.front.numMatches;
118         currentSeq = mdPairs.front.mismatch;
119         isLast = mdPairs.front.isLast;
120         mdPairs.popFront;
121     }
122 
123     char front()
124     {
125         if (currentlen)
126             return '=';
127         else
128             return currentSeq[0];
129     }
130 
131     void popFront()
132     {
133         if (currentlen)
134             currentlen--;
135         else if (currentSeq.length > 1)
136             currentSeq = currentSeq[1 .. $];
137         else
138         {
139             currentlen = mdPairs.front.numMatches;
140             currentSeq = mdPairs.front.mismatch;
141             isLast = mdPairs.front.isLast;
142             mdPairs.popFront;
143         }
144     }
145 
146     bool empty()
147     {
148         return isLast && currentlen == 0;
149     }
150 
151 }
152 
153 debug (dhtslib_unittest) unittest
154 {
155     import std.stdio;
156     import dhtslib.sam;
157     import std.array : array;
158     import std.path : buildPath, dirName;
159 
160     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
161             "test", "range.bam"), 0);
162     auto ar = bam.allRecords;
163     assert(ar.empty == false);
164     auto read = ar.front;
165     read["MD"] = "2G11^GATC7T6^A11";
166     assert(MDItr(read).array == "==G===========GATC=======T======A===========");
167 }