1 /**
2 Module provides a parser for SAM/BAM record auxillary tags.
3 
4 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf
5 */
6 module dhtslib.tagvalue;
7 
8 import std.stdio;
9 import std.meta : AliasSeq, staticIndexOf;
10 import std..string : fromStringz;
11 import htslib.sam : bam_aux_get, bam1_t, bam_aux2i;
12 import htslib.hts_log;
13 import std.conv : to;
14 
15 alias Types = AliasSeq!(byte, ubyte, short, ushort, int, uint, float, string, char);
16 enum TypeIndex(T) = staticIndexOf!(T, Types);
17 /// See https://samtools.github.io/hts-specs/SAMv1.pdf sec 1.5
18 char[9] TypeChars = ['c', 'C', 's', 'S', 'i', 'I', 'f', 'Z', 'A'];
19 
20 /**
21 
22 This represents a SAM/BAM record tag value, as outlined in the SAM specs §1.5.
23 
24 The struct itself stores only a pointer to the tag, and has member functions
25 to parse into any of the tag types (but only if the tag matches that type) (TODO: is this true?)
26 
27 Primary Types:
28 A   Printable character
29 i   Signed integer (see specs §1.5 footnote on size)
30 f   Single-precision float
31 Z   Printable string, including space
32 H   Byte array in the Hex format (network byte order / big-endian)
33 B   Integer or numeric array
34 
35 Byte-array (B) types:
36 c   byte
37 C   ubyte
38 s   short
39 S   ushort
40 i   int32
41 I   uint32
42 f   float (spec does not indicate precision)
43 
44 Memory layout
45 pipes delimit byte boundaries in an array
46 8/9 are example values
47 2 is a count of the array
48 the ubyte * starts at the type char
49 c | 8|
50 s |  | 8|
51 i |  |  |  | 8|
52 B |i |  |  |  | 2|  |  |  | 8|  |  |  | 9|
53 
54 
55 Alias seq allows us to have an enum of types.
56 https://forum.dlang.org/post/kmdjfzpugudmwfrdgson@forum.dlang.org
57 Thanks Paul!
58 
59 Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
60 */
61 struct TagValue
62 {
63     private ubyte* data;
64 
65     /** Constructor
66 
67     Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
68     */
69     this(bam1_t* b, char[2] tag)
70     {
71         data = bam_aux_get(b, tag);
72         debug
73         {
74             if (data is null)
75                 hts_log_warning(__FUNCTION__, (tag ~ " doesn't exist for this record").idup);
76         }
77     }
78 
79     /// check if empty/exists/null
80     @property
81     bool exists()
82     {
83         if (this.data is null) return false;
84         return true;
85     }
86 
87     /// Convert tag value
88     string to(T : string)()
89     {
90         assert(this.data !is null);
91         return fromStringz(cast(char*)&data[1]).idup;
92     }
93     /// Convert tag value
94     T to(T)()
95     {
96         assert(this.data !is null);
97         return *cast(T*) data[1 .. T.sizeof + 1].ptr;
98     }
99     /// Convert tag value
100     T[] to(T : T[])()
101     {
102         assert(this.data !is null);
103         int n = *cast(int*) data[2 .. 6].ptr;
104         return (cast(T*)(data[6 .. T.sizeof + 6].ptr))[0 .. n];
105     }
106     /// Check if tag type is type T
107     bool check(T)()
108     {
109         assert(this.data !is null);
110         return TypeChars[TypeIndex!T] == cast(char) data[0];
111     }
112     /// Check if tag type is type T
113     bool check(T : string)()
114     {
115         assert(this.data !is null);
116         return TypeChars[TypeIndex!T] == cast(char) data[0];
117     }
118     /// Check if tag type is type T
119     bool check(T : T[])()
120     {
121         assert(this.data !is null);
122         return (cast(char) data[0] == 'B') && (TypeChars[TypeIndex!T] == cast(char) data[1]);
123     }
124     /// Convert tag value to string
125     string toString() const
126     {
127         if (data !is null && cast(char) data[0] == 'Z')
128         {
129             return fromStringz(cast(char*)&data[1]).idup;
130         }
131         return "";
132     }
133     /// Convert tag value to integer
134     long toInt()
135     {
136         assert(this.data !is null);
137         switch (cast(char) data[0])
138         {
139         case 'c':
140             return cast(long)(to!byte);
141         case 'C':
142             return cast(long)(to!ubyte);
143         case 's':
144             return cast(long)(to!short);
145         case 'S':
146             return cast(long)(to!ushort);
147         case 'i':
148             return cast(long)(to!int);
149         case 'I':
150             return cast(long)(to!uint);
151         default:
152             return long.min;
153         }
154     }
155     /// Convert tag value to integer array
156     long[] toIntArray()
157     {
158         assert(this.data !is null);
159         switch (cast(char) data[1])
160         {
161         case 'c':
162             return (to!(byte[]).to!(long[]));
163         case 'C':
164             return (to!(ubyte[]).to!(long[]));
165         case 's':
166             return (to!(short[]).to!(long[]));
167         case 'S':
168             return (to!(ushort[]).to!(long[]));
169         case 'i':
170             return (to!(int[]).to!(long[]));
171         case 'I':
172             return (to!(uint[]).to!(long[]));
173         default:
174             return [];
175         }
176     }
177     /// Convert tag value to float array
178     float[] toFloatArray()
179     {
180         assert(this.data !is null);
181         return to!(float[]);
182     }
183 }
184 
185 debug (dhtslib_unittest) unittest
186 {
187     TagValue v;
188     ubyte[12] testdata;
189     testdata[0] = cast(ubyte) 'B';
190     testdata[1] = cast(ubyte) 'C';
191     *cast(int*) testdata[2 .. 6].ptr = 3;
192     testdata[6] = 1;
193     testdata[8] = 2;
194     testdata[10] = 3;
195     v.data = testdata.ptr;
196     writeln("testing array");
197     assert(v.to!(ushort[]) == [1, 2, 3]);
198     ubyte[5] testdata2;
199     testdata2[0] = cast(ubyte) 'i';
200     *cast(int*) testdata2[1 .. 5].ptr = 3;
201     v.data = testdata2.ptr;
202     writeln("testing int");
203     assert(v.to!int == 3);
204 }
205 
206 debug (dhtslib_unittest) unittest
207 {
208     import dhtslib.sam; // @suppress(dscanner.suspicious.local_imports)
209     import htslib.hts_log : hts_log_info;
210     import std.path : buildPath, dirName;
211 
212     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
213     hts_log_info(__FUNCTION__, "Testing tagvalue");
214     hts_log_info(__FUNCTION__, "Loading test file");
215     auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib",
216             "test", "auxf#values.sam"), 0);
217     hts_log_info(__FUNCTION__, "Getting read 1");
218     auto readrange = bam.all_records(); // @suppress(dscanner.suspicious.unmodified)
219     auto read = readrange.front;
220     hts_log_info(__FUNCTION__, "Testing string");
221     assert(read["RG"].to!string == "ID");
222     hts_log_info(__FUNCTION__, "Testing char");
223     assert(read["A!"].to!char == '!');
224     assert(read["Ac"].to!char == 'c');
225     assert(read["AC"].to!char == 'C');
226     hts_log_info(__FUNCTION__, "Testing int");
227     assert(read["I0"].to!ubyte == 0);
228     assert(read["I1"].to!ubyte == 1);
229     assert(read["I2"].to!ubyte == 127);
230     assert(read["I3"].to!ubyte == 128);
231     assert(read["I4"].to!ubyte == 255);
232     assert(read["I5"].to!ushort == 256);
233     assert(read["I6"].to!ushort == 32_767);
234     assert(read["I7"].to!ushort == 32_768);
235     assert(read["I8"].to!ushort == 65_535);
236     assert(read["I9"].to!uint == 65_536);
237     assert(read["IA"].to!uint == 2_147_483_647);
238     assert(read["i1"].to!byte == -1);
239     assert(read["i2"].to!byte == -127);
240     assert(read["i3"].to!byte == -128);
241     assert(read["i4"].to!short == -255);
242     assert(read["i5"].to!short == -256);
243     assert(read["i6"].to!short == -32_767);
244     assert(read["i7"].to!short == -32_768);
245     assert(read["i8"].to!int == -65_535);
246     assert(read["i9"].to!int == -65_536);
247     assert(read["iA"].to!int == -2_147_483_647);
248     assert(read["iB"].to!int == -2_147_483_648);
249     assert(read["I0"].toInt == 0);
250     assert(read["I1"].toInt == 1);
251     assert(read["I2"].toInt == 127);
252     assert(read["I3"].toInt == 128);
253     assert(read["I4"].toInt == 255);
254     assert(read["I5"].toInt == 256);
255     assert(read["I6"].toInt == 32_767);
256     assert(read["I7"].toInt == 32_768);
257     assert(read["I8"].toInt == 65_535);
258     assert(read["I9"].toInt == 65_536);
259     assert(read["IA"].toInt == 2_147_483_647);
260     assert(read["i1"].toInt == -1);
261     assert(read["i2"].toInt == -127);
262     assert(read["i3"].toInt == -128);
263     assert(read["i4"].toInt == -255);
264     assert(read["i5"].toInt == -256);
265     assert(read["i6"].toInt == -32_767);
266     assert(read["i7"].toInt == -32_768);
267     assert(read["i8"].toInt == -65_535);
268     assert(read["i9"].toInt == -65_536);
269     assert(read["iA"].toInt == -2_147_483_647);
270     assert(read["iB"].toInt == -2_147_483_648);
271     hts_log_info(__FUNCTION__, "Testing float");
272     assert(read["F0"].to!float == -1.0);
273     assert(read["F1"].to!float == 0.0);
274     assert(read["F2"].to!float == 1.0);
275     hts_log_info(__FUNCTION__, "Running tag checking");
276     assert(read["I0"].check!ubyte == true);
277     assert(read["I5"].check!ushort == true);
278     assert(read["I9"].check!uint == true);
279     assert(read["i1"].check!byte == true);
280     assert(read["i4"].check!short == true);
281     assert(read["i8"].check!int == true);
282     assert(read["F0"].check!float == true);
283     readrange.popFront;
284     read = readrange.front;
285     hts_log_info(__FUNCTION__, "Testing arrays");
286     assert(read["Bs"].to!(short[]) == [-32_768, -32_767, 0, 32_767]);
287     assert(read["Bi"].to!(int[]) == [
288             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
289             ]);
290     assert(read["BS"].to!(ushort[]) == [0, 32_767, 32_768, 65_535]);
291     assert(read["BI"].to!(uint[]) == [
292             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
293             ]);
294     writeln(read["Bs"].toIntArray);
295     assert(read["Bs"].toIntArray == [-32_768, -32_767, 0, 32_767]);
296     assert(read["Bi"].toIntArray == [
297             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
298             ]);
299     assert(read["BS"].toIntArray == [0, 32_767, 32_768, 65_535]);
300     assert(read["BI"].toIntArray == [
301             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
302             ]);
303     hts_log_info(__FUNCTION__, "Running tag checking");
304     assert(read["Bs"].check!(short[]) == true);
305     assert(read["Bi"].check!(int[]) == true);
306     assert(read["BS"].check!(ushort[]) == true);
307     assert(read["BI"].check!(uint[]) == true);
308 }