1 /**
2 Module provides a parser for SAM/BAM record auxillary tags.
3 
4 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf
5 */
6 module dhtslib.sam.tagvalue;
7 
8 import std.stdio;
9 import std.meta : AliasSeq, staticIndexOf;
10 import std.string : fromStringz;
11 import htslib.sam : bam_aux_get, bam1_t, bam_aux2i;
12 import htslib.hts_log;
13 import std.conv : to;
14 import std.exception : enforce, assertThrown;
15 import std.math : approxEqual;
16 import dhtslib.memory;
17 
18 alias Types = AliasSeq!(byte, ubyte, short, ushort, int, uint, float, string, char);
19 enum TypeIndex(T) = staticIndexOf!(T, Types);
20 /// See https://samtools.github.io/hts-specs/SAMv1.pdf sec 1.5
21 char[9] TypeChars = ['c', 'C', 's', 'S', 'i', 'I', 'f', 'Z', 'A'];
22 
23 /**
24 
25 This represents a SAM/BAM record tag value, as outlined in the SAM specs §1.5.
26 
27 The struct itself stores only a pointer to the tag, and has member functions
28 to parse into any of the tag types (but only if the tag matches that type) (TODO: is this true?)
29 
30 Primary Types:
31 A   Printable character
32 i   Signed integer (see specs §1.5 footnote on size)
33 f   Single-precision float
34 Z   Printable string, including space
35 H   Byte array in the Hex format (network byte order / big-endian) //unknown if still supported
36 B   Integer or numeric array
37 
38 Byte-array (B) types:
39 c   byte
40 C   ubyte
41 s   short
42 S   ushort
43 i   int32
44 I   uint32
45 f   float (spec does not indicate precision)
46 
47 Memory layout
48 pipes delimit byte boundaries in an array
49 8/9 are example values
50 2 is a count of the array
51 the ubyte * starts at the type char
52 c | 8|
53 s |  | 8|
54 i |  |  |  | 8|
55 B |i |  |  |  | 2|  |  |  | 8|  |  |  | 9|
56 
57 
58 Alias seq allows us to have an enum of types.
59 https://forum.dlang.org/post/kmdjfzpugudmwfrdgson@forum.dlang.org
60 Thanks Paul!
61 
62 Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
63 */
64 struct TagValue
65 {
66 
67     private ubyte* data;
68 
69     private Bam1 b;
70 
71     /** Constructor
72 
73     Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
74     */
75     this(Bam1 b, char[2] tag)
76     {
77         this.b = b;
78         data = bam_aux_get(b, tag);
79     }
80 
81     /// check if empty/exists/null
82     @property
83     bool exists()
84     {
85         return this.data is null ? false : true;
86     }
87 
88     /* Tag type checking */
89 
90     /// Check if tag type is type T
91     bool check(T)()
92     {
93         enforce(this.exists,"Tag doesn't exist");
94         return TypeChars[TypeIndex!T] == cast(char) data[0];
95     }
96     /// Check if tag type is type T
97     bool check(T : string)()
98     {
99         enforce(this.exists,"Tag doesn't exist");
100         return TypeChars[TypeIndex!T] == cast(char) data[0];
101     }
102     /// Check if tag type is type T
103     bool check(T : T[])()
104     {
105         enforce(this.exists,"Tag doesn't exist");
106         return (cast(char) data[0] == 'B') && (TypeChars[TypeIndex!T] == cast(char) data[1]);
107     }
108 
109     /// Check if tag type is type T
110     bool checkArray()
111     {
112         enforce(this.exists,"Tag doesn't exist");
113         return cast(char) data[0] == 'B';
114     }
115 
116     /// Check if tag type is type T
117     bool checkHexByteArray()
118     {
119         enforce(this.exists,"Tag doesn't exist");
120         return cast(char) data[0] == 'H';
121     }
122 
123     /* Tag conversion */
124 
125     /// Convert tag value to D string
126     string to(T : string)()
127     {
128         enforce(this.check!string || this.checkHexByteArray,"Tag is not type Z or H");
129         return fromStringz(cast(char*)&data[1]).idup;
130     }
131     /// Convert tag value to D type
132     T to(T)()
133     {
134         enforce(this.check!T,"Tag is not type " ~ T.stringof);
135         return *cast(T*) data[1 .. T.sizeof + 1].ptr;
136     }
137     /// Convert array tag value D array
138     T[] to(T : T[])()
139     {
140         enforce(this.check!(T[]),"Tag is not type " ~ T.stringof);
141         int n = *cast(int*) data[2 .. 6].ptr;
142         return (cast(T*)(data[6 .. T.sizeof + 6].ptr))[0 .. n];
143     }
144     
145 
146     /// Convert any tag value to string
147     string toString()
148     {
149         enforce(this.exists,"Tag doesn't exist");
150         switch (cast(char) data[0])
151         {
152         case 'c':
153             return to!byte.to!string;
154         case 'C':
155             return to!ubyte.to!string;
156         case 's':
157             return to!short.to!string;
158         case 'S':
159             return to!ushort.to!string;
160         case 'i':
161             return to!int.to!string;
162         case 'I':
163             return to!uint.to!string;
164         case 'f':
165             return to!float.to!string;
166         case 'Z':
167         case 'H':
168             return to!string;
169         case 'B':
170             switch (cast(char) data[1])
171             {
172             case 'c':
173                 return to!(byte[]).to!string;
174             case 'C':
175                 return to!(ubyte[]).to!string;
176             case 's':
177                 return to!(short[]).to!string;
178             case 'S':
179                 return to!(ushort[]).to!string;
180             case 'i':
181                 return to!(int[]).to!string;
182             case 'I':
183                 return to!(uint[]).to!string;
184             case 'f':
185                 return to!(float[]).to!string;
186             default:
187                 throw new Exception("Array Tag malformed");    
188             }
189         default:
190             throw new Exception("Tag malformed");
191         }
192     }
193     /// Convert tag value to integer
194     long toInt()
195     {
196         enforce(this.exists,"Tag doesn't exist");
197         switch (cast(char) data[0])
198         {
199         case 'c':
200             return cast(long)(to!byte);
201         case 'C':
202             return cast(long)(to!ubyte);
203         case 's':
204             return cast(long)(to!short);
205         case 'S':
206             return cast(long)(to!ushort);
207         case 'i':
208             return cast(long)(to!int);
209         case 'I':
210             return cast(long)(to!uint);
211         default:
212             throw new Exception("Tag is not numeric or is malformed");
213         }
214     }
215     /// Convert tag value to integer array
216     long[] toIntArray()
217     {
218         enforce(this.exists,"Tag doesn't exist");
219         enforce(this.checkArray,"Tag is not a numeric array");
220         switch (cast(char) data[1])
221         {
222         case 'c':
223             return (to!(byte[]).to!(long[]));
224         case 'C':
225             return (to!(ubyte[]).to!(long[]));
226         case 's':
227             return (to!(short[]).to!(long[]));
228         case 'S':
229             return (to!(ushort[]).to!(long[]));
230         case 'i':
231             return (to!(int[]).to!(long[]));
232         case 'I':
233             return (to!(uint[]).to!(long[]));
234         default:
235             throw new Exception("Tag is malformed");
236         }
237     }
238     /// Convert tag value to float array
239     float[] toFloatArray()
240     {
241         enforce(this.exists,"Tag doesn't exist");
242         enforce(this.checkArray,"Tag is not an array");
243         enforce(this.check!(float[]),"Tag is not a float array");
244         return to!(float[]);
245     }
246 }
247 
248 debug (dhtslib_unittest) unittest
249 {
250     TagValue v;
251     assert(!v.exists);
252     ubyte[12] testdata;
253     assertThrown(v.toIntArray);
254     assertThrown(v.toInt);
255     assertThrown(v.toString);
256     testdata[0] = cast(ubyte) 'B';
257     testdata[1] = cast(ubyte) 'S';
258     *cast(int*) testdata[2 .. 6].ptr = 3;
259     testdata[6] = 1;
260     testdata[8] = 2;
261     testdata[10] = 3;
262     v.data = testdata.ptr;
263     writeln("testing array");
264     assert(v.to!(ushort[]) == [1, 2, 3]);
265     ubyte[5] testdata2;
266     testdata2[0] = cast(ubyte) 'i';
267     *cast(int*) testdata2[1 .. 5].ptr = 3;
268     v.data = testdata2.ptr;
269     writeln("testing int");
270     assert(v.to!int == 3);
271 }
272 
273 debug (dhtslib_unittest) unittest
274 {
275     import dhtslib.sam; // @suppress(dscanner.suspicious.local_imports)
276     import htslib.hts_log : hts_log_info;
277     import std.path : buildPath, dirName;
278 
279     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
280     hts_log_info(__FUNCTION__, "Testing tagvalue");
281     hts_log_info(__FUNCTION__, "Loading test file");
282     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
283             "test", "auxf#values.sam"), 0);
284 
285     hts_log_info(__FUNCTION__, "Getting read 1");
286     auto readrange = bam.allRecords(); // @suppress(dscanner.suspicious.unmodified)
287     assert(readrange.empty == false);
288     auto read = readrange.front;
289 
290     hts_log_info(__FUNCTION__, "Testing string");
291     assert(read["RG"].to!string == "ID");
292 
293     hts_log_info(__FUNCTION__, "Testing char");
294     assert(read["A!"].to!char == '!');
295     assert(read["Ac"].to!char == 'c');
296     assert(read["AC"].to!char == 'C');
297 
298     hts_log_info(__FUNCTION__, "Testing integral checks");
299     assert(read["I0"].check!ubyte);
300     assert(read["I1"].check!ubyte);
301     assert(read["I2"].check!ubyte);
302     assert(read["I3"].check!ubyte);
303     assert(read["I4"].check!ubyte);
304     assert(read["I5"].check!ushort);
305     assert(read["I6"].check!ushort);
306     assert(read["I7"].check!ushort);
307     assert(read["I8"].check!ushort);
308     assert(read["I9"].check!uint);
309     assert(read["IA"].check!uint);
310     assert(read["i1"].check!byte);
311     assert(read["i2"].check!byte);
312     assert(read["i3"].check!byte);
313     assert(read["i4"].check!short);
314     assert(read["i5"].check!short);
315     assert(read["i6"].check!short);
316     assert(read["i7"].check!short);
317     assert(read["i8"].check!int);
318     assert(read["i9"].check!int);
319     assert(read["iA"].check!int);
320     assert(read["iB"].check!int);
321 
322     hts_log_info(__FUNCTION__, "Testing integral conversion");
323     assert(read["I0"].to!ubyte == 0);
324     assert(read["I1"].to!ubyte == 1);
325     assert(read["I2"].to!ubyte == 127);
326     assert(read["I3"].to!ubyte == 128);
327     assert(read["I4"].to!ubyte == 255);
328     assert(read["I5"].to!ushort == 256);
329     assert(read["I6"].to!ushort == 32_767);
330     assert(read["I7"].to!ushort == 32_768);
331     assert(read["I8"].to!ushort == 65_535);
332     assert(read["I9"].to!uint == 65_536);
333     assert(read["IA"].to!uint == 2_147_483_647);
334     assert(read["i1"].to!byte == -1);
335     assert(read["i2"].to!byte == -127);
336     assert(read["i3"].to!byte == -128);
337     assert(read["i4"].to!short == -255);
338     assert(read["i5"].to!short == -256);
339     assert(read["i6"].to!short == -32_767);
340     assert(read["i7"].to!short == -32_768);
341     assert(read["i8"].to!int == -65_535);
342     assert(read["i9"].to!int == -65_536);
343     assert(read["iA"].to!int == -2_147_483_647);
344     assert(read["iB"].to!int == -2_147_483_648);
345 
346     hts_log_info(__FUNCTION__, "Testing integral toString");
347     assert(read["I0"].toString == "0");
348     assert(read["I1"].toString == "1");
349     assert(read["I2"].toString == "127");
350     assert(read["I3"].toString == "128");
351     assert(read["I4"].toString == "255");
352     assert(read["I5"].toString == "256");
353     assert(read["I6"].toString == "32767");
354     assert(read["I7"].toString == "32768");
355     assert(read["I8"].toString == "65535");
356     assert(read["I9"].toString == "65536");
357     assert(read["IA"].toString == "2147483647");
358     assert(read["i1"].toString == "-1");
359     assert(read["i2"].toString == "-127");
360     assert(read["i3"].toString == "-128");
361     assert(read["i4"].toString == "-255");
362     assert(read["i5"].toString == "-256");
363     assert(read["i6"].toString == "-32767");
364     assert(read["i7"].toString == "-32768");
365     assert(read["i8"].toString == "-65535");
366     assert(read["i9"].toString == "-65536");
367     assert(read["iA"].toString == "-2147483647");
368     assert(read["iB"].toString == "-2147483648");
369 
370     hts_log_info(__FUNCTION__, "Testing integral toInt");
371     assert(read["I0"].toInt == 0);
372     assert(read["I1"].toInt == 1);
373     assert(read["I2"].toInt == 127);
374     assert(read["I3"].toInt == 128);
375     assert(read["I4"].toInt == 255);
376     assert(read["I5"].toInt == 256);
377     assert(read["I6"].toInt == 32_767);
378     assert(read["I7"].toInt == 32_768);
379     assert(read["I8"].toInt == 65_535);
380     assert(read["I9"].toInt == 65_536);
381     assert(read["IA"].toInt == 2_147_483_647);
382     assert(read["i1"].toInt == -1);
383     assert(read["i2"].toInt == -127);
384     assert(read["i3"].toInt == -128);
385     assert(read["i4"].toInt == -255);
386     assert(read["i5"].toInt == -256);
387     assert(read["i6"].toInt == -32_767);
388     assert(read["i7"].toInt == -32_768);
389     assert(read["i8"].toInt == -65_535);
390     assert(read["i9"].toInt == -65_536);
391     assert(read["iA"].toInt == -2_147_483_647);
392     assert(read["iB"].toInt == -2_147_483_648);
393 
394     hts_log_info(__FUNCTION__, "Testing float checks");
395 
396     assert(read["F0"].check!float);
397     assert(read["F1"].check!float);
398     assert(read["F2"].check!float);
399 
400     hts_log_info(__FUNCTION__, "Testing float conversion");
401     assert(read["F0"].to!float == -1.0);
402     assert(read["F1"].to!float == 0.0);
403     assert(read["F2"].to!float == 1.0);
404 
405     hts_log_info(__FUNCTION__, "Testing float toString");
406 
407     assert(approxEqual(read["F0"].toString.to!float, -1.0));
408     assert(approxEqual(read["F1"].toString.to!float, 0.0));
409     assert(approxEqual(read["F2"].toString.to!float, 1.0));
410 
411     hts_log_info(__FUNCTION__, "Running tag checking");
412     assert(read["I0"].check!ubyte == true);
413     assert(read["I5"].check!ushort == true);
414     assert(read["I9"].check!uint == true);
415     assert(read["i1"].check!byte == true);
416     assert(read["i4"].check!short == true);
417     assert(read["i8"].check!int == true);
418     assert(read["F0"].check!float == true);
419     readrange.popFront;
420     read = readrange.front;
421     hts_log_info(__FUNCTION__, "Testing arrays");
422     assert(read["Bs"].to!(short[]) == [-32_768, -32_767, 0, 32_767]);
423     assert(read["Bi"].to!(int[]) == [
424             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
425             ]);
426     assert(read["BS"].to!(ushort[]) == [0, 32_767, 32_768, 65_535]);
427     assert(read["BI"].to!(uint[]) == [
428             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
429             ]);
430     
431     hts_log_info(__FUNCTION__, "Testing array toString");
432     assert(read["Bs"].toString == "[-32768, -32767, 0, 32767]");
433     assert(read["Bi"].toString == "[-2147483648, -2147483647, 0, 2147483647]");
434     assert(read["BS"].toString == "[0, 32767, 32768, 65535]");
435     assert(read["BI"].toString == "[0, 2147483647, 2147483648, 4294967295]");
436 
437     writeln(read["Bs"].toIntArray);
438     assert(read["Bs"].toIntArray == [-32_768, -32_767, 0, 32_767]);
439     assert(read["Bi"].toIntArray == [
440             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
441             ]);
442     assert(read["BS"].toIntArray == [0, 32_767, 32_768, 65_535]);
443     assert(read["BI"].toIntArray == [
444             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
445             ]);
446     hts_log_info(__FUNCTION__, "Running tag checking");
447     assert(read["Bs"].check!(short[]) == true);
448     assert(read["Bi"].check!(int[]) == true);
449     assert(read["BS"].check!(ushort[]) == true);
450     assert(read["BI"].check!(uint[]) == true);
451 
452     hts_log_info(__FUNCTION__, "Testing float Array");
453     float[] arr = [10.0,11.0,12.1];
454     read["fA"] = arr;
455     assert(read["fA"].to!(float[]) == arr);
456     assert(read["fA"].toFloatArray == arr);
457     assert(read["fA"].toString == "[10, 11, 12.1]");
458 
459     hts_log_info(__FUNCTION__, "Testing byte Array");
460     byte[] arr2 = [10, -10]; 
461     read["cA"] = arr2;
462     assert(read["cA"].to!(byte[]) == arr2);
463     assert(read["cA"].toIntArray == arr2.to!(long[]));
464     assert(read["cA"].toString == "[10, -10]");
465 
466     hts_log_info(__FUNCTION__, "Testing ubyte Array");
467     ubyte[] arr3 = [10, 11]; 
468     read["CA"] = arr3;
469     assert(read["CA"].to!(ubyte[]) == arr3);
470     assert(read["CA"].toIntArray == arr3.to!(long[]));
471     assert(read["CA"].toString == "[10, 11]");
472 
473 }