1 /**
2 Module provides a parser for SAM/BAM record auxillary tags.
3 
4 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf
5 */
6 module dhtslib.sam.tagvalue;
7 
8 import std.stdio;
9 import std.meta : AliasSeq, staticIndexOf;
10 import std.string : fromStringz;
11 import htslib.sam : bam_aux_get, bam1_t, bam_aux2i;
12 import htslib.hts_log;
13 import std.conv : to;
14 import std.exception : enforce, assertThrown;
15 import std.math : approxEqual;
16 import dhtslib.memory;
17 
18 alias Types = AliasSeq!(byte, ubyte, short, ushort, int, uint, float, string, char);
19 enum TypeIndex(T) = staticIndexOf!(T, Types);
20 /// See https://samtools.github.io/hts-specs/SAMv1.pdf sec 1.5
21 char[9] TypeChars = ['c', 'C', 's', 'S', 'i', 'I', 'f', 'Z', 'A'];
22 
23 /**
24 
25 This represents a SAM/BAM record tag value, as outlined in the SAM specs §1.5.
26 
27 The struct itself stores only a pointer to the tag, and has member functions
28 to parse into any of the tag types (but only if the tag matches that type) (TODO: is this true?)
29 
30 Primary Types:
31 A   Printable character
32 i   Signed integer (see specs §1.5 footnote on size)
33 f   Single-precision float
34 Z   Printable string, including space
35 H   Byte array in the Hex format (network byte order / big-endian) //unknown if still supported
36 B   Integer or numeric array
37 
38 Byte-array (B) types:
39 c   byte
40 C   ubyte
41 s   short
42 S   ushort
43 i   int32
44 I   uint32
45 f   float (spec does not indicate precision)
46 
47 Memory layout
48 pipes delimit byte boundaries in an array
49 8/9 are example values
50 2 is a count of the array
51 the ubyte * starts at the type char
52 c | 8|
53 s |  | 8|
54 i |  |  |  | 8|
55 B |i |  |  |  | 2|  |  |  | 8|  |  |  | 9|
56 
57 
58 Alias seq allows us to have an enum of types.
59 https://forum.dlang.org/post/kmdjfzpugudmwfrdgson@forum.dlang.org
60 Thanks Paul!
61 
62 Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
63 */
64 struct TagValue
65 {
66 
67     private ubyte* data;
68 
69     private Bam1 b;
70 
71     /** Constructor
72 
73     Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag
74     */
75     this(Bam1 b, char[2] tag)
76     {
77         this.b = b;
78         data = bam_aux_get(b, tag);
79     }
80 
81     /// Explicit postblit to avoid 
82     /// https://github.com/blachlylab/dhtslib/issues/122
83     this(this)
84     {
85         this.data = data;
86         this.b = b;   
87     }
88 
89     /// check if empty/exists/null
90     @property
91     bool exists()
92     {
93         return this.data is null ? false : true;
94     }
95 
96     /* Tag type checking */
97 
98     /// Check if tag type is type T
99     bool check(T)()
100     {
101         enforce(this.exists,"Tag doesn't exist");
102         return TypeChars[TypeIndex!T] == cast(char) data[0];
103     }
104     /// Check if tag type is type T
105     bool check(T : string)()
106     {
107         enforce(this.exists,"Tag doesn't exist");
108         return TypeChars[TypeIndex!T] == cast(char) data[0];
109     }
110     /// Check if tag type is type T
111     bool check(T : T[])()
112     {
113         enforce(this.exists,"Tag doesn't exist");
114         return (cast(char) data[0] == 'B') && (TypeChars[TypeIndex!T] == cast(char) data[1]);
115     }
116 
117     /// Check if tag type is type T
118     bool checkArray()
119     {
120         enforce(this.exists,"Tag doesn't exist");
121         return cast(char) data[0] == 'B';
122     }
123 
124     /// Check if tag type is type T
125     bool checkHexByteArray()
126     {
127         enforce(this.exists,"Tag doesn't exist");
128         return cast(char) data[0] == 'H';
129     }
130 
131     /* Tag conversion */
132 
133     /// Convert tag value to D string
134     string to(T : string)()
135     {
136         enforce(this.check!string || this.checkHexByteArray,"Tag is not type Z or H");
137         return fromStringz(cast(char*)&data[1]).idup;
138     }
139     /// Convert tag value to D type
140     T to(T)()
141     {
142         enforce(this.check!T,"Tag is not type " ~ T.stringof);
143         return *cast(T*) data[1 .. T.sizeof + 1].ptr;
144     }
145     /// Convert array tag value D array
146     T[] to(T : T[])()
147     {
148         enforce(this.check!(T[]),"Tag is not type " ~ T.stringof);
149         int n = *cast(int*) data[2 .. 6].ptr;
150         return (cast(T*)(data[6 .. T.sizeof + 6].ptr))[0 .. n];
151     }
152     
153 
154     /// Convert any tag value to string
155     string toString()
156     {
157         enforce(this.exists,"Tag doesn't exist");
158         switch (cast(char) data[0])
159         {
160         case 'c':
161             return to!byte.to!string;
162         case 'C':
163             return to!ubyte.to!string;
164         case 's':
165             return to!short.to!string;
166         case 'S':
167             return to!ushort.to!string;
168         case 'i':
169             return to!int.to!string;
170         case 'I':
171             return to!uint.to!string;
172         case 'f':
173             return to!float.to!string;
174         case 'Z':
175         case 'H':
176             return to!string;
177         case 'B':
178             switch (cast(char) data[1])
179             {
180             case 'c':
181                 return to!(byte[]).to!string;
182             case 'C':
183                 return to!(ubyte[]).to!string;
184             case 's':
185                 return to!(short[]).to!string;
186             case 'S':
187                 return to!(ushort[]).to!string;
188             case 'i':
189                 return to!(int[]).to!string;
190             case 'I':
191                 return to!(uint[]).to!string;
192             case 'f':
193                 return to!(float[]).to!string;
194             default:
195                 throw new Exception("Array Tag malformed");    
196             }
197         default:
198             throw new Exception("Tag malformed");
199         }
200     }
201     /// Convert tag value to integer
202     long toInt()
203     {
204         enforce(this.exists,"Tag doesn't exist");
205         switch (cast(char) data[0])
206         {
207         case 'c':
208             return cast(long)(to!byte);
209         case 'C':
210             return cast(long)(to!ubyte);
211         case 's':
212             return cast(long)(to!short);
213         case 'S':
214             return cast(long)(to!ushort);
215         case 'i':
216             return cast(long)(to!int);
217         case 'I':
218             return cast(long)(to!uint);
219         default:
220             throw new Exception("Tag is not numeric or is malformed");
221         }
222     }
223     /// Convert tag value to integer array
224     long[] toIntArray()
225     {
226         enforce(this.exists,"Tag doesn't exist");
227         enforce(this.checkArray,"Tag is not a numeric array");
228         switch (cast(char) data[1])
229         {
230         case 'c':
231             return (to!(byte[]).to!(long[]));
232         case 'C':
233             return (to!(ubyte[]).to!(long[]));
234         case 's':
235             return (to!(short[]).to!(long[]));
236         case 'S':
237             return (to!(ushort[]).to!(long[]));
238         case 'i':
239             return (to!(int[]).to!(long[]));
240         case 'I':
241             return (to!(uint[]).to!(long[]));
242         default:
243             throw new Exception("Tag is malformed");
244         }
245     }
246     /// Convert tag value to float array
247     float[] toFloatArray()
248     {
249         enforce(this.exists,"Tag doesn't exist");
250         enforce(this.checkArray,"Tag is not an array");
251         enforce(this.check!(float[]),"Tag is not a float array");
252         return to!(float[]);
253     }
254 }
255 
256 debug (dhtslib_unittest) unittest
257 {
258     TagValue v;
259     assert(!v.exists);
260     ubyte[12] testdata;
261     assertThrown(v.toIntArray);
262     assertThrown(v.toInt);
263     assertThrown(v.toString);
264     testdata[0] = cast(ubyte) 'B';
265     testdata[1] = cast(ubyte) 'S';
266     *cast(int*) testdata[2 .. 6].ptr = 3;
267     testdata[6] = 1;
268     testdata[8] = 2;
269     testdata[10] = 3;
270     v.data = testdata.ptr;
271     writeln("testing array");
272     assert(v.to!(ushort[]) == [1, 2, 3]);
273     ubyte[5] testdata2;
274     testdata2[0] = cast(ubyte) 'i';
275     *cast(int*) testdata2[1 .. 5].ptr = 3;
276     v.data = testdata2.ptr;
277     writeln("testing int");
278     assert(v.to!int == 3);
279 }
280 
281 debug (dhtslib_unittest) unittest
282 {
283     import dhtslib.sam; // @suppress(dscanner.suspicious.local_imports)
284     import htslib.hts_log : hts_log_info;
285     import std.path : buildPath, dirName;
286 
287     hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
288     hts_log_info(__FUNCTION__, "Testing tagvalue");
289     hts_log_info(__FUNCTION__, "Loading test file");
290     auto bam = SAMFile(buildPath(dirName(dirName(dirName(dirName(__FILE__)))), "htslib",
291             "test", "auxf#values.sam"), 0);
292 
293     hts_log_info(__FUNCTION__, "Getting read 1");
294     auto readrange = bam.allRecords(); // @suppress(dscanner.suspicious.unmodified)
295     assert(readrange.empty == false);
296     auto read = readrange.front;
297 
298     hts_log_info(__FUNCTION__, "Testing string");
299     assert(read["RG"].to!string == "ID");
300 
301     hts_log_info(__FUNCTION__, "Testing char");
302     assert(read["A!"].to!char == '!');
303     assert(read["Ac"].to!char == 'c');
304     assert(read["AC"].to!char == 'C');
305 
306     hts_log_info(__FUNCTION__, "Testing integral checks");
307     assert(read["I0"].check!ubyte);
308     assert(read["I1"].check!ubyte);
309     assert(read["I2"].check!ubyte);
310     assert(read["I3"].check!ubyte);
311     assert(read["I4"].check!ubyte);
312     assert(read["I5"].check!ushort);
313     assert(read["I6"].check!ushort);
314     assert(read["I7"].check!ushort);
315     assert(read["I8"].check!ushort);
316     assert(read["I9"].check!uint);
317     assert(read["IA"].check!uint);
318     assert(read["i1"].check!byte);
319     assert(read["i2"].check!byte);
320     assert(read["i3"].check!byte);
321     assert(read["i4"].check!short);
322     assert(read["i5"].check!short);
323     assert(read["i6"].check!short);
324     assert(read["i7"].check!short);
325     assert(read["i8"].check!int);
326     assert(read["i9"].check!int);
327     assert(read["iA"].check!int);
328     assert(read["iB"].check!int);
329 
330     hts_log_info(__FUNCTION__, "Testing integral conversion");
331     assert(read["I0"].to!ubyte == 0);
332     assert(read["I1"].to!ubyte == 1);
333     assert(read["I2"].to!ubyte == 127);
334     assert(read["I3"].to!ubyte == 128);
335     assert(read["I4"].to!ubyte == 255);
336     assert(read["I5"].to!ushort == 256);
337     assert(read["I6"].to!ushort == 32_767);
338     assert(read["I7"].to!ushort == 32_768);
339     assert(read["I8"].to!ushort == 65_535);
340     assert(read["I9"].to!uint == 65_536);
341     assert(read["IA"].to!uint == 2_147_483_647);
342     assert(read["i1"].to!byte == -1);
343     assert(read["i2"].to!byte == -127);
344     assert(read["i3"].to!byte == -128);
345     assert(read["i4"].to!short == -255);
346     assert(read["i5"].to!short == -256);
347     assert(read["i6"].to!short == -32_767);
348     assert(read["i7"].to!short == -32_768);
349     assert(read["i8"].to!int == -65_535);
350     assert(read["i9"].to!int == -65_536);
351     assert(read["iA"].to!int == -2_147_483_647);
352     assert(read["iB"].to!int == -2_147_483_648);
353 
354     hts_log_info(__FUNCTION__, "Testing integral toString");
355     assert(read["I0"].toString == "0");
356     assert(read["I1"].toString == "1");
357     assert(read["I2"].toString == "127");
358     assert(read["I3"].toString == "128");
359     assert(read["I4"].toString == "255");
360     assert(read["I5"].toString == "256");
361     assert(read["I6"].toString == "32767");
362     assert(read["I7"].toString == "32768");
363     assert(read["I8"].toString == "65535");
364     assert(read["I9"].toString == "65536");
365     assert(read["IA"].toString == "2147483647");
366     assert(read["i1"].toString == "-1");
367     assert(read["i2"].toString == "-127");
368     assert(read["i3"].toString == "-128");
369     assert(read["i4"].toString == "-255");
370     assert(read["i5"].toString == "-256");
371     assert(read["i6"].toString == "-32767");
372     assert(read["i7"].toString == "-32768");
373     assert(read["i8"].toString == "-65535");
374     assert(read["i9"].toString == "-65536");
375     assert(read["iA"].toString == "-2147483647");
376     assert(read["iB"].toString == "-2147483648");
377 
378     hts_log_info(__FUNCTION__, "Testing integral toInt");
379     assert(read["I0"].toInt == 0);
380     assert(read["I1"].toInt == 1);
381     assert(read["I2"].toInt == 127);
382     assert(read["I3"].toInt == 128);
383     assert(read["I4"].toInt == 255);
384     assert(read["I5"].toInt == 256);
385     assert(read["I6"].toInt == 32_767);
386     assert(read["I7"].toInt == 32_768);
387     assert(read["I8"].toInt == 65_535);
388     assert(read["I9"].toInt == 65_536);
389     assert(read["IA"].toInt == 2_147_483_647);
390     assert(read["i1"].toInt == -1);
391     assert(read["i2"].toInt == -127);
392     assert(read["i3"].toInt == -128);
393     assert(read["i4"].toInt == -255);
394     assert(read["i5"].toInt == -256);
395     assert(read["i6"].toInt == -32_767);
396     assert(read["i7"].toInt == -32_768);
397     assert(read["i8"].toInt == -65_535);
398     assert(read["i9"].toInt == -65_536);
399     assert(read["iA"].toInt == -2_147_483_647);
400     assert(read["iB"].toInt == -2_147_483_648);
401 
402     hts_log_info(__FUNCTION__, "Testing float checks");
403 
404     assert(read["F0"].check!float);
405     assert(read["F1"].check!float);
406     assert(read["F2"].check!float);
407 
408     hts_log_info(__FUNCTION__, "Testing float conversion");
409     assert(read["F0"].to!float == -1.0);
410     assert(read["F1"].to!float == 0.0);
411     assert(read["F2"].to!float == 1.0);
412 
413     hts_log_info(__FUNCTION__, "Testing float toString");
414 
415     assert(approxEqual(read["F0"].toString.to!float, -1.0));
416     assert(approxEqual(read["F1"].toString.to!float, 0.0));
417     assert(approxEqual(read["F2"].toString.to!float, 1.0));
418 
419     hts_log_info(__FUNCTION__, "Running tag checking");
420     assert(read["I0"].check!ubyte == true);
421     assert(read["I5"].check!ushort == true);
422     assert(read["I9"].check!uint == true);
423     assert(read["i1"].check!byte == true);
424     assert(read["i4"].check!short == true);
425     assert(read["i8"].check!int == true);
426     assert(read["F0"].check!float == true);
427     readrange.popFront;
428     read = readrange.front;
429     hts_log_info(__FUNCTION__, "Testing arrays");
430     assert(read["Bs"].to!(short[]) == [-32_768, -32_767, 0, 32_767]);
431     assert(read["Bi"].to!(int[]) == [
432             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
433             ]);
434     assert(read["BS"].to!(ushort[]) == [0, 32_767, 32_768, 65_535]);
435     assert(read["BI"].to!(uint[]) == [
436             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
437             ]);
438     
439     hts_log_info(__FUNCTION__, "Testing array toString");
440     assert(read["Bs"].toString == "[-32768, -32767, 0, 32767]");
441     assert(read["Bi"].toString == "[-2147483648, -2147483647, 0, 2147483647]");
442     assert(read["BS"].toString == "[0, 32767, 32768, 65535]");
443     assert(read["BI"].toString == "[0, 2147483647, 2147483648, 4294967295]");
444 
445     writeln(read["Bs"].toIntArray);
446     assert(read["Bs"].toIntArray == [-32_768, -32_767, 0, 32_767]);
447     assert(read["Bi"].toIntArray == [
448             -2_147_483_648, -2_147_483_647, 0, 2_147_483_647
449             ]);
450     assert(read["BS"].toIntArray == [0, 32_767, 32_768, 65_535]);
451     assert(read["BI"].toIntArray == [
452             0, 2_147_483_647, 2_147_483_648, 4_294_967_295
453             ]);
454     hts_log_info(__FUNCTION__, "Running tag checking");
455     assert(read["Bs"].check!(short[]) == true);
456     assert(read["Bi"].check!(int[]) == true);
457     assert(read["BS"].check!(ushort[]) == true);
458     assert(read["BI"].check!(uint[]) == true);
459 
460     hts_log_info(__FUNCTION__, "Testing float Array");
461     float[] arr = [10.0,11.0,12.1];
462     read["fA"] = arr;
463     assert(read["fA"].to!(float[]) == arr);
464     assert(read["fA"].toFloatArray == arr);
465     assert(read["fA"].toString == "[10, 11, 12.1]");
466 
467     hts_log_info(__FUNCTION__, "Testing byte Array");
468     byte[] arr2 = [10, -10]; 
469     read["cA"] = arr2;
470     assert(read["cA"].to!(byte[]) == arr2);
471     assert(read["cA"].toIntArray == arr2.to!(long[]));
472     assert(read["cA"].toString == "[10, -10]");
473 
474     hts_log_info(__FUNCTION__, "Testing ubyte Array");
475     ubyte[] arr3 = [10, 11]; 
476     read["CA"] = arr3;
477     assert(read["CA"].to!(ubyte[]) == arr3);
478     assert(read["CA"].toIntArray == arr3.to!(long[]));
479     assert(read["CA"].toString == "[10, 11]");
480 
481 }