1 /** 2 Module provides a parser for SAM/BAM record auxillary tags. 3 4 Reference: https://samtools.github.io/hts-specs/SAMtags.pdf 5 */ 6 module dhtslib.tagvalue; 7 8 import std.stdio; 9 import std.meta : AliasSeq, staticIndexOf; 10 import std..string : fromStringz; 11 import htslib.sam : bam_aux_get, bam1_t, bam_aux2i; 12 import htslib.hts_log; 13 import std.conv : to; 14 15 alias Types = AliasSeq!(byte, ubyte, short, ushort, int, uint, float, string, char); 16 enum TypeIndex(T) = staticIndexOf!(T, Types); 17 /// See https://samtools.github.io/hts-specs/SAMv1.pdf sec 1.5 18 char[9] TypeChars = ['c', 'C', 's', 'S', 'i', 'I', 'f', 'Z', 'A']; 19 20 /** 21 22 This represents a SAM/BAM record tag value, as outlined in the SAM specs §1.5. 23 24 The struct itself stores only a pointer to the tag, and has member functions 25 to parse into any of the tag types (but only if the tag matches that type) (TODO: is this true?) 26 27 Primary Types: 28 A Printable character 29 i Signed integer (see specs §1.5 footnote on size) 30 f Single-precision float 31 Z Printable string, including space 32 H Byte array in the Hex format (network byte order / big-endian) 33 B Integer or numeric array 34 35 Byte-array (B) types: 36 c byte 37 C ubyte 38 s short 39 S ushort 40 i int32 41 I uint32 42 f float (spec does not indicate precision) 43 44 Memory layout 45 pipes delimit byte boundaries in an array 46 8/9 are example values 47 2 is a count of the array 48 the ubyte * starts at the type char 49 c | 8| 50 s | | 8| 51 i | | | | 8| 52 B |i | | | | 2| | | | 8| | | | 9| 53 54 55 Alias seq allows us to have an enum of types. 56 https://forum.dlang.org/post/kmdjfzpugudmwfrdgson@forum.dlang.org 57 Thanks Paul! 58 59 Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag 60 */ 61 struct TagValue 62 { 63 private ubyte* data; 64 65 /** Constructor 66 67 Usage: auto t = TagValue(b, 'XX') where b is bam1_t* BAM record and XX is tag 68 */ 69 this(bam1_t* b, char[2] tag) 70 { 71 data = bam_aux_get(b, tag); 72 debug 73 { 74 if (data is null) 75 hts_log_warning(__FUNCTION__, (tag ~ " doesn't exist for this record").idup); 76 } 77 } 78 79 /// check if empty/exists/null 80 @property 81 bool exists() 82 { 83 if (this.data is null) return false; 84 return true; 85 } 86 87 /// Convert tag value 88 string to(T : string)() 89 { 90 assert(this.data !is null); 91 return fromStringz(cast(char*)&data[1]).idup; 92 } 93 /// Convert tag value 94 T to(T)() 95 { 96 assert(this.data !is null); 97 return *cast(T*) data[1 .. T.sizeof + 1].ptr; 98 } 99 /// Convert tag value 100 T[] to(T : T[])() 101 { 102 assert(this.data !is null); 103 int n = *cast(int*) data[2 .. 6].ptr; 104 return (cast(T*)(data[6 .. T.sizeof + 6].ptr))[0 .. n]; 105 } 106 /// Check if tag type is type T 107 bool check(T)() 108 { 109 assert(this.data !is null); 110 return TypeChars[TypeIndex!T] == cast(char) data[0]; 111 } 112 /// Check if tag type is type T 113 bool check(T : string)() 114 { 115 assert(this.data !is null); 116 return TypeChars[TypeIndex!T] == cast(char) data[0]; 117 } 118 /// Check if tag type is type T 119 bool check(T : T[])() 120 { 121 assert(this.data !is null); 122 return (cast(char) data[0] == 'B') && (TypeChars[TypeIndex!T] == cast(char) data[1]); 123 } 124 /// Convert tag value to string 125 string toString() const 126 { 127 if (data !is null && cast(char) data[0] == 'Z') 128 { 129 return fromStringz(cast(char*)&data[1]).idup; 130 } 131 return ""; 132 } 133 /// Convert tag value to integer 134 long toInt() 135 { 136 assert(this.data !is null); 137 switch (cast(char) data[0]) 138 { 139 case 'c': 140 return cast(long)(to!byte); 141 case 'C': 142 return cast(long)(to!ubyte); 143 case 's': 144 return cast(long)(to!short); 145 case 'S': 146 return cast(long)(to!ushort); 147 case 'i': 148 return cast(long)(to!int); 149 case 'I': 150 return cast(long)(to!uint); 151 default: 152 return long.min; 153 } 154 } 155 /// Convert tag value to integer array 156 long[] toIntArray() 157 { 158 assert(this.data !is null); 159 switch (cast(char) data[1]) 160 { 161 case 'c': 162 return (to!(byte[]).to!(long[])); 163 case 'C': 164 return (to!(ubyte[]).to!(long[])); 165 case 's': 166 return (to!(short[]).to!(long[])); 167 case 'S': 168 return (to!(ushort[]).to!(long[])); 169 case 'i': 170 return (to!(int[]).to!(long[])); 171 case 'I': 172 return (to!(uint[]).to!(long[])); 173 default: 174 return []; 175 } 176 } 177 /// Convert tag value to float array 178 float[] toFloatArray() 179 { 180 assert(this.data !is null); 181 return to!(float[]); 182 } 183 } 184 185 debug (dhtslib_unittest) unittest 186 { 187 TagValue v; 188 ubyte[12] testdata; 189 testdata[0] = cast(ubyte) 'B'; 190 testdata[1] = cast(ubyte) 'C'; 191 *cast(int*) testdata[2 .. 6].ptr = 3; 192 testdata[6] = 1; 193 testdata[8] = 2; 194 testdata[10] = 3; 195 v.data = testdata.ptr; 196 writeln("testing array"); 197 assert(v.to!(ushort[]) == [1, 2, 3]); 198 ubyte[5] testdata2; 199 testdata2[0] = cast(ubyte) 'i'; 200 *cast(int*) testdata2[1 .. 5].ptr = 3; 201 v.data = testdata2.ptr; 202 writeln("testing int"); 203 assert(v.to!int == 3); 204 } 205 206 debug (dhtslib_unittest) unittest 207 { 208 import dhtslib.sam; // @suppress(dscanner.suspicious.local_imports) 209 import htslib.hts_log : hts_log_info; 210 import std.path : buildPath, dirName; 211 212 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE); 213 hts_log_info(__FUNCTION__, "Testing tagvalue"); 214 hts_log_info(__FUNCTION__, "Loading test file"); 215 auto bam = SAMFile(buildPath(dirName(dirName(dirName(__FILE__))), "htslib", 216 "test", "auxf#values.sam"), 0); 217 hts_log_info(__FUNCTION__, "Getting read 1"); 218 auto readrange = bam.all_records(); // @suppress(dscanner.suspicious.unmodified) 219 auto read = readrange.front; 220 hts_log_info(__FUNCTION__, "Testing string"); 221 assert(read["RG"].to!string == "ID"); 222 hts_log_info(__FUNCTION__, "Testing char"); 223 assert(read["A!"].to!char == '!'); 224 assert(read["Ac"].to!char == 'c'); 225 assert(read["AC"].to!char == 'C'); 226 hts_log_info(__FUNCTION__, "Testing int"); 227 assert(read["I0"].to!ubyte == 0); 228 assert(read["I1"].to!ubyte == 1); 229 assert(read["I2"].to!ubyte == 127); 230 assert(read["I3"].to!ubyte == 128); 231 assert(read["I4"].to!ubyte == 255); 232 assert(read["I5"].to!ushort == 256); 233 assert(read["I6"].to!ushort == 32_767); 234 assert(read["I7"].to!ushort == 32_768); 235 assert(read["I8"].to!ushort == 65_535); 236 assert(read["I9"].to!uint == 65_536); 237 assert(read["IA"].to!uint == 2_147_483_647); 238 assert(read["i1"].to!byte == -1); 239 assert(read["i2"].to!byte == -127); 240 assert(read["i3"].to!byte == -128); 241 assert(read["i4"].to!short == -255); 242 assert(read["i5"].to!short == -256); 243 assert(read["i6"].to!short == -32_767); 244 assert(read["i7"].to!short == -32_768); 245 assert(read["i8"].to!int == -65_535); 246 assert(read["i9"].to!int == -65_536); 247 assert(read["iA"].to!int == -2_147_483_647); 248 assert(read["iB"].to!int == -2_147_483_648); 249 assert(read["I0"].toInt == 0); 250 assert(read["I1"].toInt == 1); 251 assert(read["I2"].toInt == 127); 252 assert(read["I3"].toInt == 128); 253 assert(read["I4"].toInt == 255); 254 assert(read["I5"].toInt == 256); 255 assert(read["I6"].toInt == 32_767); 256 assert(read["I7"].toInt == 32_768); 257 assert(read["I8"].toInt == 65_535); 258 assert(read["I9"].toInt == 65_536); 259 assert(read["IA"].toInt == 2_147_483_647); 260 assert(read["i1"].toInt == -1); 261 assert(read["i2"].toInt == -127); 262 assert(read["i3"].toInt == -128); 263 assert(read["i4"].toInt == -255); 264 assert(read["i5"].toInt == -256); 265 assert(read["i6"].toInt == -32_767); 266 assert(read["i7"].toInt == -32_768); 267 assert(read["i8"].toInt == -65_535); 268 assert(read["i9"].toInt == -65_536); 269 assert(read["iA"].toInt == -2_147_483_647); 270 assert(read["iB"].toInt == -2_147_483_648); 271 hts_log_info(__FUNCTION__, "Testing float"); 272 assert(read["F0"].to!float == -1.0); 273 assert(read["F1"].to!float == 0.0); 274 assert(read["F2"].to!float == 1.0); 275 hts_log_info(__FUNCTION__, "Running tag checking"); 276 assert(read["I0"].check!ubyte == true); 277 assert(read["I5"].check!ushort == true); 278 assert(read["I9"].check!uint == true); 279 assert(read["i1"].check!byte == true); 280 assert(read["i4"].check!short == true); 281 assert(read["i8"].check!int == true); 282 assert(read["F0"].check!float == true); 283 readrange.popFront; 284 read = readrange.front; 285 hts_log_info(__FUNCTION__, "Testing arrays"); 286 assert(read["Bs"].to!(short[]) == [-32_768, -32_767, 0, 32_767]); 287 assert(read["Bi"].to!(int[]) == [ 288 -2_147_483_648, -2_147_483_647, 0, 2_147_483_647 289 ]); 290 assert(read["BS"].to!(ushort[]) == [0, 32_767, 32_768, 65_535]); 291 assert(read["BI"].to!(uint[]) == [ 292 0, 2_147_483_647, 2_147_483_648, 4_294_967_295 293 ]); 294 writeln(read["Bs"].toIntArray); 295 assert(read["Bs"].toIntArray == [-32_768, -32_767, 0, 32_767]); 296 assert(read["Bi"].toIntArray == [ 297 -2_147_483_648, -2_147_483_647, 0, 2_147_483_647 298 ]); 299 assert(read["BS"].toIntArray == [0, 32_767, 32_768, 65_535]); 300 assert(read["BI"].toIntArray == [ 301 0, 2_147_483_647, 2_147_483_648, 4_294_967_295 302 ]); 303 hts_log_info(__FUNCTION__, "Running tag checking"); 304 assert(read["Bs"].check!(short[]) == true); 305 assert(read["Bi"].check!(int[]) == true); 306 assert(read["BS"].check!(ushort[]) == true); 307 assert(read["BI"].check!(uint[]) == true); 308 }