1 /**
2 
3 This module provides a struct that encapsulates a SAMHeader
4 
5 `SAMHeader` encapsulates and owns a `sam_hdr_t*`,
6 and provides convenience functions to read and write header records.
7 
8 */
9 module dhtslib.sam.header;
10 
11 import htslib.sam;
12 import htslib.kstring;
13 import dhtslib.memory;
14 
15 import core.stdc.stdlib : free;
16 import core.stdc.string : memcpy;
17 
18 import std.conv : to;
19 import std.string : toStringz, fromStringz;
20 import std.traits : isSomeString;
21 
22 /// SAM specifications Section 1.3
23 /// Each header line begins with the character '@' followed by one of the
24 /// two-letter header record type codes defined in this section.
25 enum RecordType : immutable(char)[2]
26 {
27     HD = "HD",
28     SQ = "SQ",
29     RG = "RG",
30     PG = "PG",
31     CO = "CO",
32 }
33 
34 /** SAMHeader encapsulates `sam_hdr_t*`
35     and provides convenience wrappers to manipulate the header metadata/records.
36 */
37 struct SAMHeader
38 {
39     BamHdr h; /// rc bam_hdr_t * wrapper
40 
41     /// Construct from existing pointer
42     this(bam_hdr_t * h)
43     {
44         this.h = BamHdr(h);
45     }
46 
47     bool isNull(){
48         return this.h is null;
49     }
50 
51     /// Copy a SAMHeader
52     auto dup(){
53         return SAMHeader(sam_hdr_dup(this.h));
54     }
55 
56     /* contig info */
57     /// number of reference sequences; from bam_hdr_t
58     @property int nTargets() const
59     {
60         return this.h.n_targets;
61     }
62 
63     /// length of specific reference sequence by number (`tid`)
64     uint targetLength(int target) const
65     in (target >= 0)
66     in (target < this.nTargets)
67     {
68         return this.h.target_len[target];
69     }
70 
71     /// lengths of the reference sequences
72     @property uint[] targetLengths() const
73     {
74         return this.h.target_len[0 .. this.nTargets].dup;
75     }
76 
77     /// names of the reference sequences
78     @property string[] targetNames() const
79     {
80         string[] names;
81         names.length = this.nTargets;
82         foreach (i; 0 .. this.nTargets)
83         {
84             names[i] = fromStringz(this.h.target_name[i]).idup;
85         }
86         return names;
87     }
88 
89     /// reference contig name to integer id
90     int targetId(string name)
91     {
92         return sam_hdr_name2tid(this.h, toStringz(name));
93     }
94 
95     /// reference contig name to integer id
96     const(char)[] targetName(int tid)
97     {
98         return fromStringz(sam_hdr_tid2name(this.h, tid));
99     }
100 
101     /* Array-like indexing */
102 
103     /// 'in' membership operator.
104     /// usage: RecordType.SQ in hdr; => <bool>
105     bool opBinaryRight(string op)(RecordType lhs)
106     if (op == "in")
107     {
108         if (numRecords(lhs)) return true;
109         return false;
110     }
111 
112     /// For position-based lookups of key,
113     /// e.g. a sample-name lookup in Pysam is ["RG"][0]["SM"] ,
114     /// while in dhtslib:
115     /// [RecordType.RG, 0, "SN"]
116     const(char)[] opIndex(RecordType rt, size_t pos, const(char)[] key)
117     {
118         return this.valueByPos(rt, pos, key);
119     }
120 
121     /// number of records (lines) of type e.g. SQ, RG, etc.
122     size_t numRecords(RecordType rt)
123     {
124         return sam_hdr_count_lines(this.h, rt.ptr);
125     }
126 
127     /* ==== Line level methods ==== */
128 
129     /// add multiple \n-terminated full SAM header records, eg "@SQ\tSN:foo\tLN:100"
130     /// (single line or last of multiple does not require terminal \n)
131     auto addLines(const(char)[][] lines)
132     {
133         if (lines.length == 0)
134             return 0;   // (sam_hdr_add_lines also returns 0 for nothing added)
135         else if (lines.length == 1)
136             return sam_hdr_add_lines(this.h, lines[0].ptr, lines[0].length);
137         else {
138             char[] buf;
139             foreach(line; lines)
140                 buf ~= line ~ '\n';
141             return sam_hdr_add_lines(this.h, buf.ptr, buf.length);
142         }
143     }
144 
145     /// Add a single line to an existing header
146     auto addLine(T...)(RecordType type, T kvargs)
147     if(kvargs.length > 0 && isSomeString!(T[0]))
148     {
149         static assert (kvargs.length %2 == 0);   // K-V pairs => even number of variadic args
150 /*
151         // NOTE: both (runtime) type safe variadic params, and compile-time variadic templates
152         // use dynamic arrays, which cannot be passed to C variadic functions no matter what.
153         // complicating this, we also need to convert to char*. The below won't work period;
154         // the analogous variadic template won't work either without the mixin foolishness below.
155         const(char)*[] varargs;
156         varargs.length = kvargs.length + 1;
157         for(int i=0; i < kvargs.length; i++)
158             varargs[i] = kvargs[i].ptr;
159         varargs[$-1] = null;  // last vararg param null signals to sam_hdr_add_line end of list
160         
161         return sam_hdr_add_line(this.h, type.ptr, varargs.ptr);
162 */
163         string varargMagic(size_t len)
164         {
165             string args = "sam_hdr_add_line(this.h, type.ptr, ";
166             for(int i=0; i<len; i++)
167                 args ~= "toStringz(kvargs[" ~ i.to!string ~ "]), ";
168             args ~= "null)";
169             return args;
170         }
171 
172         // if mixin result is "toStringz(kvargs[0], ..." error is:
173         // Error: Using the result of a comma expression is not allowed
174         //return sam_hdr_add_line(this.h, type.ptr, mixin(varargMagic(kvargs.length)) );
175         return mixin(varargMagic(kvargs.length));
176     }
177 
178     /// Return a complete line of formatted text for a given type and ID,
179     /// or if no ID, first line matching type.
180     ///
181     /// Parameters:
182     ///     * type      - enum
183     ///     * id_key    - may be empty, in which case the first line matching type is returned
184     ///     * id_val    - may be empty IFF id_key empty; otherwise must be value for key
185     const(char)[] lineById(RecordType type, string id_key = "", string id_val = "")
186     in (id_key.length == 0 ? id_val.length == 0 : id_val.length > 0)
187     {
188 
189         kstring_t ks_line;
190         
191         // looking at samtools header.c sam_hrecs_Find_type_id (called by sam_hdr_find_line_id),
192         // passing non-null terminated two-char char* appears safe
193         auto res = sam_hdr_find_line_id(this.h, type.ptr,
194                                         id_key == "" ? null : id_key.ptr,
195                                         id_val == "" ? null : id_val.ptr,
196                                         &ks_line);
197 
198         // 0: success, -1: no match found, -2: error
199         if (res < 0)
200             return "";
201 
202         char[] line;
203         line.length = ks_line.l;
204         memcpy(line.ptr, ks_line.s, ks_line.l);
205         free(ks_line.s);
206         return line;
207     }
208 
209     /*
210 int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type,
211                           int pos, kstring_t *ks);
212     */
213 
214     /* int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); */
215 
216     /* int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position); */
217 
218     /* int sam_hdr_update_line(sam_hdr_t *h, const char *type,
219         const char *ID_key, const char *ID_value, ...); */
220 
221     /* int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); */
222 
223     /* int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh); */
224 
225     /+
226     int sam_hdr_count_lines(sam_hdr_t *h, const char *type);
227     int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key);
228     const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos);
229     +/
230 
231     //// //// int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks);
232 
233 
234     /// Return the value associated with a key for a header line identified by position
235     const(char)[] valueByPos(RecordType type, size_t pos, const(char)[] key)
236     in (pos <= int.max)
237     in (key.length > 0)
238     {
239         kstring_t ks;
240         auto res = sam_hdr_find_tag_pos(this.h, type.ptr, cast(int)pos, toStringz(key), &ks);
241         
242         // 0: success, -1: tag DNE, -2: error
243         if (res < 0)
244             return "";
245 
246         char[] ret;
247         ret.length = ks.l;
248         memcpy(ret.ptr, ks.s, ks.l);
249         free(ks.s);
250         return ret;
251     }
252 }
253 /// Example
254 debug (dhtslib_unittest) unittest
255 {
256     import std;
257 
258     auto h = sam_hdr_init();
259     auto hdr = SAMHeader(h);
260 
261     assert(!(RecordType.RG in hdr));
262 
263     //sam_hdr_add_line(h, RecordType.RG.ptr, "ID".ptr, "001".ptr, "SM".ptr, "sample1".ptr, null);
264     hdr.addLine(RecordType.RG, "ID", "001", "SM", "sample1");
265 
266     assert(RecordType.RG in hdr);
267 
268     auto line = hdr.lineById(RecordType.RG, "ID", "001");
269     assert(line == "@RG	ID:001	SM:sample1");
270 
271     auto val = hdr.valueByPos(RecordType.RG, 0, "SM");
272     assert(val == "sample1");
273     assert(hdr[RecordType.RG, 0, "SM"] == "sample1");
274     assert(hdr[RecordType.RG, 0, "XX"] == "");
275 }