1 /**
2 
3 This module provides a struct that encapsulates a SAMHeader
4 
5 `SAMHeader` encapsulates and owns a `sam_hdr_t*`,
6 and provides convenience functions to read and write header records.
7 
8 */
9 module dhtslib.sam.header;
10 
11 import htslib.sam;
12 import htslib.kstring;
13 import dhtslib.memory;
14 
15 import core.stdc.stdlib : free;
16 import core.stdc.string : memcpy;
17 
18 import std.conv : to;
19 import std.string : toStringz, fromStringz;
20 import std.traits : isSomeString;
21 
22 /// SAM specifications Section 1.3
23 /// Each header line begins with the character '@' followed by one of the
24 /// two-letter header record type codes defined in this section.
25 enum RecordType : immutable(char)[2]
26 {
27     HD = "HD",
28     SQ = "SQ",
29     RG = "RG",
30     PG = "PG",
31     CO = "CO",
32 }
33 
34 /** SAMHeader encapsulates `sam_hdr_t*`
35     and provides convenience wrappers to manipulate the header metadata/records.
36 */
37 struct SAMHeader
38 {
39     BamHdr h; /// rc bam_hdr_t * wrapper
40 
41     /// Construct from existing pointer
42     this(bam_hdr_t * h)
43     {
44         this.h = BamHdr(h);
45     }
46 
47     /// Explicit postblit to avoid 
48     /// https://github.com/blachlylab/dhtslib/issues/122
49     this(this)
50     {
51         this.h = h;   
52     }
53 
54     bool isNull(){
55         return this.h is null;
56     }
57 
58     /// Copy a SAMHeader
59     auto dup(){
60         return SAMHeader(sam_hdr_dup(this.h));
61     }
62 
63     /* contig info */
64     /// number of reference sequences; from bam_hdr_t
65     @property int nTargets() const
66     {
67         return this.h.n_targets;
68     }
69 
70     /// length of specific reference sequence by number (`tid`)
71     uint targetLength(int target) const
72     in (target >= 0)
73     in (target < this.nTargets)
74     {
75         return this.h.target_len[target];
76     }
77 
78     /// lengths of the reference sequences
79     @property uint[] targetLengths() const
80     {
81         return this.h.target_len[0 .. this.nTargets].dup;
82     }
83 
84     /// names of the reference sequences
85     @property string[] targetNames() const
86     {
87         string[] names;
88         names.length = this.nTargets;
89         foreach (i; 0 .. this.nTargets)
90         {
91             names[i] = fromStringz(this.h.target_name[i]).idup;
92         }
93         return names;
94     }
95 
96     /// reference contig name to integer id
97     int targetId(string name)
98     {
99         return sam_hdr_name2tid(this.h, toStringz(name));
100     }
101 
102     /// reference contig name to integer id
103     const(char)[] targetName(int tid)
104     {
105         return fromStringz(sam_hdr_tid2name(this.h, tid));
106     }
107 
108     /* Array-like indexing */
109 
110     /// 'in' membership operator.
111     /// usage: RecordType.SQ in hdr; => <bool>
112     bool opBinaryRight(string op)(RecordType lhs)
113     if (op == "in")
114     {
115         if (numRecords(lhs)) return true;
116         return false;
117     }
118 
119     /// For position-based lookups of key,
120     /// e.g. a sample-name lookup in Pysam is ["RG"][0]["SM"] ,
121     /// while in dhtslib:
122     /// [RecordType.RG, 0, "SN"]
123     const(char)[] opIndex(RecordType rt, size_t pos, const(char)[] key)
124     {
125         return this.valueByPos(rt, pos, key);
126     }
127 
128     /// number of records (lines) of type e.g. SQ, RG, etc.
129     size_t numRecords(RecordType rt)
130     {
131         return sam_hdr_count_lines(this.h, rt.ptr);
132     }
133 
134     /* ==== Line level methods ==== */
135 
136     /// add multiple \n-terminated full SAM header records, eg "@SQ\tSN:foo\tLN:100"
137     /// (single line or last of multiple does not require terminal \n)
138     auto addLines(const(char)[][] lines)
139     {
140         if (lines.length == 0)
141             return 0;   // (sam_hdr_add_lines also returns 0 for nothing added)
142         else if (lines.length == 1)
143             return sam_hdr_add_lines(this.h, lines[0].ptr, lines[0].length);
144         else {
145             char[] buf;
146             foreach(line; lines)
147                 buf ~= line ~ '\n';
148             return sam_hdr_add_lines(this.h, buf.ptr, buf.length);
149         }
150     }
151 
152     /// Add a single line to an existing header
153     auto addLine(T...)(RecordType type, T kvargs)
154     if(kvargs.length > 0 && isSomeString!(T[0]))
155     {
156         static assert (kvargs.length %2 == 0);   // K-V pairs => even number of variadic args
157 /*
158         // NOTE: both (runtime) type safe variadic params, and compile-time variadic templates
159         // use dynamic arrays, which cannot be passed to C variadic functions no matter what.
160         // complicating this, we also need to convert to char*. The below won't work period;
161         // the analogous variadic template won't work either without the mixin foolishness below.
162         const(char)*[] varargs;
163         varargs.length = kvargs.length + 1;
164         for(int i=0; i < kvargs.length; i++)
165             varargs[i] = kvargs[i].ptr;
166         varargs[$-1] = null;  // last vararg param null signals to sam_hdr_add_line end of list
167         
168         return sam_hdr_add_line(this.h, type.ptr, varargs.ptr);
169 */
170         string varargMagic(size_t len)
171         {
172             string args = "sam_hdr_add_line(this.h, type.ptr, ";
173             for(int i=0; i<len; i++)
174                 args ~= "toStringz(kvargs[" ~ i.to!string ~ "]), ";
175             args ~= "null)";
176             return args;
177         }
178 
179         // if mixin result is "toStringz(kvargs[0], ..." error is:
180         // Error: Using the result of a comma expression is not allowed
181         //return sam_hdr_add_line(this.h, type.ptr, mixin(varargMagic(kvargs.length)) );
182         return mixin(varargMagic(kvargs.length));
183     }
184 
185     /// Return a complete line of formatted text for a given type and ID,
186     /// or if no ID, first line matching type.
187     ///
188     /// Parameters:
189     ///     * type      - enum
190     ///     * id_key    - may be empty, in which case the first line matching type is returned
191     ///     * id_val    - may be empty IFF id_key empty; otherwise must be value for key
192     const(char)[] lineById(RecordType type, string id_key = "", string id_val = "")
193     in (id_key.length == 0 ? id_val.length == 0 : id_val.length > 0)
194     {
195 
196         kstring_t ks_line;
197         
198         // looking at samtools header.c sam_hrecs_Find_type_id (called by sam_hdr_find_line_id),
199         // passing non-null terminated two-char char* appears safe
200         auto res = sam_hdr_find_line_id(this.h, type.ptr,
201                                         id_key == "" ? null : id_key.ptr,
202                                         id_val == "" ? null : id_val.ptr,
203                                         &ks_line);
204 
205         // 0: success, -1: no match found, -2: error
206         if (res < 0)
207             return "";
208 
209         char[] line;
210         line.length = ks_line.l;
211         memcpy(line.ptr, ks_line.s, ks_line.l);
212         free(ks_line.s);
213         return line;
214     }
215 
216     /*
217 int sam_hdr_find_line_pos(sam_hdr_t *h, const char *type,
218                           int pos, kstring_t *ks);
219     */
220 
221     /* int sam_hdr_remove_line_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); */
222 
223     /* int sam_hdr_remove_line_pos(sam_hdr_t *h, const char *type, int position); */
224 
225     /* int sam_hdr_update_line(sam_hdr_t *h, const char *type,
226         const char *ID_key, const char *ID_value, ...); */
227 
228     /* int sam_hdr_remove_except(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value); */
229 
230     /* int sam_hdr_remove_lines(sam_hdr_t *h, const char *type, const char *id, void *rh); */
231 
232     /+
233     int sam_hdr_count_lines(sam_hdr_t *h, const char *type);
234     int sam_hdr_line_index(sam_hdr_t *bh, const char *type, const char *key);
235     const char *sam_hdr_line_name(sam_hdr_t *bh, const char *type, int pos);
236     +/
237 
238     //// //// int sam_hdr_find_tag_id(sam_hdr_t *h, const char *type, const char *ID_key, const char *ID_value, const char *key, kstring_t *ks);
239 
240 
241     /// Return the value associated with a key for a header line identified by position
242     const(char)[] valueByPos(RecordType type, size_t pos, const(char)[] key)
243     in (pos <= int.max)
244     in (key.length > 0)
245     {
246         kstring_t ks;
247         auto res = sam_hdr_find_tag_pos(this.h, type.ptr, cast(int)pos, toStringz(key), &ks);
248         
249         // 0: success, -1: tag DNE, -2: error
250         if (res < 0)
251             return "";
252 
253         char[] ret;
254         ret.length = ks.l;
255         memcpy(ret.ptr, ks.s, ks.l);
256         free(ks.s);
257         return ret;
258     }
259 }
260 /// Example
261 debug (dhtslib_unittest) unittest
262 {
263     import std;
264 
265     auto h = sam_hdr_init();
266     auto hdr = SAMHeader(h);
267 
268     assert(!(RecordType.RG in hdr));
269 
270     //sam_hdr_add_line(h, RecordType.RG.ptr, "ID".ptr, "001".ptr, "SM".ptr, "sample1".ptr, null);
271     hdr.addLine(RecordType.RG, "ID", "001", "SM", "sample1");
272 
273     assert(RecordType.RG in hdr);
274 
275     auto line = hdr.lineById(RecordType.RG, "ID", "001");
276     assert(line == "@RG	ID:001	SM:sample1");
277 
278     auto val = hdr.valueByPos(RecordType.RG, 0, "SM");
279     assert(val == "sample1");
280     assert(hdr[RecordType.RG, 0, "SM"] == "sample1");
281     assert(hdr[RecordType.RG, 0, "XX"] == "");
282 }