1 module dhtslib.gff.record;
2
3 import std.stdio;
4 import std.algorithm.iteration: splitter;
5 import std.stdio: chunks, File;
6 import std.algorithm: filter,map;
7 import std.traits: ReturnType;
8 import std.array: replace;
9 import std.conv: to;
10 import std.string;
11 import std.file: mkdir;
12 import std.algorithm.searching: countUntil;
13 import std.range: drop, ForwardRange, chain;
14 import std.format : format;
15 import std.array : split;
16 import std.math : isNaN;
17
18 import dhtslib.coordinates;
19
20 enum GFFVersion
21 {
22 GFF2 = 0,
23 GTF = 0,
24 GFF3 = 1
25 }
26
27 alias GTFRecord = GFFRecord!(GFFVersion.GTF);
28 alias GFF2Record = GFFRecord!(GFFVersion.GFF2);
29 alias GFF3Record = GFFRecord!(GFFVersion.GFF3);
30
31 /** GFF Generic Record
32
33 Format Documentation:
34 * http://gmod.org/wiki/GFF2#The_GFF2_File_Format
35 * https://useast.ensembl.org/info/website/upload/gff.html
36 *
37 * Represents a GFF record. Version selected only changes formatting of
38 * String representations and how the record is parsed. Implementation
39 * is mostly the same. This record has getters and setters and
40 * should be able to be built from scratch.
41 */
42 struct GFFRecord(GFFVersion ver)
43 {
44 private ubyte[] raw;
45
46 private bool unpackedFields = true;
47 private bool unpackedAttributes = true;
48 private string[9] fields;
49 private string[string] kvmap;
50
51 /// constructor (raw ubytes)
52 this(ubyte[]data){
53 this.raw = data;
54 this.unpackedFields = false;
55 this.unpackedAttributes = false;
56 }
57
58 /// constructor (string)
59 this(string data){
60 this(cast(ubyte[])data);
61 }
62
63 /// unpack fields of gff line for mutability
64 private void unpack()
65 {
66 if(this.unpackedFields) return;
67 this.fields = (cast(string) this.raw).split('\t');
68 assert(this.fields.length == 9);
69 this.unpackedFields = true;
70 }
71
72 /// unpack attributes of gff3 line for mutability
73 private void unpackGFF3Attributes(){
74 if(this.unpackedAttributes) return;
75 unpack;
76 auto kvpairs = this.fields[8].split(';');
77 foreach (string kv; kvpairs)
78 {
79 auto kvfields = kv.split('=');
80 assert(kvfields.length == 2);
81 this.kvmap[kvfields[0]] = kvfields[0];
82 }
83 this.unpackedAttributes = true;
84 }
85
86 /// unpack attributes of gff2 line for mutability
87 private void unpackGFF2Attributes(){
88 if(this.unpackedAttributes) return;
89 unpack;
90 auto kvpairs = this.fields[8].split(';');
91 foreach (string kv; kvpairs)
92 {
93 auto kvfields = kv.strip.split(' ');
94 assert(kvfields.length == 2);
95 this.kvmap[kvfields[0].strip] = kvfields[0].strip;
96 }
97 this.unpackedAttributes = true;
98 }
99
100 /// TODO: Not implemented; (almost) always true
101 @property bool isValid()
102 {
103 //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length));
104 return (raw.length >= 0 ? true : false);
105 }
106
107 /// Column 1: seqid (aka contig); basis for the coordinate system
108 /// getter
109 @property seqid() const
110 {
111 if(unpackedFields) return this.fields[0];
112 return cast(string)this.raw.splitter('\t').front;
113 }
114
115 /// Column 1: seqid (aka contig); basis for the coordinate system
116 /// setter
117 @property seqid(string chr)
118 {
119 unpack;
120 this.fields[0] = chr;
121 }
122
123 /// ditto
124 @property contig() const
125 {
126 return seqid;
127 }
128
129 /// ditto
130 @property contig(string chr)
131 {
132 seqid(chr);
133 }
134
135 /// Column 2: source; software, procedure, or database originating the record
136 /// getter
137 @property source() const
138 {
139 if(unpackedFields) return this.fields[1];
140 return cast(string)this.raw.splitter('\t').drop(1).front;
141 }
142
143 /// Column 2: source; software, procedure, or database originating the record
144 /// setter
145 @property source(string src)
146 {
147 unpack;
148 this.fields[1] = src;
149 }
150
151 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
152 /// getter
153 @property type() const
154 {
155 if(unpackedFields) return this.fields[2];
156 return cast(string)this.raw.splitter('\t').drop(2).front;
157 }
158
159 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
160 /// setter
161 @property type(string typ)
162 {
163 unpack;
164 this.fields[2] = typ;
165 }
166
167 /// Columns 4 & 5: returns Coordinate set: OBC format
168 /// getter
169 @property coordinates() const
170 {
171 long start, end;
172 if(unpackedFields){
173 start = this.fields[3].to!long;
174 end = this.fields[4].to!long;
175 }else{
176 start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long;
177 end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long;
178 }
179 return OBC(start, end);
180 }
181
182 /// Columns 4 & 5: returns Coordinate set: OBC format
183 /// setter
184 @property coordinates(CoordSystem cs)(Interval!cs coords)
185 {
186 unpack;
187 auto newCoords = coords.to!(CoordSystem.obc);
188 this.fields[3] = newCoords.start.pos.to!string;
189 this.fields[4] = newCoords.end.pos.to!string;
190 }
191
192 /// Columns 4: start; 1-based integer start position of the feature
193 @property start() const { return this.coordinates.start; }
194 /// Column 5: end; closed coordinate integer ending nucleotide position of the feature
195 @property end() const { return this.coordinates.end; }
196
197 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
198 /// Tragically, score can be either a float, or not present (".")
199 /// Totally arbitrarily, we will represent absent as -1
200 /// getter
201 @property score() const
202 {
203 string val;
204 if(unpackedFields) val = this.fields[5];
205 else val = cast(string)this.raw.splitter('\t').drop(5).front;
206 if(val=="."){
207 return -1.0;
208 }
209 return val.to!float;
210 }
211
212 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
213 /// Tragically, score can be either a float, or not present (".")
214 /// Totally arbitrarily, we will represent absent as -1
215 /// setter
216 @property score(float s)
217 {
218 unpack;
219 if(s.isNaN) this.fields[5] = ".";
220 else this.fields[5] = s.to!string;
221 }
222
223 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
224 // getter
225 @property strand() const
226 {
227 if(unpackedFields) return cast(char)this.fields[6][0];
228 return cast(char)this.raw.splitter('\t').drop(6).front[0];
229 }
230
231 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
232 //setter
233 @property strand(char s)
234 {
235 unpack;
236 assert(s == '+' || s == '-' || s == '.');
237 this.fields[6] = [s].idup;
238 }
239
240 /** Column 8: phase;
241 For features of type "CDS", the phase indicates where the feature begins with
242 reference to the reading frame. The phase is one of the integers 0, 1, or 2,
243 indicating the number of bases that should be removed from the beginning of
244 this feature to reach the first base of the next codon. In other words, a
245 phase of "0" indicates that the next codon begins at the first base of the
246 region described by the current line, a phase of "1" indicates that the next
247 codon begins at the second base of this region, and a phase of "2" indicates
248 that the codon begins at the third base of this region. This is NOT to be
249 confused with the frame, which is simply start modulo 3.
250
251 For forward strand features, phase is counted from the start field.
252 For reverse strand features, phase is counted from the end field.
253
254 The phase is REQUIRED for all CDS features.
255
256 Tragically, phase can be either an integer (0, 1, 2), or not present (".")
257 Totally arbitrarily, we will represent absent as -1
258 **/
259 @property phase() const
260 {
261 string val;
262 if(unpackedFields) val = this.fields[7];
263 else val = cast(string)this.raw.splitter('\t').drop(7).front;
264 if(val=="."){
265 return -1;
266 }
267 return val.to!int;
268 }
269
270 @property phase(long p)
271 {
272 unpack;
273 assert(p == 0 || p == 1 || p == 2 || p == -1);
274 if(p == -1) this.fields[7] = ".";
275 else this.fields[7] = p.to!string;
276 }
277
278 /// Column 9: backwards compatible GFF2 group field
279 @property group() const
280 {
281 if(unpackedAttributes){
282 static if(ver == GFFVersion.GFF3)
283 return ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";"));
284 else
285 return ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]);
286 }
287 if(unpackedFields) return this.fields[8];
288 return cast(string)this.raw.splitter('\t').drop(8).front;
289 }
290
291 /// Column 9: backwards compatible GFF2 group field
292 @property group(string g)
293 {
294 unpack;
295 this.fields[8] = g;
296
297 /// clear hashmap as values have been overwritten
298 if(this.unpackedAttributes) this.kvmap.clear;
299 this.unpackedAttributes = false;
300 }
301
302 /// Column 9: attributes; A list of ;-separated feature attributes in key=value form
303 string attributes(const string field){ return this.opIndex(field); }
304
305 /// Provides map key lookup semantics for column 9 attributes
306 string opIndex(string field) const
307 {
308 if(unpackedAttributes) return this.kvmap[field];
309 static if(ver != GFFVersion.GFF3){
310 auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
311 auto val = attrs // actualy a Range of key=val
312 .filter!(kv => ((cast(string) kv).strip[0 .. (cast(string) kv).strip.countUntil(' ')]) == field);
313 //.front; // -- AssertError if range is empty
314 if (!val.empty) return ((cast(string) val.front).strip[(cast(string) val.front).strip.countUntil(' ') + 1..$]).strip.strip("\"");
315 else return "";
316 }
317 else
318 {
319 auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
320 auto val = attrs // actualy a Range of key=val
321 .filter!(kv => cast(string)(kv[0 .. kv.countUntil('=')]) == field);
322 //.front; // -- AssertError if range is empty
323 if (!val.empty) return cast(string) (val.front[val.front.countUntil('=')+1..$]);
324 else return "";
325 }
326
327 /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront)
328 auto vals = attrs
329 .filter!(kv => kv.startsWith(field ~ "="))
330 .map!(kv => kv[kv.countUntil('=')+1 .. $]);
331 if (!vals.empty) return cast(string) val.front;
332 else return "";
333 +/
334 }
335
336 /// Provides map key assignment semantics for column 9 attributes
337 void opIndexAssign(string val, string field)
338 {
339 static if(ver == GFFVersion.GFF3) unpackGFF3Attributes;
340 else unpackGFF2Attributes;
341 static if(ver != GFFVersion.GFF3) val = "\"" ~ val ~ "\"";
342 this.kvmap[field] = val;
343 }
344
345 /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...}
346 bool hasTag(string tagName)()
347 {
348 return this["tag"].splitter(",").filter!(t => t == tagName).count > 0;
349 }
350
351 /// Computed feature length
352 @property length() const { return this.end - (this.start-1); }
353 /// Relative start === 1
354 @property relativeStart() const { return OB(1); }
355 /// Relative start === the feature length
356 @property relativeEnd() const { return OB(this.length - 1); }
357
358 /// Genomic coordinate at offset into feature, taking strandedness into account
359 @property coordinateAtOffset(long offset) const
360 {
361 // GFF3 features are 1-based coordinates
362 assert(offset >= 0);
363
364 // for - strand count from end; for +, ., and ? strand count from start
365 immutable auto begin = (this.strand == '-' ? this.end : this.start);
366
367 // for - strand count backward; for +, ., and ? strand count forward
368 immutable auto direction = (this.strand == '-' ? -1 : 1);
369
370 return OB(begin + (direction * offset));
371 }
372 /// Genomic coordinate at beginning of feature, taking strandedness into account
373 @property coordinateAtBegin() const
374 {
375 return this.coordinateAtOffset(0);
376 }
377 /// Genomic coordinate at end of feature, taking strandedness into account
378 @property coordinateAtEnd() const
379 {
380 return this.coordinateAtOffset(this.length - 1);
381 }
382
383 string toString() const
384 {
385 if(!unpackedFields)
386 return cast(string) this.raw;
387 string ret;
388 if(unpackedAttributes){
389 ret = this.fields[0..8].join("\t");
390 static if(ver == GFFVersion.GFF3)
391 ret ~= ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";"));
392 else
393 ret ~= ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]);
394 }else{
395 ret = this.fields[].join("\t");
396 }
397 return ret;
398 }
399
400 /// Returns a string with the canonical "chr:start-end" representation
401 @property string canonicalRepresentation() const
402 {
403 return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string;
404 }
405 /// Return the seqURI representation
406 @property string seqURI() const {
407 return format("seq:unk/%s", this.canonicalRepresentation);
408 }
409
410 }