1 module dhtslib.gff.record;
2 
3 import std.stdio;
4 import std.algorithm.iteration: splitter;
5 import std.stdio: chunks, File;
6 import std.algorithm: filter,map;
7 import std.traits: ReturnType;
8 import std.array: replace;
9 import std.conv: to;
10 import std.string;
11 import std.file: mkdir;
12 import std.algorithm.searching: countUntil;
13 import std.range: drop, ForwardRange, chain;
14 import std.format : format;
15 import std.array : split;
16 import std.math : isNaN;
17 
18 import dhtslib.coordinates;
19 
20 enum GFFVersion
21 {
22     GFF2 = 0,
23     GTF = 0,
24     GFF3 = 1
25 }
26 
27 alias GTFRecord = GFFRecord!(GFFVersion.GTF);
28 alias GFF2Record = GFFRecord!(GFFVersion.GFF2);
29 alias GFF3Record = GFFRecord!(GFFVersion.GFF3);
30 
31 /** GFF Generic Record
32 
33 Format Documentation:
34  *  http://gmod.org/wiki/GFF2#The_GFF2_File_Format
35  *  https://useast.ensembl.org/info/website/upload/gff.html
36  *   
37  *  Represents a GFF record. Version selected only changes formatting of 
38  *  String representations and how the record is parsed. Implementation  
39  *  is mostly the same. This record has getters and setters and 
40  *  should be able to be built from scratch.
41  */
42 struct GFFRecord(GFFVersion ver)
43 {
44     private ubyte[] raw;
45 
46     private bool unpackedFields = true;
47     private bool unpackedAttributes = true;
48     private string[9] fields;
49     private string[string] kvmap;
50 
51     /// constructor (raw ubytes)
52     this(ubyte[]data){
53         this.raw = data;
54         this.unpackedFields = false;
55         this.unpackedAttributes = false;
56     }
57 
58     /// constructor (string)
59     this(string data){
60         this(cast(ubyte[])data);
61     }
62 
63     /// unpack fields of gff line for mutability
64     private void unpack()
65     {
66         if(this.unpackedFields) return;
67         this.fields = (cast(string) this.raw).split('\t');
68         assert(this.fields.length == 9);
69         this.unpackedFields = true;
70     }
71 
72     /// unpack attributes of gff3 line for mutability
73     private void unpackGFF3Attributes(){
74         if(this.unpackedAttributes) return;
75         unpack;
76         auto kvpairs = this.fields[8].split(';');
77         foreach (string kv; kvpairs)
78         {
79             auto kvfields = kv.split('=');
80             assert(kvfields.length == 2);
81             this.kvmap[kvfields[0]] = kvfields[0];
82         }
83         this.unpackedAttributes = true;
84     }
85 
86     /// unpack attributes of gff2 line for mutability
87     private void unpackGFF2Attributes(){
88         if(this.unpackedAttributes) return;
89         unpack;
90         auto kvpairs = this.fields[8].split(';');
91         foreach (string kv; kvpairs)
92         {
93             auto kvfields = kv.strip.split(' ');
94             assert(kvfields.length == 2);
95             this.kvmap[kvfields[0].strip] = kvfields[0].strip;
96         }
97         this.unpackedAttributes = true;
98     }
99 
100     /// TODO: Not implemented; (almost) always true
101     @property bool isValid()
102     {
103         //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length));
104         return (raw.length >= 0 ? true : false);
105     }
106 
107     /// Column 1: seqid (aka contig); basis for the coordinate system
108     /// getter
109     @property seqid() const
110     {
111         if(unpackedFields) return this.fields[0];
112         return cast(string)this.raw.splitter('\t').front; 
113     }
114 
115     /// Column 1: seqid (aka contig); basis for the coordinate system
116     /// setter
117     @property seqid(string chr)
118     {
119         unpack;
120         this.fields[0] = chr; 
121     }
122 
123     /// ditto
124     @property contig() const
125     {
126         return seqid;
127     }
128 
129     /// ditto
130     @property contig(string chr)
131     {
132         seqid(chr); 
133     }
134 
135     /// Column 2: source; software, procedure, or database originating the record
136     /// getter
137     @property source() const
138     {
139         if(unpackedFields) return this.fields[1];
140         return cast(string)this.raw.splitter('\t').drop(1).front;
141     }
142 
143     /// Column 2: source; software, procedure, or database originating the record
144     /// setter
145     @property source(string src)
146     {
147         unpack;
148         this.fields[1] = src; 
149     }
150 
151     /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
152     /// getter
153     @property type() const
154     {
155         if(unpackedFields) return this.fields[2];
156         return cast(string)this.raw.splitter('\t').drop(2).front;
157     }
158     
159     /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number
160     /// setter
161     @property type(string typ)
162     {
163         unpack;
164         this.fields[2] = typ; 
165     }
166 
167     /// Columns 4 & 5: returns Coordinate set: OBC format
168     /// getter
169     @property coordinates() const
170     {
171         long start, end;
172         if(unpackedFields){
173             start = this.fields[3].to!long;
174             end = this.fields[4].to!long;
175         }else{
176             start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long;
177             end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long;
178         }
179         return OBC(start, end);
180     }
181 
182     /// Columns 4 & 5: returns Coordinate set: OBC format
183     /// setter
184     @property coordinates(CoordSystem cs)(Interval!cs coords)
185     {
186         unpack;
187         auto newCoords = coords.to!(CoordSystem.obc);
188         this.fields[3] = newCoords.start.pos.to!string;
189         this.fields[4] = newCoords.end.pos.to!string; 
190     }
191 
192     /// Columns 4: start; 1-based integer start position of the feature
193     @property start() const { return this.coordinates.start; }
194     /// Column 5: end; closed coordinate integer ending nucleotide position of the feature
195     @property end() const { return this.coordinates.end; }
196 
197     /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
198     /// Tragically, score can be either a float, or not present (".")
199     /// Totally arbitrarily, we will represent absent as -1
200     /// getter
201     @property score() const
202     {
203         string val;
204         if(unpackedFields) val = this.fields[5];
205         else val = cast(string)this.raw.splitter('\t').drop(5).front;
206         if(val=="."){
207             return -1.0;
208         }
209         return val.to!float;
210     }
211 
212     /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined."
213     /// Tragically, score can be either a float, or not present (".")
214     /// Totally arbitrarily, we will represent absent as -1
215     /// setter
216     @property score(float s)
217     {
218         unpack;
219         if(s.isNaN) this.fields[5] = ".";
220         else this.fields[5] = s.to!string;
221     }
222 
223     /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
224     // getter
225     @property strand() const
226     {
227         if(unpackedFields) return cast(char)this.fields[6][0];
228         return cast(char)this.raw.splitter('\t').drop(6).front[0];
229     }
230 
231     /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown)
232     //setter
233     @property strand(char s)
234     {
235         unpack;
236         assert(s == '+' || s == '-' || s == '.');
237         this.fields[6] = [s].idup;
238     }
239 
240     /** Column 8: phase;
241     For features of type "CDS", the phase indicates where the feature begins with
242     reference to the reading frame. The phase is one of the integers 0, 1, or 2,
243     indicating the number of bases that should be removed from the beginning of
244     this feature to reach the first base of the next codon. In other words, a
245     phase of "0" indicates that the next codon begins at the first base of the
246     region described by the current line, a phase of "1" indicates that the next
247     codon begins at the second base of this region, and a phase of "2" indicates
248     that the codon begins at the third base of this region. This is NOT to be
249     confused with the frame, which is simply start modulo 3.
250 
251     For forward strand features, phase is counted from the start field.
252     For reverse strand features, phase is counted from the end field.
253 
254     The phase is REQUIRED for all CDS features.
255     
256     Tragically, phase can be either an integer (0, 1, 2), or not present (".")
257     Totally arbitrarily, we will represent absent as -1
258     **/
259     @property phase() const
260     {
261         string val;
262         if(unpackedFields) val = this.fields[7];
263         else val = cast(string)this.raw.splitter('\t').drop(7).front;
264         if(val=="."){
265             return -1;
266         }
267         return val.to!int;
268     }
269 
270     @property phase(long p)
271     {
272         unpack;
273         assert(p == 0 || p == 1 || p == 2 || p == -1);
274         if(p == -1) this.fields[7] = ".";
275         else this.fields[7] = p.to!string;
276     }
277 
278     /// Column 9: backwards compatible GFF2 group field
279     @property group() const
280     {
281         if(unpackedAttributes){
282             static if(ver == GFFVersion.GFF3) 
283                 return ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";"));
284             else
285                 return ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]);
286         }
287         if(unpackedFields) return this.fields[8];
288         return cast(string)this.raw.splitter('\t').drop(8).front;
289     }
290 
291     /// Column 9: backwards compatible GFF2 group field
292     @property group(string g)
293     {
294         unpack;
295         this.fields[8] = g;
296 
297         /// clear hashmap as values have been overwritten
298         if(this.unpackedAttributes) this.kvmap.clear;
299         this.unpackedAttributes = false;
300     }
301 
302     /// Column 9: attributes; A list of ;-separated feature attributes in key=value form
303     string attributes(const string field){ return this.opIndex(field); }
304 
305     /// Provides map key lookup semantics for column 9 attributes
306     string opIndex(string field) const
307     {
308         if(unpackedAttributes) return this.kvmap[field];
309         static if(ver != GFFVersion.GFF3){
310             auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
311             auto val = attrs    // actualy a Range of key=val
312                 .filter!(kv => ((cast(string) kv).strip[0 .. (cast(string) kv).strip.countUntil(' ')]) == field);
313                 //.front; // -- AssertError if range is empty
314             if (!val.empty) return ((cast(string) val.front).strip[(cast(string) val.front).strip.countUntil(' ') + 1..$]).strip.strip("\"");
315             else return "";
316         }
317         else
318         {
319             auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";");
320             auto val = attrs    // actualy a Range of key=val
321                 .filter!(kv => cast(string)(kv[0 .. kv.countUntil('=')]) == field);
322                 //.front; // -- AssertError if range is empty
323             if (!val.empty) return cast(string) (val.front[val.front.countUntil('=')+1..$]);
324             else return "";
325         }
326         
327         /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront)
328         auto vals = attrs
329                     .filter!(kv => kv.startsWith(field ~ "="))
330                     .map!(kv => kv[kv.countUntil('=')+1 .. $]);
331         if (!vals.empty) return cast(string) val.front;
332         else return "";
333         +/
334     }
335 
336     /// Provides map key assignment semantics for column 9 attributes
337     void opIndexAssign(string val, string field)
338     {
339         static if(ver == GFFVersion.GFF3) unpackGFF3Attributes;
340         else unpackGFF2Attributes;
341         static if(ver != GFFVersion.GFF3) val = "\"" ~ val ~ "\"";
342         this.kvmap[field] = val;
343     }
344 
345     /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...}
346     bool hasTag(string tagName)()
347     {
348         return this["tag"].splitter(",").filter!(t => t == tagName).count > 0;
349     }
350 
351     /// Computed feature length
352     @property length() const { return this.end - (this.start-1); }
353     /// Relative start === 1
354     @property relativeStart() const { return OB(1); }
355     /// Relative start === the feature length
356     @property relativeEnd() const { return OB(this.length - 1); }
357 
358     /// Genomic coordinate at offset into feature, taking strandedness into account
359     @property coordinateAtOffset(long offset) const
360     {
361         // GFF3 features are 1-based coordinates
362         assert(offset >= 0);
363         
364         // for - strand count from end; for +, ., and ? strand count from start
365         immutable auto begin = (this.strand == '-' ? this.end : this.start);
366 
367         // for - strand count backward; for +, ., and ? strand count forward
368         immutable auto direction = (this.strand == '-' ? -1 : 1);
369 
370         return OB(begin + (direction * offset));
371     }
372     /// Genomic coordinate at beginning of feature, taking strandedness into account
373     @property coordinateAtBegin() const
374     {
375         return this.coordinateAtOffset(0);
376     }
377     /// Genomic coordinate at end of feature, taking strandedness into account
378     @property coordinateAtEnd() const
379     {
380         return this.coordinateAtOffset(this.length - 1);
381     }
382 
383     string toString() const
384     {
385         if(!unpackedFields) 
386             return cast(string) this.raw;
387         string ret;
388         if(unpackedAttributes){
389             ret = this.fields[0..8].join("\t");
390             static if(ver == GFFVersion.GFF3) 
391                 ret ~= ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";"));
392             else
393                 ret ~= ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]);
394         }else{
395             ret = this.fields[].join("\t");
396         }
397         return ret;
398     }
399 
400     /// Returns a string with the canonical "chr:start-end" representation
401     @property string canonicalRepresentation() const
402     {
403         return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string;
404     }
405     /// Return the seqURI representation
406     @property string seqURI() const {
407         return format("seq:unk/%s", this.canonicalRepresentation);
408     }
409 
410 }