1 module dhtslib.gff.record; 2 3 import std.stdio; 4 import std.algorithm.iteration: splitter; 5 import std.stdio: chunks, File; 6 import std.algorithm: filter,map; 7 import std.traits: ReturnType; 8 import std.array: replace; 9 import std.conv: to; 10 import std.string; 11 import std.file: mkdir; 12 import std.algorithm.searching: countUntil; 13 import std.range: drop, ForwardRange, chain; 14 import std.format : format; 15 import std.array : split; 16 import std.math : isNaN; 17 18 import dhtslib.coordinates; 19 20 enum GFFVersion 21 { 22 GFF2 = 0, 23 GTF = 0, 24 GFF3 = 1 25 } 26 27 alias GTFRecord = GFFRecord!(GFFVersion.GTF); 28 alias GFF2Record = GFFRecord!(GFFVersion.GFF2); 29 alias GFF3Record = GFFRecord!(GFFVersion.GFF3); 30 31 /** GFF Generic Record 32 33 Format Documentation: 34 * http://gmod.org/wiki/GFF2#The_GFF2_File_Format 35 * https://useast.ensembl.org/info/website/upload/gff.html 36 * 37 * Represents a GFF record. Version selected only changes formatting of 38 * String representations and how the record is parsed. Implementation 39 * is mostly the same. This record has getters and setters and 40 * should be able to be built from scratch. 41 */ 42 struct GFFRecord(GFFVersion ver) 43 { 44 private ubyte[] raw; 45 46 private bool unpackedFields = true; 47 private bool unpackedAttributes = true; 48 private string[9] fields; 49 private string[string] kvmap; 50 51 /// constructor (raw ubytes) 52 this(ubyte[]data){ 53 this.raw = data; 54 this.unpackedFields = false; 55 this.unpackedAttributes = false; 56 } 57 58 /// constructor (string) 59 this(string data){ 60 this(cast(ubyte[])data); 61 } 62 63 /// unpack fields of gff line for mutability 64 private void unpack() 65 { 66 if(this.unpackedFields) return; 67 this.fields = (cast(string) this.raw).split('\t'); 68 assert(this.fields.length == 9); 69 this.unpackedFields = true; 70 } 71 72 /// unpack attributes of gff3 line for mutability 73 private void unpackGFF3Attributes(){ 74 if(this.unpackedAttributes) return; 75 unpack; 76 auto kvpairs = this.fields[8].split(';'); 77 foreach (string kv; kvpairs) 78 { 79 auto kvfields = kv.split('='); 80 assert(kvfields.length == 2); 81 this.kvmap[kvfields[0]] = kvfields[0]; 82 } 83 this.unpackedAttributes = true; 84 } 85 86 /// unpack attributes of gff2 line for mutability 87 private void unpackGFF2Attributes(){ 88 if(this.unpackedAttributes) return; 89 unpack; 90 auto kvpairs = this.fields[8].split(';'); 91 foreach (string kv; kvpairs) 92 { 93 auto kvfields = kv.strip.split(' '); 94 assert(kvfields.length == 2); 95 this.kvmap[kvfields[0].strip] = kvfields[0].strip; 96 } 97 this.unpackedAttributes = true; 98 } 99 100 /// TODO: Not implemented; (almost) always true 101 @property bool isValid() 102 { 103 //hts_log_trace(__FUNCTION__, format("raw.length %d", raw.length)); 104 return (raw.length >= 0 ? true : false); 105 } 106 107 /// Column 1: seqid (aka contig); basis for the coordinate system 108 /// getter 109 @property seqid() const 110 { 111 if(unpackedFields) return this.fields[0]; 112 return cast(string)this.raw.splitter('\t').front; 113 } 114 115 /// Column 1: seqid (aka contig); basis for the coordinate system 116 /// setter 117 @property seqid(string chr) 118 { 119 unpack; 120 this.fields[0] = chr; 121 } 122 123 /// ditto 124 @property contig() const 125 { 126 return seqid; 127 } 128 129 /// ditto 130 @property contig(string chr) 131 { 132 seqid(chr); 133 } 134 135 /// Column 2: source; software, procedure, or database originating the record 136 /// getter 137 @property source() const 138 { 139 if(unpackedFields) return this.fields[1]; 140 return cast(string)this.raw.splitter('\t').drop(1).front; 141 } 142 143 /// Column 2: source; software, procedure, or database originating the record 144 /// setter 145 @property source(string src) 146 { 147 unpack; 148 this.fields[1] = src; 149 } 150 151 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number 152 /// getter 153 @property type() const 154 { 155 if(unpackedFields) return this.fields[2]; 156 return cast(string)this.raw.splitter('\t').drop(2).front; 157 } 158 159 /// Column 3: feature type; sequence ontology (SO) defined type, or SO accession number 160 /// setter 161 @property type(string typ) 162 { 163 unpack; 164 this.fields[2] = typ; 165 } 166 167 /// Columns 4 & 5: returns Coordinate set: OBC format 168 /// getter 169 @property coordinates() const 170 { 171 long start, end; 172 if(unpackedFields){ 173 start = this.fields[3].to!long; 174 end = this.fields[4].to!long; 175 }else{ 176 start = (cast(string)this.raw.splitter('\t').drop(3).front).to!long; 177 end = (cast(string)this.raw.splitter('\t').drop(4).front).to!long; 178 } 179 return OBC(start, end); 180 } 181 182 /// Columns 4 & 5: returns Coordinate set: OBC format 183 /// setter 184 @property coordinates(CoordSystem cs)(Interval!cs coords) 185 { 186 unpack; 187 auto newCoords = coords.to!(CoordSystem.obc); 188 this.fields[3] = newCoords.start.pos.to!string; 189 this.fields[4] = newCoords.end.pos.to!string; 190 } 191 192 /// Columns 4: start; 1-based integer start position of the feature 193 @property start() const { return this.coordinates.start; } 194 /// Column 5: end; closed coordinate integer ending nucleotide position of the feature 195 @property end() const { return this.coordinates.end; } 196 197 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined." 198 /// Tragically, score can be either a float, or not present (".") 199 /// Totally arbitrarily, we will represent absent as -1 200 /// getter 201 @property score() const 202 { 203 string val; 204 if(unpackedFields) val = this.fields[5]; 205 else val = cast(string)this.raw.splitter('\t').drop(5).front; 206 if(val=="."){ 207 return -1.0; 208 } 209 return val.to!float; 210 } 211 212 /// Column 6: score; float. From the standard: "the semantics of the score are ill-defined." 213 /// Tragically, score can be either a float, or not present (".") 214 /// Totally arbitrarily, we will represent absent as -1 215 /// setter 216 @property score(float s) 217 { 218 unpack; 219 if(s.isNaN) this.fields[5] = "."; 220 else this.fields[5] = s.to!string; 221 } 222 223 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown) 224 // getter 225 @property strand() const 226 { 227 if(unpackedFields) return cast(char)this.fields[6][0]; 228 return cast(char)this.raw.splitter('\t').drop(6).front[0]; 229 } 230 231 /// Column 7: strand; '+', '-', or '.' (or '?' for relevant but unknown) 232 //setter 233 @property strand(char s) 234 { 235 unpack; 236 assert(s == '+' || s == '-' || s == '.'); 237 this.fields[6] = [s].idup; 238 } 239 240 /** Column 8: phase; 241 For features of type "CDS", the phase indicates where the feature begins with 242 reference to the reading frame. The phase is one of the integers 0, 1, or 2, 243 indicating the number of bases that should be removed from the beginning of 244 this feature to reach the first base of the next codon. In other words, a 245 phase of "0" indicates that the next codon begins at the first base of the 246 region described by the current line, a phase of "1" indicates that the next 247 codon begins at the second base of this region, and a phase of "2" indicates 248 that the codon begins at the third base of this region. This is NOT to be 249 confused with the frame, which is simply start modulo 3. 250 251 For forward strand features, phase is counted from the start field. 252 For reverse strand features, phase is counted from the end field. 253 254 The phase is REQUIRED for all CDS features. 255 256 Tragically, phase can be either an integer (0, 1, 2), or not present (".") 257 Totally arbitrarily, we will represent absent as -1 258 **/ 259 @property phase() const 260 { 261 string val; 262 if(unpackedFields) val = this.fields[7]; 263 else val = cast(string)this.raw.splitter('\t').drop(7).front; 264 if(val=="."){ 265 return -1; 266 } 267 return val.to!int; 268 } 269 270 @property phase(long p) 271 { 272 unpack; 273 assert(p == 0 || p == 1 || p == 2 || p == -1); 274 if(p == -1) this.fields[7] = "."; 275 else this.fields[7] = p.to!string; 276 } 277 278 /// Column 9: backwards compatible GFF2 group field 279 @property group() const 280 { 281 if(unpackedAttributes){ 282 static if(ver == GFFVersion.GFF3) 283 return ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";")); 284 else 285 return ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]); 286 } 287 if(unpackedFields) return this.fields[8]; 288 return cast(string)this.raw.splitter('\t').drop(8).front; 289 } 290 291 /// Column 9: backwards compatible GFF2 group field 292 @property group(string g) 293 { 294 unpack; 295 this.fields[8] = g; 296 297 /// clear hashmap as values have been overwritten 298 if(this.unpackedAttributes) this.kvmap.clear; 299 this.unpackedAttributes = false; 300 } 301 302 /// Column 9: attributes; A list of ;-separated feature attributes in key=value form 303 string attributes(const string field){ return this.opIndex(field); } 304 305 /// Provides map key lookup semantics for column 9 attributes 306 string opIndex(string field) const 307 { 308 if(unpackedAttributes) return this.kvmap[field]; 309 static if(ver != GFFVersion.GFF3){ 310 auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";"); 311 auto val = attrs // actualy a Range of key=val 312 .filter!(kv => ((cast(string) kv).strip[0 .. (cast(string) kv).strip.countUntil(' ')]) == field); 313 //.front; // -- AssertError if range is empty 314 if (!val.empty) return ((cast(string) val.front).strip[(cast(string) val.front).strip.countUntil(' ') + 1..$]).strip.strip("\""); 315 else return ""; 316 } 317 else 318 { 319 auto attrs=this.raw.splitter('\t').drop(8).front.splitter(";"); 320 auto val = attrs // actualy a Range of key=val 321 .filter!(kv => cast(string)(kv[0 .. kv.countUntil('=')]) == field); 322 //.front; // -- AssertError if range is empty 323 if (!val.empty) return cast(string) (val.front[val.front.countUntil('=')+1..$]); 324 else return ""; 325 } 326 327 /+ Alternative impl -- benchmark (also pull field ~ "=" out of the filter and combine it once upfront) 328 auto vals = attrs 329 .filter!(kv => kv.startsWith(field ~ "=")) 330 .map!(kv => kv[kv.countUntil('=')+1 .. $]); 331 if (!vals.empty) return cast(string) val.front; 332 else return ""; 333 +/ 334 } 335 336 /// Provides map key assignment semantics for column 9 attributes 337 void opIndexAssign(string val, string field) 338 { 339 static if(ver == GFFVersion.GFF3) unpackGFF3Attributes; 340 else unpackGFF2Attributes; 341 static if(ver != GFFVersion.GFF3) val = "\"" ~ val ~ "\""; 342 this.kvmap[field] = val; 343 } 344 345 /// Column 9 attributes may also include a comma-sep list of tags: (key:tag)={t1,t2,t3,...} 346 bool hasTag(string tagName)() 347 { 348 return this["tag"].splitter(",").filter!(t => t == tagName).count > 0; 349 } 350 351 /// Computed feature length 352 @property length() const { return this.end - (this.start-1); } 353 /// Relative start === 1 354 @property relativeStart() const { return OB(1); } 355 /// Relative start === the feature length 356 @property relativeEnd() const { return OB(this.length - 1); } 357 358 /// Genomic coordinate at offset into feature, taking strandedness into account 359 @property coordinateAtOffset(long offset) const 360 { 361 // GFF3 features are 1-based coordinates 362 assert(offset >= 0); 363 364 // for - strand count from end; for +, ., and ? strand count from start 365 immutable auto begin = (this.strand == '-' ? this.end : this.start); 366 367 // for - strand count backward; for +, ., and ? strand count forward 368 immutable auto direction = (this.strand == '-' ? -1 : 1); 369 370 return OB(begin + (direction * offset)); 371 } 372 /// Genomic coordinate at beginning of feature, taking strandedness into account 373 @property coordinateAtBegin() const 374 { 375 return this.coordinateAtOffset(0); 376 } 377 /// Genomic coordinate at end of feature, taking strandedness into account 378 @property coordinateAtEnd() const 379 { 380 return this.coordinateAtOffset(this.length - 1); 381 } 382 383 string toString() const 384 { 385 if(!unpackedFields) 386 return cast(string) this.raw; 387 string ret; 388 if(unpackedAttributes){ 389 ret = this.fields[0..8].join("\t"); 390 static if(ver == GFFVersion.GFF3) 391 ret ~= ("\t" ~ this.kvmap.byKeyValue.map!(a => a.key ~ "=" ~ a.value).join(";")); 392 else 393 ret ~= ("\t" ~ (this.kvmap.byKeyValue.map!(a => " " ~ a.key ~ " " ~ a.value ~ " ").join(";"))[1 .. $]); 394 }else{ 395 ret = this.fields[].join("\t"); 396 } 397 return ret; 398 } 399 400 /// Returns a string with the canonical "chr:start-end" representation 401 @property string canonicalRepresentation() const 402 { 403 return this.seqid~":"~this.start.pos.to!string~"-"~this.end.pos.to!string; 404 } 405 /// Return the seqURI representation 406 @property string seqURI() const { 407 return format("seq:unk/%s", this.canonicalRepresentation); 408 } 409 410 }