VCFRecord

Wrapper around bcf1_t

Because it uses bcf1_t internally, it must conform to the BCF2 part of the VCFv4.2 specs, rather than the loosey-goosey VCF specs. i.e., INFO, CONTIG, FILTER records must exist in the header.

TODO: Does this need to be kept in a consistent state? Ideally, VCFWriter would reject invalid ones, but we are informed that it is invalid (e.g. if contig not found) while building this struct; bcf_write1 will actually segfault, unfortunately. I'd like to avoid expensive validate() calls for every record before writing if possible, which means keeping this consistent. However, not sure what to do if error occurs when using the setters herein?

2019-01-23 struct->class to mirror SAMRecord -- faster if reference type?

2019-01-23 WIP: getters for chrom, pos, id, ref, alt are complete (untested)

After parsing a BCF or VCF line, bcf1_t must be UnpackLeveled. (not applicable when building bcf1_t from scratch) Depending on information needed, this can be done to various levels with performance tradeoff. Unpacking symbols: UNPACK.ALL: all UNPACK.SHR: all shared information (UNPACK.ALT|UNPACK.FILT|UNPACK.INFO)

UnpackLevel.ALT / UnpackLevel.FILT | / UnpackLevel.INFO | | / ____________________________ UnpackLevel.FMT V V V / | | | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 ...

struct VCFRecord {}

Constructors

this
this(T h, bcf1_t* b, UnpackLevel MAX_UNPACK)
this(VCFHeader vcfhdr, string chrom, int pos, string id, string _ref, string alt, float qual, SS filter)
this(VCFHeader vcfhdr, string line, UnpackLevel MAX_UNPACK)

VCFRecord

Members

Functions

add
void add(const(char)[] tag, T data)

add INFO or FORMAT key:value pairs to a record add a single datapoint OR vector of values, OR, values to each sample (if lineType == FORMAT)

addFilter
int addFilter(string f)

Add a filter; from htslib: "If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed."

addFormat
void addFormat(string tag, T data)
void addFormat(string tag, T[] data)

Update FORMAT (sample info; column 9+) * * Templated on data type, calls one of bc_update_format_{int32,float,string,flag}

addID
int addID(const(char)[] id)

Append an ID (column 3) to the record.

addInfo
void addInfo(string tag, T data)

Update INFO (pan-sample info; column 8) * * Add a tag:value to the INFO column * NOTE: tag must already exist in the header * * Templated on data type, calls one of bcf_update_info_{int32,float,string,flag} * Both singletons and arrays are supported.

getFormats
FormatField[string] getFormats()

returns a hashmap of format data by field

getInfos
InfoField[string] getInfos()

returns a hashmap of info data by field

hasFilter
bool hasFilter(string filter)

Determine whether FILTER is present. log warning if filter does not exist. "PASS" and "." can be used interchangeably.

removeAllFilters
void removeAllFilters()

Remove all entries in FILTER

removeFilter
int removeFilter(string f)

Remove a filter by name

removeFilter
int removeFilter(int fid)

Remove a filter by numeric id

removeFormat
void removeFormat(string tag)

remove a format section

removeInfo
void removeInfo(string tag)

remove an info section

setAlleles
void setAlleles(string _ref, string alt)

Set alleles; alt can be comma separated

setAlleles
void setAlleles(string _ref, string alt, ...)

Set alleles; min. 2 alleles (ref, alt1); unlimited alts may be specified

setRefAllele
void setRefAllele(string r)

Set REF allele only

toString
string toString()

Return a string representation of the VCFRecord (i.e. as would appear in .vcf)

unpack
void unpack(UnpackLevel unpackLevel)

ensure that vcf variable length data is unpacked to at least desired level

Properties

alleles
string alleles [@property setter]

Set alleles; comma-separated list

alleles
string[] alleles [@property setter]

Set alleles; array

allelesAsArray
string[] allelesAsArray [@property getter]

All alleles getter (array)

altAllelesAsArray
string[] altAllelesAsArray [@property getter]

Alternate alleles getter version 1: ["A", "ACTG", ...]

altAllelesAsString
string altAllelesAsString [@property getter]

Alternate alleles getter version 2: "A,ACTG,..."

chrom
string chrom [@property getter]

Get chromosome (CHROM)

chrom
const(char)[] chrom [@property setter]

Set chromosome (CHROM)

coordinates
ZBHO coordinates [@property getter]

Coordinate range of the reference allele

filter
string filter [@property getter]

Get FILTER column (nothing in htslib sadly)

filter
string filter [@property setter]

Set the FILTER column to f

filter
string[] filter [@property setter]

Set the FILTER column to f0,f1,f2... TODO: determine definitiely whether "." is replaced with "PASS"

id
string id [@property getter]

Get ID string

id
const(char)[] id [@property setter]

Sets new ID string; comma-separated list allowed but no dup checking performed

pos
ZB pos [@property getter]

Get position (POS, column 2) * * NB: internally BCF is uzing 0 based coordinates; we only show +1 when printing a VCF line with toString (which calls vcf_format)

pos
ZB pos [@property setter]

Set position (POS, column 2)

qual
float qual [@property getter]

Get variant quality (QUAL, column 6)

qual
float qual [@property setter]

Set variant quality (QUAL, column 6)

refAllele
string refAllele [@property getter]

Reference allele getter

refLen
long refLen [@property getter]

REF allele length

Variables

line
Bcf1 line;

htslib structured record TODO: change to 'b' for better internal consistency? (vcf.h/c actually use line quite a bit in fn params)

vcfheader
VCFHeader vcfheader;

corresponding header (required);

Examples

1 import std.exception: assertThrown;
2 import std.stdio: writeln, writefln;
3 import dhtslib.vcf.writer;
4 
5 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
6 
7 
8 auto vw = VCFWriter("/dev/null");
9 
10 vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
11 vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
12 // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
13 vw.vcfhdr.addHeaderLine!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data");
14 vw.addHeaderLineRaw("##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); // @suppress(dscanner.style.long_line)
15 vw.addHeaderLineRaw("##FILTER=<ID=q10,Description=\"Quality below 10\">");
16 HeaderRecord hrec;
17 hrec.setHeaderRecordType(HeaderRecordType.Format);
18 hrec.setID("AF");
19 hrec.setLength(HeaderLengths.OnePerAltAllele);
20 hrec.setValueType(HeaderTypes.Float);
21 hrec.setDescription("Allele Freq");
22 vw.vcfhdr.addHeaderRecord(hrec);
23 
24 assert(vw.vcfhdr.getHeaderRecord(HeaderRecordType.Format, "ID","AF")["ID"] == "AF");
25 vw.addHeaderLineRaw("##FORMAT=<ID=CH,Number=3,Type=String,Description=\"test\">");
26 
27 // Exercise header
28 assert(vw.vcfhdr.nsamples == 0);
29 vw.addSample("NA12878");
30 assert(vw.vcfhdr.nsamples == 1);
31 
32 vw.writeHeader();
33 
34 auto r = new VCFRecord(vw.vcfhdr, bcf_init1());
35 
36 r.chrom = "20";
37 assert(r.chrom == "20");
38 assertThrown(r.chrom = "chr20");   // headerline chromosome is "20" not "chr20"
39 
40 r.pos = Coordinate!(Basis.zero)(999_999);
41 assert(r.pos == Coordinate!(Basis.zero)(999_999));
42 r.pos = Coordinate!(Basis.zero)(62_435_964 + 1);     // Exceeds contig length
43 
44 // Test ID field
45 // note that in an empty freshly initialized bcf1_t, the ID field is "" rather than "."
46 // Will consider adding initialization code
47 //writefln("ID: %s", r.id);
48 r.id = "";
49 assert(r.id == ".");
50 r.id = ".";
51 assert(r.id == ".");
52 r.id = "rs001";
53 assert(r.id == "rs001");
54 r.addID("rs999");
55 assert(r.id == "rs001;rs999");
56 r.addID("rs001");   // test duplicate checking
57 assert(r.id == "rs001;rs999");
58 
59 // Test REF/ALT allele setters and getters
60 // many overloads to test for good coverage
61 // Test matrix: Set {zero, one, two, three} alleles * {set by string, set by array} = 8 test cases
62 //  * also, there is setAlleles(ref, alt) and setAlleles(ref, alt1, ...) 
63 //  * TODO: will also need to retreive alt alleles as array and string
64 
65 string[] alleles_array;
66 
67 // Zero alleles
68 r.alleles("");
69 const auto zs = r.allelesAsArray;
70 const auto zsr = r.refAllele;
71 assert(zs.length == 0);
72 assert(zsr == "");
73 
74 r.alleles(alleles_array);
75 const auto za = r.allelesAsArray;
76 const auto zar = r.refAllele;
77 assert(za.length == 0);
78 assert(zar == "");
79 
80 // One allele
81 r.alleles("C");
82 const auto os = r.allelesAsArray;
83 const auto osr = r.refAllele;
84 assert(os.length == 1 && os[0] == "C");
85 assert(osr == "C");
86 
87 r.alleles(["C"]);
88 const auto oa = r.allelesAsArray;
89 const auto oar = r.refAllele;
90 assert(oa.length == 1 && oa[0] == "C");
91 assert(oar == "C");
92 
93 // Two alleles
94 r.alleles("C,T");
95 const auto ts = r.allelesAsArray;
96 const auto tsr= r.refAllele;
97 assert(ts == ["C", "T"]);
98 assert(tsr== "C");
99 
100 r.alleles(["C", "T"]);
101 const auto ta = r.allelesAsArray;
102 const auto tar= r.refAllele;
103 assert(ta == ["C", "T"]);
104 assert(tar== "C");
105 
106 const taaa = r.altAllelesAsArray;
107 const taas = r.altAllelesAsString;
108 assert(taaa == ["T"]);
109 assert(taas == "T");
110 
111 // alternate setter for >= 2 alleles
112 r.setAlleles("A", "G");
113 assert(r.allelesAsArray == ["A", "G"]);
114 
115 // Three alleles
116 r.alleles("A,C,T");
117 assert(r.allelesAsArray == ["A", "C", "T"]);
118 assert(r.refAllele == "A");
119 assert(r.altAllelesAsString == "C,T");
120 
121 // alternate the alleles for testing purposes
122 r.alleles(["G", "A", "C"]);
123 assert(r.allelesAsArray == ["G", "A", "C"]);
124 assert(r.refAllele == "G");
125 assert(r.altAllelesAsString == "A,C");
126 
127 r.setAlleles("A", "C", "T");
128 assert(r.allelesAsArray == ["A", "C", "T"]);
129 assert(r.refAllele == "A");
130 assert(r.altAllelesAsString == "C,T");
131 
132 r.setRefAllele("G");
133 assert(r.allelesAsArray == ["G", "C", "T"]);
134 assert(r.refAllele == "G");
135 assert(r.altAllelesAsString == "C,T");
136 
137 r.setRefAllele("A");
138 assert(r.allelesAsArray == ["A", "C", "T"]);
139 assert(r.refAllele == "A");
140 assert(r.altAllelesAsString == "C,T");
141 
142 
143 
144 // Test QUAL
145 r.qual = 1.0;
146 assert(r.qual == 1.0);
147 // now test setting qual without UnpackLeveling
148 // TODO: see https://forum.dlang.org/post/hebouvswxlslqhovzaia@forum.dlang.org, once resolved (or once factory function written),
149 //  add template value param UnpackLevel.ALT
150 auto rr = new VCFRecord(vw.vcfhdr, "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0.017\n"); // @suppress(dscanner.style.long_line)
151 rr.qual = 3.0;
152 assert(rr.qual == 3.0);
153 
154 
155 // Test FILTER
156 assert(r.hasFilter("PASS"));
157 assert(r.hasFilter("."));
158 // add q10 filter (is in header)
159 r.filter = "q10";
160 assert(r.filter == "q10");
161 assert(!r.hasFilter("PASS"));   // if has another filter, no longer PASS (unless added explicitly)
162 assert(!r.hasFilter("."));      // if has another filter, no longer has . (which is PASS, or no filter [spec conflict?])
163 
164 // add q30 filteR (not in header)
165 r.filter = "q30";
166 assert(r.filter == "q10");  // i.e., unchanged
167 
168 assert(r.hasFilter("q10"));
169 assert(!r.hasFilter("q30"));
170 
171 r.removeAllFilters();
172 assert(r.hasFilter("."));
173 r.addFilter("q10");
174 assert(r.hasFilter("q10"));
175 r.removeFilter("q10");
176 assert(r.hasFilter("PASS"));
177 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\n");
178 
179 r.addFormat("AF",[0.1, 0.5]);
180 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\tAF\t0.1,0.5\n");
181 
182 rr.addFormat("AF",[0.1]);
183 writeln(rr.toString);
184 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0\tAF\t0.1\n");
185 
186 
187 assert(rr.getFormats["AF"].to!float[0][0] == 0.1f);
188 
189 rr.addInfo("AF",0.5);
190 assert(rr.getInfos["AF"].to!float == 0.5f);
191 
192 rr.removeInfo("AF");
193 
194 rr.removeFormat("AF");
195 
196 rr.addFormat("CH", "test");
197 
198 
199 writeln(rr.toString);
200 writeln(rr.getFormats["CH"].to!string);
201 assert(rr.getFormats["CH"].to!string[0][0] == "test");
202 
203 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11\tCH\ttest\n");
204 
205 assert(rr.coordinates == ZBHO(17329,17330));
206 vw.writeRecord(*r);
207 vw.writeRecord(vw.vcfhdr.hdr, r.line);
208 
209 
210 // Finally, print the records:
211 writefln("VCF records via toString:\n%s%s", (*r), (*rr));
212 
213 writeln("\ndhtslib.vcf: all tests passed\n");

Meta