VCFRecord

Wrapper around bcf1_t

Because it uses bcf1_t internally, it must conform to the BCF2 part of the VCFv4.2 specs, rather than the loosey-goosey VCF specs. i.e., INFO, CONTIG, FILTER records must exist in the header.

TODO: Does this need to be kept in a consistent state? Ideally, VCFWriter would reject invalid ones, but we are informed that it is invalid (e.g. if contig not found) while building this struct; bcf_write1 will actually segfault, unfortunately. I'd like to avoid expensive validate() calls for every record before writing if possible, which means keeping this consistent. However, not sure what to do if error occurs when using the setters herein?

2019-01-23 struct->class to mirror SAMRecord -- faster if reference type?

2019-01-23 WIP: getters for chrom, pos, id, ref, alt are complete (untested)

After parsing a BCF or VCF line, bcf1_t must be UnpackLeveled. (not applicable when building bcf1_t from scratch) Depending on information needed, this can be done to various levels with performance tradeoff. Unpacking symbols: UNPACK.ALL: all UNPACK.SHR: all shared information (UNPACK.ALT|UNPACK.FILT|UNPACK.INFO)

UnpackLevel.ALT / UnpackLevel.FILT | / UnpackLevel.INFO | | / ____________________________ UnpackLevel.FMT V V V / | | | #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 ...

struct VCFRecord {}

Constructors

this
this(T h, bcf1_t* b, UnpackLevel MAX_UNPACK)
this(VCFHeader vcfhdr, string chrom, int pos, string id, string _ref, string alt, float qual, SS filter)
this(VCFHeader vcfhdr, string line, UnpackLevel MAX_UNPACK)

VCFRecord

Postblit

this(this)
this(this)

Explicit postblit to avoid https://github.com/blachlylab/dhtslib/issues/122

Members

Functions

add
void add(const(char)[] tag, T data)

add INFO or FORMAT key:value pairs to a record add a single datapoint OR vector of values, OR, values to each sample (if lineType == FORMAT)

addFilter
int addFilter(string f)

Add a filter; from htslib: "If flt_id is PASS, all existing filters are removed first. If other than PASS, existing PASS is removed."

addFormat
void addFormat(string tag, T data)
void addFormat(string tag, T[] data)

Update FORMAT (sample info; column 9+) * * Templated on data type, calls one of bc_update_format_{int32,float,string,flag}

addID
int addID(const(char)[] id)

Append an ID (column 3) to the record.

addInfo
void addInfo(string tag, T data)

Update INFO (pan-sample info; column 8) * * Add a tag:value to the INFO column * NOTE: tag must already exist in the header * * Templated on data type, calls one of bcf_update_info_{int32,float,string,flag} * Both singletons and arrays are supported.

getFormats
FormatField[string] getFormats()

returns a hashmap of format data by field

getGenotype
Genotype getGenotype(int sampleIdx)
Undocumented in source. Be warned that the author may not have intended to support it.
getGenotypes
Genotype[] getGenotypes()
Undocumented in source. Be warned that the author may not have intended to support it.
getInfos
InfoField[string] getInfos()

returns a hashmap of info data by field

hasFilter
bool hasFilter(string filter)

Determine whether FILTER is present. log warning if filter does not exist. "PASS" and "." can be used interchangeably.

removeAllFilters
void removeAllFilters()

Remove all entries in FILTER

removeFilter
int removeFilter(string f)

Remove a filter by name

removeFilter
int removeFilter(int fid)

Remove a filter by numeric id

removeFormat
void removeFormat(string tag)

remove a format section

removeInfo
void removeInfo(string tag)

remove an info section

setAlleles
void setAlleles(string _ref, string alt)

Set alleles; alt can be comma separated

setAlleles
void setAlleles(string _ref, string alt, ...)

Set alleles; min. 2 alleles (ref, alt1); unlimited alts may be specified

setRefAllele
void setRefAllele(string r)

Set REF allele only

toString
string toString()

Return a string representation of the VCFRecord (i.e. as would appear in .vcf)

unpack
void unpack(UnpackLevel unpackLevel)

ensure that vcf variable length data is unpacked to at least desired level

Properties

alleles
string alleles [@property setter]

Set alleles; comma-separated list

alleles
string[] alleles [@property setter]

Set alleles; array

allelesAsArray
string[] allelesAsArray [@property getter]

All alleles getter (array)

altAllelesAsArray
string[] altAllelesAsArray [@property getter]

Alternate alleles getter version 1: ["A", "ACTG", ...]

altAllelesAsString
string altAllelesAsString [@property getter]

Alternate alleles getter version 2: "A,ACTG,..."

chrom
string chrom [@property getter]

Get chromosome (CHROM)

chrom
const(char)[] chrom [@property setter]

Set chromosome (CHROM)

coordinates
ZBHO coordinates [@property getter]

Coordinate range of the reference allele

filter
string filter [@property getter]

Get FILTER column (nothing in htslib sadly)

filter
string filter [@property setter]

Set the FILTER column to f

filter
string[] filter [@property setter]

Set the FILTER column to f0,f1,f2... TODO: determine definitiely whether "." is replaced with "PASS"

id
string id [@property getter]

Get ID string

id
const(char)[] id [@property setter]

Sets new ID string; comma-separated list allowed but no dup checking performed

pos
ZB pos [@property getter]

Get position (POS, column 2) * * NB: internally BCF is uzing 0 based coordinates; we only show +1 when printing a VCF line with toString (which calls vcf_format)

pos
ZB pos [@property setter]

Set position (POS, column 2)

qual
float qual [@property getter]

Get variant quality (QUAL, column 6)

qual
float qual [@property setter]

Set variant quality (QUAL, column 6)

refAllele
string refAllele [@property getter]

Reference allele getter

refLen
long refLen [@property getter]

REF allele length

Variables

line
Bcf1 line;

htslib structured record TODO: change to 'b' for better internal consistency? (vcf.h/c actually use line quite a bit in fn params)

vcfheader
VCFHeader vcfheader;

corresponding header (required);

Examples

1 import std.exception: assertThrown;
2 import std.stdio: writeln, writefln;
3 import dhtslib.vcf.writer;
4 
5 hts_set_log_level(htsLogLevel.HTS_LOG_TRACE);
6 
7 
8 auto vw = VCFWriter("/dev/null", VCFWriterTypes.VCF);
9 
10 vw.addHeaderLineRaw("##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">");
11 vw.addHeaderLineKV("INFO", "<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">");
12 vw.addHeaderLineKV("INFO", "<ID=DP2,Number=2,Type=Float,Description=\"Total Depth\">");
13 // ##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
14 vw.vcfhdr.addHeaderLine!(HeaderRecordType.Info)("AF", HeaderLengths.OnePerAltAllele, HeaderTypes.Integer, "Number of Samples With Data");
15 vw.addHeaderLineRaw("##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d618ff66beb2da,species=\"Homo sapiens\",taxonomy=x>"); // @suppress(dscanner.style.long_line)
16 vw.addHeaderLineRaw("##FILTER=<ID=q10,Description=\"Quality below 10\">");
17 HeaderRecord hrec;
18 hrec.setHeaderRecordType(HeaderRecordType.Format);
19 hrec.setID("AF");
20 hrec.setLength(HeaderLengths.OnePerAltAllele);
21 hrec.setValueType(HeaderTypes.Float);
22 hrec.setDescription("Allele Freq");
23 vw.vcfhdr.addHeaderRecord(hrec);
24 
25 assert(vw.vcfhdr.getHeaderRecord(HeaderRecordType.Format, "ID","AF")["ID"] == "AF");
26 vw.addHeaderLineRaw("##FORMAT=<ID=CH,Number=3,Type=String,Description=\"test\">");
27 
28 // Exercise header
29 assert(vw.vcfhdr.nsamples == 0);
30 vw.addSample("NA12878");
31 assert(vw.vcfhdr.nsamples == 1);
32 
33 vw.writeHeader();
34 
35 auto r = new VCFRecord(vw.vcfhdr, bcf_init1());
36 
37 r.chrom = "20";
38 assert(r.chrom == "20");
39 assertThrown(r.chrom = "chr20");   // headerline chromosome is "20" not "chr20"
40 
41 r.pos = Coordinate!(Basis.zero)(999_999);
42 assert(r.pos == Coordinate!(Basis.zero)(999_999));
43 r.pos = Coordinate!(Basis.zero)(62_435_964 + 1);     // Exceeds contig length
44 
45 // Test ID field
46 // note that in an empty freshly initialized bcf1_t, the ID field is "" rather than "."
47 // Will consider adding initialization code
48 //writefln("ID: %s", r.id);
49 r.id = "";
50 assert(r.id == ".");
51 r.id = ".";
52 assert(r.id == ".");
53 r.id = "rs001";
54 assert(r.id == "rs001");
55 r.addID("rs999");
56 assert(r.id == "rs001;rs999");
57 r.addID("rs001");   // test duplicate checking
58 assert(r.id == "rs001;rs999");
59 
60 // Test REF/ALT allele setters and getters
61 // many overloads to test for good coverage
62 // Test matrix: Set {zero, one, two, three} alleles * {set by string, set by array} = 8 test cases
63 //  * also, there is setAlleles(ref, alt) and setAlleles(ref, alt1, ...) 
64 //  * TODO: will also need to retreive alt alleles as array and string
65 
66 string[] alleles_array;
67 
68 // Zero alleles
69 r.alleles("");
70 const auto zs = r.allelesAsArray;
71 const auto zsr = r.refAllele;
72 assert(zs.length == 0);
73 assert(zsr == "");
74 
75 r.alleles(alleles_array);
76 const auto za = r.allelesAsArray;
77 const auto zar = r.refAllele;
78 assert(za.length == 0);
79 assert(zar == "");
80 
81 // One allele
82 r.alleles("C");
83 const auto os = r.allelesAsArray;
84 const auto osr = r.refAllele;
85 assert(os.length == 1 && os[0] == "C");
86 assert(osr == "C");
87 
88 r.alleles(["C"]);
89 const auto oa = r.allelesAsArray;
90 const auto oar = r.refAllele;
91 assert(oa.length == 1 && oa[0] == "C");
92 assert(oar == "C");
93 
94 // Two alleles
95 r.alleles("C,T");
96 const auto ts = r.allelesAsArray;
97 const auto tsr= r.refAllele;
98 assert(ts == ["C", "T"]);
99 assert(tsr== "C");
100 
101 r.alleles(["C", "T"]);
102 const auto ta = r.allelesAsArray;
103 const auto tar= r.refAllele;
104 assert(ta == ["C", "T"]);
105 assert(tar== "C");
106 
107 const taaa = r.altAllelesAsArray;
108 const taas = r.altAllelesAsString;
109 assert(taaa == ["T"]);
110 assert(taas == "T");
111 
112 // alternate setter for >= 2 alleles
113 r.setAlleles("A", "G");
114 assert(r.allelesAsArray == ["A", "G"]);
115 
116 // Three alleles
117 r.alleles("A,C,T");
118 assert(r.allelesAsArray == ["A", "C", "T"]);
119 assert(r.refAllele == "A");
120 assert(r.altAllelesAsString == "C,T");
121 
122 // alternate the alleles for testing purposes
123 r.alleles(["G", "A", "C"]);
124 assert(r.allelesAsArray == ["G", "A", "C"]);
125 assert(r.refAllele == "G");
126 assert(r.altAllelesAsString == "A,C");
127 
128 r.setAlleles("A", "C", "T");
129 assert(r.allelesAsArray == ["A", "C", "T"]);
130 assert(r.refAllele == "A");
131 assert(r.altAllelesAsString == "C,T");
132 
133 r.setRefAllele("G");
134 assert(r.allelesAsArray == ["G", "C", "T"]);
135 assert(r.refAllele == "G");
136 assert(r.altAllelesAsString == "C,T");
137 
138 r.setRefAllele("A");
139 assert(r.allelesAsArray == ["A", "C", "T"]);
140 assert(r.refAllele == "A");
141 assert(r.altAllelesAsString == "C,T");
142 
143 
144 
145 // Test QUAL
146 r.qual = 1.0;
147 assert(r.qual == 1.0);
148 // now test setting qual without UnpackLeveling
149 // TODO: see https://forum.dlang.org/post/hebouvswxlslqhovzaia@forum.dlang.org, once resolved (or once factory function written),
150 //  add template value param UnpackLevel.ALT
151 auto rr = new VCFRecord(vw.vcfhdr, "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0.017\n"); // @suppress(dscanner.style.long_line)
152 rr.qual = 3.0;
153 assert(rr.qual == 3.0);
154 
155 
156 // Test FILTER
157 assert(r.hasFilter("PASS"));
158 assert(r.hasFilter("."));
159 // add q10 filter (is in header)
160 r.filter = "q10";
161 assert(r.filter == "q10");
162 assert(!r.hasFilter("PASS"));   // if has another filter, no longer PASS (unless added explicitly)
163 assert(!r.hasFilter("."));      // if has another filter, no longer has . (which is PASS, or no filter [spec conflict?])
164 
165 // add q30 filteR (not in header)
166 r.filter = "q30";
167 assert(r.filter == "q10");  // i.e., unchanged
168 
169 assert(r.hasFilter("q10"));
170 assert(!r.hasFilter("q30"));
171 
172 r.removeAllFilters();
173 assert(r.hasFilter("."));
174 r.addFilter("q10");
175 assert(r.hasFilter("q10"));
176 r.removeFilter("q10");
177 assert(r.hasFilter("PASS"));
178 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\n");
179 
180 r.addFormat("AF",[0.1, 0.5]);
181 assert(r.toString == "20\t62435966\trs001;rs999\tA\tC,T\t1\t.\t.\tAF\t0.1,0.5\n");
182 
183 rr.addFormat("AF",[0.1]);
184 writeln(rr.toString);
185 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;AF=0\tAF\t0.1\n");
186 
187 
188 assert(rr.getFormats["AF"].to!float[0][0] == 0.1f);
189 
190 rr.addInfo("AF",0.5);
191 assert(rr.getInfos["AF"].to!float == 0.5f);
192 
193 rr.addInfo("DP2",[0.5f, 0.4f]);
194 assert(rr.getInfos["DP2"].to!(float[]) == [0.5f, 0.4f]);
195 
196 rr.removeInfo("AF");
197 
198 rr.removeFormat("AF");
199 
200 rr.addFormat("CH", "test");
201 
202 
203 writeln(rr.toString);
204 writeln(rr.getFormats["CH"].to!string);
205 assert(rr.getFormats["CH"].to!string[0][0] == "test");
206 
207 assert(rr.toString == "20\t17330\t.\tT\tA\t3\t.\tNS=3;DP=11;DP2=0.5,0.4\tCH\ttest\n");
208 
209 assert(rr.coordinates == ZBHO(17329,17330));
210 vw.writeRecord(*r);
211 vw.writeRecord(vw.vcfhdr.hdr, r.line);
212 
213 
214 // Finally, print the records:
215 writefln("VCF records via toString:\n%s%s", (*r), (*rr));
216 
217 writeln("\ndhtslib.vcf: all tests passed\n");

Meta