package htsjdk.variant.vcf;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.LazyGenotypesContext;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.writer.IntGenotypeFieldAccessors;
import java.lang.reflect.Array;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
/**
* Functions specific to encoding VCF records.
*/
public class VCFEncoder {
/**
* The encoding used for VCF files: ISO-8859-1
*/
public static final Charset VCF_CHARSET = Charset.forName("ISO-8859-1");
private static final String QUAL_FORMAT_STRING = "%.2f";
private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00";
private final IntGenotypeFieldAccessors GENOTYPE_FIELD_ACCESSORS = new IntGenotypeFieldAccessors();
private VCFHeader header;
private boolean allowMissingFieldsInHeader = false;
private boolean outputTrailingFormatFields = false;
/**
* Prepare a VCFEncoder that will encode records appropriate to the given VCF header, optionally
* allowing missing fields in the header.
*/
public VCFEncoder(final VCFHeader header, final boolean allowMissingFieldsInHeader, final boolean outputTrailingFormatFields) {
if (header == null) throw new NullPointerException("The VCF header must not be null.");
this.header = header;
this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;
this.outputTrailingFormatFields = outputTrailingFormatFields;
}
/**
* Please see the notes in the default constructor
*/
@Deprecated
public void setVCFHeader(final VCFHeader header) {
this.header = header;
}
/**
* Please see the notes in the default constructor
*/
@Deprecated
public void setAllowMissingFieldsInHeader(final boolean allow) {
this.allowMissingFieldsInHeader = allow;
}
public String encode(final VariantContext context) {
if (this.header == null) {
throw new NullPointerException("The header field must be set on the VCFEncoder before encoding records.");
}
final StringBuilder stringBuilder = new StringBuilder();
// CHROM
stringBuilder.append(context.getChr()).append(VCFConstants.FIELD_SEPARATOR);
// POS
stringBuilder.append(String.valueOf(context.getStart())).append(VCFConstants.FIELD_SEPARATOR);
// ID
stringBuilder.append(context.getID()).append(VCFConstants.FIELD_SEPARATOR);
// REF
stringBuilder.append(context.getReference().getDisplayString()).append(VCFConstants.FIELD_SEPARATOR);
// ALT
if ( context.isVariant() ) {
Allele altAllele = context.getAlternateAllele(0);
String alt = altAllele.getDisplayString();
stringBuilder.append(alt);
for (int i = 1; i < context.getAlternateAlleles().size(); i++) {
altAllele = context.getAlternateAllele(i);
alt = altAllele.getDisplayString();
stringBuilder.append(",");
stringBuilder.append(alt);
}
} else {
stringBuilder.append(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
}
stringBuilder.append(VCFConstants.FIELD_SEPARATOR);
// QUAL
if ( ! context.hasLog10PError()) stringBuilder.append(VCFConstants.MISSING_VALUE_v4);
else stringBuilder.append(formatQualValue(context.getPhredScaledQual()));
stringBuilder.append(VCFConstants.FIELD_SEPARATOR);
// FILTER
stringBuilder.append(getFilterString(context)).append(VCFConstants.FIELD_SEPARATOR);
// INFO
final Map<String, String> infoFields = new TreeMap<String, String>();
for (final Map.Entry<String, Object> field : context.getAttributes().entrySet() ) {
if ( ! this.header.hasInfoLine(field.getKey())) fieldIsMissingFromHeaderError(context, field.getKey(), "INFO");
final String outputValue = formatVCFField(field.getValue());
if (outputValue != null) infoFields.put(field.getKey(), outputValue);
}
writeInfoString(infoFields, stringBuilder);
// FORMAT
final GenotypesContext gc = context.getGenotypes();
if (gc.isLazyWithData() && ((LazyGenotypesContext) gc).getUnparsedGenotypeData() instanceof String) {
stringBuilder.append(VCFConstants.FIELD_SEPARATOR);
stringBuilder.append(((LazyGenotypesContext) gc).getUnparsedGenotypeData().toString());
} else {
final List<String> genotypeAttributeKeys = context.calcVCFGenotypeKeys(this.header);
if ( ! genotypeAttributeKeys.isEmpty()) {
for (final String format : genotypeAttributeKeys)
if ( ! this.header.hasFormatLine(format))
fieldIsMissingFromHeaderError(context, format, "FORMAT");
final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);
stringBuilder.append(VCFConstants.FIELD_SEPARATOR);
stringBuilder.append(genotypeFormatString);
final Map<Allele, String> alleleStrings = buildAlleleStrings(context);
addGenotypeData(context, alleleStrings, genotypeAttributeKeys, stringBuilder);
}
}
return stringBuilder.toString();
}
VCFHeader getVCFHeader() {
return this.header;
}
boolean getAllowMissingFieldsInHeader() {
return this.allowMissingFieldsInHeader;
}
private String getFilterString(final VariantContext vc) {
if (vc.isFiltered()) {
for (final String filter : vc.getFilters()) {
if ( ! this.header.hasFilterLine(filter)) fieldIsMissingFromHeaderError(vc, filter, "FILTER");
}
return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()));
}
else if (vc.filtersWereApplied()) return VCFConstants.PASSES_FILTERS_v4;
else return VCFConstants.UNFILTERED;
}
private String formatQualValue(final double qual) {
String s = String.format(QUAL_FORMAT_STRING, qual);
if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) )
s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length());
return s;
}
private void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) {
if ( ! allowMissingFieldsInHeader)
throw new IllegalStateException("Key " + id + " found in VariantContext field " + field
+ " at " + vc.getChr() + ":" + vc.getStart()
+ " but this key isn't defined in the VCFHeader. We require all VCFs to have"
+ " complete VCF headers by default.");
}
String formatVCFField(final Object val) {
final String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
else if ( val instanceof Double )
result = formatVCFDouble((Double) val);
else if ( val instanceof Boolean )
result = (Boolean)val ? "" : null; // empty string for true, null for false
else if ( val instanceof List ) {
result = formatVCFField(((List)val).toArray());
} else if ( val.getClass().isArray() ) {
final int length = Array.getLength(val);
if ( length == 0 )
return formatVCFField(null);
final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0)));
for ( int i = 1; i < length; i++) {
sb.append(",");
sb.append(formatVCFField(Array.get(val, i)));
}
result = sb.toString();
} else
result = val.toString();
return result;
}
/**
* Takes a double value and pretty prints it to a String for display
*
* Large doubles => gets %.2f style formatting
* Doubles < 1 / 10 but > 1/100 </>=> get %.3f style formatting
* Double < 1/100 => %.3e formatting
* @param d
* @return
*/
public static String formatVCFDouble(final double d) {
final String format;
if ( d < 1 ) {
if ( d < 0.01 ) {
if ( Math.abs(d) >= 1e-20 )
format = "%.3e";
else {
// return a zero format
return "0.00";
}
} else {
format = "%.3f";
}
} else {
format = "%.2f";
}
return String.format(format, d);
}
static int countOccurrences(final char c, final String s) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
count += s.charAt(i) == c ? 1 : 0;
}
return count;
}
static boolean isMissingValue(final String s) {
// we need to deal with the case that it's a list of missing values
return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());
}
/*
* Add the genotype data
*/
public void addGenotypeData(final VariantContext vc, final Map<Allele, String> alleleMap, final List<String> genotypeFormatKeys, final StringBuilder builder) {
final int ploidy = vc.getMaxPloidy(2);
for (final String sample : this.header.getGenotypeSamples()) {
builder.append(VCFConstants.FIELD_SEPARATOR);
Genotype g = vc.getGenotype(sample);
if (g == null) g = GenotypeBuilder.createMissing(sample, ploidy);
final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
for (final String field : genotypeFormatKeys) {
if (field.equals(VCFConstants.GENOTYPE_KEY)) {
if ( ! g.isAvailable()) {
throw new IllegalStateException("GTs cannot be missing for some samples if they are available for others in the record");
}
writeAllele(g.getAllele(0), alleleMap, builder);
for (int i = 1; i < g.getPloidy(); i++) {
builder.append(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
writeAllele(g.getAllele(i), alleleMap, builder);
}
continue;
} else {
final String outputValue;
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) {
outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4;
} else {
final IntGenotypeFieldAccessors.Accessor accessor = GENOTYPE_FIELD_ACCESSORS.getAccessor(field);
if ( accessor != null ) {
final int[] intValues = accessor.getValues(g);
if ( intValues == null )
outputValue = VCFConstants.MISSING_VALUE_v4;
else if ( intValues.length == 1 ) // fast path
outputValue = Integer.toString(intValues[0]);
else {
final StringBuilder sb = new StringBuilder();
sb.append(intValues[0]);
for ( int i = 1; i < intValues.length; i++) {
sb.append(",");
sb.append(intValues[i]);
}
outputValue = sb.toString();
}
} else {
Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;
final VCFFormatHeaderLine metaData = this.header.getFormatHeaderLine(field);
if ( metaData != null ) {
final int numInFormatField = metaData.getCount(vc);
if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
// If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
// For example, if Number=2, the string has to be ".,."
final StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
for ( int i = 1; i < numInFormatField; i++ ) {
sb.append(",");
sb.append(VCFConstants.MISSING_VALUE_v4);
}
val = sb.toString();
}
}
// assume that if key is absent, then the given string encoding suffices
outputValue = formatVCFField(val);
}
}
if ( outputValue != null )
attrs.add(outputValue);
}
}
// strip off trailing missing values
if (!outputTrailingFormatFields) {
for (int i = attrs.size() - 1; i >= 0; i--) {
if (isMissingValue(attrs.get(i))) attrs.remove(i);
else break;
}
}
for (int i = 0; i < attrs.size(); i++) {
if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY)) {
builder.append(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
}
builder.append(attrs.get(i));
}
}
}
/*
* Create the info string; assumes that no values are null
*/
private void writeInfoString(final Map<String, String> infoFields, final StringBuilder builder) {
if ( infoFields.isEmpty() ) {
builder.append(VCFConstants.EMPTY_INFO_FIELD);
return;
}
boolean isFirst = true;
for (final Map.Entry<String, String> entry : infoFields.entrySet()) {
if (isFirst) isFirst = false;
else builder.append(VCFConstants.INFO_FIELD_SEPARATOR);
builder.append(entry.getKey());
if ( ! entry.getValue().equals("")) {
final VCFInfoHeaderLine metaData = this.header.getInfoHeaderLine(entry.getKey());
if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) {
builder.append("=");
builder.append(entry.getValue());
}
}
}
}
public Map<Allele, String> buildAlleleStrings(final VariantContext vc) {
final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
final List<Allele> alleles = vc.getAlleles();
for ( int i = 0; i < alleles.size(); i++ ) {
alleleMap.put(alleles.get(i), String.valueOf(i));
}
return alleleMap;
}
private void writeAllele(final Allele allele, final Map<Allele, String> alleleMap, final StringBuilder builder) {
final String encoding = alleleMap.get(allele);
if ( encoding == null )
throw new RuntimeException("Allele " + allele + " is not an allele in the variant context");
builder.append(encoding);
}
}