/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext;
import htsjdk.samtools.util.Lazy;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.jexl2.Expression;
import org.apache.commons.jexl2.JexlEngine;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class VariantContextUtils {
private static Set<String> MISSING_KEYS_WARNED_ABOUT = new HashSet<String>();
/** Use a {@link Lazy} {@link JexlEngine} instance to avoid class-loading issues. (Applications that access this class are otherwise
* forced to build a {@link JexlEngine} instance, which depends on some apache logging libraries that mightn't be packaged.) */
final public static Lazy<JexlEngine> engine = new Lazy<JexlEngine>(new Lazy.LazyInitializer<JexlEngine>() {
@Override
public JexlEngine make() {
final JexlEngine jexl = new JexlEngine();
jexl.setSilent(false); // will throw errors now for selects that don't evaluate properly
jexl.setLenient(false);
jexl.setDebug(false);
return jexl;
}
});
private final static boolean ASSUME_MISSING_FIELDS_ARE_STRINGS = false;
/**
* Computes the alternate allele frequency at the provided {@link VariantContext} by dividing its "AN" by its "AC".
* @param vc The variant whose alternate allele frequency is computed
* @return The alternate allele frequency in [0, 1]
* @throws AssertionError When either annotation is missing, or when the compuated frequency is outside the expected range
*/
public static double calculateAltAlleleFrequency(final VariantContext vc) {
if (!vc.hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) || !vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY))
throw new AssertionError(String.format(
"Cannot compute the provided variant's alt allele frequency because it does not have both %s and %s annotations: %s",
VCFConstants.ALLELE_NUMBER_KEY,
VCFConstants.ALLELE_COUNT_KEY,
vc));
final double altAlleleCount = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
final double totalCount = vc.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0);
final double aaf = altAlleleCount / totalCount;
if (aaf > 1 || aaf < 0)
throw new AssertionError(String.format("Expected a minor allele frequency in the range [0, 1], but got %s. vc=%s", aaf, vc));
return aaf;
}
/**
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome-based VCF tags
*
* @param vc the VariantContext
* @param attributes the attributes map to populate; must not be null; may contain old values
* @param removeStaleValues should we remove stale values from the mapping?
* @return the attributes map provided as input, returned for programming convenience
*/
public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes, boolean removeStaleValues) {
return calculateChromosomeCounts(vc, attributes, removeStaleValues, new HashSet<String>(0));
}
/**
* Update the attributes of the attributes map given the VariantContext to reflect the
* proper chromosome-based VCF tags
*
* @param vc the VariantContext
* @param attributes the attributes map to populate; must not be null; may contain old values
* @param removeStaleValues should we remove stale values from the mapping?
* @param founderIds - Set of founders Ids to take into account. AF and FC will be calculated over the founders.
* If empty or null, counts are generated for all samples as unrelated individuals
* @return the attributes map provided as input, returned for programming convenience
*/
public static Map<String, Object> calculateChromosomeCounts(VariantContext vc, Map<String, Object> attributes, boolean removeStaleValues, final Set<String> founderIds) {
final int AN = vc.getCalledChrCount();
// if everyone is a no-call, remove the old attributes if requested
if ( AN == 0 && removeStaleValues ) {
if ( attributes.containsKey(VCFConstants.ALLELE_COUNT_KEY) )
attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
if ( attributes.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) )
attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
if ( attributes.containsKey(VCFConstants.ALLELE_NUMBER_KEY) )
attributes.remove(VCFConstants.ALLELE_NUMBER_KEY);
return attributes;
}
if ( vc.hasGenotypes() ) {
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, AN);
// if there are alternate alleles, record the relevant tags
if ( vc.getAlternateAlleles().size() > 0 ) {
ArrayList<Double> alleleFreqs = new ArrayList<Double>();
ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
ArrayList<Integer> foundersAlleleCounts = new ArrayList<Integer>();
double totalFoundersChromosomes = (double)vc.getCalledChrCount(founderIds);
int foundersAltChromosomes;
for ( Allele allele : vc.getAlternateAlleles() ) {
foundersAltChromosomes = vc.getCalledChrCount(allele,founderIds);
alleleCounts.add(vc.getCalledChrCount(allele));
foundersAlleleCounts.add(foundersAltChromosomes);
if ( AN == 0 ) {
alleleFreqs.add(0.0);
} else {
final Double freq = (double)foundersAltChromosomes / totalFoundersChromosomes;
alleleFreqs.add(freq);
}
}
attributes.put(VCFConstants.ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
} else {
// if there's no alt AC and AF shouldn't be present
attributes.remove(VCFConstants.ALLELE_COUNT_KEY);
attributes.remove(VCFConstants.ALLELE_FREQUENCY_KEY);
}
}
return attributes;
}
/**
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
* chromosome-based VCF tags based on the current VC produced by builder.make()
*
* @param builder the VariantContextBuilder we are updating
* @param removeStaleValues should we remove stale values from the mapping?
*/
public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues) {
VariantContext vc = builder.make();
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, new HashSet<String>(0)));
}
/**
* Update the attributes of the attributes map in the VariantContextBuilder to reflect the proper
* chromosome-based VCF tags based on the current VC produced by builder.make()
*
* @param builder the VariantContextBuilder we are updating
* @param founderIds - Set of founders to take into account. AF and FC will be calculated over the founders only.
* If empty or null, counts are generated for all samples as unrelated individuals
* @param removeStaleValues should we remove stale values from the mapping?
*/
public static void calculateChromosomeCounts(VariantContextBuilder builder, boolean removeStaleValues, final Set<String> founderIds) {
VariantContext vc = builder.make();
builder.attributes(calculateChromosomeCounts(vc, new HashMap<String, Object>(vc.getAttributes()), removeStaleValues, founderIds));
}
public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) {
VCFCompoundHeaderLine metaData = header.getFormatHeaderLine(field);
if ( metaData == null ) metaData = header.getInfoHeaderLine(field);
if ( metaData == null ) {
if ( ASSUME_MISSING_FIELDS_ARE_STRINGS ) {
if ( ! MISSING_KEYS_WARNED_ABOUT.contains(field) ) {
MISSING_KEYS_WARNED_ABOUT.add(field);
if ( GeneralUtils.DEBUG_MODE_ENABLED )
System.err.println("Field " + field + " missing from VCF header, assuming it is an unbounded string type");
}
return new VCFInfoHeaderLine(field, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Auto-generated string header for " + field);
}
else
throw new TribbleException("Fully decoding VariantContext requires header line for all fields, but none was found for " + field);
}
return metaData;
}
/**
* A simple but common wrapper for matching VariantContext objects using JEXL expressions
*/
public static class JexlVCMatchExp {
public String name;
public Expression exp;
/**
* Create a new matcher expression with name and JEXL expression exp
* @param name name
* @param exp expression
*/
public JexlVCMatchExp(String name, Expression exp) {
this.name = name;
this.exp = exp;
}
}
/**
* Method for creating JexlVCMatchExp from input walker arguments names and exps. These two arrays contain
* the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
* match() below.
*
* @param names names
* @param exps expressions
* @return list of matches
*/
public static List<JexlVCMatchExp> initializeMatchExps(String[] names, String[] exps) {
if ( names == null || exps == null )
throw new IllegalArgumentException("BUG: neither names nor exps can be null: names " + Arrays.toString(names) + " exps=" + Arrays.toString(exps) );
if ( names.length != exps.length )
throw new IllegalArgumentException("Inconsistent number of provided filter names and expressions: names=" + Arrays.toString(names) + " exps=" + Arrays.toString(exps));
Map<String, String> map = new HashMap<String, String>();
for ( int i = 0; i < names.length; i++ ) { map.put(names[i], exps[i]); }
return VariantContextUtils.initializeMatchExps(map);
}
public static List<JexlVCMatchExp> initializeMatchExps(ArrayList<String> names, ArrayList<String> exps) {
String[] nameArray = new String[names.size()];
String[] expArray = new String[exps.size()];
return initializeMatchExps(names.toArray(nameArray), exps.toArray(expArray));
}
/**
* Method for creating JexlVCMatchExp from input walker arguments mapping from names to exps. These two arrays contain
* the name associated with each JEXL expression. initializeMatchExps will parse each expression and return
* a list of JexlVCMatchExp, in order, that correspond to the names and exps. These are suitable input to
* match() below.
*
* @param names_and_exps mapping of names to expressions
* @return list of matches
*/
public static List<JexlVCMatchExp> initializeMatchExps(Map<String, String> names_and_exps) {
List<JexlVCMatchExp> exps = new ArrayList<JexlVCMatchExp>();
for ( Map.Entry<String, String> elt : names_and_exps.entrySet() ) {
String name = elt.getKey();
String expStr = elt.getValue();
if ( name == null || expStr == null ) throw new IllegalArgumentException("Cannot create null expressions : " + name + " " + expStr);
try {
Expression exp = engine.get().createExpression(expStr);
exps.add(new JexlVCMatchExp(name, exp));
} catch (Exception e) {
throw new IllegalArgumentException("Argument " + name + "has a bad value. Invalid expression used (" + expStr + "). Please see the JEXL docs for correct syntax.") ;
}
}
return exps;
}
/**
* Returns true if exp match VC. See collection<> version for full docs.
* @param vc variant context
* @param exp expression
* @return true if there is a match
*/
public static boolean match(VariantContext vc, JexlVCMatchExp exp) {
return match(vc,Arrays.asList(exp)).get(exp);
}
/**
* Matches each JexlVCMatchExp exp against the data contained in vc, and returns a map from these
* expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL
* expressions to VariantContext records. Use initializeMatchExps() to create the list of JexlVCMatchExp
* expressions.
*
* @param vc variant context
* @param exps expressions
* @return true if there is a match
*/
public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Collection<JexlVCMatchExp> exps) {
return new JEXLMap(exps,vc);
}
/**
* Returns true if exp match VC/g. See collection<> version for full docs.
* @param vc variant context
* @param g genotype
* @param exp expression
* @return true if there is a match
*/
public static boolean match(VariantContext vc, Genotype g, JexlVCMatchExp exp) {
return match(vc,g,Arrays.asList(exp)).get(exp);
}
/**
* Matches each JexlVCMatchExp exp against the data contained in vc/g, and returns a map from these
* expressions to true (if they matched) or false (if they didn't). This the best way to apply JEXL
* expressions to VariantContext records/genotypes. Use initializeMatchExps() to create the list of JexlVCMatchExp
* expressions.
*
* @param vc variant context
* @param g genotype
* @param exps expressions
* @return true if there is a match
*/
public static Map<JexlVCMatchExp, Boolean> match(VariantContext vc, Genotype g, Collection<JexlVCMatchExp> exps) {
return new JEXLMap(exps,vc,g);
}
/**
* Returns a newly allocated VC that is the same as VC, but without genotypes
* @param vc variant context
* @return new VC without genotypes
*/
public static VariantContext sitesOnlyVariantContext(VariantContext vc) {
return new VariantContextBuilder(vc).noGenotypes().make();
}
/**
* Returns a newly allocated list of VC, where each VC is the same as the input VCs, but without genotypes
* @param vcs collection of VCs
* @return new VCs without genotypes
*/
public static Collection<VariantContext> sitesOnlyVariantContexts(Collection<VariantContext> vcs) {
List<VariantContext> r = new ArrayList<VariantContext>();
for ( VariantContext vc : vcs )
r.add(sitesOnlyVariantContext(vc));
return r;
}
public static int getSize( VariantContext vc ) {
return vc.getEnd() - vc.getStart() + 1;
}
public static Set<String> genotypeNames(final Collection<Genotype> genotypes) {
final Set<String> names = new HashSet<String>(genotypes.size());
for ( final Genotype g : genotypes )
names.add(g.getSampleName());
return names;
}
/**
* Compute the end position for this VariantContext from the alleles themselves
*
* In the trivial case this is a single BP event and end = start (open intervals)
* In general the end is start + ref length - 1, handling the case where ref length == 0
* However, if alleles contains a symbolic allele then we use endForSymbolicAllele in all cases
*
* @param alleles the list of alleles to consider. The reference allele must be the first one
* @param start the known start position of this event
* @param endForSymbolicAlleles the end position to use if any of the alleles is symbolic. Can be -1
* if no is expected but will throw an error if one is found
* @return this builder
*/
public static int computeEndFromAlleles(final List<Allele> alleles, final int start, final int endForSymbolicAlleles) {
final Allele ref = alleles.get(0);
if ( ref.isNonReference() )
throw new IllegalStateException("computeEndFromAlleles requires first allele to be reference");
if ( VariantContext.hasSymbolicAlleles(alleles) ) {
if ( endForSymbolicAlleles == -1 )
throw new IllegalStateException("computeEndFromAlleles found a symbolic allele but endForSymbolicAlleles was provided");
return endForSymbolicAlleles;
} else {
return start + Math.max(ref.length() - 1, 0);
}
}
}