/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext;
import htsjdk.tribble.Feature;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.util.ParsingUtils;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Class VariantContext
*
* == High-level overview ==
*
* The VariantContext object is a single general class system for representing genetic variation data composed of:
*
* * Allele: representing single genetic haplotypes (A, T, ATC, -) (note that null alleles are used here for illustration; see the Allele class for how to represent indels)
* * Genotype: an assignment of alleles for each chromosome of a single named sample at a particular locus
* * VariantContext: an abstract class holding all segregating alleles at a locus as well as genotypes
* for multiple individuals containing alleles at that locus
*
* The class system works by defining segregating alleles, creating a variant context representing the segregating
* information at a locus, and potentially creating and associating genotypes with individuals in the context.
*
* All of the classes are highly validating -- call validate() if you modify them -- so you can rely on the
* self-consistency of the data once you have a VariantContext in hand. The system has a rich set of assessor
* and manipulator routines, as well as more complex static support routines in VariantContextUtils.
*
* The VariantContext (and Genotype) objects are attributed (supporting addition of arbitrary key/value pairs) and
* filtered (can represent a variation that is viewed as suspect).
*
* VariantContexts are dynamically typed, so whether a VariantContext is a SNP, Indel, or NoVariant depends
* on the properties of the alleles in the context. See the detailed documentation on the Type parameter below.
*
* It's also easy to create subcontexts based on selected genotypes.
*
* == Working with Variant Contexts ==
* By default, VariantContexts are immutable. In order to access (in the rare circumstances where you need them)
* setter routines, you need to create MutableVariantContexts and MutableGenotypes.
*
* === Some example data ===
*
* Allele A, Aref, T, Tref;
* Allele del, delRef, ATC, ATCref;
*
* A [ref] / T at 10
* GenomeLoc snpLoc = GenomeLocParser.createGenomeLoc("chr1", 10, 10);
*
* A / ATC [ref] from 20-23
* GenomeLoc delLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 22);
*
* // A [ref] / ATC immediately after 20
* GenomeLoc insLoc = GenomeLocParser.createGenomeLoc("chr1", 20, 20);
*
* === Alleles ===
*
* See the documentation in the Allele class itself
*
* What are they?
*
* Alleles can be either reference or non-reference
*
* Examples of alleles used here:
*
* A = new Allele("A");
* Aref = new Allele("A", true);
* T = new Allele("T");
* ATC = new Allele("ATC");
*
* === Creating variant contexts ===
*
* ==== By hand ====
*
* Here's an example of a A/T polymorphism with the A being reference:
*
* <pre>
* VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref, T));
* </pre>
*
* If you want to create a non-variant site, just put in a single reference allele
*
* <pre>
* VariantContext vc = new VariantContext(name, snpLoc, Arrays.asList(Aref));
* </pre>
*
* A deletion is just as easy:
*
* <pre>
* VariantContext vc = new VariantContext(name, delLoc, Arrays.asList(ATCref, del));
* </pre>
*
* The only thing that distinguishes between an insertion and deletion is which is the reference allele.
* An insertion has a reference allele that is smaller than the non-reference allele, and vice versa for deletions.
*
* <pre>
* VariantContext vc = new VariantContext("name", insLoc, Arrays.asList(delRef, ATC));
* </pre>
*
* ==== Converting rods and other data structures to VCs ====
*
* You can convert many common types into VariantContexts using the general function:
*
* <pre>
* VariantContextAdaptors.convertToVariantContext(name, myObject)
* </pre>
*
* dbSNP and VCFs, for example, can be passed in as myObject and a VariantContext corresponding to that
* object will be returned. A null return type indicates that the type isn't yet supported. This is the best
* and easiest way to create contexts using RODs.
*
*
* === Working with genotypes ===
*
* <pre>
* List<Allele> alleles = Arrays.asList(Aref, T);
* Genotype g1 = new Genotype(Arrays.asList(Aref, Aref), "g1", 10);
* Genotype g2 = new Genotype(Arrays.asList(Aref, T), "g2", 10);
* Genotype g3 = new Genotype(Arrays.asList(T, T), "g3", 10);
* VariantContext vc = new VariantContext(snpLoc, alleles, Arrays.asList(g1, g2, g3));
* </pre>
*
* At this point we have 3 genotypes in our context, g1-g3.
*
* You can assess a good deal of information about the genotypes through the VariantContext:
*
* <pre>
* vc.hasGenotypes()
* vc.isMonomorphicInSamples()
* vc.isPolymorphicInSamples()
* vc.getSamples().size()
*
* vc.getGenotypes()
* vc.getGenotypes().get("g1")
* vc.hasGenotype("g1")
*
* vc.getCalledChrCount()
* vc.getCalledChrCount(Aref)
* vc.getCalledChrCount(T)
* </pre>
*
* === NO_CALL alleles ===
*
* The system allows one to create Genotypes carrying special NO_CALL alleles that aren't present in the
* set of context alleles and that represent undetermined alleles in a genotype:
*
* Genotype g4 = new Genotype(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), "NO_DATA_FOR_SAMPLE", 10);
*
*
* === subcontexts ===
* It's also very easy get subcontext based only the data in a subset of the genotypes:
*
* <pre>
* VariantContext vc12 = vc.subContextFromGenotypes(Arrays.asList(g1,g2));
* VariantContext vc1 = vc.subContextFromGenotypes(Arrays.asList(g1));
* </pre>
*
* <s3>
* Fully decoding. Currently VariantContexts support some fields, particularly those
* stored as generic attributes, to be of any type. For example, a field AB might
* be naturally a floating point number, 0.51, but when it's read into a VC its
* not decoded into the Java presentation but left as a string "0.51". A fully
* decoded VariantContext is one where all values have been converted to their
* corresponding Java object types, based on the types declared in a VCFHeader.
*
* The fullyDecode() takes a header object and creates a new fully decoded VariantContext
* where all fields are converted to their true java representation. The VCBuilder
* can be told that all fields are fully decoded, in which case no work is done when
* asking for a fully decoded version of the VC.
* </s3>
*
* @author depristo
*/
public class VariantContext implements Feature { // to enable tribble integration
private final static boolean WARN_ABOUT_BAD_END = true;
private final static int MAX_ALLELE_SIZE_FOR_NON_SV = 150;
private boolean fullyDecoded = false;
protected CommonInfo commonInfo = null;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
public final static Set<String> PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet<String>());
/** The location of this VariantContext */
final protected String contig;
final protected long start;
final protected long stop;
private final String ID;
/** The type (cached for performance reasons) of this context */
protected Type type = null;
/** A set of the alleles segregating in this context */
final protected List<Allele> alleles;
/** A mapping from sampleName -> genotype objects for all genotypes associated with this context */
protected GenotypesContext genotypes = null;
/** Counts for each of the possible Genotype types in this context */
protected int[] genotypeCounts = null;
public final static GenotypesContext NO_GENOTYPES = GenotypesContext.NO_GENOTYPES;
// a fast cached access point to the ref / alt alleles for biallelic case
private Allele REF = null;
// set to the alt allele when biallelic, otherwise == null
private Allele ALT = null;
/* cached monomorphic value: null -> not yet computed, False, True */
private Boolean monomorphic = null;
/*
* Determine which genotype fields are in use in the genotypes in VC
* @return an ordered list of genotype fields in use in VC. If vc has genotypes this will always include GT first
*/
public List<String> calcVCFGenotypeKeys(final VCFHeader header) {
final Set<String> keys = new HashSet<String>();
boolean sawGoodGT = false;
boolean sawGoodQual = false;
boolean sawGenotypeFilter = false;
boolean sawDP = false;
boolean sawAD = false;
boolean sawPL = false;
for (final Genotype g : this.getGenotypes()) {
keys.addAll(g.getExtendedAttributes().keySet());
if ( g.isAvailable() ) sawGoodGT = true;
if ( g.hasGQ() ) sawGoodQual = true;
if ( g.hasDP() ) sawDP = true;
if ( g.hasAD() ) sawAD = true;
if ( g.hasPL() ) sawPL = true;
if (g.isFiltered()) sawGenotypeFilter = true;
}
if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
if ( sawDP ) keys.add(VCFConstants.DEPTH_KEY);
if ( sawAD ) keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY);
if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);
List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));
// make sure the GT is first
if (sawGoodGT) {
final List<String> newList = new ArrayList<String>(sortedList.size()+1);
newList.add(VCFConstants.GENOTYPE_KEY);
newList.addAll(sortedList);
sortedList = newList;
}
if (sortedList.isEmpty() && header.hasGenotypingData()) {
// this needs to be done in case all samples are no-calls
return Collections.singletonList(VCFConstants.GENOTYPE_KEY);
} else {
return sortedList;
}
}
// ---------------------------------------------------------------------------------------------------------
//
// validation mode
//
// ---------------------------------------------------------------------------------------------------------
public enum Validation {
ALLELES,
GENOTYPES
}
private final static EnumSet<Validation> NO_VALIDATION = EnumSet.noneOf(Validation.class);
// ---------------------------------------------------------------------------------------------------------
//
// constructors: see VariantContextBuilder
//
// ---------------------------------------------------------------------------------------------------------
/**
* Copy constructor
*
* @param other the VariantContext to copy
*/
protected VariantContext(VariantContext other) {
this(other.getSource(), other.getID(), other.getChr(), other.getStart(), other.getEnd(),
other.getAlleles(), other.getGenotypes(), other.getLog10PError(),
other.getFiltersMaybeNull(),
other.getAttributes(),
other.fullyDecoded, NO_VALIDATION);
}
/**
* the actual constructor. Private access only
*
* @param source source
* @param contig the contig
* @param start the start base (one based)
* @param stop the stop reference base (one based)
* @param alleles alleles
* @param genotypes genotypes map
* @param log10PError qual
* @param filters filters: use null for unfiltered and empty set for passes filters
* @param attributes attributes
* @param validationToPerform set of validation steps to take
*/
protected VariantContext(final String source,
final String ID,
final String contig,
final long start,
final long stop,
final Collection<Allele> alleles,
final GenotypesContext genotypes,
final double log10PError,
final Set<String> filters,
final Map<String, Object> attributes,
final boolean fullyDecoded,
final EnumSet<Validation> validationToPerform ) {
if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); }
this.contig = contig;
this.start = start;
this.stop = stop;
// intern for efficiency. equals calls will generate NPE if ID is inappropriately passed in as null
if ( ID == null || ID.equals("") ) throw new IllegalArgumentException("ID field cannot be the null or the empty string");
this.ID = ID.equals(VCFConstants.EMPTY_ID_FIELD) ? VCFConstants.EMPTY_ID_FIELD : ID;
this.commonInfo = new CommonInfo(source, log10PError, filters, attributes);
if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); }
// we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles
this.alleles = makeAlleles(alleles);
if ( genotypes == null || genotypes == NO_GENOTYPES ) {
this.genotypes = NO_GENOTYPES;
} else {
this.genotypes = genotypes.immutable();
}
// cache the REF and ALT alleles
int nAlleles = alleles.size();
for ( Allele a : alleles ) {
if ( a.isReference() ) {
REF = a;
} else if ( nAlleles == 2 ) { // only cache ALT when biallelic
ALT = a;
}
}
this.fullyDecoded = fullyDecoded;
if ( ! validationToPerform.isEmpty() ) {
validate(validationToPerform);
}
}
// ---------------------------------------------------------------------------------------------------------
//
// Selectors
//
// ---------------------------------------------------------------------------------------------------------
/**
* This method subsets down to a set of samples.
*
* At the same time returns the alleles to just those in use by the samples,
* if rederiveAllelesFromGenotypes is true, otherwise the full set of alleles
* in this VC is returned as the set of alleles in the subContext, even if
* some of those alleles aren't in the samples
*
* WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING?
*
* @param sampleNames the sample names
* @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default
* @return new VariantContext subsetting to just the given samples
*/
public VariantContext subContextFromSamples(Set<String> sampleNames, final boolean rederiveAllelesFromGenotypes ) {
if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) {
return this; // fast path when you don't have any work to do
} else {
VariantContextBuilder builder = new VariantContextBuilder(this);
GenotypesContext newGenotypes = genotypes.subsetToSamples(sampleNames);
if ( rederiveAllelesFromGenotypes ) {
Set<Allele> allelesFromGenotypes = allelesOfGenotypes(newGenotypes);
// ensure original order of genotypes
List<Allele> rederivedAlleles = new ArrayList<Allele>(allelesFromGenotypes.size());
for (Allele allele : alleles)
if (allelesFromGenotypes.contains(allele))
rederivedAlleles.add(allele);
builder.alleles(rederivedAlleles);
}
else {
builder.alleles(alleles);
}
return builder.genotypes(newGenotypes).make();
}
}
/**
* @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true
*
* @param sampleNames
* @return
*/
public VariantContext subContextFromSamples(final Set<String> sampleNames) {
return subContextFromSamples(sampleNames, true);
}
public VariantContext subContextFromSample(String sampleName) {
return subContextFromSamples(Collections.singleton(sampleName));
}
/**
* helper routine for subcontext
* @param genotypes genotypes
* @return allele set
*/
private final Set<Allele> allelesOfGenotypes(Collection<Genotype> genotypes) {
final Set<Allele> alleles = new HashSet<Allele>();
boolean addedref = false;
for ( final Genotype g : genotypes ) {
for ( final Allele a : g.getAlleles() ) {
addedref = addedref || a.isReference();
if ( a.isCalled() )
alleles.add(a);
}
}
if ( ! addedref ) alleles.add(getReference());
return alleles;
}
// ---------------------------------------------------------------------------------------------------------
//
// type operations
//
// ---------------------------------------------------------------------------------------------------------
/**
* see: http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=handbook&part=ch5&rendertype=table&id=ch5.ch5_t3
*
* Format:
* dbSNP variation class
* Rules for assigning allele classes
* Sample allele definition
*
* Single Nucleotide Polymorphisms (SNPs)a
* Strictly defined as single base substitutions involving A, T, C, or G.
* A/T
*
* Deletion/Insertion Polymorphisms (DIPs)
* Designated using the full sequence of the insertion as one allele, and either a fully
* defined string for the variant allele or a '-' character to specify the deleted allele.
* This class will be assigned to a variation if the variation alleles are of different lengths or
* if one of the alleles is deleted ('-').
* T/-/CCTA/G
*
* No-variation
* Reports may be submitted for segments of sequence that are assayed and determined to be invariant
* in the sample.
* (NoVariation)
*
* Mixed
* Mix of other classes
*
* Also supports NO_VARIATION type, used to indicate that the site isn't polymorphic in the population
*
*
* Not currently supported:
*
* Heterozygous sequence
* The term heterozygous is used to specify a region detected by certain methods that do not
* resolve the polymorphism into a specific sequence motif. In these cases, a unique flanking
* sequence must be provided to define a sequence context for the variation.
* (heterozygous)
*
* Microsatellite or short tandem repeat (STR)
* Alleles are designated by providing the repeat motif and the copy number for each allele.
* Expansion of the allele repeat motif designated in dbSNP into full-length sequence will
* be only an approximation of the true genomic sequence because many microsatellite markers are
* not fully sequenced and are resolved as size variants only.
* (CAC)8/9/10/11
*
* Named variant
* Applies to insertion/deletion polymorphisms of longer sequence features, such as retroposon
* dimorphism for Alu or line elements. These variations frequently include a deletion '-' indicator
* for the absent allele.
* (alu) / -
*
* Multi-Nucleotide Polymorphism (MNP)
* Assigned to variations that are multi-base variations of a single, common length
* GGA/AGT
*/
public enum Type {
NO_VARIATION,
SNP,
MNP, // a multi-nucleotide polymorphism
INDEL,
SYMBOLIC,
MIXED,
}
/**
* Determines (if necessary) and returns the type of this variation by examining the alleles it contains.
*
* @return the type of this VariantContext
**/
public Type getType() {
if ( type == null )
determineType();
return type;
}
/**
* convenience method for SNPs
*
* @return true if this is a SNP, false otherwise
*/
public boolean isSNP() { return getType() == Type.SNP; }
/**
* convenience method for variants
*
* @return true if this is a variant allele, false if it's reference
*/
public boolean isVariant() { return getType() != Type.NO_VARIATION; }
/**
* convenience method for point events
*
* @return true if this is a SNP or ref site, false if it's an indel or mixed event
*/
public boolean isPointEvent() { return isSNP() || !isVariant(); }
/**
* convenience method for indels
*
* @return true if this is an indel, false otherwise
*/
public boolean isIndel() { return getType() == Type.INDEL; }
/**
* @return true if the alleles indicate a simple insertion (i.e., the reference allele is Null)
*/
public boolean isSimpleInsertion() {
// can't just call !isSimpleDeletion() because of complex indels
return isSimpleIndel() && getReference().length() == 1;
}
/**
* @return true if the alleles indicate a simple deletion (i.e., a single alt allele that is Null)
*/
public boolean isSimpleDeletion() {
// can't just call !isSimpleInsertion() because of complex indels
return isSimpleIndel() && getAlternateAllele(0).length() == 1;
}
/**
* @return true if the alleles indicate a simple indel, false otherwise.
*/
public boolean isSimpleIndel() {
return getType() == Type.INDEL // allelic lengths differ
&& isBiallelic() // exactly 2 alleles
&& getReference().length() > 0 // ref is not null or symbolic
&& getAlternateAllele(0).length() > 0 // alt is not null or symbolic
&& getReference().getBases()[0] == getAlternateAllele(0).getBases()[0] // leading bases match for both alleles
&& (getReference().length() == 1 || getAlternateAllele(0).length() == 1);
}
/**
* @return true if the alleles indicate neither a simple deletion nor a simple insertion
*/
public boolean isComplexIndel() {
return isIndel() && !isSimpleDeletion() && !isSimpleInsertion();
}
public boolean isSymbolic() {
return getType() == Type.SYMBOLIC;
}
public boolean isStructuralIndel() {
if ( getType() == Type.INDEL ) {
List<Integer> sizes = getIndelLengths();
if ( sizes != null ) {
for ( Integer length : sizes ) {
if ( length > MAX_ALLELE_SIZE_FOR_NON_SV ) {
return true;
}
}
}
}
return false;
}
/**
*
* @return true if the variant is symbolic or a large indel
*/
public boolean isSymbolicOrSV() {
return isSymbolic() || isStructuralIndel();
}
public boolean isMNP() {
return getType() == Type.MNP;
}
/**
* convenience method for indels
*
* @return true if this is an mixed variation, false otherwise
*/
public boolean isMixed() { return getType() == Type.MIXED; }
// ---------------------------------------------------------------------------------------------------------
//
// Generic accessors
//
// ---------------------------------------------------------------------------------------------------------
public boolean hasID() {
return getID() != VCFConstants.EMPTY_ID_FIELD;
}
public boolean emptyID() {
return ! hasID();
}
public String getID() {
return ID;
}
// ---------------------------------------------------------------------------------------------------------
//
// get routines to access context info fields
//
// ---------------------------------------------------------------------------------------------------------
public String getSource() { return commonInfo.getName(); }
public Set<String> getFiltersMaybeNull() { return commonInfo.getFiltersMaybeNull(); }
public Set<String> getFilters() { return commonInfo.getFilters(); }
public boolean isFiltered() { return commonInfo.isFiltered(); }
public boolean isNotFiltered() { return commonInfo.isNotFiltered(); }
public boolean filtersWereApplied() { return commonInfo.filtersWereApplied(); }
public boolean hasLog10PError() { return commonInfo.hasLog10PError(); }
public double getLog10PError() { return commonInfo.getLog10PError(); }
public double getPhredScaledQual() { return commonInfo.getPhredScaledQual(); }
public Map<String, Object> getAttributes() { return commonInfo.getAttributes(); }
public boolean hasAttribute(String key) { return commonInfo.hasAttribute(key); }
public Object getAttribute(String key) { return commonInfo.getAttribute(key); }
public Object getAttribute(String key, Object defaultValue) {
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
public CommonInfo getCommonInfo() {
return commonInfo;
}
// ---------------------------------------------------------------------------------------------------------
//
// Working with alleles
//
// ---------------------------------------------------------------------------------------------------------
/**
* @return the reference allele for this context
*/
public Allele getReference() {
Allele ref = REF;
if ( ref == null )
throw new IllegalStateException("BUG: no reference allele found at " + this);
return ref;
}
/**
* @return true if the context is strictly bi-allelic
*/
public boolean isBiallelic() {
return getNAlleles() == 2;
}
/**
* @return The number of segregating alleles in this context
*/
public int getNAlleles() {
return alleles.size();
}
/**
* Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes
*
* This function is caching, so it's only expensive on the first call
*
* @param defaultPloidy the default ploidy, if all samples are no-called
* @return default, or the max ploidy
*/
public int getMaxPloidy(final int defaultPloidy) {
return genotypes.getMaxPloidy(defaultPloidy);
}
/**
* @return The allele sharing the same bases as this String. A convenience method; better to use byte[]
*/
public Allele getAllele(String allele) {
return getAllele(allele.getBytes());
}
/**
* @return The allele sharing the same bases as this byte[], or null if no such allele is present.
*/
public Allele getAllele(byte[] allele) {
return Allele.getMatchingAllele(getAlleles(), allele);
}
/**
* @return True if this context contains Allele allele, or false otherwise
*/
public boolean hasAllele(final Allele allele) {
return hasAllele(allele, false, true);
}
public boolean hasAllele(final Allele allele, final boolean ignoreRefState) {
return hasAllele(allele, ignoreRefState, true);
}
public boolean hasAlternateAllele(final Allele allele) {
return hasAllele(allele, false, false);
}
public boolean hasAlternateAllele(final Allele allele, final boolean ignoreRefState) {
return hasAllele(allele, ignoreRefState, false);
}
private boolean hasAllele(final Allele allele, final boolean ignoreRefState, final boolean considerRefAllele) {
if ( (considerRefAllele && allele == REF) || allele == ALT ) // optimization for cached cases
return true;
final List<Allele> allelesToConsider = considerRefAllele ? getAlleles() : getAlternateAlleles();
for ( Allele a : allelesToConsider ) {
if ( a.equals(allele, ignoreRefState) )
return true;
}
return false;
}
/**
* Gets the alleles. This method should return all of the alleles present at the location,
* including the reference allele. There are no constraints imposed on the ordering of alleles
* in the set. If the reference is not an allele in this context it will not be included.
*
* @return the set of alleles
*/
public List<Allele> getAlleles() { return alleles; }
/**
* Gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference allele. There are no constraints imposed on the ordering of alleles
* in the set.
*
* @return the set of alternate alleles
*/
public List<Allele> getAlternateAlleles() {
return alleles.subList(1, alleles.size());
}
/**
* Gets the sizes of the alternate alleles if they are insertion/deletion events, and returns a list of their sizes
*
* @return a list of indel lengths ( null if not of type indel or mixed )
*/
public List<Integer> getIndelLengths() {
if ( getType() != Type.INDEL && getType() != Type.MIXED ) {
return null;
}
List<Integer> lengths = new ArrayList<Integer>();
for ( Allele a : getAlternateAlleles() ) {
lengths.add(a.length() - getReference().length());
}
return lengths;
}
/**
* @param i -- the ith allele (from 0 to n - 2 for a context with n alleles including a reference allele)
* @return the ith non-reference allele in this context
* @throws IllegalArgumentException if i is invalid
*/
public Allele getAlternateAllele(int i) {
return alleles.get(i+1);
}
/**
* @param other VariantContext whose alleles to compare against
* @return true if this VariantContext has the same alleles (both ref and alts) as other,
* regardless of ordering. Otherwise returns false.
*/
public boolean hasSameAllelesAs ( final VariantContext other ) {
return hasSameAlternateAllelesAs(other) && other.getReference().equals(getReference(), false);
}
/**
* @param other VariantContext whose alternate alleles to compare against
* @return true if this VariantContext has the same alternate alleles as other,
* regardless of ordering. Otherwise returns false.
*/
public boolean hasSameAlternateAllelesAs ( final VariantContext other ) {
List<Allele> thisAlternateAlleles = getAlternateAlleles();
List<Allele> otherAlternateAlleles = other.getAlternateAlleles();
if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) {
return false;
}
for ( Allele allele : thisAlternateAlleles ) {
if ( ! otherAlternateAlleles.contains(allele) ) {
return false;
}
}
return true;
}
// ---------------------------------------------------------------------------------------------------------
//
// Working with genotypes
//
// ---------------------------------------------------------------------------------------------------------
/**
* @return the number of samples in the context
*/
public int getNSamples() {
return genotypes.size();
}
/**
* @return true if the context has associated genotypes
*/
public boolean hasGenotypes() {
return ! genotypes.isEmpty();
}
public boolean hasGenotypes(Collection<String> sampleNames) {
return genotypes.containsSamples(sampleNames);
}
/**
* @return set of all Genotypes associated with this context
*/
public GenotypesContext getGenotypes() {
return genotypes;
}
public Iterable<Genotype> getGenotypesOrderedByName() {
return genotypes.iterateInSampleNameOrder();
}
public Iterable<Genotype> getGenotypesOrderedBy(Iterable<String> sampleOrdering) {
return genotypes.iterateInSampleNameOrder(sampleOrdering);
}
/**
* Returns a map from sampleName -> Genotype for the genotype associated with sampleName. Returns a map
* for consistency with the multi-get function.
*
* @param sampleName the sample name
* @return mapping from sample name to genotype
* @throws IllegalArgumentException if sampleName isn't bound to a genotype
*/
public GenotypesContext getGenotypes(String sampleName) {
return getGenotypes(Collections.singleton(sampleName));
}
/**
* Returns a map from sampleName -> Genotype for each sampleName in sampleNames. Returns a map
* for consistency with the multi-get function.
*
* For testing convenience only
*
* @param sampleNames a unique list of sample names
* @return subsetting genotypes context
* @throws IllegalArgumentException if sampleName isn't bound to a genotype
*/
protected GenotypesContext getGenotypes(Collection<String> sampleNames) {
return getGenotypes().subsetToSamples(new HashSet<String>(sampleNames));
}
public GenotypesContext getGenotypes(Set<String> sampleNames) {
return getGenotypes().subsetToSamples(sampleNames);
}
/**
* @return the set of all sample names in this context, not ordered
*/
public Set<String> getSampleNames() {
return getGenotypes().getSampleNames();
}
public List<String> getSampleNamesOrderedByName() {
return getGenotypes().getSampleNamesOrderedByName();
}
/**
* @param sample the sample name
*
* @return the Genotype associated with the given sample in this context or null if the sample is not in this context
*/
public Genotype getGenotype(String sample) {
return getGenotypes().get(sample);
}
public boolean hasGenotype(String sample) {
return getGenotypes().containsSample(sample);
}
public Genotype getGenotype(int ith) {
return genotypes.get(ith);
}
/**
* Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS)
*
* @return chromosome count
*/
public int getCalledChrCount() {
final Set<String> noSamples = Collections.emptySet();
return getCalledChrCount(noSamples);
}
/**
* Returns the number of chromosomes carrying any allele in the genotypes (i.e., excluding NO_CALLS)
*
* @param sampleIds IDs of samples to take into account. If empty then all samples are included.
* @return chromosome count
*/
public int getCalledChrCount(Set<String> sampleIds) {
int n = 0;
GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds);
for ( final Genotype g : genotypes) {
for ( final Allele a : g.getAlleles() )
n += a.isNoCall() ? 0 : 1;
}
return n;
}
/**
* Returns the number of chromosomes carrying allele A in the genotypes
*
* @param a allele
* @return chromosome count
*/
public int getCalledChrCount(Allele a) {
return getCalledChrCount(a,new HashSet<String>(0));
}
/**
* Returns the number of chromosomes carrying allele A in the genotypes
*
* @param a allele
* @param sampleIds - IDs of samples to take into account. If empty then all samples are included.
* @return chromosome count
*/
public int getCalledChrCount(Allele a, Set<String> sampleIds) {
int n = 0;
GenotypesContext genotypes = sampleIds.isEmpty() ? getGenotypes() : getGenotypes(sampleIds);
for ( final Genotype g : genotypes ) {
n += g.countAllele(a);
}
return n;
}
/**
* Genotype-specific functions -- are the genotypes monomorphic w.r.t. to the alleles segregating at this
* site? That is, is the number of alternate alleles among all fo the genotype == 0?
*
* @return true if it's monomorphic
*/
public boolean isMonomorphicInSamples() {
if ( monomorphic == null )
monomorphic = ! isVariant() || (hasGenotypes() && getCalledChrCount(getReference()) == getCalledChrCount());
return monomorphic;
}
/**
* Genotype-specific functions -- are the genotypes polymorphic w.r.t. to the alleles segregating at this
* site? That is, is the number of alternate alleles among all fo the genotype > 0?
*
* @return true if it's polymorphic
*/
public boolean isPolymorphicInSamples() {
return ! isMonomorphicInSamples();
}
private void calculateGenotypeCounts() {
if ( genotypeCounts == null ) {
genotypeCounts = new int[GenotypeType.values().length];
for ( final Genotype g : getGenotypes() ) {
genotypeCounts[g.getType().ordinal()]++;
}
}
}
/**
* Genotype-specific functions -- how many no-calls are there in the genotypes?
*
* @return number of no calls
*/
public int getNoCallCount() {
calculateGenotypeCounts();
return genotypeCounts[GenotypeType.NO_CALL.ordinal()];
}
/**
* Genotype-specific functions -- how many hom ref calls are there in the genotypes?
*
* @return number of hom ref calls
*/
public int getHomRefCount() {
calculateGenotypeCounts();
return genotypeCounts[GenotypeType.HOM_REF.ordinal()];
}
/**
* Genotype-specific functions -- how many het calls are there in the genotypes?
*
* @return number of het calls
*/
public int getHetCount() {
calculateGenotypeCounts();
return genotypeCounts[GenotypeType.HET.ordinal()];
}
/**
* Genotype-specific functions -- how many hom var calls are there in the genotypes?
*
* @return number of hom var calls
*/
public int getHomVarCount() {
calculateGenotypeCounts();
return genotypeCounts[GenotypeType.HOM_VAR.ordinal()];
}
/**
* Genotype-specific functions -- how many mixed calls are there in the genotypes?
*
* @return number of mixed calls
*/
public int getMixedCount() {
calculateGenotypeCounts();
return genotypeCounts[GenotypeType.MIXED.ordinal()];
}
// ---------------------------------------------------------------------------------------------------------
//
// validation: extra-strict validation routines for paranoid users
//
// ---------------------------------------------------------------------------------------------------------
/**
* Run all extra-strict validation tests on a Variant Context object
*
* @param reportedReference the reported reference allele
* @param observedReference the actual reference allele
* @param rsIDs the true dbSNP IDs
*/
public void extraStrictValidation(final Allele reportedReference, final Allele observedReference, final Set<String> rsIDs) {
// validate the reference
validateReferenceBases(reportedReference, observedReference);
// validate the RS IDs
validateRSIDs(rsIDs);
// validate the altenate alleles
validateAlternateAlleles();
// validate the AN and AC fields
validateChromosomeCounts();
// TODO: implement me
//checkReferenceTrack();
}
public void validateReferenceBases(final Allele reportedReference, final Allele observedReference) {
if ( reportedReference != null && !reportedReference.basesMatch(observedReference) ) {
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), observedReference.getBaseString(), reportedReference.getBaseString()));
}
}
public void validateRSIDs(Set<String> rsIDs) {
if ( rsIDs != null && hasID() ) {
for ( String id : getID().split(VCFConstants.ID_FIELD_SEPARATOR) ) {
if ( id.startsWith("rs") && !rsIDs.contains(id) )
throw new TribbleException.InternalCodecException(String.format("the rsID %s for the record at position %s:%d is not in dbSNP", id, getChr(), getStart()));
}
}
}
public void validateAlternateAlleles() {
if ( !hasGenotypes() )
return;
// maintain a list of non-symbolic alleles reported in the REF and ALT fields of the record
// (we exclude symbolic alleles because it's commonly expected that they don't show up in the genotypes, e.g. with GATK gVCFs)
final List<Allele> reportedAlleles = new ArrayList<Allele>();
for ( final Allele allele : getAlleles() ) {
if ( !allele.isSymbolic() )
reportedAlleles.add(allele);
}
// maintain a list of non-symbolic alleles observed in the genotypes
final Set<Allele> observedAlleles = new HashSet<Allele>();
observedAlleles.add(getReference());
for ( final Genotype g : getGenotypes() ) {
if ( g.isCalled() ) {
for ( final Allele allele : g.getAlleles() ) {
if ( !allele.isSymbolic() )
observedAlleles.add(allele);
}
}
}
if ( observedAlleles.contains(Allele.NO_CALL) )
observedAlleles.remove(Allele.NO_CALL);
if ( reportedAlleles.size() != observedAlleles.size() )
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart()));
int originalSize = reportedAlleles.size();
// take the intersection and see if things change
observedAlleles.retainAll(reportedAlleles);
if ( observedAlleles.size() != originalSize )
throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart()));
}
public void validateChromosomeCounts() {
if ( !hasGenotypes() )
return;
// AN
if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
int observedAN = getCalledChrCount();
if ( reportedAN != observedAN )
throw new TribbleException.InternalCodecException(String.format("the Allele Number (AN) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAN, observedAN));
}
// AC
if ( hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) {
ArrayList<Integer> observedACs = new ArrayList<Integer>();
// if there are alternate alleles, record the relevant tags
if ( getAlternateAlleles().size() > 0 ) {
for ( Allele allele : getAlternateAlleles() ) {
observedACs.add(getCalledChrCount(allele));
}
}
else { // otherwise, set them to 0
observedACs.add(0);
}
if ( getAttribute(VCFConstants.ALLELE_COUNT_KEY) instanceof List ) {
final List reportedACs = (List)getAttribute(VCFConstants.ALLELE_COUNT_KEY);
if ( observedACs.size() != reportedACs.size() )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.size(), observedACs.size()));
for (int i = 0; i < observedACs.size(); i++) {
// need to cast to int to make sure we don't have an issue below with object equals (earlier bug) - EB
final int reportedAC = Integer.valueOf(reportedACs.get(i).toString());
if ( reportedAC != observedACs.get(i) )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getChr(), getStart(), reportedAC, observedACs.get(i)));
}
} else {
if ( observedACs.size() != 1 )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have enough values for the record at position %s:%d", getChr(), getStart()));
int reportedAC = Integer.valueOf(getAttribute(VCFConstants.ALLELE_COUNT_KEY).toString());
if ( reportedAC != observedACs.get(0) )
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedAC, observedACs.get(0)));
}
}
}
// ---------------------------------------------------------------------------------------------------------
//
// validation: the normal validation routines are called automatically upon creation of the VC
//
// ---------------------------------------------------------------------------------------------------------
private boolean validate(final EnumSet<Validation> validationToPerform) {
validateStop();
for (final Validation val : validationToPerform ) {
switch (val) {
case ALLELES: validateAlleles(); break;
case GENOTYPES: validateGenotypes(); break;
default: throw new IllegalArgumentException("Unexpected validation mode " + val);
}
}
return true;
}
/**
* Check that getEnd() == END from the info field, if it's present
*/
private void validateStop() {
if ( hasAttribute(VCFConstants.END_KEY) ) {
final int end = getAttributeAsInt(VCFConstants.END_KEY, -1);
assert end != -1;
if ( end != getEnd() ) {
final String message = "Badly formed variant context at location " + getChr() + ":"
+ getStart() + "; getEnd() was " + getEnd()
+ " but this VariantContext contains an END key with value " + end;
if ( GeneralUtils.DEBUG_MODE_ENABLED && WARN_ABOUT_BAD_END ) {
System.err.println(message);
}
else {
throw new TribbleException(message);
}
}
} else {
final long length = (stop - start) + 1;
if ( ! hasSymbolicAlleles() && length != getReference().length() ) {
throw new IllegalStateException("BUG: GenomeLoc " + contig + ":" + start + "-" + stop + " has a size == " + length + " but the variation reference allele has length " + getReference().length() + " this = " + this);
}
}
}
private void validateAlleles() {
boolean alreadySeenRef = false;
for ( final Allele allele : alleles ) {
// make sure there's only one reference allele
if ( allele.isReference() ) {
if ( alreadySeenRef ) throw new IllegalArgumentException("BUG: Received two reference tagged alleles in VariantContext " + alleles + " this=" + this);
alreadySeenRef = true;
}
if ( allele.isNoCall() ) {
throw new IllegalArgumentException("BUG: Cannot add a no call allele to a variant context " + alleles + " this=" + this);
}
}
// make sure there's one reference allele
if ( ! alreadySeenRef )
throw new IllegalArgumentException("No reference allele found in VariantContext");
}
private void validateGenotypes() {
if ( this.genotypes == null ) throw new IllegalStateException("Genotypes is null");
for ( final Genotype g : this.genotypes ) {
if ( g.isAvailable() ) {
for ( Allele gAllele : g.getAlleles() ) {
if ( ! hasAllele(gAllele) && gAllele.isCalled() )
throw new IllegalStateException("Allele in genotype " + gAllele + " not in the variant context " + alleles);
}
}
}
}
// ---------------------------------------------------------------------------------------------------------
//
// utility routines
//
// ---------------------------------------------------------------------------------------------------------
private void determineType() {
if ( type == null ) {
switch ( getNAlleles() ) {
case 0:
throw new IllegalStateException("Unexpected error: requested type of VariantContext with no alleles!" + this);
case 1:
// note that this doesn't require a reference allele. You can be monomorphic independent of having a
// reference allele
type = Type.NO_VARIATION;
break;
default:
determinePolymorphicType();
}
}
}
private void determinePolymorphicType() {
type = null;
// do a pairwise comparison of all alleles against the reference allele
for ( Allele allele : alleles ) {
if ( allele == REF )
continue;
// find the type of this allele relative to the reference
Type biallelicType = typeOfBiallelicVariant(REF, allele);
// for the first alternate allele, set the type to be that one
if ( type == null ) {
type = biallelicType;
}
// if the type of this allele is different from that of a previous one, assign it the MIXED type and quit
else if ( biallelicType != type ) {
type = Type.MIXED;
return;
}
}
}
private static Type typeOfBiallelicVariant(Allele ref, Allele allele) {
if ( ref.isSymbolic() )
throw new IllegalStateException("Unexpected error: encountered a record with a symbolic reference allele");
if ( allele.isSymbolic() )
return Type.SYMBOLIC;
if ( ref.length() == allele.length() ) {
if ( allele.length() == 1 )
return Type.SNP;
else
return Type.MNP;
}
// Important note: previously we were checking that one allele is the prefix of the other. However, that's not an
// appropriate check as can be seen from the following example:
// REF = CTTA and ALT = C,CT,CA
// This should be assigned the INDEL type but was being marked as a MIXED type because of the prefix check.
// In truth, it should be absolutely impossible to return a MIXED type from this method because it simply
// performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type
// is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point
// in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL.
return Type.INDEL;
// old incorrect logic:
// if (oneIsPrefixOfOther(ref, allele))
// return Type.INDEL;
// else
// return Type.MIXED;
}
public String toString() {
// Note: passing genotypes to String.format() will implicitly decode the genotypes
// This may not be desirable, so don't decode by default
return genotypes.isLazyWithData() ? toStringUnparsedGenotypes() : toStringDecodeGenotypes();
}
public String toStringDecodeGenotypes() {
return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s GT=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".",
this.getType(),
ParsingUtils.sortList(this.getAlleles()),
ParsingUtils.sortedString(this.getAttributes()),
this.getGenotypes());
}
private String toStringUnparsedGenotypes() {
return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s GT=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".",
this.getType(),
ParsingUtils.sortList(this.getAlleles()),
ParsingUtils.sortedString(this.getAttributes()),
((LazyGenotypesContext)this.genotypes).getUnparsedGenotypeData());
}
public String toStringWithoutGenotypes() {
return String.format("[VC %s @ %s Q%s of type=%s alleles=%s attr=%s",
getSource(), contig + ":" + (start - stop == 0 ? start : start + "-" + stop),
hasLog10PError() ? String.format("%.2f", getPhredScaledQual()) : ".",
this.getType(),
ParsingUtils.sortList(this.getAlleles()),
ParsingUtils.sortedString(this.getAttributes()));
}
// protected basic manipulation routines
private static List<Allele> makeAlleles(Collection<Allele> alleles) {
final List<Allele> alleleList = new ArrayList<Allele>(alleles.size());
boolean sawRef = false;
for ( final Allele a : alleles ) {
for ( final Allele b : alleleList ) {
if ( a.equals(b, true) )
throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
}
// deal with the case where the first allele isn't the reference
if ( a.isReference() ) {
if ( sawRef )
throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles);
alleleList.add(0, a);
sawRef = true;
}
else
alleleList.add(a);
}
if ( alleleList.isEmpty() )
throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list");
if ( alleleList.get(0).isNonReference() )
throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles);
return alleleList;
}
// ---------------------------------------------------------------------------------------------------------
//
// Fully decode
//
// ---------------------------------------------------------------------------------------------------------
/**
* Return a VC equivalent to this one but where all fields are fully decoded
*
* See VariantContext document about fully decoded
*
* @param header containing types about all fields in this VC
* @return a fully decoded version of this VC
*/
public VariantContext fullyDecode(final VCFHeader header, final boolean lenientDecoding) {
if ( isFullyDecoded() )
return this;
else {
// TODO -- warning this is potentially very expensive as it creates copies over and over
final VariantContextBuilder builder = new VariantContextBuilder(this);
fullyDecodeInfo(builder, header, lenientDecoding);
fullyDecodeGenotypes(builder, header);
builder.fullyDecoded(true);
return builder.make();
}
}
/**
* See VariantContext document about fully decoded
* @return true if this is a fully decoded VC
*/
public boolean isFullyDecoded() {
return fullyDecoded;
}
private final void fullyDecodeInfo(final VariantContextBuilder builder, final VCFHeader header, final boolean lenientDecoding) {
builder.attributes(fullyDecodeAttributes(getAttributes(), header, lenientDecoding));
}
private final Map<String, Object> fullyDecodeAttributes(final Map<String, Object> attributes,
final VCFHeader header,
final boolean lenientDecoding) {
final Map<String, Object> newAttributes = new HashMap<String, Object>(10);
for ( final Map.Entry<String, Object> attr : attributes.entrySet() ) {
final String field = attr.getKey();
if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY) )
continue; // gross, FT is part of the extended attributes
final VCFCompoundHeaderLine format = VariantContextUtils.getMetaDataForField(header, field);
final Object decoded = decodeValue(field, attr.getValue(), format);
if ( decoded != null &&
! lenientDecoding
&& format.getCountType() != VCFHeaderLineCount.UNBOUNDED
&& format.getType() != VCFHeaderLineType.Flag ) { // we expect exactly the right number of elements
final int obsSize = decoded instanceof List ? ((List) decoded).size() : 1;
final int expSize = format.getCount(this);
if ( obsSize != expSize ) {
throw new TribbleException.InvalidHeader("Discordant field size detected for field " +
field + " at " + getChr() + ":" + getStart() + ". Field had " + obsSize + " values " +
"but the header says this should have " + expSize + " values based on header record " +
format);
}
}
newAttributes.put(field, decoded);
}
return newAttributes;
}
private final Object decodeValue(final String field, final Object value, final VCFCompoundHeaderLine format) {
if ( value instanceof String ) {
if ( field.equals(VCFConstants.GENOTYPE_PL_KEY) )
return GenotypeLikelihoods.fromPLField((String)value);
final String string = (String)value;
if ( string.indexOf(",") != -1 ) {
final String[] splits = string.split(",");
final List<Object> values = new ArrayList<Object>(splits.length);
for ( int i = 0; i < splits.length; i++ )
values.add(decodeOne(field, splits[i], format));
return values;
} else {
return decodeOne(field, string, format);
}
} else if ( value instanceof List && (((List) value).get(0)) instanceof String ) {
final List<String> asList = (List<String>)value;
final List<Object> values = new ArrayList<Object>(asList.size());
for ( final String s : asList )
values.add(decodeOne(field, s, format));
return values;
} else {
return value;
}
// allowMissingValuesComparedToHeader
}
private final Object decodeOne(final String field, final String string, final VCFCompoundHeaderLine format) {
try {
if ( string.equals(VCFConstants.MISSING_VALUE_v4) )
return null;
else {
switch ( format.getType() ) {
case Character: return string;
case Flag:
final boolean b = Boolean.valueOf(string) || string.equals("1");
if ( b == false )
throw new TribbleException("VariantContext FLAG fields " + field + " cannot contain false values"
+ " as seen at " + getChr() + ":" + getStart());
return b;
case String: return string;
case Integer: return Integer.valueOf(string);
case Float: return Double.valueOf(string);
default: throw new TribbleException("Unexpected type for field" + field);
}
}
} catch (NumberFormatException e) {
throw new TribbleException("Could not decode field " + field + " with value " + string + " of declared type " + format.getType());
}
}
private final void fullyDecodeGenotypes(final VariantContextBuilder builder, final VCFHeader header) {
final GenotypesContext gc = new GenotypesContext();
for ( final Genotype g : getGenotypes() ) {
gc.add(fullyDecodeGenotypes(g, header));
}
builder.genotypesNoValidation(gc);
}
private final Genotype fullyDecodeGenotypes(final Genotype g, final VCFHeader header) {
final Map<String, Object> map = fullyDecodeAttributes(g.getExtendedAttributes(), header, true);
return new GenotypeBuilder(g).attributes(map).make();
}
// ---------------------------------------------------------------------------------------------------------
//
// tribble integration routines -- not for public consumption
//
// ---------------------------------------------------------------------------------------------------------
public String getChr() {
return contig;
}
public int getStart() {
return (int)start;
}
public int getEnd() {
return (int)stop;
}
public boolean hasSymbolicAlleles() {
return hasSymbolicAlleles(getAlleles());
}
public static boolean hasSymbolicAlleles( final List<Allele> alleles ) {
for ( final Allele a: alleles ) {
if (a.isSymbolic()) {
return true;
}
}
return false;
}
public Allele getAltAlleleWithHighestAlleleCount() {
// optimization: for bi-allelic sites, just return the 1only alt allele
if ( isBiallelic() )
return getAlternateAllele(0);
Allele best = null;
int maxAC1 = 0;
for ( Allele a : getAlternateAlleles() ) {
final int ac = getCalledChrCount(a);
if ( ac >= maxAC1 ) {
maxAC1 = ac;
best = a;
}
}
return best;
}
/**
* Lookup the index of allele in this variant context
*
* @param allele the allele whose index we want to get
* @return the index of the allele into getAlleles(), or -1 if it cannot be found
*/
public int getAlleleIndex(final Allele allele) {
return getAlleles().indexOf(allele);
}
/**
* Return the allele index #getAlleleIndex for each allele in alleles
*
* @param alleles the alleles we want to look up
* @return a list of indices for each allele, in order
*/
public List<Integer> getAlleleIndices(final Collection<Allele> alleles) {
final List<Integer> indices = new LinkedList<Integer>();
for ( final Allele allele : alleles )
indices.add(getAlleleIndex(allele));
return indices;
}
public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) {
final int index = getAlleleIndex(targetAllele);
if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this);
return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index);
}
}