/*
* Copyright (c) 2012 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package htsjdk.variant.variantcontext;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.utils.GeneralUtils;
import htsjdk.variant.vcf.VCFConstants;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
public class GenotypeLikelihoods {
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
private final static int NUM_LIKELIHOODS_CACHE_PLOIDY = 10;
// caching numAlleles up to 5 and ploidy up to 10
private final static int[][] numLikelihoodCache = new int[NUM_LIKELIHOODS_CACHE_N_ALLELES][NUM_LIKELIHOODS_CACHE_PLOIDY];
public final static int MAX_PL = Integer.MAX_VALUE;
//
// There are two objects here because we are lazy in creating both representations
// for this object: a vector of log10 Probs and the PL phred-scaled string. Supports
// having one set during initializating, and dynamic creation of the other, if needed
//
private double[] log10Likelihoods = null;
private String likelihoodsAsString_PLs = null;
/**
* initialize num likelihoods cache
*/
static {
// must be done before PLIndexToAlleleIndex
for ( int numAlleles = 1; numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES; numAlleles++ ) {
for ( int ploidy = 1; ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY; ploidy++ ) {
numLikelihoodCache[numAlleles][ploidy] = calcNumLikelihoods(numAlleles, ploidy);
}
}
}
/**
* The maximum number of alleles that we can represent as genotype likelihoods
*/
public final static int MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED = 50;
/*
* a cache of the PL index to the 2 alleles it represents over all possible numbers of alternate alleles
*/
private final static GenotypeLikelihoodsAllelePair[] PLIndexToAlleleIndex = calculatePLcache(MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED);
public final static GenotypeLikelihoods fromPLField(String PLs) {
return new GenotypeLikelihoods(PLs);
}
@Deprecated
public final static GenotypeLikelihoods fromGLField(String GLs) {
return new GenotypeLikelihoods(parseDeprecatedGLString(GLs));
}
public final static GenotypeLikelihoods fromLog10Likelihoods(double[] log10Likelihoods) {
return new GenotypeLikelihoods(log10Likelihoods);
}
public final static GenotypeLikelihoods fromPLs(final int[] pls) {
return new GenotypeLikelihoods(PLsToGLs(pls));
}
//
// You must use the factory methods now
//
private GenotypeLikelihoods(String asString) {
likelihoodsAsString_PLs = asString;
}
private GenotypeLikelihoods(double[] asVector) {
log10Likelihoods = asVector;
}
/**
* Returns the genotypes likelihoods in negative log10 vector format. pr{AA} = x, this
* vector returns math.log10(x) for each of the genotypes. Can return null if the
* genotype likelihoods are "missing".
*
* @return
*/
public double[] getAsVector() {
// assumes one of the likelihoods vector or the string isn't null
if ( log10Likelihoods == null ) {
// make sure we create the GL string if it doesn't already exist
log10Likelihoods = parsePLsIntoLikelihoods(likelihoodsAsString_PLs);
}
return log10Likelihoods;
}
public int[] getAsPLs() {
final double[] GLs = getAsVector();
return GLs == null ? null : GLsToPLs(GLs);
}
public String toString() {
return getAsString();
}
public String getAsString() {
if ( likelihoodsAsString_PLs == null ) {
// todo -- should we accept null log10Likelihoods and set PLs as MISSING?
if ( log10Likelihoods == null )
throw new TribbleException("BUG: Attempted to get likelihoods as strings and neither the vector nor the string is set!");
likelihoodsAsString_PLs = convertLikelihoodsToPLString(log10Likelihoods);
}
return likelihoodsAsString_PLs;
}
@Override public boolean equals(Object aThat) {
//check for self-comparison
if ( this == aThat ) return true;
if ( !(aThat instanceof GenotypeLikelihoods) ) return false;
GenotypeLikelihoods that = (GenotypeLikelihoods)aThat;
// now a proper field-by-field evaluation can be made.
// GLs are considered equal if the corresponding PLs are equal
return Arrays.equals(getAsPLs(), that.getAsPLs());
}
//Return genotype likelihoods as an EnumMap with Genotypes as keys and likelihoods as values
//Returns null in case of missing likelihoods
public EnumMap<GenotypeType,Double> getAsMap(boolean normalizeFromLog10){
//Make sure that the log10likelihoods are set
double[] likelihoods = normalizeFromLog10 ? GeneralUtils.normalizeFromLog10(getAsVector()) : getAsVector();
if(likelihoods == null)
return null;
EnumMap<GenotypeType,Double> likelihoodsMap = new EnumMap<GenotypeType, Double>(GenotypeType.class);
likelihoodsMap.put(GenotypeType.HOM_REF,likelihoods[GenotypeType.HOM_REF.ordinal()-1]);
likelihoodsMap.put(GenotypeType.HET,likelihoods[GenotypeType.HET.ordinal()-1]);
likelihoodsMap.put(GenotypeType.HOM_VAR, likelihoods[GenotypeType.HOM_VAR.ordinal() - 1]);
return likelihoodsMap;
}
//Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
/**
* This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context.
* Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List<Allele>) in place of it.
*
* If you **know** you're biallelic, use getGQLog10FromLikelihoods directly.
* @param genotype - actually a genotype type (no call, hom ref, het, hom var)
* @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best).
*/
@Deprecated
public double getLog10GQ(GenotypeType genotype){
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
}
private double getLog10GQ(List<Allele> genotypeAlleles,List<Allele> contextAlleles) {
int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0));
int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1));
int plIndex = calculatePLindex(allele1Index,allele2Index);
return getGQLog10FromLikelihoods(plIndex,getAsVector());
}
public double getLog10GQ(Genotype genotype, List<Allele> vcAlleles ) {
return getLog10GQ(genotype.getAlleles(),vcAlleles);
}
public double getLog10GQ(Genotype genotype, VariantContext context) {
return getLog10GQ(genotype,context.getAlleles());
}
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
if(likelihoods == null)
return Double.NEGATIVE_INFINITY;
double qual = Double.NEGATIVE_INFINITY;
for (int i=0; i < likelihoods.length; i++) {
if (i==iOfChoosenGenotype)
continue;
if (likelihoods[i] >= qual)
qual = likelihoods[i];
}
// qual contains now max(likelihoods[k]) for all k != bestGTguess
qual = likelihoods[iOfChoosenGenotype] - qual;
if (qual < 0) {
// QUAL can be negative if the chosen genotype is not the most likely one individually.
// In this case, we compute the actual genotype probability and QUAL is the likelihood of it not being the chosen one
double[] normalized = GeneralUtils.normalizeFromLog10(likelihoods);
double chosenGenotype = normalized[iOfChoosenGenotype];
return Math.log10(1.0 - chosenGenotype);
} else {
// invert the size, as this is the probability of making an error
return -1 * qual;
}
}
private final static double[] parsePLsIntoLikelihoods(String likelihoodsAsString_PLs) {
if ( !likelihoodsAsString_PLs.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = likelihoodsAsString_PLs.split(",");
double[] likelihoodsAsVector = new double[strings.length];
try {
for ( int i = 0; i < strings.length; i++ ) {
likelihoodsAsVector[i] = Integer.parseInt(strings[i]) / -10.0;
}
} catch (NumberFormatException e) {
throw new TribbleException("The GL/PL tag contains non-integer values: " + likelihoodsAsString_PLs);
}
return likelihoodsAsVector;
} else
return null;
}
/**
* Back-compatibility function to read old style GL formatted genotype likelihoods in VCF format
* @param GLString
* @return
*/
private final static double[] parseDeprecatedGLString(String GLString) {
if ( !GLString.equals(VCFConstants.MISSING_VALUE_v4) ) {
String[] strings = GLString.split(",");
double[] likelihoodsAsVector = new double[strings.length];
for ( int i = 0; i < strings.length; i++ ) {
likelihoodsAsVector[i] = Double.parseDouble(strings[i]);
}
return likelihoodsAsVector;
}
return null;
}
private final static String convertLikelihoodsToPLString(final double[] GLs) {
if ( GLs == null )
return VCFConstants.MISSING_VALUE_v4;
final StringBuilder s = new StringBuilder();
boolean first = true;
for ( final int pl : GLsToPLs(GLs) ) {
if ( ! first )
s.append(",");
else
first = false;
s.append(pl);
}
return s.toString();
}
private final static int[] GLsToPLs(final double[] GLs) {
final int[] pls = new int[GLs.length];
final double adjust = maxPL(GLs);
for ( int i = 0; i < GLs.length; i++ ) {
pls[i] = (int)Math.round(Math.min(-10 * (GLs[i] - adjust), MAX_PL));
}
return pls;
}
private final static double maxPL(final double[] GLs) {
double adjust = Double.NEGATIVE_INFINITY;
for ( double l : GLs ) adjust = Math.max(adjust, l);
return adjust;
}
private final static double[] PLsToGLs(final int pls[]) {
double[] likelihoodsAsVector = new double[pls.length];
for ( int i = 0; i < pls.length; i++ ) {
likelihoodsAsVector[i] = pls[i] / -10.0;
}
return likelihoodsAsVector;
}
// -------------------------------------------------------------------------------------
//
// Static conversion utilities, going from GL/PL index to allele index and vice versa.
//
// -------------------------------------------------------------------------------------
/*
* Class representing the 2 alleles (or rather their indexes into VariantContext.getAllele()) corresponding to a specific PL index.
* Note that the reference allele is always index=0.
*/
public static class GenotypeLikelihoodsAllelePair {
public final int alleleIndex1, alleleIndex2;
public GenotypeLikelihoodsAllelePair(final int alleleIndex1, final int alleleIndex2) {
this.alleleIndex1 = alleleIndex1;
this.alleleIndex2 = alleleIndex2;
}
}
private static GenotypeLikelihoodsAllelePair[] calculatePLcache(final int altAlleles) {
final int numLikelihoods = numLikelihoods(1 + altAlleles, 2);
final GenotypeLikelihoodsAllelePair[] cache = new GenotypeLikelihoodsAllelePair[numLikelihoods];
// for all possible combinations of 2 alleles
for ( int allele1 = 0; allele1 <= altAlleles; allele1++ ) {
for ( int allele2 = allele1; allele2 <= altAlleles; allele2++ ) {
cache[calculatePLindex(allele1, allele2)] = new GenotypeLikelihoodsAllelePair(allele1, allele2);
}
}
// a bit of sanity checking
for ( int i = 0; i < cache.length; i++ ) {
if ( cache[i] == null )
throw new IllegalStateException("BUG: cache entry " + i + " is unexpected null");
}
return cache;
}
// -------------------------------------------------------------------------------------
//
// num likelihoods given number of alleles and ploidy
//
// -------------------------------------------------------------------------------------
/**
* Actually does the computation in @see #numLikelihoods
*
* @param numAlleles
* @param ploidy
* @return
*/
private static final int calcNumLikelihoods(final int numAlleles, final int ploidy) {
if (numAlleles == 1)
return 1;
else if (ploidy == 1)
return numAlleles;
else {
int acc =0;
for (int k=0; k <= ploidy; k++ )
acc += calcNumLikelihoods(numAlleles - 1, ploidy - k);
return acc;
}
}
/**
* Compute how many likelihood elements are associated with the given number of alleles
* Equivalent to asking in how many ways N non-negative integers can add up to P is S(N,P)
* where P = ploidy (number of chromosomes) and N = total # of alleles.
* Each chromosome can be in one single state (0,...,N-1) and there are P of them.
* Naive solution would be to store N*P likelihoods, but this is not necessary because we can't distinguish chromosome states, but rather
* only total number of alt allele counts in all chromosomes.
*
* For example, S(3,2) = 6: For alleles A,B,C, on a diploid organism we have six possible genotypes:
* AA,AB,BB,AC,BC,CC.
* Another way of expressing is with vector (#of A alleles, # of B alleles, # of C alleles)
* which is then, for ordering above, (2,0,0), (1,1,0), (0,2,0), (1,1,0), (0,1,1), (0,0,2)
* In general, for P=2 (regular biallelic), then S(N,2) = N*(N+1)/2
*
* Note this method caches the value for most common num Allele / ploidy combinations for efficiency
*
* Recursive implementation:
* S(N,P) = sum_{k=0}^P S(N-1,P-k)
* because if we have N integers, we can condition 1 integer to be = k, and then N-1 integers have to sum to P-K
* With initial conditions
* S(N,1) = N (only way to have N integers add up to 1 is all-zeros except one element with a one. There are N of these vectors)
* S(1,P) = 1 (only way to have 1 integer add to P is with that integer P itself).
*
* @param numAlleles Number of alleles (including ref)
* @param ploidy Ploidy, or number of chromosomes in set
* @return Number of likelihood elements we need to hold.
*/
public static int numLikelihoods(final int numAlleles, final int ploidy) {
if ( numAlleles < NUM_LIKELIHOODS_CACHE_N_ALLELES
&& ploidy < NUM_LIKELIHOODS_CACHE_PLOIDY )
return numLikelihoodCache[numAlleles][ploidy];
else {
// have to calculate on the fly
return calcNumLikelihoods(numAlleles, ploidy);
}
}
// As per the VCF spec: "the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j.
// In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc."
// Assumes that allele1Index < allele2Index
public static int calculatePLindex(final int allele1Index, final int allele2Index) {
return (allele2Index * (allele2Index+1) / 2) + allele1Index;
}
/**
* get the allele index pair for the given PL
*
* @param PLindex the PL index
* @return the allele index pair
*/
public static GenotypeLikelihoodsAllelePair getAllelePair(final int PLindex) {
// make sure that we've cached enough data
if ( PLindex >= PLIndexToAlleleIndex.length )
throw new IllegalStateException("Internal limitation: cannot genotype more than " + MAX_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + " alleles");
return PLIndexToAlleleIndex[PLindex];
}
// An index conversion from the deprecated PL ordering to the new VCF-based ordering for up to 3 alternate alleles
protected static final int[] PLindexConversion = new int[]{0, 1, 3, 6, 2, 4, 7, 5, 8, 9};
/**
* get the allele index pair for the given PL using the deprecated PL ordering:
* AA,AB,AC,AD,BB,BC,BD,CC,CD,DD instead of AA,AB,BB,AC,BC,CC,AD,BD,CD,DD.
* Although it's painful to keep this conversion around, our DiploidSNPGenotypeLikelihoods class uses the deprecated
* ordering and I know with certainty that external users have built code on top of it; changing it now would
* cause a whole lot of heartache for our collaborators, so for now at least there's a standard conversion method.
* This method assumes at most 3 alternate alleles.
*
* @param PLindex the PL index
* @return the allele index pair
*/
@Deprecated
public static GenotypeLikelihoodsAllelePair getAllelePairUsingDeprecatedOrdering(final int PLindex) {
return getAllelePair(PLindexConversion[PLindex]);
}
/**
* get the PL indexes (AA, AB, BB) for the given allele pair; assumes allele1Index <= allele2Index.
*
* @param allele1Index the index in VariantContext.getAllele() of the first allele
* @param allele2Index the index in VariantContext.getAllele() of the second allele
* @return the PL indexes
*/
public static int[] getPLIndecesOfAlleles(final int allele1Index, final int allele2Index) {
final int[] indexes = new int[3];
indexes[0] = calculatePLindex(allele1Index, allele1Index);
indexes[1] = calculatePLindex(allele1Index, allele2Index);
indexes[2] = calculatePLindex(allele2Index, allele2Index);
return indexes;
}
}