/* * Copyright 2015 Google. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.cloud.genomics.dataflow.functions.verifybamid; import com.google.cloud.genomics.dataflow.model.ReadCounts; import com.google.cloud.genomics.dataflow.model.ReadQualityCount; import com.google.cloud.genomics.dataflow.model.ReadQualityCount.Base; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.genomics.v1.Position; import org.apache.commons.math3.analysis.UnivariateFunction; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.Map; /** * Implementation of the likelihood function in equation (2) in * G. Jun, M. Flickinger, K. N. Hetrick, Kurt, J. M. Romm, K. F. Doheny, * G. Abecasis, M. Boehnke,and H. M. Kang, Detecting and Estimating * Contamination of Human DNA Samples in Sequencing and Array-Based Genotype * Data, American journal of human genetics doi:10.1016/j.ajhg.2012.09.004 * (volume 91 issue 5 pp.839 - 848) * http://www.sciencedirect.com/science/article/pii/S0002929712004788 */ public class LikelihoodFn implements UnivariateFunction { /** Possible genotypes for a SNP with a single alternate */ enum Genotype { REF_HOMOZYGOUS, HETEROZYGOUS, NONREF_HOMOZYGOUS } /** Possible error statuses for a base in a read */ enum ReadStatus { CORRECT, ERROR } static int toTableIndex(Base observed, Genotype trueGenotype, ReadStatus status) { return observed.ordinal() + Base.values().length * (status.ordinal() + ReadStatus.values().length * trueGenotype.ordinal()); } /* * P_OBS_GIVEN_TRUTH contains the probability of observing a particular * base (reference, non-reference, or other) given the true genotype * and the error status of the read. See Table 1 of Jun et al. */ private static final ImmutableList<Double> P_OBS_GIVEN_TRUTH; static { final ImmutableList<Double> pTable = ImmutableList.of( // Observed base // REF NONREF OTHER 1.0, 0.0, 0.0, // P(base | REF_HOMOZYGOUS, CORRECT) 0.0, 1.0 / 3.0, 2.0 / 3.0, // P(base | REF_HOMOZYGOUS, ERROR) 0.5, 0.5, 0.0, // P(base | HETEROZYGOUS, CORRECT) 1.0 / 6.0, 1.0 / 6.0, 2.0 / 3.0, // P(base | HETEROZYGOUS, ERROR) 0.0, 1.0, 0.0, // P(base | NONREF_HOMOZYGOUS, CORRECT) 1.0 / 3.0, 0.0, 2.0 / 3.0); // P(base | NONREF_HOMOZYGOUS, ERROR) Iterator<Double> itProb = pTable.iterator(); ArrayList<Double> pCond = new ArrayList<>(); pCond.addAll(Collections.nCopies( Base.values().length * ReadStatus.values().length * Genotype.values().length, 0.0)); for ( Genotype g : ImmutableList.of(Genotype.REF_HOMOZYGOUS, Genotype.HETEROZYGOUS, Genotype.NONREF_HOMOZYGOUS)) { for (ReadStatus r : ImmutableList.of(ReadStatus.CORRECT, ReadStatus.ERROR)) { for (Base b : ImmutableList.of(Base.REF, Base.NONREF, Base.OTHER)) { pCond.set(toTableIndex(b, g, r), itProb.next()); } } } P_OBS_GIVEN_TRUTH = ImmutableList.copyOf(pCond); } private final Map<Position, ReadCounts> readCounts; /** * Create a new LikelihoodFn instance for a given set of read counts. * * @param readCounts counts of reads by quality for each position of interest */ public LikelihoodFn(Map<Position, ReadCounts> readCounts) { // copy the map so the counts don't get changed out from under us this.readCounts = ImmutableMap.copyOf(readCounts); } /** * Compute the probability of a genotype given the reference allele probability. */ private static double pGenotype(Genotype g, double refProb) { switch(g) { case REF_HOMOZYGOUS: return refProb * refProb; case HETEROZYGOUS: return refProb * (1.0 - refProb); case NONREF_HOMOZYGOUS: return (1.0 - refProb) * (1.0 - refProb); default: throw new IllegalArgumentException("Illegal genotype"); } } /** * Look up the probability of an observation conditioned on the underlying state. */ private static double probObsGivenTruth(Base observed, Genotype trueGenotype, ReadStatus trueStatus) { return P_OBS_GIVEN_TRUTH.get(toTableIndex(observed, trueGenotype, trueStatus)); } /** * Compute the likelihood of a contaminant fraction alpha. * * <p>See equation (2) in Jun et al. */ @Override public double value(double alpha) { double logLikelihood = 0.0; for (ReadCounts rc : readCounts.values()) { double refProb = rc.getRefFreq(); double pPosition = 0.0; for (Genotype trueGenotype1 : Genotype.values()) { double pGenotype1 = pGenotype(trueGenotype1, refProb); for (Genotype trueGenotype2 : Genotype.values()) { double pGenotype2 = pGenotype(trueGenotype2, refProb); double pObsGivenGenotype = 1.0; for (ReadQualityCount rqc : rc.getReadQualityCounts()) { Base base = rqc.getBase(); double pErr = phredToProb(rqc.getQuality()); double pObs = ((1.0 - alpha) * probObsGivenTruth(base, trueGenotype1, ReadStatus.CORRECT) + (alpha) * probObsGivenTruth(base, trueGenotype2, ReadStatus.CORRECT) ) * (1.0 - pErr) + ((1.0 - alpha) * probObsGivenTruth(base, trueGenotype1, ReadStatus.ERROR) + (alpha) * probObsGivenTruth(base, trueGenotype2, ReadStatus.ERROR) ) * pErr; pObsGivenGenotype *= Math.pow(pObs, rqc.getCount()); } pPosition += pObsGivenGenotype * pGenotype1 * pGenotype2; } } logLikelihood += Math.log(pPosition); } return logLikelihood; } /** * Convert a Phred score to a probability. */ private static double phredToProb(int phred) { return Math.pow(10.0, -(double) phred / 10.0); } }