/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; import picard.PicardException; import picard.util.MathUtil; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; /** * Created by David Benjamin on 5/13/15. */ public class TheoreticalSensitivity { private static final Log log = Log.getInstance(TheoreticalSensitivity.class); private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops private static final int MAX_CONSIDERED_DEPTH = 1000; //no point in looking any deeper than this, otherwise GC overhead is too high. /** * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q * @param sampleSize sample size is the number of random sums of quality scores for each m * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, * for example 5 if the variant likelihood must be 10^5 times greater */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold) { return hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, logOddsThreshold, true); } /** * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q * @param sampleSize sample size is the number of random sums of quality scores for each m * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, * for example 5 if the variant likelihood must be 10^5 times greater. * @param withLogging true to output log messages, false otherwise. */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold, final boolean withLogging) { final int N = Math.min(depthDistribution.length, MAX_CONSIDERED_DEPTH + 1); if (withLogging) log.info("Creating Roulette Wheel"); final RouletteWheel qualitySampler = new RouletteWheel(qualityDistribution); //qualitySums[m] is a random sample of sums of m quality scores, for m = 0, 1, N - 1 if (withLogging) log.info("Calculating quality sums from quality sampler"); final List<ArrayList<Integer>> qualitySums = qualitySampler.sampleCumulativeSums(N, sampleSize, withLogging); //if a quality sum of m qualities exceeds the quality sum threshold for n total reads, a SNP is called final ArrayList<Double> qualitySumThresholds = new ArrayList<>(N); final double LOG_10 = Math.log10(2); for (int n = 0; n < N; n++) qualitySumThresholds.add(10 * (n * LOG_10 + logOddsThreshold)); //probabilityToExceedThreshold[m][n] is the probability that the sum of m quality score //exceeds the nth quality sum threshold if (withLogging) log.info("Calculating theoretical het sensitivity"); final List<ArrayList<Double>> probabilityToExceedThreshold = proportionsAboveThresholds(qualitySums, qualitySumThresholds); final List<ArrayList<Double>> altDepthDistribution = hetAltDepthDistribution(N); double result = 0.0; for (int n = 0; n < N; n++) { for (int m = 0; m <= n; m++) { result += depthDistribution[n] * altDepthDistribution.get(n).get(m) * probabilityToExceedThreshold.get(m).get(n); } } return result; } //given L lists of lists and N thresholds, count the proportion of each list above each threshold public static List<ArrayList<Double>> proportionsAboveThresholds(final List<ArrayList<Integer>> lists, final List<Double> thresholds) { final ArrayList<ArrayList<Double>> result = new ArrayList<>(); for (final ArrayList<Integer> list : lists) { final ArrayList<Double> newRow = new ArrayList<>(Collections.nCopies(thresholds.size(), 0.0)); Collections.sort(list); int n = 0; int j = 0; //index within the ordered sample while (n < thresholds.size() && j < list.size()) { if (thresholds.get(n) > list.get(j)) j++; else newRow.set(n++, (double) (list.size() - j) / list.size()); } result.add(newRow); } return result; } //Utility function for making table of binomial distribution probabilities nCm * (0.5)^n //for n = 0, 1 . . . N - 1 and m = 0, 1. . . n public static List<ArrayList<Double>> hetAltDepthDistribution(final int N) { final List<ArrayList<Double>> table = new ArrayList<>(); for (int n = 0; n < N; n++) { final ArrayList<Double> nthRow = new ArrayList<>(); //add the 0th element, then elements 1 through n - 1, then the nth. //Note that nCm = (n-1)C(m-1) * (n/m) nthRow.add(Math.pow(0.5, n)); for (int m = 1; m < n; m++) nthRow.add((n * 0.5 / m) * table.get(n - 1).get(m - 1)); if (n > 0) nthRow.add(nthRow.get(0)); table.add(nthRow); } return table; } /* Perform random draws from {0, 1. . . N - 1} according to a list of relative probabilities. We use an O(1) stochastic acceptance algorithm -- see Physica A, Volume 391, Page 2193 (2012) -- which works well when the ratio of maximum weight to average weight is not large. */ public static class RouletteWheel { final private List<Double> probabilities; final private int N; private int count = 0; private Random rng; RouletteWheel(final double[] weights) { rng = new Random(51); N = weights.length; probabilities = new ArrayList<>(); final double wMax = MathUtil.max(weights); if (wMax == 0) { throw new PicardException("Quality score distribution is empty."); } for (final double w : weights) { probabilities.add(w / wMax); } } public int draw() { while (true) { final int n = (int) (N * rng.nextDouble()); count++; if (rng.nextDouble() < probabilities.get(n)) { count = 0; return n; } else if (count >= SAMPLING_MAX) { count = 0; return 0; } } } //get samples of sums of 0, 1, 2,. . . N - 1 draws public List<ArrayList<Integer>> sampleCumulativeSums(final int maxNumberOfSummands, final int sampleSize, final boolean withLogging) { final List<ArrayList<Integer>> result = new ArrayList<>(); for (int m = 0; m < maxNumberOfSummands; m++) result.add(new ArrayList<>()); for (int iteration = 0; iteration < sampleSize; iteration++) { int cumulativeSum = 0; for (int m = 0; m < maxNumberOfSummands; m++) { result.get(m).add(cumulativeSum); cumulativeSum += draw(); } if (withLogging && iteration % 1000 == 0) { log.info(iteration + " sampling iterations completed"); } } return result; } } public static double[] normalizeHistogram(final Histogram<Integer> histogram) { if (histogram == null) throw new PicardException("Histogram is null and cannot be normalized"); final double histogramSumOfValues = histogram.getSumOfValues(); final double[] normalizedHistogram = new double[histogram.size()]; for (int i = 0; i < histogram.size(); i++) { if (histogram.get(i) != null) { normalizedHistogram[i] = histogram.get(i).getValue() / histogramSumOfValues; } } return normalizedHistogram; } }