/* * The MIT License * * Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam.markduplicates; import java.util.stream.Collectors; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.QualityUtil; import picard.util.MathUtil; /** * Metrics that are calculated during the process of marking duplicates * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. */ public class UmiMetrics extends MetricBase { private final Histogram<String> observedUmis = new Histogram<>(); private final Histogram<String> inferredUmis = new Histogram<>(); private long observedUmiBases = 0; /** Number of bases in each UMI */ public double MEAN_UMI_LENGTH = 0.0; /** Number of different UMI sequences observed */ public long OBSERVED_UNIQUE_UMIS = 0; /** Number of different inferred UMI sequences derived */ public long INFERRED_UNIQUE_UMIS = 0; /** Number of errors inferred by comparing the observed and inferred UMIs */ public long OBSERVED_BASE_ERRORS = 0; /** Number of duplicate sets found before taking UMIs into account */ public long DUPLICATE_SETS_IGNORING_UMI = 0; /** Number of duplicate sets found after taking UMIs into account */ public long DUPLICATE_SETS_WITH_UMI = 0; /** * Entropy (in base 4) of the observed UMI sequences, indicating the * effective number of bases in the UMIs. If this is significantly * smaller than UMI_LENGTH, it indicates that the UMIs are not * distributed uniformly. */ public double OBSERVED_UMI_ENTROPY = 0; /** Entropy (in base 4) of the inferred UMI sequences, indicating the * effective number of bases in the inferred UMIs. If this is significantly * smaller than UMI_LENGTH, it indicates that the UMIs are not * distributed uniformly. */ public double INFERRED_UMI_ENTROPY = 0; /** Estimation of Phred scaled quality scores for UMIs */ public double UMI_BASE_QUALITIES = 0.0; public UmiMetrics() {} public UmiMetrics(final double length, final int observedUniqueUmis, final int inferredUniqueUmis, final int observedBaseErrors, final int duplicateSetsWithoutUmi, final int duplicateSetsWithUmi, final double effectiveLengthOfInferredUmis, final double effectiveLengthOfObservedUmis, final double estimatedBaseQualityOfUmis) { MEAN_UMI_LENGTH = length; OBSERVED_UNIQUE_UMIS = observedUniqueUmis; INFERRED_UNIQUE_UMIS = inferredUniqueUmis; OBSERVED_BASE_ERRORS = observedBaseErrors; DUPLICATE_SETS_IGNORING_UMI = duplicateSetsWithoutUmi; DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; INFERRED_UMI_ENTROPY = effectiveLengthOfInferredUmis; OBSERVED_UMI_ENTROPY = effectiveLengthOfObservedUmis; UMI_BASE_QUALITIES = estimatedBaseQualityOfUmis; } public void calculateDerivedFields() { OBSERVED_UNIQUE_UMIS = observedUmis.size(); INFERRED_UNIQUE_UMIS = inferredUmis.size(); OBSERVED_UMI_ENTROPY = effectiveNumberOfBases(observedUmis); INFERRED_UMI_ENTROPY = effectiveNumberOfBases(inferredUmis); UMI_BASE_QUALITIES = QualityUtil.getPhredScoreFromErrorProbability((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); } /** * Add an observation of a UMI to the metrics * @param observedUmi String containing the observed UMI * @param inferredUmi String containing the UMI inferred after error correcting the observed UMI */ public void addUmiObservation(String observedUmi, String inferredUmi) { observedUmis.increment(observedUmi); inferredUmis.increment(inferredUmi); observedUmiBases += observedUmi.length(); } private double effectiveNumberOfBases(Histogram<?> observations) { double totalObservations = observations.getSumOfValues(); // Convert to log base 4 so that the entropy is now a measure // of the effective number of DNA bases. If we used log(2.0) // our result would be in bits. double entropyBaseE = observations.values().stream().collect(Collectors.summingDouble( v -> {double p = v.getValue() / totalObservations; return -p * Math.log(p);})); return entropyBaseE / MathUtil.LOG_4_BASE_E; } }