/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.filter.SamRecordFilter;
import htsjdk.samtools.filter.SecondaryAlignmentFilter;
import htsjdk.samtools.metrics.MetricsFile;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFileWalker;
import htsjdk.samtools.util.*;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.Metrics;
import picard.filter.CountingDuplicateFilter;
import picard.filter.CountingFilter;
import picard.filter.CountingMapQFilter;
import picard.filter.CountingPairedFilter;
import picard.util.MathUtil;
import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments.
* Two algorithms are available for this metrics: default and fast. The fast algorithm is enabled by USE_FAST_ALGORITHM option.
* The fast algorithm works better for regions of BAM file with coverage at least 10 reads per locus,
* for lower coverage the algorithms perform the same.
* @author tfennell
*/
@CommandLineProgramProperties(
usage = CollectWgsMetrics.USAGE_SUMMARY + CollectWgsMetrics.USAGE_DETAILS,
usageShort = CollectWgsMetrics.USAGE_SUMMARY,
programGroup = Metrics.class
)
public class CollectWgsMetrics extends CommandLineProgram {
static final String USAGE_SUMMARY = "Collect metrics about coverage and performance of whole genome sequencing (WGS) experiments.";
static final String USAGE_DETAILS = "<p>This tool collects metrics about the fractions of reads that pass base- and mapping-quality "+
"filters as well as coverage (read-depth) levels for WGS analyses. Both minimum base- and mapping-quality values as well as the maximum "+
"read depths (coverage cap) are user defined.</p>" +
"<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p>"+
"<h4>Usage Example:</h4>"+
"<pre>" +
"java -jar picard.jar CollectWgsMetrics \\<br /> " +
" I=input.bam \\<br /> "+
" O=collect_wgs_metrics.txt \\<br /> " +
" R=reference_sequence.fasta " +
"</pre>" +
"Please see "+
"<a href='https://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectWgsMetrics.WgsMetrics'>CollectWgsMetrics</a> "+
"for detailed explanations of the output metrics." +
"<hr />"
;
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM or BAM file.")
public File INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output metrics file.")
public File OUTPUT;
@Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "The reference sequence fasta aligned to.")
public File REFERENCE_SEQUENCE;
@Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 20;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage. N bases will be treated as having a base quality " +
"of negative infinity and will therefore be excluded from coverage regardless of the value of this parameter.", overridable = true)
public int MINIMUM_BASE_QUALITY = 20;
@Option(shortName = "CAP", doc = "Treat positions with coverage exceeding this value as if they had coverage at this value (but calculate the difference for PCT_EXC_CAPPED).", overridable = true)
public int COVERAGE_CAP = 250;
@Option(doc="At positions with coverage exceeding this value, completely ignore reads that accumulate beyond this value (so that they will not be considered for PCT_EXC_CAPPED). Used to keep memory consumption in check, but could create bias if set too low", overridable = true)
public int LOCUS_ACCUMULATION_CAP = 100000;
@Option(doc = "For debugging purposes, stop after processing this many genomic bases.")
public long STOP_AFTER = -1;
@Option(doc = "Determines whether to include the base quality histogram in the metrics file.")
public boolean INCLUDE_BQ_HISTOGRAM = false;
@Option(doc="If true, count unpaired reads, and paired reads with one end unmapped")
public boolean COUNT_UNPAIRED = false;
@Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true)
public int SAMPLE_SIZE=10000;
@Option(doc = "If true, fast algorithm is used.")
public boolean USE_FAST_ALGORITHM = false;
@Option(doc = "Average read length in the file. Default is 150.", optional = true)
public int READ_LENGTH = 150;
@Option(doc = "An interval list file that contains the positions to restrict the assessment. Please note that " +
"all bases of reads that overlap these intervals will be considered, even if some of those bases extend beyond the boundaries of " +
"the interval. The ideal use case for this argument is to use it to restrict the calculation to a subset of (whole) contigs.",
optional = true, overridable = true)
public File INTERVALS = null;
private SAMFileHeader header = null;
private final Log log = Log.getInstance(CollectWgsMetrics.class);
private static final double LOG_ODDS_THRESHOLD = 3.0;
/** Metrics for evaluating the performance of whole genome sequencing experiments. */
public static class WgsMetrics extends MergeableMetricBase {
/** The intervals over which this metric was computed. */
@MergingIsManual
protected IntervalList intervals;
/** Count of sites with a given depth of coverage. Excludes bases with quality below MINIMUM_BASE_QUALITY (default 20) */
@MergingIsManual
protected final Histogram<Integer> highQualityDepthHistogram;
/** Count of sites with a given depth of coverage. Includes all but quality 2 bases */
@MergingIsManual
protected final Histogram<Integer> unfilteredDepthHistogram;
/** The count of bases observed with a given base quality. */
@MergingIsManual
protected final Histogram<Integer> unfilteredBaseQHistogram;
/** The maximum depth/coverage to consider. */
@MergeByAssertEquals
protected final int coverageCap;
/** The sample size used for theoretical het sensitivity. */
@NoMergingKeepsValue
protected final int theoreticalHetSensitivitySampleSize;
/**
* Create an instance of this metric that is not mergeable.
*/
public WgsMetrics() {
intervals = null;
highQualityDepthHistogram = null;
unfilteredDepthHistogram = null;
unfilteredBaseQHistogram = null;
theoreticalHetSensitivitySampleSize = -1;
coverageCap = -1;
}
/**
* Create an instance of this metric that is mergeable.
*
* @param highQualityDepthHistogram the count of genomic positions observed for each observed depth. Excludes bases with quality below MINIMUM_BASE_QUALITY.
* @param unfilteredDepthHistogram the depth histogram that includes all but quality 2 bases.
* @param pctExcludedByMapq the fraction of aligned bases that were filtered out because they were in reads with low mapping quality.
* @param pctExcludedByDupes the fraction of aligned bases that were filtered out because they were in reads marked as duplicates.
* @param pctExcludedByPairing the fraction of bases that were filtered out because they were in reads without a mapped mate pair.
* @param pctExcludedByBaseq the fraction of aligned bases that were filtered out because they were of low base quality.
* @param pctExcludedByOverlap the fraction of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads.
* @param pctExcludedByCapping the fraction of aligned bases that were filtered out because they would have raised coverage above the capped value.
* @param pctExcludeTotal the fraction of bases excluded across all filters.
* @param coverageCap Treat positions with coverage exceeding this value as if they had coverage at this value.
* @param unfilteredBaseQHistogram the count of bases observed with a given quality. Includes all but quality 2 bases.
* @param theoreticalHetSensitivitySampleSize the sample size used for theoretical het sensitivity sampling.
*/
public WgsMetrics(final IntervalList intervals,
final Histogram<Integer> highQualityDepthHistogram,
final Histogram<Integer> unfilteredDepthHistogram,
final double pctExcludedByMapq,
final double pctExcludedByDupes,
final double pctExcludedByPairing,
final double pctExcludedByBaseq,
final double pctExcludedByOverlap,
final double pctExcludedByCapping,
final double pctExcludeTotal,
final int coverageCap,
final Histogram<Integer> unfilteredBaseQHistogram,
final int theoreticalHetSensitivitySampleSize) {
this.intervals = intervals.uniqued();
this.highQualityDepthHistogram = highQualityDepthHistogram;
this.unfilteredDepthHistogram = unfilteredDepthHistogram;
this.unfilteredBaseQHistogram = unfilteredBaseQHistogram;
this.coverageCap = coverageCap;
this.theoreticalHetSensitivitySampleSize = theoreticalHetSensitivitySampleSize;
PCT_EXC_MAPQ = pctExcludedByMapq;
PCT_EXC_DUPE = pctExcludedByDupes;
PCT_EXC_UNPAIRED = pctExcludedByPairing;
PCT_EXC_BASEQ = pctExcludedByBaseq;
PCT_EXC_OVERLAP = pctExcludedByOverlap;
PCT_EXC_CAPPED = pctExcludedByCapping;
PCT_EXC_TOTAL = pctExcludeTotal;
calculateDerivedFields();
}
/** The number of non-N bases in the genome reference over which coverage will be evaluated. */
@NoMergingIsDerived
public long GENOME_TERRITORY;
/** The mean coverage in bases of the genome territory, after all filters are applied. */
@NoMergingIsDerived
public double MEAN_COVERAGE;
/** The standard deviation of coverage of the genome after all filters are applied. */
@NoMergingIsDerived
public double SD_COVERAGE;
/** The median coverage in bases of the genome territory, after all filters are applied. */
@NoMergingIsDerived
public double MEDIAN_COVERAGE;
/** The median absolute deviation of coverage of the genome after all filters are applied. */
@NoMergingIsDerived
public double MAD_COVERAGE;
/** The fraction of aligned bases that were filtered out because they were in reads with low mapping quality (default is < 20). */
@NoMergingIsDerived
public double PCT_EXC_MAPQ;
/** The fraction of aligned bases that were filtered out because they were in reads marked as duplicates. */
@NoMergingIsDerived
public double PCT_EXC_DUPE;
/** The fraction of aligned bases that were filtered out because they were in reads without a mapped mate pair. */
@NoMergingIsDerived
public double PCT_EXC_UNPAIRED;
/** The fraction of aligned bases that were filtered out because they were of low base quality (default is < 20). */
@NoMergingIsDerived
public double PCT_EXC_BASEQ;
/** The fraction of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads. */
@NoMergingIsDerived
public double PCT_EXC_OVERLAP;
/** The fraction of aligned bases that were filtered out because they would have raised coverage above the capped value (default cap = 250x). */
@NoMergingIsDerived
public double PCT_EXC_CAPPED;
/** The total fraction of aligned bases excluded due to all filters. */
@NoMergingIsDerived
public double PCT_EXC_TOTAL;
/** The fraction of bases that attained at least 1X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_1X;
/** The fraction of bases that attained at least 5X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_5X;
/** The fraction of bases that attained at least 10X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_10X;
/** The fraction of bases that attained at least 15X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_15X;
/** The fraction of bases that attained at least 20X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_20X;
/** The fraction of bases that attained at least 25X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_25X;
/** The fraction of bases that attained at least 30X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_30X;
/** The fraction of bases that attained at least 40X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_40X;
/** The fraction of bases that attained at least 50X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_50X;
/** The fraction of bases that attained at least 60X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_60X;
/** The fraction of bases that attained at least 70X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_70X;
/** The fraction of bases that attained at least 80X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_80X;
/** The fraction of bases that attained at least 90X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_90X;
/** The fraction of bases that attained at least 100X sequence coverage in post-filtering bases. */
@NoMergingIsDerived
public double PCT_100X;
/** The theoretical HET SNP sensitivity. */
@NoMergingIsDerived
public double HET_SNP_SENSITIVITY;
/** The Phred Scaled Q Score of the theoretical HET SNP sensitivity. */
@NoMergingIsDerived
public double HET_SNP_Q;
/**
* Merges the various PCT_EXC_* metrics.
* @param other metric to merge into this one.
*
* @return result of merging, also known as "this"
*/
@Override
public MergeableMetricBase merge(final MergeableMetricBase other) {
final WgsMetrics otherMetric = (WgsMetrics) other;
if (highQualityDepthHistogram == null || otherMetric.highQualityDepthHistogram == null ||
unfilteredDepthHistogram == null || otherMetric.unfilteredDepthHistogram == null) {
throw new PicardException("Depth histogram is required when deriving metrics.");
}
// Union the intervals over which bases are called. They should have no overlaps!
// NB: interval lists are already uniqued.
final long genomeTerritory = this.intervals.getBaseCount() + otherMetric.intervals.getBaseCount();
this.intervals.addall(otherMetric.intervals.getIntervals());
this.intervals = this.intervals.uniqued();
if (this.intervals.getBaseCount() != genomeTerritory) {
throw new PicardException("Trying to merge WgsMetrics calculated on intervals that overlap.");
}
// NB:
// Since: PCT_EXC_TOTAL = (totalWithExcludes - thisMetricTotal) / totalWithExcludes;
// Thus: totalWithExcludes = total / (1 - PCT_EXC_TOTAL)
// Proof: Exercise is left to the reader.
final long thisMetricTotal = (long) highQualityDepthHistogram.getSum();
final long otherMetricTotal = (long) otherMetric.highQualityDepthHistogram.getSum();
final long total = thisMetricTotal + otherMetricTotal;
final long thisTotalWithExcludes = (long) (thisMetricTotal / (1.0 - PCT_EXC_TOTAL));
final long otherTotalWithExcludes = (long) (otherMetricTotal / (1.0 - otherMetric.PCT_EXC_TOTAL));
final double totalWithExcludes = thisTotalWithExcludes + otherTotalWithExcludes;
if (0 < totalWithExcludes) {
PCT_EXC_DUPE = (PCT_EXC_DUPE * thisTotalWithExcludes + otherMetric.PCT_EXC_DUPE * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_MAPQ = (PCT_EXC_MAPQ * thisTotalWithExcludes + otherMetric.PCT_EXC_MAPQ * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_UNPAIRED = (PCT_EXC_UNPAIRED * thisTotalWithExcludes + otherMetric.PCT_EXC_UNPAIRED * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_BASEQ = (PCT_EXC_BASEQ * thisTotalWithExcludes + otherMetric.PCT_EXC_BASEQ * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_OVERLAP = (PCT_EXC_OVERLAP * thisTotalWithExcludes + otherMetric.PCT_EXC_OVERLAP * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_CAPPED = (PCT_EXC_CAPPED * thisTotalWithExcludes + otherMetric.PCT_EXC_CAPPED * otherTotalWithExcludes) / totalWithExcludes;
PCT_EXC_TOTAL = (totalWithExcludes - total) / totalWithExcludes;
}
// do any merging that are dictated by the annotations.
super.merge(other);
// merge the histograms
highQualityDepthHistogram.addHistogram(otherMetric.highQualityDepthHistogram);
unfilteredDepthHistogram.addHistogram(otherMetric.unfilteredDepthHistogram);
if (unfilteredBaseQHistogram != null && otherMetric.unfilteredBaseQHistogram != null)
unfilteredBaseQHistogram.addHistogram(otherMetric.unfilteredBaseQHistogram);
return this;
}
@Override
public void calculateDerivedFields() {
if (highQualityDepthHistogram == null || unfilteredDepthHistogram == null) throw new PicardException("Depth histogram is required when deriving metrics.");
if (unfilteredBaseQHistogram != null && theoreticalHetSensitivitySampleSize <= 0) {
throw new PicardException("Sample size is required when a baseQ histogram is given when deriving metrics.");
}
final long[] depthHistogramArray = new long[coverageCap+1];
for (final Histogram.Bin<Integer> bin : highQualityDepthHistogram.values()) {
final int depth = Math.min((int) bin.getIdValue(), coverageCap);
depthHistogramArray[depth] += bin.getValue();
}
GENOME_TERRITORY = (long) highQualityDepthHistogram.getSumOfValues();
MEAN_COVERAGE = highQualityDepthHistogram.getMean();
SD_COVERAGE = highQualityDepthHistogram.getStandardDeviation();
MEDIAN_COVERAGE = highQualityDepthHistogram.getMedian();
MAD_COVERAGE = highQualityDepthHistogram.getMedianAbsoluteDeviation();
PCT_1X = MathUtil.sum(depthHistogramArray, 1, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_5X = MathUtil.sum(depthHistogramArray, 5, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_10X = MathUtil.sum(depthHistogramArray, 10, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_15X = MathUtil.sum(depthHistogramArray, 15, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_20X = MathUtil.sum(depthHistogramArray, 20, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_25X = MathUtil.sum(depthHistogramArray, 25, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_30X = MathUtil.sum(depthHistogramArray, 30, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_40X = MathUtil.sum(depthHistogramArray, 40, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_50X = MathUtil.sum(depthHistogramArray, 50, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_60X = MathUtil.sum(depthHistogramArray, 60, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_70X = MathUtil.sum(depthHistogramArray, 70, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_80X = MathUtil.sum(depthHistogramArray, 80, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_90X = MathUtil.sum(depthHistogramArray, 90, depthHistogramArray.length) / (double) GENOME_TERRITORY;
PCT_100X = MathUtil.sum(depthHistogramArray, 100, depthHistogramArray.length) / (double) GENOME_TERRITORY;
// Get Theoretical Het SNP Sensitivity
if (unfilteredBaseQHistogram != null && unfilteredDepthHistogram != null) {
final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(unfilteredDepthHistogram);
final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(unfilteredBaseQHistogram);
HET_SNP_SENSITIVITY = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, LOG_ODDS_THRESHOLD);
HET_SNP_Q = QualityUtil.getPhredScoreFromErrorProbability((1 - HET_SNP_SENSITIVITY));
}
}
}
public static void main(final String[] args) {
new CollectWgsMetrics().instanceMainWithExit(args);
}
/** Gets the SamReader from which records will be examined. This will also set the header so that it is available in
* */
protected SamReader getSamReader() {
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
this.header = in.getFileHeader();
return in;
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
if (INTERVALS != null) {
IOUtil.assertFileIsReadable(INTERVALS);
}
// it doesn't make sense for the locus accumulation cap to be lower than the coverage cap
if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) {
log.warn("Setting the LOCUS_ACCUMULATION_CAP to be equal to the COVERAGE_CAP (" + COVERAGE_CAP + ") because it should not be lower");
LOCUS_ACCUMULATION_CAP = COVERAGE_CAP;
}
// Setup all the inputs
final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci");
final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
final SamReader in = getSamReader();
final AbstractLocusIterator iterator = getLocusIterator(in);
final List<SamRecordFilter> filters = new ArrayList<>();
final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY);
final CountingFilter dupeFilter = new CountingDuplicateFilter();
final CountingPairedFilter pairFilter = new CountingPairedFilter();
// The order in which filters are added matters!
filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice
filters.add(mapqFilter);
filters.add(dupeFilter);
if (!COUNT_UNPAIRED) {
filters.add(pairFilter);
}
iterator.setSamFilters(filters);
iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases
iterator.setIncludeNonPfReads(false);
final AbstractWgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine());
final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector);
processor.processFile();
final MetricsFile<WgsMetrics, Integer> out = getMetricsFile();
processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter);
out.write(OUTPUT);
return 0;
}
private <T extends AbstractRecordAndOffset> WgsMetricsProcessorImpl<T> getWgsMetricsProcessor(
ProgressLogger progress, ReferenceSequenceFileWalker refWalker,
AbstractLocusIterator<T, AbstractLocusInfo<T>> iterator, AbstractWgsMetricsCollector<T> collector) {
return new WgsMetricsProcessorImpl<>(iterator, refWalker, collector, progress);
}
/** Gets the intervals over which we will calculate metrics. */
protected IntervalList getIntervalsToExamine() {
final IntervalList intervals;
if (INTERVALS != null) {
IOUtil.assertFileIsReadable(INTERVALS);
intervals = IntervalList.fromFile(INTERVALS);
} else {
intervals = new IntervalList(this.header);
for (final SAMSequenceRecord rec : this.header.getSequenceDictionary().getSequences()) {
final Interval interval = new Interval(rec.getSequenceName(), 1, rec.getSequenceLength());
intervals.add(interval);
}
}
return intervals;
}
/** This method should only be called after {@link this.getSamReader()} is called. */
protected SAMFileHeader getSamFileHeader() {
if (this.header == null) throw new IllegalStateException("getSamFileHeader() was called but this.header is null");
return this.header;
}
protected WgsMetrics generateWgsMetrics(final IntervalList intervals,
final Histogram<Integer> highQualityDepthHistogram,
final Histogram<Integer> unfilteredDepthHistogram,
final double pctExcludedByMapq,
final double pctExcludedByDupes,
final double pctExcludedByPairing,
final double pctExcludedByBaseq,
final double pctExcludedByOverlap,
final double pctExcludedByCapping,
final double pctTotal,
final int coverageCap,
final Histogram<Integer> unfilteredBaseQHistogram,
final int theoreticalHetSensitivitySampleSize) {
return new WgsMetrics(
intervals,
highQualityDepthHistogram,
unfilteredDepthHistogram,
pctExcludedByMapq,
pctExcludedByDupes,
pctExcludedByPairing,
pctExcludedByBaseq,
pctExcludedByOverlap,
pctExcludedByCapping,
pctTotal,
coverageCap,
unfilteredBaseQHistogram,
theoreticalHetSensitivitySampleSize
);
}
WgsMetrics generateWgsMetrics(final IntervalList intervals,
final Histogram<Integer> highQualityDepthHistogram,
final Histogram<Integer> unfilteredDepthHistogram,
final long basesExcludedByMapq,
final long basesExcludedByDupes,
final long basesExcludedByPairing,
final long basesExcludedByBaseq,
final long basesExcludedByOverlap,
final long basesExcludedByCapping,
final int coverageCap,
final Histogram<Integer> unfilteredBaseQHistogram,
final int theoreticalHetSensitivitySampleSize) {
final double total = highQualityDepthHistogram.getSum();
final double totalWithExcludes = total + basesExcludedByDupes + basesExcludedByMapq + basesExcludedByPairing + basesExcludedByBaseq + basesExcludedByOverlap + basesExcludedByCapping;
final double pctExcludedByMapq = totalWithExcludes == 0 ? 0 : basesExcludedByMapq / totalWithExcludes;
final double pctExcludedByDupes = totalWithExcludes == 0 ? 0 : basesExcludedByDupes / totalWithExcludes;
final double pctExcludedByPairing = totalWithExcludes == 0 ? 0 : basesExcludedByPairing / totalWithExcludes;
final double pctExcludedByBaseq = totalWithExcludes == 0 ? 0 : basesExcludedByBaseq / totalWithExcludes;
final double pctExcludedByOverlap = totalWithExcludes == 0 ? 0 : basesExcludedByOverlap / totalWithExcludes;
final double pctExcludedByCapping = totalWithExcludes == 0 ? 0 : basesExcludedByCapping / totalWithExcludes;
final double pctTotal = totalWithExcludes == 0 ? 0 : (totalWithExcludes - total) / totalWithExcludes;
return generateWgsMetrics(
intervals,
highQualityDepthHistogram,
unfilteredDepthHistogram,
pctExcludedByMapq,
pctExcludedByDupes,
pctExcludedByPairing,
pctExcludedByBaseq,
pctExcludedByOverlap,
pctExcludedByCapping,
pctTotal,
coverageCap,
unfilteredBaseQHistogram,
theoreticalHetSensitivitySampleSize
);
}
/**
* If INTERVALS is specified, this will count bases beyond the interval list when the read overlaps the intervals and extends beyond the
* edge. Ideally INTERVALS should only include regions that have hard edges without reads that could extend beyond the boundary (such as a whole contig).
*/
protected long getBasesExcludedBy(final CountingFilter filter) {
return filter.getFilteredBases();
}
/**
* Creates {@link htsjdk.samtools.util.AbstractLocusIterator} implementation according to {@link this#USE_FAST_ALGORITHM} value.
*
* @param in inner {@link htsjdk.samtools.SamReader}
* @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link htsjdk.samtools.util.EdgeReadIterator} implementation,
* otherwise default algorithm is used and {@link htsjdk.samtools.util.SamLocusIterator} is returned.
*/
protected AbstractLocusIterator getLocusIterator(final SamReader in) {
if (USE_FAST_ALGORITHM) {
return (INTERVALS != null) ? new EdgeReadIterator(in, IntervalList.fromFile(INTERVALS)) : new EdgeReadIterator(in);
}
SamLocusIterator iterator = (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in);
iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP);
iterator.setEmitUncoveredLoci(true);
iterator.setQualityScoreCutoff(0);
return iterator;
}
/**
* Creates {@link picard.analysis.AbstractWgsMetricsCollector} implementation according to {@link this#USE_FAST_ALGORITHM} value.
*
* @param coverageCap the maximum depth/coverage to consider.
* @param intervals the intervals over which metrics are collected.
* @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link picard.analysis.FastWgsMetricsCollector} implementation,
* otherwise default algorithm is used and {@link picard.analysis.CollectWgsMetrics.WgsMetricsCollector} is returned.
*/
protected AbstractWgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) {
return USE_FAST_ALGORITHM ? new FastWgsMetricsCollector(this, coverageCap, intervals) :
new WgsMetricsCollector(this, coverageCap, intervals);
}
protected static class WgsMetricsCollector extends AbstractWgsMetricsCollector<SamLocusIterator.RecordAndOffset> {
public WgsMetricsCollector(final CollectWgsMetrics metrics, final int coverageCap, final IntervalList intervals) {
super(metrics, coverageCap, intervals);
}
@Override
public void addInfo(final AbstractLocusInfo<SamLocusIterator.RecordAndOffset> info, final ReferenceSequence ref, boolean referenceBaseN) {
if (referenceBaseN) {
return;
}
// Figure out the coverage while not counting overlapping reads twice, and excluding various things
final HashSet<String> readNames = new HashSet<>(info.getRecordAndOffsets().size());
int pileupSize = 0;
int unfilteredDepth = 0;
for (final SamLocusIterator.RecordAndOffset recs : info.getRecordAndOffsets()) {
if (recs.getBaseQuality() <= 2) { ++basesExcludedByBaseq; continue; }
// we add to the base quality histogram any bases that have quality > 2
// the raw depth may exceed the coverageCap before the high-quality depth does. So stop counting once we reach the coverage cap.
if (unfilteredDepth < coverageCap) {
unfilteredBaseQHistogramArray[recs.getRecord().getBaseQualities()[recs.getOffset()]]++;
unfilteredDepth++;
}
if (recs.getBaseQuality() < collectWgsMetrics.MINIMUM_BASE_QUALITY ||
SequenceUtil.isNoCall(recs.getReadBase())) { ++basesExcludedByBaseq; continue; }
if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; }
pileupSize++;
}
final int highQualityDepth = Math.min(pileupSize, coverageCap);
if (highQualityDepth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap;
highQualityDepthHistogramArray[highQualityDepth]++;
unfilteredDepthHistogramArray[unfilteredDepth]++;
}
}
}