package picard.vcf;
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextUtils;
import htsjdk.variant.vcf.VCFHeader;
import picard.util.DbSnpBitSetUtil;
import picard.vcf.processor.VariantProcessor;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static picard.vcf.CollectVariantCallingMetrics.VariantCallingDetailMetrics;
import static picard.vcf.CollectVariantCallingMetrics.VariantCallingSummaryMetrics;
/**
* Collects variants and generates metrics about them. To use, construct, call {@link #setup(VCFHeader)} once, then
* {@link #accumulate(htsjdk.variant.variantcontext.VariantContext)} as desired, then call {@link #result()}.
*
* @author mccowan
*/
public class CallingMetricAccumulator implements VariantProcessor.Accumulator<CallingMetricAccumulator.Result> {
public static class Result {
final VariantCallingSummaryMetrics summary;
final Collection<VariantCallingDetailMetrics> details;
Result(final VariantCallingSummaryMetrics summary, final Collection<VariantCallingDetailMetrics> details) {
this.summary = summary;
this.details = details;
}
public static Result merge(final Collection<Result> results) {
final Collection<VariantCallingDetailMetrics> details = new ArrayList<>();
final Collection<VariantCallingSummaryMetrics> summaries = new ArrayList<>();
results.stream().forEach(result -> {
summaries.add(result.summary);
details.addAll(result.details);
});
final Map<String, List<VariantCallingDetailMetrics>> sampleDetailsMap =
details.stream().collect(Collectors.groupingBy(vcDetailMetrics -> vcDetailMetrics.SAMPLE_ALIAS));
final Collection<CollectVariantCallingMetrics.VariantCallingDetailMetrics> collapsedDetails = new ArrayList<>();
sampleDetailsMap.values().stream().forEach(sampleDetails -> {
final VariantCallingDetailMetrics collapsed = new VariantCallingDetailMetrics();
VariantCallingDetailMetrics.foldInto(collapsed, sampleDetails);
collapsedDetails.add(collapsed);
collapsed.calculateDerivedFields();
});
final VariantCallingSummaryMetrics collapsedSummary = new VariantCallingSummaryMetrics();
VariantCallingSummaryMetrics.foldInto(collapsedSummary, summaries);
collapsedSummary.calculateDerivedFields();
return new Result(collapsedSummary, collapsedDetails);
}
}
private static final Log LOG = Log.getInstance(CallingMetricAccumulator.class);
private static final ProgressLogger progress = new ProgressLogger(LOG, 10000);
private final DbSnpBitSetUtil.DbSnpBitSets dbsnp;
private final VariantCallingSummaryMetrics summaryMetric = new VariantCallingSummaryMetrics();
/**
* A map of sample names to metrics. If .get() for a not-yet-existing sample name, a metric is generated, inserted into the map,
* then returned.
*/
private final CollectionUtil.DefaultingMap<String, VariantCallingDetailMetrics> sampleMetricsMap =
new CollectionUtil.DefaultingMap<>(
sampleName -> {
final VariantCallingDetailMetrics detail = new VariantCallingDetailMetrics();
detail.SAMPLE_ALIAS = sampleName;
return detail;
}, true);
public CallingMetricAccumulator(final DbSnpBitSetUtil.DbSnpBitSets dbsnp) {
this.dbsnp = dbsnp;
}
public void setup(final VCFHeader vcfHeader) {
//noop.
}
/** Incorporates the provided variant's data into the metric analysis. */
@Override
public void accumulate(final VariantContext vc) {
progress.record(vc.getContig(), vc.getStart());
if (!isVariantExcluded(vc)) {
final String singletonSample = getSingletonSample(vc);
updateSummaryMetric(summaryMetric, null, vc, singletonSample != null); // The summary metric has no genotype.
vc.getSampleNames().stream()
.filter(sampleName -> !vc.getGenotype(sampleName).isHomRef())
.forEach(sampleName ->
updateDetailMetric(sampleMetricsMap.get(sampleName), vc.getGenotype(sampleName), vc,
sampleName.equals(singletonSample)));
}
}
/**
* @return Sample name if there is only one sample that contains alternate allele(s), else null if either multiple samples that
* are not homref, or no samples that are not homref.
*/
protected static String getSingletonSample(final VariantContext vc) {
// peek can only change effectively final variables...workaround
final String[] sampleName = new String[1];
if (vc.getGenotypes()
.stream()
// look at het or homVar genotypes
.filter(genotype -> genotype.isHet() || genotype.isHomVar())
// two such genotypes will be enough
.limit(2)
//get any of the sample names
.peek(genotype -> sampleName[0] = genotype.getSampleName())
//map to the number of variant chromosomes
.mapToInt(genotype -> genotype.isHet() ? 1 : 2)
//add them up
.reduce(Integer::sum)
// compare to 1 with 0 as default
.orElse(0) == 1) {
return sampleName[0];
} else {
return null;
}
}
public Result result() {
final Collection<VariantCallingDetailMetrics> values = sampleMetricsMap.values();
values.forEach(CollectVariantCallingMetrics.VariantCallingDetailMetrics::calculateDerivedFields);
summaryMetric.calculateDerivedFields();
return new Result(summaryMetric, values);
}
/** Returns true if the variant is --NOT-- interesting enough to be included in metrics calculations. */
static private boolean isVariantExcluded(final VariantContext vc) {
// If the entire record is not a variant, exclude it
return !vc.isVariant() || vc.getGenotypes().stream().allMatch(Genotype::isHomRef);
}
private void updateDetailMetric(final VariantCallingDetailMetrics metric,
final Genotype genotype,
final VariantContext vc,
final boolean hasSingletonSample) {
updateSummaryMetric(metric, genotype, vc, hasSingletonSample);
if (genotype != null && !vc.isFiltered()) {
if (genotype.getGQ() == 0) {
++metric.TOTAL_GQ0_VARIANTS;
}
if (genotype.isHet()) {
++metric.numHets;
} else if (genotype.isHomVar()) {
++metric.numHomVar;
}
}
}
/** Amends the provided metric with the data in the provided variant. Also amends the summary metric re: reference bias. */
private void updateSummaryMetric(final VariantCallingSummaryMetrics metric,
final Genotype genotype,
final VariantContext vc,
final boolean hasSingletonSample) {
// If this sample's genotype doesn't have any variation, exclude it
if (genotype != null && genotype.isNoCall()) return;
// Tally up the filtered SNPs & indels, then exit. The other metrics shouldn't be
// computed on low-confidence calls.
if (vc.isFiltered()) {
if (vc.isSNP()) metric.FILTERED_SNPS++;
else if (vc.isIndel()) metric.FILTERED_INDELS++;
return;
}
if (hasSingletonSample) {
++metric.NUM_SINGLETONS;
}
if (vc.isBiallelic() && vc.isSNP()) {
// Biallelic SNPs
final boolean isInDbSnp = dbsnp.snps.isDbSnpSite(vc.getContig(), vc.getStart());
final boolean isTransition = VariantContextUtils.isTransition(vc);
metric.TOTAL_SNPS++;
if (isInDbSnp) {
metric.NUM_IN_DB_SNP++;
if (isTransition) metric.dbSnpTransitions++;
else metric.dbSnpTransversions++;
} else {
if (isTransition) metric.novelTransitions++;
else metric.novelTransversions++;
}
// Calculate reference bias numbers. Note: genotype == null for summary metric, so this block won't be called when metric ==
// summaryMetric.
if (genotype != null && genotype.isHet()) { //
final int[] alleleDepths = genotype.getAD();
/*
* Null check: work around GATK issue in which some biallelic sites are missing allele depth. This should affect only ~1%
* of samples and should not have a significant impact on the reference bias calculation.
*/
if (alleleDepths != null) {
final int indexOfRef = vc.getAlleleIndex(vc.getReference());
final int indexOfAlt = (indexOfRef + 1) % 2;
metric.refAlleleObs += alleleDepths[indexOfRef];
metric.altAlleleObs += alleleDepths[indexOfAlt];
// Always count these values for summary metrics.
summaryMetric.refAlleleObs += alleleDepths[indexOfRef];
summaryMetric.altAlleleObs += alleleDepths[indexOfAlt];
} else {
LOG.debug("Skipping aggregation of genotype due to missing allele depth data: ", genotype, ".");
}
}
} else if (vc.isSNP() && vc.getAlternateAlleles().size() > 1) {
// Multiallelic SNPs
metric.TOTAL_MULTIALLELIC_SNPS++;
if (dbsnp.snps.isDbSnpSite(vc.getContig(), vc.getStart())) metric.NUM_IN_DB_SNP_MULTIALLELIC++;
} else if (vc.isIndel() && !vc.isComplexIndel()) {
// Simple Indels
final boolean isInDbSnp = dbsnp.indels.isDbSnpSite(vc.getContig(), vc.getStart());
final boolean isInsertion = vc.isSimpleInsertion();
metric.TOTAL_INDELS++;
if (isInDbSnp) {
metric.NUM_IN_DB_SNP_INDELS++;
if (isInsertion) metric.dbSnpInsertions++;
else metric.dbSnpDeletions++;
} else {
if (isInsertion) metric.novelInsertions++;
else {
metric.novelDeletions++;
}
}
} else if (vc.isComplexIndel()) {
// Complex Indels
metric.TOTAL_COMPLEX_INDELS++;
if (dbsnp.indels.isDbSnpSite(vc.getContig(), vc.getStart())) metric.NUM_IN_DB_SNP_COMPLEX_INDELS++;
}
}
}