/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.vcf; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.Log; import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import picard.analysis.MergeableMetricBase; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Metrics; import picard.util.DbSnpBitSetUtil; import picard.vcf.processor.VariantProcessor; import java.io.File; import java.util.Collection; import java.util.Optional; /** Collects summary and per-sample metrics about variant calls in a VCF file. */ @CommandLineProgramProperties( usage = "Collects per-sample and aggregate (spanning all samples) metrics from the provided VCF file.", usageShort = "Collects per-sample and aggregate (spanning all samples) metrics from the provided VCF file", programGroup = Metrics.class ) public class CollectVariantCallingMetrics extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input vcf file for analysis") public File INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Path (except for the file extension) of output metrics files " + "to write.") public File OUTPUT; @Option(doc = "Reference dbSNP file in dbSNP or VCF format.") public File DBSNP; @Option(shortName = "TI", doc = "Target intervals to restrict analysis to.", optional = true) public File TARGET_INTERVALS; @Option(shortName = StandardOptionDefinitions.SEQUENCE_DICTIONARY_SHORT_NAME, optional = true, doc = "If present, speeds loading of dbSNP file, will look for dictionary in vcf if not present here.") public File SEQUENCE_DICTIONARY = null; @Option(doc = "Set to true if running on a single-sample gvcf.", optional = true) public boolean GVCF_INPUT = false; @Option public int THREAD_COUNT = 1; private final Log log = Log.getInstance(CollectVariantCallingMetrics.class); public static void main(final String[] args) { new CollectVariantCallingMetrics().instanceMainWithExit(args); } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(DBSNP); if (TARGET_INTERVALS != null) IOUtil.assertFileIsReadable(TARGET_INTERVALS); if (SEQUENCE_DICTIONARY != null) IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY); final boolean requiresIndex = this.TARGET_INTERVALS != null || this.THREAD_COUNT > 1; final VCFFileReader variantReader = new VCFFileReader(INPUT, requiresIndex); final VCFHeader vcfHeader = variantReader.getFileHeader(); CloserUtil.close(variantReader); final SAMSequenceDictionary sequenceDictionary = SAMSequenceDictionaryExtractor.extractDictionary(SEQUENCE_DICTIONARY == null ? INPUT : SEQUENCE_DICTIONARY); final IntervalList targetIntervals = (TARGET_INTERVALS == null) ? null : IntervalList.fromFile(TARGET_INTERVALS).uniqued(); log.info("Loading dbSNP file ..."); final DbSnpBitSetUtil.DbSnpBitSets dbsnp = DbSnpBitSetUtil.createSnpAndIndelBitSets(DBSNP, sequenceDictionary, targetIntervals, Optional.of(log)); log.info("Starting iteration of variants."); final VariantProcessor.Builder<CallingMetricAccumulator, CallingMetricAccumulator.Result> builder = VariantProcessor.Builder .generatingAccumulatorsBy(() -> { CallingMetricAccumulator accumulator = GVCF_INPUT ? new GvcfMetricAccumulator(dbsnp) : new CallingMetricAccumulator(dbsnp); accumulator.setup(vcfHeader); return accumulator; }) .combiningResultsBy(CallingMetricAccumulator.Result::merge) .withInput(INPUT) .multithreadingBy(THREAD_COUNT); if (targetIntervals != null) { builder.limitingProcessedRegionsTo(targetIntervals); } final CallingMetricAccumulator.Result result = builder.build().process(); // Fetch and write the metrics. final MetricsFile<CollectVariantCallingMetrics.VariantCallingDetailMetrics, Integer> detail = getMetricsFile(); final MetricsFile<CollectVariantCallingMetrics.VariantCallingSummaryMetrics, Integer> summary = getMetricsFile(); summary.addMetric(result.summary); result.details.forEach(detail::addMetric); final String outputPrefix = OUTPUT.getAbsolutePath() + "."; detail.write(new File(outputPrefix + CollectVariantCallingMetrics.VariantCallingDetailMetrics.getFileExtension())); summary.write(new File(outputPrefix + CollectVariantCallingMetrics.VariantCallingSummaryMetrics.getFileExtension())); return 0; } /** A collection of metrics relating to snps and indels within a variant-calling file (VCF). */ public static class VariantCallingSummaryMetrics extends MergeableMetricBase { /** The number of passing bi-allelic SNPs calls (i.e. non-reference genotypes) that were examined */ @MergeByAdding public long TOTAL_SNPS; /** The number of passing bi-allelic SNPs found in dbSNP */ @MergeByAdding public long NUM_IN_DB_SNP; /** The number of passing bi-allelic SNPS called that were not found in dbSNP */ @MergeByAdding public long NOVEL_SNPS; /** The number of SNPs that are filtered */ @MergeByAdding public long FILTERED_SNPS; /** The fraction of passing bi-allelic SNPs in dbSNP */ @NoMergingIsDerived public float PCT_DBSNP; /** The Transition/Transversion ratio of the passing bi-allelic SNP calls made at dbSNP sites */ @NoMergingIsDerived public double DBSNP_TITV; /** The Transition/Transversion ratio of the passing bi-allelic SNP calls made at non-dbSNP sites */ @NoMergingIsDerived public double NOVEL_TITV; /** The number of passing indel calls that were examined */ @MergeByAdding public long TOTAL_INDELS; /** The number of passing indels called that were not found in dbSNP */ @MergeByAdding public long NOVEL_INDELS; /** The number of indels that are filtered */ @MergeByAdding public long FILTERED_INDELS; /** The fraction of passing indels in dbSNP */ @NoMergingIsDerived public float PCT_DBSNP_INDELS; /** The number of passing indels found in dbSNP */ @MergeByAdding public long NUM_IN_DB_SNP_INDELS; /** The Insertion/Deletion ratio of the indel calls made at dbSNP sites */ @NoMergingIsDerived public double DBSNP_INS_DEL_RATIO; /** The Insertion/Deletion ratio of the indel calls made at non-dbSNP sites */ @NoMergingIsDerived public double NOVEL_INS_DEL_RATIO; /** The number of passing multi-allelic SNP calls that were examined */ @MergeByAdding public double TOTAL_MULTIALLELIC_SNPS; /** The number of passing multi-allelic SNPs found in dbSNP */ @MergeByAdding public double NUM_IN_DB_SNP_MULTIALLELIC; /** The number of passing complex indel calls that were examined */ @MergeByAdding public double TOTAL_COMPLEX_INDELS; /** The number of passing complex indels found in dbSNP */ @MergeByAdding public double NUM_IN_DB_SNP_COMPLEX_INDELS; /** The rate at which reference bases are observed at ref/alt heterozygous SNP sites. */ @NoMergingIsDerived public double SNP_REFERENCE_BIAS; /** * For summary metrics, the number of variants that appear in only one sample. * For detail metrics, the number of variants that appear only in the current sample. */ @MergeByAdding public long NUM_SINGLETONS; /** Hidden fields that are not propagated to the metrics file, but are used to house temporary values. */ @MergeByAdding long refAlleleObs, altAlleleObs, novelDeletions, novelInsertions, novelTransitions, novelTransversions, dbSnpDeletions, dbSnpInsertions, dbSnpTransitions, dbSnpTransversions; public static String getFileExtension() { return "variant_calling_summary_metrics"; } public void calculateDerivedFields() { this.PCT_DBSNP = this.NUM_IN_DB_SNP / (float) this.TOTAL_SNPS; this.NOVEL_SNPS = this.TOTAL_SNPS - this.NUM_IN_DB_SNP; this.SNP_REFERENCE_BIAS = (this.refAlleleObs) / (double) (this.refAlleleObs + this.altAlleleObs); if (this.dbSnpTransversions > 0) this.DBSNP_TITV = (double) this.dbSnpTransitions / (double) this.dbSnpTransversions; if (this.novelTransversions > 0) this.NOVEL_TITV = this.novelTransitions / (double) this.novelTransversions; this.PCT_DBSNP_INDELS = this.NUM_IN_DB_SNP_INDELS / (float) this.TOTAL_INDELS; this.NOVEL_INDELS = this.TOTAL_INDELS - this.NUM_IN_DB_SNP_INDELS; if (this.dbSnpDeletions > 0) this.DBSNP_INS_DEL_RATIO = this.dbSnpInsertions / (double) this.dbSnpDeletions; if (this.novelDeletions > 0) this.NOVEL_INS_DEL_RATIO = this.novelInsertions / (double) this.novelDeletions; } public void calculateFromDerivedFields(final long totalHetDepth) { dbSnpTransversions = invertFromRatio(NUM_IN_DB_SNP, DBSNP_TITV); dbSnpTransitions = NUM_IN_DB_SNP - dbSnpTransversions; novelTransversions = invertFromRatio(NOVEL_SNPS, NOVEL_TITV); novelTransitions = NOVEL_SNPS - novelTransversions; dbSnpDeletions = invertFromRatio(NUM_IN_DB_SNP_INDELS, DBSNP_INS_DEL_RATIO); dbSnpInsertions = NUM_IN_DB_SNP_INDELS - dbSnpDeletions; novelDeletions = invertFromRatio(NOVEL_INDELS, NOVEL_INS_DEL_RATIO); novelInsertions = NOVEL_INDELS - novelDeletions; refAlleleObs = Double.isNaN(SNP_REFERENCE_BIAS) ? 0L : Math.round(totalHetDepth * SNP_REFERENCE_BIAS); altAlleleObs = totalHetDepth - refAlleleObs; } public static <T extends VariantCallingSummaryMetrics> void foldInto(final T target, final Collection<T> metrics) { metrics.forEach(target::merge); } } /** * Given the ratio (X/Y) and the sum (X+Y), returns Y. * * @param sum X+Y * @param ratio X/Y * @return Y as a long */ private static long invertFromRatio(final long sum, final Double ratio) { return ratio.isNaN() ? 0L : Math.round(sum / (ratio + 1.0)); } /** A collection of metrics relating to snps and indels within a variant-calling file (VCF) for a given sample. */ public static class VariantCallingDetailMetrics extends CollectVariantCallingMetrics.VariantCallingSummaryMetrics { /** The name of the sample being assayed */ @MergeByAssertEquals public String SAMPLE_ALIAS; /** * (count of hets)/(count of homozygous non-ref) for this sample */ @NoMergingIsDerived public double HET_HOMVAR_RATIO; /** * The percentage of variants in a particular sample that have a GQ score of 0. */ @NoMergingIsDerived public double PCT_GQ0_VARIANTS; /** * The total number of variants in a particular sample that have a GQ score of 0. */ @MergeByAdding public long TOTAL_GQ0_VARIANTS; /** * total number of reads (from AD field) for passing bi-allelic SNP hets for this sample */ @NoMergingIsDerived public long TOTAL_HET_DEPTH; /** * Hidden fields not propagated to the metrics file. */ @MergeByAdding long numHets, numHomVar; public static String getFileExtension() { return "variant_calling_detail_metrics"; } @Override public void calculateDerivedFields() { super.calculateDerivedFields(); // Divide by zero should be OK -- NaN should get propagated to metrics file. HET_HOMVAR_RATIO = numHets / (double) numHomVar; PCT_GQ0_VARIANTS = TOTAL_GQ0_VARIANTS / (double) (numHets + numHomVar); TOTAL_HET_DEPTH = refAlleleObs + altAlleleObs; } public void calculateFromDerivedFields() { numHomVar = invertFromRatio(TOTAL_SNPS, HET_HOMVAR_RATIO); numHets = TOTAL_SNPS - numHomVar; calculateFromDerivedFields(TOTAL_HET_DEPTH); } } }