/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.vcf; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.*; import htsjdk.tribble.Tribble; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; import htsjdk.variant.vcf.*; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.VcfOrBcf; import picard.vcf.GenotypeConcordanceStates.CallState; import picard.vcf.GenotypeConcordanceStates.ContingencyState; import picard.vcf.GenotypeConcordanceStates.TruthAndCallStates; import picard.vcf.GenotypeConcordanceStates.TruthState; import picard.vcf.PairedVariantSubContextIterator.VcfTuple; import java.io.File; import java.util.*; import static htsjdk.variant.variantcontext.VariantContext.Type.*; import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4; import static picard.vcf.GenotypeConcordanceStateCodes.*; /** * Calculates the concordance between genotype data for two samples in two different VCFs - one being considered the truth (or reference) * the other being the call. The concordance is broken into separate results sections for SNPs and indels. Summary and detailed statistics * are reported * * @author Tim Fennell * @author George Grant */ @CommandLineProgramProperties( usage = GenotypeConcordance.USAGE_SUMMARY + GenotypeConcordance.USAGE_DETAILS, usageShort = GenotypeConcordance.USAGE_SUMMARY, programGroup = VcfOrBcf.class ) public class GenotypeConcordance extends CommandLineProgram { static final String USAGE_SUMMARY = "Evaluate genotype concordance between callsets."; static final String USAGE_DETAILS = "This tool evaluates the concordance between genotype calls for samples in different " + "callsets where one is being considered as the truth (aka standard, or reference) and the other as the call that is being " + "evaluated for accuracy. <br />" + "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar GenotypeConcordance \\<br />" + " CALL_VCF=input.vcf \\<br />" + " CALL_SAMPLE=sample_name \\<br />" + " O=gc_concordance.vcf \\<br />" + " TRUTH_VCF=truth_set.vcf \\<br />" + " TRUTH_SAMPLE=truth_sample#" + "</pre>" + "" + "<h4>Output Metrics:</h4>" + "<ul>" + "<li>Output metrics include GenotypeConcordanceContingencyMetrics, GenotypeConcordanceSummaryMetrics, and " + "GenotypeConcordanceDetailMetrics. For each set of metrics, the data is broken into separate sections for " + "SNPs and INDELs. Note that only SNP and INDEL variants are considered, MNP, Symbolic, and Mixed classes" + " of variants are not included. </li>" + "<li>GenotypeConcordanceContingencyMetrics enumerate the constituents of each contingent in a callset " + "including true-positive (TP), true-negative (TN), false-positive (FP), and false-negative (FN) calls. See " + "http://broadinstitute.github.io/picard/picard-metric-definitions.html#GenotypeConcordanceContingencyMetrics" + " for more details.</li>" + "<li>GenotypeConcordanceDetailMetrics include the numbers of SNPs and INDELs for each contingent genotype as well " + "as the number of validated genotypes. See " + "http://broadinstitute.github.io/picard/picard-metric-definitions.html#GenotypeConcordanceDetailMetrics for more details.</li>" + "<li>GenotypeConcordanceSummaryMetrics provide specific details for the variant caller performance on a callset including: " + "values for sensitivity, specificity, and positive predictive values. See " + "http://broadinstitute.github.io/picard/picard-metric-definitions.html#GenotypeConcordanceSummaryMetrics for more details.</li>" + "</ul>" + "<br /><br />" + "Useful definitions applicable to alleles and genotypes:<br /> " + "<ul>"+ "<li>Truthset - A callset (typically in VCF format) containing variant calls and genotypes that have been cross-validated " + "with multiple technologies e.g. Genome In A Bottle Consortium (GIAB) (https://sites.stanford.edu/abms/giab)</li>" + "<li>TP - True positives are variant calls that match a 'truthset'</li>" + "<li>FP - False-positives are reference sites miscalled as variant</li>" + "<li>FN - False-negatives are variant sites miscalled as reference</li>" + "<li>TN - True negatives are correctly called reference sites</li>" + "<li>Validated genotypes - are TP sites where the exact genotype (HET or HOM-VAR) has been validated </li> " + "</ul>"+ "" + "<h4>VCF Output:</h4>" + "<ul>" + "<li>The concordance state will be stored in the \"CONC_ST\" tag in the INFO field.</li>" + "<li>The truth sample name will be \"truth\" and call sample name will be \"call\".</li>" + "</ul>" + "<hr />" ; @Option(shortName = "TV", doc="The VCF containing the truth sample") public File TRUTH_VCF; @Option(shortName = "CV", doc="The VCF containing the call sample") public File CALL_VCF; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Basename for the two metrics files that are to be written." + " Resulting files will be <OUTPUT>" + SUMMARY_METRICS_FILE_EXTENSION + " and <OUTPUT>" + DETAILED_METRICS_FILE_EXTENSION + ".") public File OUTPUT; @Option(doc = "Output a VCF annotated with concordance information.") public boolean OUTPUT_VCF = false; @Option(shortName = "TS", doc="The name of the truth sample within the truth VCF. Not required if only one sample exists.", optional = true) public String TRUTH_SAMPLE = null; @Option(shortName = "CS", doc="The name of the call sample within the call VCF. Not required if only one sample exists.", optional = true) public String CALL_SAMPLE = null; @Option(doc="One or more interval list files that will be used to limit the genotype concordance. Note - if intervals are specified, the VCF files must be indexed.") public List<File> INTERVALS; @Option(doc="If true, multiple interval lists will be intersected. If false multiple lists will be unioned.") public boolean INTERSECT_INTERVALS = true; @Option(doc="Genotypes below this genotype quality will have genotypes classified as LowGq.") public int MIN_GQ = 0; @Option(doc="Genotypes below this depth will have genotypes classified as LowDp.") public int MIN_DP = 0; @Option(doc="If true, output all rows in detailed statistics even when count == 0. When false only output rows with non-zero counts.") public boolean OUTPUT_ALL_ROWS = false; @Option(doc="If true, use the VCF index, else iterate over the entire VCF.", optional = true) public boolean USE_VCF_INDEX = false; @Option(shortName = "MISSING_HOM", doc="Default is false, which follows the GA4GH Scheme. If true, missing sites in the truth set will be " + "treated as HOM_REF sites and sites missing in both the truth and call sets will be true negatives. Useful when hom ref sites are left out of the truth set. " + "This flag can only be used with a high confidence interval list.") public boolean MISSING_SITES_HOM_REF = false; private final Log log = Log.getInstance(GenotypeConcordance.class); private final ProgressLogger progress = new ProgressLogger(log, 10000, "checked", "variants"); public static final String SUMMARY_METRICS_FILE_EXTENSION = ".genotype_concordance_summary_metrics"; public static final String DETAILED_METRICS_FILE_EXTENSION = ".genotype_concordance_detail_metrics"; public static final String CONTINGENCY_METRICS_FILE_EXTENSION = ".genotype_concordance_contingency_metrics"; public static final String OUTPUT_VCF_FILE_EXTENSION = ".genotype_concordance.vcf.gz"; protected GenotypeConcordanceCounts snpCounter; public GenotypeConcordanceCounts getSnpCounter() { return snpCounter; } protected GenotypeConcordanceCounts indelCounter; public GenotypeConcordanceCounts getIndelCounter() { return indelCounter; } public static final String CONTINGENCY_STATE_TAG = "CONC_ST"; public static final VCFHeaderLine CONTINGENCY_STATE_HEADER_LINE = new VCFInfoHeaderLine(CONTINGENCY_STATE_TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "The genotype concordance contingency state(s)"); public static final String OUTPUT_VCF_TRUTH_SAMPLE_NAME = "truth"; public static final String OUTPUT_VCF_CALL_SAMPLE_NAME = "call"; public static void main(final String[] args) { new GenotypeConcordance().instanceMainWithExit(args); } @Override protected String[] customCommandLineValidation() { // Note - If the user specifies to use INTERVALS, the code will fail if the vcfs are not indexed, so we set USE_VCF_INDEX to true and check that the vcfs are indexed. IOUtil.assertFileIsReadable(TRUTH_VCF); IOUtil.assertFileIsReadable(CALL_VCF); final boolean usingIntervals = this.INTERVALS != null && !this.INTERVALS.isEmpty(); final List<String> errors = new ArrayList<String>(); if (usingIntervals) { USE_VCF_INDEX = true; } if (USE_VCF_INDEX) { // Index file is required either because we are using intervals, or because user-set parameter if (!indexExists(TRUTH_VCF)) { errors.add("The index file was not found for the TRUTH VCF. Note that if intervals are specified, the VCF files must be indexed."); } if (!indexExists(CALL_VCF)) { errors.add("The index file was not found for the CALL VCF. Note that if intervals are specified, the VCF files must be indexed."); } } if (MISSING_SITES_HOM_REF) { //If you are using this flag you must include a high confidence interval list where missing sites are hom_ref. if (!usingIntervals) { errors.add("You cannot use the MISSING_HOM option without also supplying an interval list over which missing " + "sites are considered confident homozygous reference calls."); } } if (errors.isEmpty()) { return null; } else { return errors.toArray(new String[errors.size()]); } } /** * Determines whether an index file exists for the given vcf file using standard extension suffixes * * @param vcf the vcf file to investigate * @return true if an index file exists, false otherwise */ private boolean indexExists(final File vcf) { return Tribble.indexFile(vcf).exists() || Tribble.tabixIndexFile(vcf).exists(); } @Override protected int doWork() { final File summaryMetricsFile = new File(OUTPUT + SUMMARY_METRICS_FILE_EXTENSION); final File detailedMetricsFile = new File(OUTPUT + DETAILED_METRICS_FILE_EXTENSION); final File contingencyMetricsFile = new File(OUTPUT + CONTINGENCY_METRICS_FILE_EXTENSION); IOUtil.assertFileIsWritable(summaryMetricsFile); IOUtil.assertFileIsWritable(detailedMetricsFile); IOUtil.assertFileIsWritable(contingencyMetricsFile); final GenotypeConcordanceSchemeFactory schemeFactory = new GenotypeConcordanceSchemeFactory(); final GenotypeConcordanceScheme scheme = schemeFactory.getScheme(MISSING_SITES_HOM_REF); final boolean usingIntervals = this.INTERVALS != null && !this.INTERVALS.isEmpty(); IntervalList intervals = null; SAMSequenceDictionary intervalsSamSequenceDictionary = null; if (usingIntervals) { log.info("Starting to load intervals list(s)."); long genomeBaseCount = 0; for (final File f : INTERVALS) { IOUtil.assertFileIsReadable(f); final IntervalList tmpIntervalList = IntervalList.fromFile(f); if (genomeBaseCount == 0) { // Don't count the reference length more than once. intervalsSamSequenceDictionary = tmpIntervalList.getHeader().getSequenceDictionary(); genomeBaseCount = intervalsSamSequenceDictionary.getReferenceLength(); } if (intervals == null) intervals = tmpIntervalList; else if (INTERSECT_INTERVALS) intervals = IntervalList.intersection(intervals, tmpIntervalList); else intervals = IntervalList.union(intervals, tmpIntervalList); } if (intervals != null) { intervals = intervals.uniqued(); } log.info("Finished loading up intervals list(s)."); } final VCFFileReader truthReader = new VCFFileReader(TRUTH_VCF, USE_VCF_INDEX); final VCFFileReader callReader = new VCFFileReader(CALL_VCF, USE_VCF_INDEX); if (TRUTH_SAMPLE == null) { if (truthReader.getFileHeader().getNGenotypeSamples() > 1) { throw new PicardException("TRUTH_SAMPLE is required when the TRUTH_VCF has more than one sample"); } TRUTH_SAMPLE = truthReader.getFileHeader().getGenotypeSamples().get(0); } if (CALL_SAMPLE == null) { if (callReader.getFileHeader().getNGenotypeSamples() > 1) { throw new PicardException("CALL_SAMPLE is required when the CALL_VCF has more than one sample"); } CALL_SAMPLE = callReader.getFileHeader().getGenotypeSamples().get(0); } // Check that the samples actually exist in the files! if (!truthReader.getFileHeader().getGenotypeSamples().contains(TRUTH_SAMPLE)) { throw new PicardException("File " + TRUTH_VCF.getAbsolutePath() + " does not contain genotypes for sample " + TRUTH_SAMPLE); } if (!callReader.getFileHeader().getGenotypeSamples().contains(CALL_SAMPLE)) { throw new PicardException("File " + CALL_VCF.getAbsolutePath() + " does not contain genotypes for sample " + CALL_SAMPLE); } // Verify that both VCFs have the same Sequence Dictionary SequenceUtil.assertSequenceDictionariesEqual(truthReader.getFileHeader().getSequenceDictionary(), callReader.getFileHeader().getSequenceDictionary()); final Optional<VariantContextWriter> writer = getVariantContextWriter(truthReader, callReader); if (usingIntervals) { // If using intervals, verify that the sequence dictionaries agree with those of the VCFs SequenceUtil.assertSequenceDictionariesEqual(intervalsSamSequenceDictionary, truthReader.getFileHeader().getSequenceDictionary()); } // Build the pair of iterators over the regions of interest final Iterator<VariantContext> truthIterator, callIterator; if (usingIntervals) { truthIterator = new ByIntervalListVariantContextIterator(truthReader, intervals); callIterator = new ByIntervalListVariantContextIterator(callReader, intervals); } else { truthIterator = truthReader.iterator(); callIterator = callReader.iterator(); } // Now do the iteration and count things up final PairedVariantSubContextIterator pairedIterator = new PairedVariantSubContextIterator(truthIterator, TRUTH_SAMPLE, callIterator, CALL_SAMPLE, truthReader.getFileHeader().getSequenceDictionary()); snpCounter = new GenotypeConcordanceCounts(); indelCounter = new GenotypeConcordanceCounts(); // A map to keep track of the count of Truth/Call States which we could not successfully classify final Map<String, Integer> unClassifiedStatesMap = new HashMap<String, Integer>(); log.info("Starting iteration over variants."); while (pairedIterator.hasNext()) { final VcfTuple tuple = pairedIterator.next(); final VariantContext.Type truthVariantContextType = tuple.leftVariantContext.map(VariantContext::getType).orElse(NO_VARIATION); final VariantContext.Type callVariantContextType = tuple.rightVariantContext.map(VariantContext::getType).orElse(NO_VARIATION); final boolean stateClassified = classifyVariants(tuple.leftVariantContext, TRUTH_SAMPLE, tuple.rightVariantContext, CALL_SAMPLE, Optional.of(snpCounter), Optional.of(indelCounter), MIN_GQ, MIN_DP); if (!stateClassified) { final String condition = truthVariantContextType + " " + callVariantContextType; final Integer count = unClassifiedStatesMap.getOrDefault(condition, 0) + 1; unClassifiedStatesMap.put(condition, count); } // write to the output VCF writer.ifPresent(w -> writeVcfTuple(tuple, w, scheme)); //final VariantContext variantContextForLogging = tuple.leftVariantContext.orElseGet(tuple.rightVariantContext::get); // FIXME final VariantContext variantContextForLogging = tuple.leftVariantContext.isPresent() ? tuple.leftVariantContext.get() : tuple.rightVariantContext.get(); progress.record(variantContextForLogging.getContig(), variantContextForLogging.getStart()); } //snp counter add in X number of missing-missing hom ref's (truth and call state) //missing missing is total interval size minus number of iterations in while loop if (MISSING_SITES_HOM_REF) { // need to know size of region called over (intervals or whole genome) to add missing-missing sites for NIST schema. final long baseCount = (intervals != null) ? intervals.getBaseCount() : truthReader.getFileHeader().getSequenceDictionary().getReferenceLength(); addMissingTruthAndMissingCallStates(snpCounter.getCounterSize(), baseCount, snpCounter); addMissingTruthAndMissingCallStates(indelCounter.getCounterSize(), baseCount, indelCounter); } // Calculate and store the summary-level metrics final MetricsFile<GenotypeConcordanceSummaryMetrics,?> genotypeConcordanceSummaryMetricsFile = getMetricsFile(); GenotypeConcordanceSummaryMetrics summaryMetrics = new GenotypeConcordanceSummaryMetrics(SNP, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF); genotypeConcordanceSummaryMetricsFile.addMetric(summaryMetrics); summaryMetrics = new GenotypeConcordanceSummaryMetrics(INDEL, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF); genotypeConcordanceSummaryMetricsFile.addMetric(summaryMetrics); genotypeConcordanceSummaryMetricsFile.write(summaryMetricsFile); // Calculate and store the detailed metrics for both SNP and indels final MetricsFile<GenotypeConcordanceDetailMetrics,?> genotypeConcordanceDetailMetrics = getMetricsFile(); outputDetailMetricsFile(SNP, genotypeConcordanceDetailMetrics, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF, OUTPUT_ALL_ROWS); outputDetailMetricsFile(INDEL, genotypeConcordanceDetailMetrics, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF, OUTPUT_ALL_ROWS); genotypeConcordanceDetailMetrics.write(detailedMetricsFile); // Calculate and score the contingency metrics final MetricsFile<GenotypeConcordanceContingencyMetrics,?> genotypeConcordanceContingencyMetricsFile = getMetricsFile(); GenotypeConcordanceContingencyMetrics contingencyMetrics = new GenotypeConcordanceContingencyMetrics(SNP, snpCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF); genotypeConcordanceContingencyMetricsFile.addMetric(contingencyMetrics); contingencyMetrics = new GenotypeConcordanceContingencyMetrics(INDEL, indelCounter, TRUTH_SAMPLE, CALL_SAMPLE, MISSING_SITES_HOM_REF); genotypeConcordanceContingencyMetricsFile.addMetric(contingencyMetrics); genotypeConcordanceContingencyMetricsFile.write(contingencyMetricsFile); for (final String condition : unClassifiedStatesMap.keySet()) { log.info("Uncovered truth/call Variant Context Type Counts: " + condition + " " + unClassifiedStatesMap.get(condition)); } CloserUtil.close(callReader); CloserUtil.close(truthReader); writer.ifPresent(VariantContextWriter::close); return 0; } /** Gets the variant context writer if the output VCF is to be written, otherwise empty. */ private Optional<VariantContextWriter> getVariantContextWriter(final VCFFileReader truthReader, final VCFFileReader callReader) { if (OUTPUT_VCF) { final File outputVcfFile = new File(OUTPUT + OUTPUT_VCF_FILE_EXTENSION); final VariantContextWriterBuilder builder = new VariantContextWriterBuilder() .setOutputFile(outputVcfFile) .setReferenceDictionary(callReader.getFileHeader().getSequenceDictionary()) .setOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER) .setOption(Options.INDEX_ON_THE_FLY); final VariantContextWriter writer = builder.build(); // create the output header final List<String> sampleNames = Arrays.asList(OUTPUT_VCF_CALL_SAMPLE_NAME, OUTPUT_VCF_TRUTH_SAMPLE_NAME); final Set<VCFHeaderLine> headerLines = new HashSet<>(); headerLines.addAll(callReader.getFileHeader().getMetaDataInInputOrder()); headerLines.addAll(truthReader.getFileHeader().getMetaDataInInputOrder()); headerLines.add(CONTINGENCY_STATE_HEADER_LINE); writer.writeHeader(new VCFHeader(headerLines, sampleNames)); return Optional.of(writer); } else { return Optional.empty(); } } private void writeVcfTuple(final VcfTuple tuple, final VariantContextWriter writer, final GenotypeConcordanceScheme scheme) { VariantContext truthContext = null, callContext = null; final List<Genotype> genotypes = new ArrayList<>(2); // get the variant contexts and initialize the output variant context builder if (tuple.leftVariantContext.isPresent()) { truthContext = tuple.leftVariantContext.get(); } if (tuple.rightVariantContext.isPresent()) { callContext = tuple.rightVariantContext.get(); } // Get the alleles for each genotype. No alleles will be extracted for a genotype if the genotype is // mixed, filtered, or missing. final Alleles alleles = normalizeAlleles(truthContext, TRUTH_SAMPLE, callContext, CALL_SAMPLE); // There will be no alleles if both genotypes are one of mixed, filtered, or missing. Do not output any // variant context in this case. if (!alleles.allAlleles.isEmpty()) { if (truthContext == null && callContext == null) { throw new IllegalStateException("Both truth and call contexts are null!"); } final VariantContextBuilder builder; final List<Allele> allAlleles = alleles.asList(); final List<Allele> truthAlleles = alleles.truthAlleles(); final List<Allele> callAlleles = alleles.callAlleles(); // Get the alleles present at this site for both samples to use for the output variant context. final Set<Allele> siteAlleles = new HashSet<>(); if (truthContext != null) { siteAlleles.addAll(truthContext.getAlleles()); } if (callContext != null) { siteAlleles.addAll(callContext.getAlleles()); } // Initialize the variant context builder final VariantContext initialContext = (callContext == null) ? truthContext : callContext; builder = new VariantContextBuilder(initialContext.getSource(), initialContext.getContig(), initialContext.getStart(), initialContext.getEnd(), siteAlleles); builder.computeEndFromAlleles(allAlleles, initialContext.getStart()); builder.log10PError(initialContext.getLog10PError()); // Add the genotypes addToGenotypes(genotypes, truthContext, TRUTH_SAMPLE, OUTPUT_VCF_TRUTH_SAMPLE_NAME, allAlleles, truthAlleles, MISSING_SITES_HOM_REF); addToGenotypes(genotypes, callContext, CALL_SAMPLE, OUTPUT_VCF_CALL_SAMPLE_NAME, allAlleles, callAlleles, false); // set the alleles and genotypes builder.genotypes(genotypes); // set the concordance state attribute final TruthAndCallStates state = GenotypeConcordance.determineState(truthContext, TRUTH_SAMPLE, callContext, CALL_SAMPLE, MIN_GQ, MIN_DP); final ContingencyState[] stateArray = scheme.getConcordanceStateArray(state.truthState, state.callState); builder.attribute(CONTINGENCY_STATE_TAG, Arrays.asList(stateArray)); // write it writer.add(builder.make()); } } /** Adds a new genotype to the provided list of genotypes. If the given variant context is null or has no alleles * (ex. a no-call), then add a no-call, otherwise make a copy of the genotype for the sample with the input * sample name within the provided context. In the former case, if missingSitesHomRef is true, a homozygous * reference genotype is created instead of a no-call. In the latter case, the new genotype will have the output * sample name. */ private void addToGenotypes(final List<Genotype> genotypes, final VariantContext ctx, final String inputSampleName, final String outputSampleName, final List<Allele> allAlleles, final List<Allele> ctxAlleles, final boolean missingSitesHomRef) { if (ctx != null && !ctxAlleles.isEmpty()) { final Genotype genotype = ctx.getGenotype(inputSampleName); final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); genotypeBuilder.name(outputSampleName); genotypeBuilder.alleles(ctxAlleles); if (!genotype.hasAnyAttribute(VCFConstants.GENOTYPE_KEY)) { genotypeBuilder.attribute(VCFConstants.GENOTYPE_KEY, MISSING_VALUE_v4); } genotypes.add(genotypeBuilder.make()); } else { final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(outputSampleName); if (missingSitesHomRef) { genotypeBuilder.alleles(Arrays.asList(allAlleles.get(0), allAlleles.get(0))); } else { genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); } genotypes.add(genotypeBuilder.make()); } } public static boolean classifyVariants(final Optional<VariantContext> truthContext, final String truthSample, final Optional<VariantContext> callContext, final String callSample, final int minGq, final int minDp) { return classifyVariants(truthContext, truthSample, callContext, callSample, Optional.empty(), Optional.empty(), minGq, minDp); } /** * Attempts to determine the concordance state given the truth and all variant context and optionally increments the genotype concordance * count for the given variant type (SNP or INDEL). This will ignore cases where an indel was found in the truth and a SNP was found in * the call, and vice versa. We typically fail to classify Mixed, Symbolic variants, or MNPs. * * @param truthContext A variant context representing truth * @param truthSample The name of the truth sample * @param callContext A variant context representing the call * @param callSample The name of the call sample * @param snpCounter optionally a place to increment the counts for SNP truth/call states * @param indelCounter optionally a place to increment the counts for INDEL truth/call states * @param minGq Threshold for filtering by genotype attribute GQ * @param minDp Threshold for filtering by genotype attribute DP * @return true if the concordance state could be classified. */ public static boolean classifyVariants(final Optional<VariantContext> truthContext, final String truthSample, final Optional<VariantContext> callContext, final String callSample, final Optional<GenotypeConcordanceCounts> snpCounter, final Optional<GenotypeConcordanceCounts> indelCounter, final int minGq, final int minDp) { final VariantContext.Type truthVariantContextType = truthContext.map(VariantContext::getType).orElse(NO_VARIATION); final VariantContext.Type callVariantContextType = callContext.map(VariantContext::getType).orElse(NO_VARIATION); // A flag to keep track of whether we have been able to successfully classify the Truth/Call States. // Unclassified include MIXED/MNP/Symbolic... final TruthAndCallStates truthAndCallStates = determineState(truthContext.orElse(null), truthSample, callContext.orElse(null), callSample, minGq, minDp); if (truthVariantContextType == SNP) { if ((callVariantContextType == SNP) || (callVariantContextType == MIXED) || (callVariantContextType == NO_VARIATION)) { // Note. If truth is SNP and call is MIXED, the event will be logged in the snpCounter, with row = MIXED snpCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } } else if (truthVariantContextType == INDEL) { // Note. If truth is Indel and call is MIXED, the event will be logged in the indelCounter, with row = MIXED if ((callVariantContextType == INDEL) || (callVariantContextType == MIXED) || (callVariantContextType == NO_VARIATION)) { indelCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } } else if (truthVariantContextType == MIXED) { // Note. If truth is MIXED and call is SNP, the event will be logged in the snpCounter, with column = MIXED if (callVariantContextType == SNP) { snpCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } // Note. If truth is MIXED and call is INDEL, the event will be logged in the indelCounter, with column = MIXED else if (callVariantContextType == INDEL) { indelCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } } else if (truthVariantContextType == NO_VARIATION) { if (callVariantContextType == SNP) { snpCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } else if (callVariantContextType == INDEL) { indelCounter.ifPresent(counter -> counter.increment(truthAndCallStates)); return true; } } return false; } /** * Method to add missing sites that are KNOWN to be HOM_REF in the case of the NIST truth data set. */ public static void addMissingTruthAndMissingCallStates(final double numVariants, final long intervalBaseCount, final GenotypeConcordanceCounts counter) { final double countMissingMissing = intervalBaseCount-numVariants; final TruthAndCallStates missingMissing = new TruthAndCallStates(TruthState.MISSING, CallState.MISSING); counter.increment(missingMissing, countMissingMissing); } /** * Outputs the detailed statistics tables for SNP and Indel match categories. **/ public static void outputDetailMetricsFile(final VariantContext.Type variantType, final MetricsFile<GenotypeConcordanceDetailMetrics,?> genotypeConcordanceDetailMetricsFile, final GenotypeConcordanceCounts counter, final String truthSampleName, final String callSampleName, final boolean missingSitesHomRef, final boolean outputAllRows) { final GenotypeConcordanceSchemeFactory schemeFactory = new GenotypeConcordanceSchemeFactory(); final GenotypeConcordanceScheme scheme = schemeFactory.getScheme(missingSitesHomRef); scheme.validateScheme(); for (final TruthState truthState : TruthState.values()) { for (final CallState callState : CallState.values()) { final long count = counter.getCount(truthState, callState); final String contingencyValues = scheme.getContingencyStateString(truthState, callState); if (count > 0 || outputAllRows) { final GenotypeConcordanceDetailMetrics detailMetrics = new GenotypeConcordanceDetailMetrics(); detailMetrics.VARIANT_TYPE = variantType; detailMetrics.TRUTH_SAMPLE = truthSampleName; detailMetrics.CALL_SAMPLE = callSampleName; detailMetrics.TRUTH_STATE = truthState; detailMetrics.CALL_STATE = callState; detailMetrics.COUNT = count; detailMetrics.CONTINGENCY_VALUES = contingencyValues; genotypeConcordanceDetailMetricsFile.addMetric(detailMetrics); } } } } /** A simple structure to return the results of getAlleles. NB: truthAllele1/2 and callAllele1/2 may be null if * no first or second allele was present respectively. */ protected static class Alleles { public final OrderedSet<String> allAlleles; public final String truthAllele1; public final String truthAllele2; public final String callAllele1; public final String callAllele2; public Alleles(final OrderedSet<String> allAlleles, final String truthAllele1, final String truthAllele2, final String callAllele1, final String callAllele2) { if (truthAllele1 == null && truthAllele2 != null) { throw new IllegalStateException("truthAllele2 should be null if truthAllele1 is null."); } if (callAllele1 == null && callAllele2 != null) { throw new IllegalStateException("callAllele2 should be null if callAllele1 is null."); } this.allAlleles = allAlleles; this.truthAllele1 = (truthAllele1 == null) ? null : this.allAlleles.get(allAlleles.indexOf(truthAllele1)); this.truthAllele2 = (truthAllele2 == null) ? null : this.allAlleles.get(allAlleles.indexOf(truthAllele2)); this.callAllele1 = (callAllele1 == null) ? null : this.allAlleles.get(allAlleles.indexOf(callAllele1)); this.callAllele2 = (callAllele2 == null) ? null : this.allAlleles.get(allAlleles.indexOf(callAllele2)); } public List<Allele> asList() { if (allAlleles.isEmpty()) { return Collections.emptyList(); } else { final List<Allele> alleles = new ArrayList<>(this.allAlleles.size()); for (int idx = 0; idx < this.allAlleles.size(); idx++) { alleles.add(Allele.create(this.allAlleles.get(idx), idx == 0)); } return alleles; } } public List<Allele> truthAlleles() { if (allAlleles.isEmpty() || this.truthAllele1 == null) { return Collections.emptyList(); } else { final Allele truthAllele1 = Allele.create(this.truthAllele1, this.allAlleles.indexOf(this.truthAllele1) == 0); final Allele truthAllele2 = Allele.create(this.truthAllele2, this.allAlleles.indexOf(this.truthAllele2) == 0); return Arrays.asList(truthAllele1, truthAllele2); } } public List<Allele> callAlleles() { if (allAlleles.isEmpty() || this.callAllele1 == null) { return Collections.emptyList(); } else { final Allele callAllele1 = Allele.create(this.callAllele1, this.allAlleles.indexOf(this.callAllele1) == 0); final Allele callAllele2 = Allele.create(this.callAllele2, this.allAlleles.indexOf(this.callAllele2) == 0); return Arrays.asList(callAllele1, callAllele2); } } } /** Inserts the given string into the destination string at the given index. If the index is past the end of the * destination string, the given string is appended to the destination. */ static String spliceOrAppendString(final String destination, final String toInsert, final int insertIdx) { if (insertIdx <= destination.length()) { return destination.substring(0, insertIdx) + toInsert + destination.substring(insertIdx); } else { return destination + toInsert; } } /** Gets the alleles for the truth and call genotypes. In particular, this handles the case where indels can have different * reference alleles. */ final protected static Alleles normalizeAlleles(final VariantContext truthContext, final String truthSample, final VariantContext callContext, final String callSample) { final Genotype truthGenotype, callGenotype; if (truthContext == null || truthContext.isMixed() || truthContext.isFiltered()) truthGenotype = null; else truthGenotype = truthContext.getGenotype(truthSample); if (callContext == null || callContext.isMixed() || callContext.isFiltered()) callGenotype = null; else callGenotype = callContext.getGenotype(callSample); // initialize the reference String truthRef = (truthGenotype != null) ? truthContext.getReference().getBaseString() : null; String callRef = (callGenotype != null) ? callContext.getReference().getBaseString() : null; String truthAllele1 = null; String truthAllele2 = null; if (null != truthGenotype) { if (truthGenotype.getAlleles().size() != 2) { throw new IllegalStateException("Genotype for Variant Context: " + truthContext + " does not have exactly 2 alleles"); } truthAllele1 = truthGenotype.getAllele(0).getBaseString(); truthAllele2 = truthGenotype.getAllele(1).getBaseString(); } String callAllele1 = null; String callAllele2 = null; if (null != callGenotype) { if (callGenotype.getAlleles().size() != 2) { throw new IllegalStateException("Genotype for Variant Context: " + callContext + " does not have exactly 2 alleles"); } callAllele1 = callGenotype.getAllele(0).getBaseString(); callAllele2 = callGenotype.getAllele(1).getBaseString(); } if ((truthRef != null && callRef != null) && (!truthRef.equals(callRef))) { // We must handle different representations for indels based on their reference alleles. For example: // - Deletion: TCAA*/T versus TCAACAA*/TCAA // - Insertion: TCAA*/TCAAGG versus TCAACAA*/TCAACAAGG // We do the following: // 1. Verify that the shorter reference allele is a prefix of the longer reference allele. // 2. Add the extra reference bases to the variant allele. // 3. Update the shorter reference allele to be the longer reference allele. if (truthRef.length() < callRef.length()) { // Truth reference is shorter than call reference final String suffix = getStringSuffix(callRef, truthRef, "Ref alleles mismatch between: " + truthContext + " and " + callContext); final int insertIdx = truthRef.length(); truthAllele1 = spliceOrAppendString(truthAllele1, suffix, insertIdx); truthAllele2 = spliceOrAppendString(truthAllele2, suffix, insertIdx); truthRef = truthRef + suffix; } else if (truthRef.length() > callRef.length()) { // call reference is shorter than truth: final String suffix = getStringSuffix(truthRef, callRef, "Ref alleles mismatch between: " + truthContext + " and " + callContext); final int insertIdx = callRef.length(); callAllele1 = spliceOrAppendString(callAllele1, suffix, insertIdx); callAllele2 = spliceOrAppendString(callAllele2, suffix, insertIdx); callRef = callRef + suffix; } else { // Same length - so they must just disagree... throw new IllegalStateException("Ref alleles mismatch between: " + truthContext + " and " + callContext); } } final OrderedSet<String> allAlleles = new OrderedSet<String>(); if (truthGenotype != null || callGenotype != null) { // Store the refAllele as the first (0th index) allele in allAlleles (only can do if at least one genotype is non-null) allAlleles.smartAdd(truthGenotype == null ? callRef : truthRef); // zeroth allele; } if (null != truthGenotype) { allAlleles.smartAdd(truthAllele1); allAlleles.smartAdd(truthAllele2); } /** * if either of the call alleles is in allAlleles, with index > 1, we need to make sure that allele has index 1. * this is because of the following situations: * * REF TRUTH CALL-GT TRUTH-STATE CALL-STATE * A C/G C/A HET_VAR1_VAR2 HET_REF_VAR1 * A G/C C/A HET_VAR1_VAR2 HET_REF_VAR1 * A G/C G/A HET_VAR1_VAR2 HET_REF_VAR1 * A G/C G/A HET_VAR1_VAR2 HET_REF_VAR1 * * so, in effect, the order of the alleles in the TRUTH doesn't determine the assignment of allele to Var1 and Var2, * only once the call is known can this assignment be made. */ if (null != callGenotype) { if (allAlleles.indexOf(callAllele1) > 1 || allAlleles.indexOf(callAllele2) > 1) { allAlleles.remove(2); allAlleles.remove(1); allAlleles.smartAdd(truthAllele2); allAlleles.smartAdd(truthAllele1); } allAlleles.smartAdd(callAllele1); allAlleles.smartAdd(callAllele2); } return new Alleles(allAlleles, truthAllele1, truthAllele2, callAllele1, callAllele2); } private static GenotypeConcordanceStateCodes getStateCode(final VariantContext ctx, final String sample, final int minGq, final int minDp) { // Site level checks // Added potential to include missing sites as hom ref. if (ctx == null) return MISSING_CODE; else if (ctx.isMixed()) return IS_MIXED_CODE; else if (ctx.isFiltered()) return VC_FILTERED_CODE; else { // Genotype level checks final Genotype genotype = ctx.getGenotype(sample); if (genotype.isNoCall()) return NO_CALL_CODE; else if (genotype.isFiltered()) return GT_FILTERED_CODE; else if ((genotype.getGQ() != -1) && (genotype.getGQ() < minGq)) return LOW_GQ_CODE; else if ((genotype.getDP() != -1) && (genotype.getDP() < minDp)) return LOW_DP_CODE; // Note. Genotype.isMixed means that it is called on one chromosome and NOT on the other else if ((genotype.isMixed())) return NO_CALL_CODE; } return null; } /** * A method to determine the truth and call states for a pair of variant contexts representing truth and call. * A variety of variant and genotype-level checks are first used to determine if either of the the variant contexts * are filtered and after that a comparison of the called genotype alleles to determine appropriate truth and call state * * Note that this method does NOT check for SNP versus Indel. It is assumed that that check is done by the caller and the results * of this method are stored by SNP/Indel. * Note that if a variant context has BOTH GQ and DP less than the specified threshold, then it will be of Truth/Call State LOW_GQ * * @param truthContext A variant context representing truth * @param truthSample The name of the truth sample * @param callContext A variant context representing the call * @param callSample The name of the call sample * @param minGq Threshold for filtering by genotype attribute GQ * @param minDp Threshold for filtering by genotype attribute DP * @return TruthAndCallStates object containing the TruthState and CallState determined here. */ final public static TruthAndCallStates determineState(final VariantContext truthContext, final String truthSample, final VariantContext callContext, final String callSample, final int minGq, final int minDp) { TruthState truthState = null; CallState callState = null; // TODO: what about getPloidy() // Get truth and call states if they are filtered or are not going to be compared (ex. depth is less than minDP). final GenotypeConcordanceStateCodes truthStateCode = getStateCode(truthContext, truthSample, minGq, minDp); if (null != truthStateCode) { truthState = GenotypeConcordanceStates.truthMap.get(truthStateCode.ordinal()); } final GenotypeConcordanceStateCodes callStateCode = getStateCode(callContext, callSample, minGq, minDp); if (null != callStateCode) { callState = GenotypeConcordanceStates.callMap.get(callStateCode.ordinal()); } final Alleles alleles = normalizeAlleles( truthState == null ? truthContext : null, truthSample, callState == null ? callContext : null, callSample); final OrderedSet<String> allAlleles = alleles.allAlleles; final String truthAllele1 = alleles.truthAllele1; final String truthAllele2 = alleles.truthAllele2; final String callAllele1 = alleles.callAllele1; final String callAllele2 = alleles.callAllele2; // Truth if (null == truthState) { final int allele0idx = allAlleles.indexOf(truthAllele1); final int allele1idx = allAlleles.indexOf(truthAllele2); if (allele0idx == allele1idx) { //HOM truthState = TruthState.getHom(allele0idx); } else { //HET truthState = TruthState.getVar(allele0idx, allele1idx); } } // Call if (null == callState) { final int allele0idx = allAlleles.indexOf(callAllele1); final int allele1idx = allAlleles.indexOf(callAllele2); if (allele0idx == allele1idx) { //HOM callState = CallState.getHom(allele0idx); } else { //HET callState = CallState.getHet(allele0idx, allele1idx); } if (null == callState) { throw new IllegalStateException("This should never happen... Could not classify the call variant: " + callContext.getGenotype(callSample)); } } return new TruthAndCallStates(truthState, callState); } final static String getStringSuffix(final String longerString, final String shorterString, final String errorMsg) { // Truth reference is shorter than call reference if (!longerString.startsWith(shorterString)) { throw new IllegalStateException(errorMsg); } return longerString.substring(shorterString.length()); } } /** like a list, but if you ask for an index of an item, it will first add that item. also, same item cannot be added more than once (like a set) */ class OrderedSet<T> extends ArrayList<T> { public int smartIndexOf(final T o) { smartAdd(o); return super.indexOf(o); } public boolean smartAdd(final T o) { if (!this.contains(o)) { return add(o); } return false; } }