/* * The MIT License * * Copyright (c) 2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis.replicates; import htsjdk.samtools.DuplicateSet; import htsjdk.samtools.DuplicateSetIterator; import htsjdk.samtools.QueryInterval; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMRecordIterator; import htsjdk.samtools.SAMUtils; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.filter.AggregateFilter; import htsjdk.samtools.filter.AlignedFilter; import htsjdk.samtools.filter.FilteringSamIterator; import htsjdk.samtools.filter.MappingQualityFilter; import htsjdk.samtools.filter.SamRecordFilter; import htsjdk.samtools.filter.SecondaryOrSupplementaryFilter; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.ComparableTuple; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.filter.CompoundFilter; import htsjdk.variant.variantcontext.filter.FilteringVariantContextIterator; import htsjdk.variant.variantcontext.filter.GenotypeQualityFilter; import htsjdk.variant.variantcontext.filter.HeterozygosityFilter; import htsjdk.variant.variantcontext.filter.PassingVariantFilter; import htsjdk.variant.variantcontext.filter.SnpFilter; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFFileReader; import htsjdk.variant.vcf.VCFHeader; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Alpha; import picard.filter.CountingPairedFilter; import java.io.File; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.IntStream; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; /** * A CLP that, given a BAM and a VCF with genotypes of the same sample, estimates the rate of independent replication of reads within the bam. * That is, it estimates the fraction of the reads which look like duplicates (in the MarkDuplicates sense of the word) but are actually * independent observations of the data. In the presence of Unique Molecular Identifiers (UMIs), various metrics are collected regarding the * utility of the UMI's for the purpose of increasing coverage. * <p> * The estimation is based on duplicate-sets of size 2 and 3 and gives separate estimates from each. The assumption is that the duplication * rate (biological or otherwise) is independent of the duplicate-set size. A significant difference between the two rates may be an indication that * this assumption is incorrect. * <p> * The duplicate sets are found using the mate-cigar tag (MC) which is added by {@link picard.sam.MergeBamAlignment} , or {@link picard.sam.FixMateInformation}. * This program will not work without the MC tag. * <p> * Explanation of the calculation behind the estimation can be found in the {@link IndependentReplicateMetric} class. * <p> * The calculation Assumes a diploid organism (more accurately, assumes that only two alleles can appear at a HET site and that * these two alleles will appear at equal probabilities. It requires as input a VCF with genotypes for the sample in question. * * NOTE: This class is very much in alpha stage, and still under heavy development (feel free to join!) * * * @author Yossi Farjoun * */ @CommandLineProgramProperties( usage = "Estimates the rate of independent replication rate of reads within a bam. \n" + "That is, it estimates the fraction of the reads which would be marked as duplicates but " + "are actually biological replicates, independent observations of the data. ", usageShort = "Estimates the rate of independent replication of reads within a bam.", programGroup = Alpha.class ) public class CollectIndependentReplicateMetrics extends CommandLineProgram { private static final int DOUBLETON_SIZE = 2, TRIPLETON_SIZE = 3; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input (indexed) BAM file.") public File INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Write metrics to this file") public File OUTPUT; @Option(shortName = "MO", doc = "Write the confusion matrix (of UMIs) to this file", optional = true) public File MATRIX_OUTPUT; @Option(shortName = "V", doc = "Input VCF file") public File VCF; @Option(shortName = "GQ", doc = "minimal value for the GQ field in the VCF to use variant site.", optional = true) public Integer MINIMUM_GQ = 90; @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "minimal value for the mapping quality of the reads to be used in the estimation.", optional = true) public Integer MINIMUM_MQ = 40; @Option(shortName = "BQ", doc = "minimal value for the base quality of a base to be used in the estimation.", optional = true) public Integer MINIMUM_BQ = 17; @Option(shortName = StandardOptionDefinitions.SAMPLE_ALIAS_SHORT_NAME, doc = "Name of sample to look at in VCF. Can be omitted if VCF contains only one sample.", optional = true) public String SAMPLE = null; @Option(doc = "Number of sets to examine before stopping.", optional = true) public Integer STOP_AFTER = 0; @Option(doc = "Barcode SAM tag.", optional = true) public String BARCODE_TAG = "RX"; @Option(doc = "Barcode Quality SAM tag.", optional = true) public String BARCODE_BQ = "QX"; @Option(shortName = "MBQ", doc = "minimal value for the base quality of all the bases in a molecular barcode, for it to be used.", optional = true) public Integer MINIMUM_BARCODE_BQ = 30; private static final Log log = Log.getInstance(CollectIndependentReplicateMetrics.class); @Override protected int doWork() { IOUtil.assertFileIsReadable(VCF); IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); if (MATRIX_OUTPUT != null) IOUtil.assertFileIsWritable(MATRIX_OUTPUT); final VCFFileReader vcf = new VCFFileReader(VCF, false); final VCFHeader vcfFileHeader = vcf.getFileHeader(); final List<String> samples = vcfFileHeader.getSampleNamesInOrder(); if (SAMPLE == null) { if (samples.size() != 1) { throw new IllegalArgumentException("When sample is null, VCF must have exactly 1 sample. found " + samples.size()); } else { SAMPLE = samples.get(0); log.info("No SAMPLE given, using sample from VCF: ", SAMPLE); } } else if (!samples.contains(SAMPLE)) { throw new IllegalArgumentException("When sample is not null, VCF must contain supplied sample. Cannot find sample " + SAMPLE + " in vcf."); } final Histogram<ComparableTuple<String, String>> umiConfusionMatrix = new Histogram<>("ConfusionUMI", "Count"); final Histogram<ComparableTuple<String, String>> umiConfusionMatrixEditDistance = new Histogram<>("ConfusionUMI", "EditDistance"); final IndependentReplicateMetric metric = new IndependentReplicateMetric(); final Histogram<Byte> umiEditDistanceInDiffBiDups = new Histogram<>("editDistance", "diffAllelesCount"); final Histogram<Byte> umiEditDistanceInSameBiDups = new Histogram<>("editDistance", "sameAllelesCount"); final Histogram<Byte> alleleBalanceCount = new Histogram<>("alleleBalance", "alleleBalanceCount"); // get the intervals that correspond to het sites in the VCF final SortedMap<QueryInterval, List<Allele>> intervalAlleleMap = getQueryIntervalsMap(VCF); final Iterator<QueryInterval> queryIntervalIterator = intervalAlleleMap.keySet().iterator(); log.info("Found " + intervalAlleleMap.size() + " heterozygous sites in VCF."); // get an iterator to reads that overlap the heterozygous sites final SamReader in = SamReaderFactory.makeDefault().open(INPUT); log.info("Querying BAM for sites."); final SAMRecordIterator samRecordIterator = in.query(intervalAlleleMap.keySet().toArray(new QueryInterval[intervalAlleleMap.size()]), false); final List<SamRecordFilter> samFilters = CollectionUtil.makeList( new AlignedFilter(true), new CountingPairedFilter(), new SecondaryOrSupplementaryFilter(), new MappingQualityFilter(MINIMUM_MQ) ); final FilteringSamIterator filteredSamRecordIterator = new FilteringSamIterator(samRecordIterator, new AggregateFilter(samFilters)); log.info("Queried BAM, getting duplicate sets."); // get duplicate iterator from iterator above final DuplicateSetIterator duplicateSets = new DuplicateSetIterator(filteredSamRecordIterator, in.getFileHeader()); QueryInterval queryInterval = null; log.info("Starting iteration on reads"); final ProgressLogger progress = new ProgressLogger(log, 10000000, "examined", "duplicate sets"); IndependentReplicateMetric locusData = new IndependentReplicateMetric(); boolean useLocus = true; boolean newLocus = false; int thirdAlleleInfos = 0; Allele badAllele = null; String offendingReadName = null; set: while (duplicateSets.hasNext()) { final DuplicateSet set = duplicateSets.next(); final SAMRecord setRep = set.getRepresentative(); final QueryInterval setRepsInterval = queryIntervalFromSamRecord(setRep); progress.record(setRep); // if the current duplicate set no longer overlaps the query interval then null it (and handle it below) // also move to the next variant if the previous variant is bad. if (!useLocus || queryInterval != null && isCleanlyBefore(queryInterval, setRepsInterval)) { if (!useLocus) { metric.nThreeAllelesSites++; if (++thirdAlleleInfos < 100) { log.debug("Skipping a locus due to third allele: " + badAllele + " but expected " + intervalAlleleMap.get(queryInterval) + " queryInterval " + queryInterval + " offending read name is : " + offendingReadName); } } queryInterval = null; } // Iterate until we find the query interval that contains the current duplicate set. // Simply polling for the "next" query will not do since the next one might not be covered by any reads, or it may have been // covered by past reads (if close enough to previous query interval) while (queryIntervalIterator.hasNext() && (queryInterval == null || isCleanlyBefore(queryInterval, setRepsInterval))) { // if we haven't seen either the reference or the alternate in the locus (subject to our stringent filters) do not use locus. if (locusData.nReferenceReads == 0 || locusData.nAlternateReads == 0) { useLocus = false; log.debug("will not use this locus due to lack of evidence of het site."); } // Query interval didn't get killed by 3rd alleles and so we combine the results with the tally if (useLocus && newLocus) { metric.merge(locusData); log.debug("merging metric. total nSites so far: " + metric.nSites); //calculate allele balance with faux counts final byte alleleBalance = (byte) Math.round(100D * (locusData.nAlternateReads + 0.5) / (locusData.nAlternateReads + locusData.nReferenceReads + 1)); alleleBalanceCount.increment(alleleBalance); // we have merged now, no need to merge the old locus data or update the nSites until out of this while. newLocus = false; } queryInterval = queryIntervalIterator.next(); locusData = new IndependentReplicateMetric(); locusData.nSites = 1; useLocus = true; } // we have a new locus, next time we should perhaps merge newLocus = true; // shouldn't happen, but being safe. if (queryInterval == null) break; final int setSize = set.size(); locusData.nTotalReads += setSize; if (setSize > 1) locusData.nDuplicateSets++; if (setSize == DOUBLETON_SIZE) { locusData.nExactlyDouble++; } else if (setSize == TRIPLETON_SIZE) { locusData.nExactlyTriple++; } else if (setSize > TRIPLETON_SIZE) { // singletons are only counted in nTotalReads locusData.nReadsInBigSets += setSize; } log.debug("set size is: " + setSize); final List<Allele> allelesInVc = intervalAlleleMap.get(queryInterval); log.debug("alleles in VC: " + allelesInVc); int nRef = 0, nAlt = 0, nOther = 0; for (final SAMRecord read : set.getRecords()) { // getReadPositionAtReferencePosition gives 1-based offset final int offset = read.getReadPositionAtReferencePosition(queryInterval.start) - 1; if (offset == -1) { // a labeled continue watch-out! // This could be a deletion OR a clipped end. Get a new set. log.debug("got offset -1, getting new set"); continue set; } // a labeled continue watch-out! // need to move to the next set since this set has a low quality base-quality. if (read.getBaseQualities()[offset] <= MINIMUM_BQ) { log.debug("got low read quality, getting new set"); continue set; } final Allele allele = Allele.create(read.getReadBases()[offset]); if (allelesInVc.get(0).basesMatch(allele)) { nRef++; } else if (allelesInVc.get(1).basesMatch(allele)) { nAlt++; } else { nOther++; // if other alleles were found, toss out the whole locus! (but read the reads first) useLocus = false; badAllele = allele; offendingReadName = read.getReadName(); } } locusData.nAlternateReads += nAlt; locusData.nReferenceReads += nRef; if ( setSize == 1 || setSize > TRIPLETON_SIZE) continue; // From here on there should only be 2 or 3 reads in the set final SetClassification classification = classifySet(nRef, nAlt, nOther); log.debug("Classification of set is: " + classification); if (setSize == DOUBLETON_SIZE) { final boolean useBarcodes = !set.getRecords().stream() .map(read -> read.getStringAttribute(BARCODE_BQ)) .map(string -> string == null ? "" : string).map(string -> { final byte[] bytes = SAMUtils.fastqToPhred(string); return IntStream.range(0, bytes.length).map(i -> bytes[i]).anyMatch(q -> q < MINIMUM_BARCODE_BQ); }).anyMatch(a -> a); log.debug("using barcodes?" + useBarcodes); if(useBarcodes) locusData.nGoodBarcodes++; else locusData.nBadBarcodes++; final List<String> barcodes = set.getRecords().stream() .map(read -> read.getStringAttribute(BARCODE_TAG)) .map(string -> string == null ? "" : string).collect(Collectors.toList()); log.debug("found UMIs:" + barcodes); final boolean hasMultipleOrientations = set.getRecords().stream() .map(SAMRecord::getFirstOfPairFlag) //must be paired, due to filter on sam Iterator .distinct().count() != 1; log.debug("reads have multiple orientation?" + hasMultipleOrientations); final byte editDistance = calculateEditDistance(barcodes.get(0), barcodes.get(1)); log.debug("Edit distance between umi: " + editDistance); if (useBarcodes && editDistance != 0) { if (hasMultipleOrientations) locusData.nMismatchingUMIsInContraOrientedBiDups++; else locusData.nMismatchingUMIsInCoOrientedBiDups++; } // sanity check the number of distinct tags if (classification == SetClassification.DIFFERENT_ALLELES) { locusData.nDifferentAllelesBiDups++; if (useBarcodes) { umiEditDistanceInDiffBiDups.increment(editDistance); if(editDistance == 0) locusData.nMatchingUMIsInDiffBiDups++; else locusData.nMismatchingUMIsInDiffBiDups++; } // we're going to toss out this locus. } else if (classification == SetClassification.MISMATCHING_ALLELE) { locusData.nMismatchingAllelesBiDups++; } else { // the classification is either ALTERNATE_ALLELE or REFERENCE_ALLELE if we've reached here if (classification == SetClassification.ALTERNATE_ALLELE) locusData.nAlternateAllelesBiDups++; else locusData.nReferenceAllelesBiDups++; if (useBarcodes) { umiEditDistanceInSameBiDups.increment(editDistance); final ComparableTuple<String, String> key = new ComparableTuple<>(barcodes.get(0), barcodes.get(1)); umiConfusionMatrix.increment(key); if (!umiConfusionMatrixEditDistance.containsKey(key)) umiConfusionMatrixEditDistance.increment(key, editDistance); if (editDistance == 0) locusData.nMatchingUMIsInSameBiDups++; else locusData.nMismatchingUMIsInSameBiDups++; } } } if (setSize == TRIPLETON_SIZE) { switch (classification) { case MISMATCHING_ALLELE: locusData.nMismatchingAllelesTriDups++; break; case DIFFERENT_ALLELES: locusData.nDifferentAllelesTriDups++; break; case ALTERNATE_ALLELE: locusData.nAlternateAllelesTriDups++; break; case REFERENCE_ALLELE: locusData.nReferenceAllelesTriDups++; break; default: throw new IllegalStateException("Un possible!"); } } if (STOP_AFTER > 0 && progress.getCount() > STOP_AFTER) break; } if (useLocus && newLocus) { metric.merge(locusData); log.debug("Merged final metric. nSites:" + metric.nSites); } else { metric.nThreeAllelesSites++; log.debug("didn't merge last metric, due to 3rd allele: nThreeAllelesSites =" + metric.nThreeAllelesSites); } log.info("Iteration done. Emitting metrics."); // Emit metrics final MetricsFile<IndependentReplicateMetric, Byte> metricsFile = getMetricsFile(); metric.calculateDerivedFields(); metricsFile.addMetric(metric); metricsFile.addHistogram(alleleBalanceCount); metricsFile.addHistogram(umiEditDistanceInDiffBiDups); metricsFile.addHistogram(umiEditDistanceInSameBiDups); metricsFile.write(OUTPUT); final MetricsFile<?, ComparableTuple<String, String>> confusionMetrics = getMetricsFile(); if (MATRIX_OUTPUT != null) { confusionMetrics.addHistogram(umiConfusionMatrix); confusionMetrics.addHistogram(umiConfusionMatrixEditDistance); confusionMetrics.write(MATRIX_OUTPUT); } return 0; } private enum SetClassification { MISMATCHING_ALLELE, DIFFERENT_ALLELES, REFERENCE_ALLELE, ALTERNATE_ALLELE } /** * a small utility to inform if one interval is cleanly before another, meaning that they do not overlap and * the first is prior (in genomic order) to the second * * @param lhs the "first" {@link QueryInterval} * @param rhs the "second" {@link QueryInterval} * @return true if the to intervals do not intersect _and_ the first is prior to the second in genomic order */ private static boolean isCleanlyBefore(final QueryInterval lhs, final QueryInterval rhs) { return !lhs.overlaps(rhs) && lhs.compareTo(rhs) < 0; } private static SetClassification classifySet(final int nRef, final int nAlt, final int nOther) { // if we found any "other" alleles, this is a mismatching set if (nOther != 0) return SetClassification.MISMATCHING_ALLELE; // if we found both ref and alt alleles, this is a heterogeneous set if (nAlt > 0 && nRef > 0) return SetClassification.DIFFERENT_ALLELES; // if we found no reference alleles, this is an "alternate" set if (nRef == 0) return SetClassification.ALTERNATE_ALLELE; // if we found no alternate alleles, this is a "reference" set. if (nAlt == 0) return SetClassification.REFERENCE_ALLELE; throw new IllegalAccessError("shouldn't be here!"); } private static QueryInterval queryIntervalFromSamRecord(final SAMRecord samRecord) { return new QueryInterval(samRecord.getReferenceIndex(), samRecord.getStart(), samRecord.getEnd()); } /** Gives the edit distance between this barcode and another of the same length. */ private static byte calculateEditDistance(final String lhs, final String rhs) { assert(lhs.length()==rhs.length()); byte tmp = 0; for (int i = 0; i < rhs.length(); ++i) { if (rhs.charAt(i) != lhs.charAt(i)) ++tmp; } return tmp; } private SortedMap<QueryInterval, List<Allele>> getQueryIntervalsMap(final File vcf) { final Map<String, Integer> contigIndexMap = new HashMap<>(); final VCFFileReader vcfReader = new VCFFileReader(vcf, false); // We want to look at unfiltered SNP sites for which the sample is genotyped as a het // with high quality. final CompoundFilter compoundFilter = new CompoundFilter(true); compoundFilter.add(new SnpFilter()); compoundFilter.add(new PassingVariantFilter()); compoundFilter.add(new GenotypeQualityFilter(MINIMUM_GQ, SAMPLE)); compoundFilter.add(new HeterozygosityFilter(true, SAMPLE)); final Iterator<VariantContext> hetIterator = new FilteringVariantContextIterator(vcfReader.iterator(), compoundFilter); for (final VCFContigHeaderLine vcfContig : vcfReader.getFileHeader().getContigLines()) { contigIndexMap.put(vcfContig.getID(), vcfContig.getContigIndex()); } // return a TreeMap since the keys are comparable, and this will use their order in the iteration final SortedMap<QueryInterval, List<Allele>> map = new TreeMap<>(); while (hetIterator.hasNext()) { final VariantContext vc = hetIterator.next(); map.put(new QueryInterval(contigIndexMap.get(vc.getContig()), vc.getStart(), vc.getEnd()), vc.getGenotype(SAMPLE).getAlleles()); } vcfReader.close(); return map; } }