/* * The MIT License * * Copyright (c) 2013 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.SequenceUtil; import picard.metrics.PerUnitMetricCollector; import picard.metrics.SAMRecordAndReference; import picard.metrics.SAMRecordAndReferenceMultiLevelCollector; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Set; public class RrbsMetricsCollector extends SAMRecordAndReferenceMultiLevelCollector<RrbsMetrics, Comparable<?>> { private final int minReadLength; private final double maxMismatchRate; private final int cQualityThreshold; private final int nextBaseQualityThreshold; public RrbsMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords, final int cQualityThreshold, final int nextBaseQualityThreshold, final int minReadLength, final double maxMismatchRate) { this.cQualityThreshold = cQualityThreshold; this.nextBaseQualityThreshold = nextBaseQualityThreshold; this.minReadLength = minReadLength; this.maxMismatchRate = maxMismatchRate; setup(accumulationLevels, samRgRecords); } @Override protected PerUnitMetricCollector<RrbsMetrics, Comparable<?>, SAMRecordAndReference> makeChildCollector(final String sample, final String library, final String readGroup) { return new PerUnitRrbsMetricsCollector(sample, library, readGroup); } private class PerUnitRrbsMetricsCollector implements PerUnitMetricCollector<RrbsMetrics, Comparable<?>, SAMRecordAndReference> { final String sample; final String library; final String readGroup; // Counters for CpG & non-CpG seen/converted sites int nCytoConverted = 0; int nCytoTotal = 0; final Histogram<CpgLocation> cpgTotal = new Histogram<CpgLocation>(); final Histogram<CpgLocation> cpgConverted = new Histogram<CpgLocation>(); // Counters for QC filters used in the final metrics int mappedRecordCount = 0; int smallReadCount = 0; int mismatchCount = 0; int noCpgCount = 0; // Final metrics calculated once all the reads are done double cytoConversionRate; double cpgConversionRate; int nCpgSeen; int nCpgConverted; double coverageMean; int coverageMedian; public PerUnitRrbsMetricsCollector(final String sample, final String library, final String readGroup) { this.sample = sample; this.library = library; this.readGroup = readGroup; } public void acceptRecord(final SAMRecordAndReference args) { mappedRecordCount++; final SAMRecord samRecord = args.getSamRecord(); final ReferenceSequence referenceSequence = args.getReferenceSequence(); final byte[] readBases = samRecord.getReadBases(); final byte[] readQualities = samRecord.getBaseQualities(); final byte[] refBases = referenceSequence.getBases(); if (samRecord.getReadLength() < minReadLength) { smallReadCount++; return; } else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) { mismatchCount++; return; } // We only record non-CpG C sites if there was at least one CpG in the read, keep track of // the values for this record and then apply to the global totals if valid int recordCpgs = 0; for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) { final int blockLength = alignmentBlock.getLength(); final int refFragmentStart = alignmentBlock.getReferenceStart() - 1; final int readFragmentStart = alignmentBlock.getReadStart() - 1; final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength); final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength); final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength); if (samRecord.getReadNegativeStrandFlag()) { // In the case of a negative strand, reverse (and complement for base arrays) the reference, // reads & qualities so that it can be treated as a positive strand for the rest of the process SequenceUtil.reverseComplement(refFragment); SequenceUtil.reverseComplement(readFragment); SequenceUtil.reverseQualities(readQualityFragment); } for (int i=0; i < blockLength-1; i++) { final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag()); // Look at a 2-base window to see if we're on a CpG site, and if so check for conversion // (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) && (SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) { // We want to catch the case where there's a CpG in the reference, even if it is not valid // to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all // in one if statement if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) { recordCpgs++; final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex); cpgTotal.increment(curLocation); if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) { cpgConverted.increment(curLocation); } } i++; } else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) && SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) { // C base in the reference that's not associated with a CpG nCytoTotal++; if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) { nCytoConverted++; } } } } if (recordCpgs == 0) { noCpgCount++; } } public void finish() { cytoConversionRate = nCytoTotal == 0 ? 0 : nCytoConverted / (double)nCytoTotal; nCpgSeen = (int)cpgTotal.getSumOfValues(); nCpgConverted = (int)cpgConverted.getSumOfValues(); cpgConversionRate = nCpgSeen == 0 ? 0 : nCpgConverted / (double)nCpgSeen; coverageMean = cpgTotal.getMeanBinSize(); coverageMedian = (int)cpgTotal.getMedianBinSize(); } @Override public void addMetricsToFile(final MetricsFile<RrbsMetrics, Comparable<?>> metricsFile) { // Create both the summary and detail metrics & add them to the RrbsMetrics container class for // the downstream code to use as desired final RrbsSummaryMetrics summaryMetrics = buildSummaryMetrics(); final List<RrbsCpgDetailMetrics> detailMetrics = buildDetailMetrics(); final RrbsMetrics rrbsMetrics = new RrbsMetrics(summaryMetrics, detailMetrics); metricsFile.addMetric(rrbsMetrics); } private RrbsSummaryMetrics buildSummaryMetrics() { final RrbsSummaryMetrics summaryMetrics = new RrbsSummaryMetrics(); summaryMetrics.SAMPLE = sample; summaryMetrics.READ_GROUP = readGroup; summaryMetrics.LIBRARY = library; summaryMetrics.READS_ALIGNED = mappedRecordCount; summaryMetrics.NON_CPG_BASES = nCytoTotal; summaryMetrics.NON_CPG_CONVERTED_BASES = nCytoConverted; summaryMetrics.PCT_NON_CPG_BASES_CONVERTED = cytoConversionRate; summaryMetrics.CPG_BASES_SEEN = nCpgSeen; summaryMetrics.CPG_BASES_CONVERTED = nCpgConverted; summaryMetrics.PCT_CPG_BASES_CONVERTED = cpgConversionRate; summaryMetrics.MEAN_CPG_COVERAGE = coverageMean; summaryMetrics.MEDIAN_CPG_COVERAGE = coverageMedian; summaryMetrics.READS_IGNORED_SHORT = smallReadCount; summaryMetrics.READS_WITH_NO_CPG = noCpgCount; summaryMetrics.READS_IGNORED_MISMATCHES = mismatchCount; return summaryMetrics; } private List<RrbsCpgDetailMetrics> buildDetailMetrics() { final List<RrbsCpgDetailMetrics> detailMetrics = new ArrayList<RrbsCpgDetailMetrics>(); for (final CpgLocation key : cpgTotal.keySet()) { final RrbsCpgDetailMetrics cpgMetric = new RrbsCpgDetailMetrics(); cpgMetric.SAMPLE = sample; cpgMetric.READ_GROUP = readGroup; cpgMetric.LIBRARY = library; cpgMetric.SEQUENCE_NAME = key.getSequence(); cpgMetric.POSITION = key.getPosition(); cpgMetric.TOTAL_SITES = (int)cpgTotal.get(key).getValue(); cpgMetric.CONVERTED_SITES = cpgConverted.containsKey(key) ? (int)cpgConverted.get(key).getValue() : 0; cpgMetric.PCT_CONVERTED = cpgMetric.CONVERTED_SITES == 0 ? 0 : cpgMetric.CONVERTED_SITES / (double)cpgMetric.TOTAL_SITES; detailMetrics.add(cpgMetric); } return detailMetrics; } } private byte[] getFragment(final byte[] fullArray, final int fragmentStart, final int length) { return Arrays.copyOfRange(fullArray, fragmentStart, fragmentStart + length); } /** * True if there's a C in the reference as well as read (possibly bisulfite converted) */ private boolean isC(final byte refBase, final byte readBase) { return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase)); } /** * Checks a pair of bases for CpG status & quality thresholds */ private boolean isValidCpg(final byte[] refBases, final byte[] readBases, final byte[] readQualities, final int index) { return isC(refBases[index], readBases[index]) && SequenceUtil.basesEqual(refBases[index+1], readBases[index+1]) && isAboveCytoQcThreshold(readQualities, index); } /** * Any cyto base (CpG context or not) needs to be above a specified quality threshold. Similarly, the neighboring * base on the 3' side must also be above another threshold unless the cyto base is the final base in the 3' * direction. */ private boolean isAboveCytoQcThreshold(final byte[] readQualities, final int index) { return ((index < readQualities.length - 1) && (readQualities[index] >= cQualityThreshold) && (readQualities[index+1] >= nextBaseQualityThreshold)); } /** * Accounts for the fact that negative strand counts have been reversed */ private int getCurRefIndex(final int refStart, final int blockLength, final int idx, final boolean isNegative) { return isNegative ? refStart + (blockLength - 1) - idx - 1 : refStart + idx; } } /** * Used to keep track of the location of CpG sites */ class CpgLocation implements Comparable<CpgLocation> { private final String sequence; private final Integer position; public CpgLocation(final String sequence, final int position) { this.sequence = sequence; this.position = position; } @Override public int compareTo(final CpgLocation other) { final int seqComp = sequence.compareTo(other.sequence); return seqComp == 0 ? position.compareTo(other.position) : seqComp; } @Override public boolean equals(final Object other) { if (this == other) { return true; } if (other == null || getClass() != other.getClass()) { return false; } final CpgLocation that = (CpgLocation)other; return (sequence.equals(that.sequence)) && (position.equals(that.position)); } @Override public int hashCode() { int result = sequence.hashCode(); result = 31 * result + position; return result; } public String getSequence() { return sequence; } public Integer getPosition() { return position; } }