package picard.analysis.directed; import htsjdk.samtools.AlignmentBlock; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.OverlapDetector; import htsjdk.samtools.util.SequenceUtil; import picard.PicardException; import picard.analysis.MetricAccumulationLevel; import picard.analysis.RnaSeqMetrics; import picard.annotation.Gene; import picard.annotation.LocusFunction; import picard.metrics.PerUnitMetricCollector; import picard.metrics.SAMRecordMultiLevelCollector; import picard.util.MathUtil; import java.io.File; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class RnaSeqMetricsCollector extends SAMRecordMultiLevelCollector<RnaSeqMetrics, Integer> { public enum StrandSpecificity {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND} private final int minimumLength; private final StrandSpecificity strandSpecificity; private final double rrnaFragmentPercentage; private final Long ribosomalInitialValue; final private Set<Integer> ignoredSequenceIndices; private final OverlapDetector<Gene> geneOverlapDetector; private final OverlapDetector<Interval> ribosomalSequenceOverlapDetector; private final boolean collectCoverageStatistics; public RnaSeqMetricsCollector(final Set<MetricAccumulationLevel> accumulationLevels, final List<SAMReadGroupRecord> samRgRecords, final Long ribosomalBasesInitialValue, OverlapDetector<Gene> geneOverlapDetector, OverlapDetector<Interval> ribosomalSequenceOverlapDetector, final HashSet<Integer> ignoredSequenceIndices, final int minimumLength, final StrandSpecificity strandSpecificity, final double rrnaFragmentPercentage, boolean collectCoverageStatistics) { this.ribosomalInitialValue = ribosomalBasesInitialValue; this.ignoredSequenceIndices = ignoredSequenceIndices; this.geneOverlapDetector = geneOverlapDetector; this.ribosomalSequenceOverlapDetector = ribosomalSequenceOverlapDetector; this.minimumLength = minimumLength; this.strandSpecificity = strandSpecificity; this.rrnaFragmentPercentage = rrnaFragmentPercentage; this.collectCoverageStatistics = collectCoverageStatistics; setup(accumulationLevels, samRgRecords); } @Override protected PerUnitMetricCollector<RnaSeqMetrics, Integer, SAMRecord> makeChildCollector(final String sample, final String library, final String readGroup) { return new PerUnitRnaSeqMetricsCollector(sample, library, readGroup, ribosomalInitialValue); } public static OverlapDetector<Interval> makeOverlapDetector(final File samFile, final SAMFileHeader header, final File ribosomalIntervalsFile) { OverlapDetector<Interval> ribosomalSequenceOverlapDetector = new OverlapDetector<Interval>(0, 0); if (ribosomalIntervalsFile != null) { final IntervalList ribosomalIntervals = IntervalList.fromFile(ribosomalIntervalsFile); try { SequenceUtil.assertSequenceDictionariesEqual(header.getSequenceDictionary(), ribosomalIntervals.getHeader().getSequenceDictionary()); } catch (SequenceUtil.SequenceListsDifferException e) { throw new PicardException("Sequence dictionaries differ in " + samFile.getAbsolutePath() + " and " + ribosomalIntervalsFile.getAbsolutePath(), e); } ribosomalIntervals.unique(); final List<Interval> intervals = ribosomalIntervals.getIntervals(); ribosomalSequenceOverlapDetector.addAll(intervals, intervals); } return ribosomalSequenceOverlapDetector; } public static HashSet<Integer> makeIgnoredSequenceIndicesSet(final SAMFileHeader header, final Set<String> ignoredSequence) { final HashSet<Integer> ignoredSequenceIndices = new HashSet<Integer>(); for (final String sequenceName: ignoredSequence) { final SAMSequenceRecord sequenceRecord = header.getSequence(sequenceName); if (sequenceRecord == null) { throw new PicardException("Unrecognized sequence " + sequenceName + " passed as argument to IGNORE_SEQUENCE"); } ignoredSequenceIndices.add(sequenceRecord.getSequenceIndex()); } return ignoredSequenceIndices; } private class PerUnitRnaSeqMetricsCollector implements PerUnitMetricCollector<RnaSeqMetrics, Integer, SAMRecord> { final RnaSeqMetrics metrics = new RnaSeqMetrics(); private final Map<Gene.Transcript, int[]> coverageByTranscript = new HashMap<Gene.Transcript, int[]>(); public PerUnitRnaSeqMetricsCollector(final String sample, final String library, final String readGroup, final Long ribosomalBasesInitialValue) { this.metrics.SAMPLE = sample; this.metrics.LIBRARY = library; this.metrics.READ_GROUP = readGroup; this.metrics.RIBOSOMAL_BASES = ribosomalBasesInitialValue; } public void acceptRecord(SAMRecord rec) { // Filter out some reads, and collect the total number of PF bases if (rec.getReadFailsVendorQualityCheckFlag() || rec.isSecondaryOrSupplementary()) return; this.metrics.PF_BASES += rec.getReadLength(); if (rec.getReadUnmappedFlag()) return; if (ignoredSequenceIndices.contains(rec.getReferenceIndex())) { ++this.metrics.IGNORED_READS; return; } // Grab information about the alignment and overlapping genes etc. final Interval readInterval = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()); // Attempt to get an interval for the entire fragment (if paired read) else just use the read itself. // If paired read is chimeric or has one end unmapped, don't create an interval. final Interval fragmentInterval; if (!rec.getReadPairedFlag()) { fragmentInterval = readInterval; } else if (rec.getMateUnmappedFlag() || rec.getReferenceIndex() != rec.getMateReferenceIndex()) { fragmentInterval = null; } else { final int fragmentStart = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart()); final int fragmentEnd = CoordMath.getEnd(fragmentStart, Math.abs(rec.getInferredInsertSize())); fragmentInterval = new Interval(rec.getReferenceName(), fragmentStart, fragmentEnd); } if (fragmentInterval != null) { final Collection<Interval> overlappingRibosomalIntervals = ribosomalSequenceOverlapDetector.getOverlaps(fragmentInterval); int intersectionLength = 0; for (final Interval overlappingInterval : overlappingRibosomalIntervals) { final int thisIntersectionLength = overlappingInterval.getIntersectionLength(fragmentInterval); intersectionLength = Math.max(intersectionLength, thisIntersectionLength); } if (intersectionLength/(double)fragmentInterval.length() >= rrnaFragmentPercentage) { // Assume entire read is ribosomal. // TODO: Should count reads, not bases? metrics.RIBOSOMAL_BASES += rec.getReadLength(); int numAlignedBases = 0; for (final AlignmentBlock alignmentBlock : rec.getAlignmentBlocks()) { numAlignedBases += alignmentBlock.getLength(); } metrics.PF_ALIGNED_BASES += numAlignedBases; return; } } final Collection<Gene> overlappingGenes = geneOverlapDetector.getOverlaps(readInterval); final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks(); boolean overlapsExon = false; for (final AlignmentBlock alignmentBlock : alignmentBlocks) { // Get functional class for each position in the alignment block. final LocusFunction[] locusFunctions = new LocusFunction[alignmentBlock.getLength()]; // By default, if base does not overlap with rRNA or gene, it is intergenic. Arrays.fill(locusFunctions, 0, locusFunctions.length, LocusFunction.INTERGENIC); for (final Gene gene : overlappingGenes) { for (final Gene.Transcript transcript : gene) { transcript.assignLocusFunctionForRange(alignmentBlock.getReferenceStart(), locusFunctions); // if you want to gather coverage statistics, this variable should be true. // added for cases with many units [samples/read groups] which overwhelm memory. // Add coverage to our coverage counter for this transcript if (collectCoverageStatistics) { int[] coverage = this.coverageByTranscript.get(transcript); if (coverage == null) { coverage = new int[transcript.length()]; this.coverageByTranscript.put(transcript, coverage); } transcript.addCoverageCounts(alignmentBlock.getReferenceStart(), CoordMath.getEnd(alignmentBlock.getReferenceStart(), alignmentBlock.getLength()), coverage); } } } // Tally the function of each base in the alignment block. for (final LocusFunction locusFunction : locusFunctions) { ++metrics.PF_ALIGNED_BASES; switch (locusFunction) { case INTERGENIC: ++metrics.INTERGENIC_BASES; break; case INTRONIC: ++metrics.INTRONIC_BASES; break; case UTR: ++metrics.UTR_BASES; overlapsExon = true; break; case CODING: ++metrics.CODING_BASES; overlapsExon = true; break; case RIBOSOMAL: ++metrics.RIBOSOMAL_BASES; break; } } } // Strand-specificity is tallied on read basis rather than base at a time. A read that aligns to more than one // gene is not counted. if (overlapsExon && strandSpecificity != StrandSpecificity.NONE && overlappingGenes.size() == 1) { final boolean negativeTranscriptionStrand = overlappingGenes.iterator().next().isNegativeStrand(); final boolean negativeReadStrand = rec.getReadNegativeStrandFlag(); final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand; final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag(); final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND; final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree; // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree, // then the strand specificity for this read is correct. // -- OR -- // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed // to agree, then the strand specificity for this read is correct. if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) { ++metrics.CORRECT_STRAND_READS; } else { ++metrics.INCORRECT_STRAND_READS; } } } public void finish() { if (metrics.PF_ALIGNED_BASES > 0) { if (metrics.RIBOSOMAL_BASES != null) { metrics.PCT_RIBOSOMAL_BASES = metrics.RIBOSOMAL_BASES / (double) metrics.PF_ALIGNED_BASES; } metrics.PCT_CODING_BASES = metrics.CODING_BASES / (double) metrics.PF_ALIGNED_BASES; metrics.PCT_UTR_BASES = metrics.UTR_BASES / (double) metrics.PF_ALIGNED_BASES; metrics.PCT_INTRONIC_BASES = metrics.INTRONIC_BASES / (double) metrics.PF_ALIGNED_BASES; metrics.PCT_INTERGENIC_BASES = metrics.INTERGENIC_BASES / (double) metrics.PF_ALIGNED_BASES; metrics.PCT_MRNA_BASES = metrics.PCT_CODING_BASES + metrics.PCT_UTR_BASES; metrics.PCT_USABLE_BASES = (metrics.CODING_BASES + metrics.UTR_BASES) / (double) metrics.PF_BASES; } if (metrics.CORRECT_STRAND_READS > 0 || metrics.INCORRECT_STRAND_READS > 0) { metrics.PCT_CORRECT_STRAND_READS = metrics.CORRECT_STRAND_READS/(double)(metrics.CORRECT_STRAND_READS + metrics.INCORRECT_STRAND_READS); } } @Override public void addMetricsToFile(final MetricsFile<RnaSeqMetrics, Integer> file) { // Compute metrics based on coverage of top 1000 genes final Histogram<Integer> normalizedCovByPos = computeCoverageMetrics(); file.addMetric(metrics); file.addHistogram(normalizedCovByPos); } /** * Computes a set of coverage based metrics on the mostly highly expressed genes' most highly * expressed transcripts. */ private Histogram<Integer> computeCoverageMetrics() { final Histogram<Double> cvs = new Histogram<Double>(); final Histogram<Double> fivePrimeSkews = new Histogram<Double>(); final Histogram<Double> threePrimeSkews = new Histogram<Double>(); final Histogram<Double> gapBasesPerKb = new Histogram<Double>(); final Histogram<Double> fiveToThreeSkews = new Histogram<Double>(); String prefix = null; if (this.metrics.READ_GROUP != null) { prefix = this.metrics.READ_GROUP + "."; } else if (this.metrics.LIBRARY != null) { prefix = this.metrics.LIBRARY + "."; } else if (this.metrics.SAMPLE != null) { prefix = this.metrics.SAMPLE + "."; } else { prefix = "All_Reads."; } final Histogram<Integer> normalizedCoverageByNormalizedPosition = new Histogram<Integer>("normalized_position", prefix + "normalized_coverage"); final Map<Gene.Transcript,int[]> transcripts = pickTranscripts(coverageByTranscript); final double transcriptCount = transcripts.size(); for (final Map.Entry<Gene.Transcript,int[]> entry : transcripts.entrySet()) { final Gene.Transcript tx = entry.getKey(); final double[] coverage; { final double[] tmp = MathUtil.promote(entry.getValue()); if (tx.getGene().isPositiveStrand()) coverage = tmp; else coverage = copyAndReverse(tmp); } final double mean = MathUtil.mean(coverage, 0, coverage.length); // Calculate the CV of coverage for this tx final double stdev = MathUtil.stddev(coverage, 0, coverage.length, mean); final double cv = stdev / mean; cvs.increment(cv); // Calculate the 5' and 3' biases { final int PRIME_BASES = 100; final double fivePrimeCoverage = MathUtil.mean(coverage, 0, PRIME_BASES); final double threePrimeCoverage = MathUtil.mean(coverage, coverage.length - PRIME_BASES, coverage.length); fivePrimeSkews.increment(fivePrimeCoverage / mean); threePrimeSkews.increment(threePrimeCoverage / mean); fiveToThreeSkews.increment(fivePrimeCoverage / threePrimeCoverage); } // Calculate normalized coverage vs. normalized position { final int lastIndex = coverage.length - 1; for (int percent=0; percent<=100; ++percent) { final double p = percent / 100d; final int start = (int) Math.max(0, lastIndex * (p-0.005)); final int end = (int) Math.min(lastIndex, lastIndex * (p+0.005)); final int length = end - start + 1; double sum = 0; for (int i=start; i<=end; ++i) sum += coverage[i]; final double normalized = (sum / length) / mean; normalizedCoverageByNormalizedPosition.increment(percent, normalized / transcriptCount); } } // Calculate gap bases per kilobase // { // int gapBases = 0; // final double minCoverage = mean * 0.1; // for (int i=0; i<coverage.length; ++i) { // if (coverage[i] < minCoverage) ++gapBases; // } // gapBasesPerKb.increment(gapBases / (coverage.length / 1000d)); // } } this.metrics.MEDIAN_CV_COVERAGE = cvs.getMedian(); this.metrics.MEDIAN_5PRIME_BIAS = fivePrimeSkews.getMedian(); this.metrics.MEDIAN_3PRIME_BIAS = threePrimeSkews.getMedian(); this.metrics.MEDIAN_5PRIME_TO_3PRIME_BIAS = fiveToThreeSkews.getMedian(); return normalizedCoverageByNormalizedPosition; } /** Little method to copy an array and reverse it at the same time. */ private double[] copyAndReverse(final double[] in) { final double[] out = new double[in.length]; for (int i=0, j=in.length-1; i<in.length; ++i, --j) out[j] = in[i]; return out; } /** Picks the set of transcripts on which the coverage metrics are to be calculated. */ public Map<Gene.Transcript, int[]> pickTranscripts(final Map<Gene.Transcript, int[]> transcriptCoverage) { final Map<Gene.Transcript, Double> bestPerGene = new HashMap<Gene.Transcript, Double>(); // Make a map of the best transcript per gene to it's mean coverage for (final Gene gene : geneOverlapDetector.getAll()) { Gene.Transcript best = null; double bestMean = 0; for (final Gene.Transcript tx : gene) { final int[] cov = transcriptCoverage.get(tx); if (tx.length() < Math.max(minimumLength, 100)) continue; if (cov == null) continue; final double mean = MathUtil.mean(MathUtil.promote(cov), 0, cov.length); if (mean < 1d) continue; if (best == null || mean > bestMean) { best = tx; bestMean = mean; } } if (best != null) bestPerGene.put(best, bestMean); } // Find the 1000th best coverage value final double[] coverages = new double[bestPerGene.size()]; int i=0; for (final double d : bestPerGene.values()) coverages[i++] = d; Arrays.sort(coverages); final double min = coverages.length == 0 ? 0 : coverages[Math.max(0, coverages.length - 1001)]; // And finally build the output map final Map<Gene.Transcript, int[]> retval = new HashMap<Gene.Transcript, int[]>(); for (final Map.Entry<Gene.Transcript,Double> entry : bestPerGene.entrySet()) { final Gene.Transcript tx = entry.getKey(); final double coverage = entry.getValue(); if (coverage >= min) { retval.put(tx, transcriptCoverage.get(tx)); } } return retval; } } }