/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.illumina; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.Log; import picard.PicardException; import picard.analysis.SinglePassSamProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Illumina; import java.io.File; /** * Program to collect Coverage Summary Metrics of SAM files for data sequenced by Illumina products. * It uses our best approximation of the filters that Illumina uses which means: * <p/> * 1. Examine PF reads only * 2. Examine reads that are not marked as duplicate only * 3. Examine mapped reads only (it is unclear if Illumina does this or not, but without this we cannot do the next point) * 4. Only count bases that are present in two of mated reads, once. (For this we need TLEN from the sam record, which required that reads are mapped) * * Program assumes that all reads in the SAM file are of equal length. * * @author farjoun@broadinstitute.org */ @CommandLineProgramProperties( usage = "Program to collect Coverage Summary Metrics of SAM files for data sequenced by Illumina products. " + "It uses our best approximation of the filters that Illumina uses which means:\n\n" + "1. Examine PF reads only" + "2. Examine reads that are not marked as duplicate only" + "3. Examine mapped reads only (it is unclear if Illumina does this or not, but without this we cannot do the next point)" + "4. Only count bases that are present in two of mated reads, once. (For this we need TLEN from the sam record, which required that reads are mapped)" + "\nProgram assumes that all reads in the SAM file are of equal length, and that if a read is marked as \"mated\", its mate will really exist.\n", usageShort = "Collects summary metrics according to Illumina specifications.", programGroup = Illumina.class ) public class CollectIlluminaSummaryMetrics extends SinglePassSamProgram { @Option(doc = "IntervalList describing the target interval of the sequencing experiment (for calculating average coverage). " + "Use null to use reference from BAM as interval (whole genome)", optional = true) public File TARGET_REGION = null; private final IlluminaSummaryMetrics metrics = new IlluminaSummaryMetrics(); private static final Log log = Log.getInstance(CollectIlluminaSummaryMetrics.class); private Integer readLength = 0; /** Stock main method for a command line program. */ public static void main(final String[] argv) { new CollectIlluminaSummaryMetrics().instanceMainWithExit(argv); } @Override protected void setup(final SAMFileHeader header, final File samFile) { if (TARGET_REGION == null) { metrics.TARGET_TERRITORY = header.getSequenceDictionary().getReferenceLength(); } else { IOUtil.assertFileIsReadable(TARGET_REGION); IOUtil.assertFileIsWritable(OUTPUT); final IntervalList targetInterval = IntervalList.fromFile(TARGET_REGION); metrics.TARGET_TERRITORY = targetInterval.getUniqueBaseCount(); } } @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { if (readLength == 0) { readLength = rec.getReadLength(); } else { if (rec.getReadLength() != readLength) { final String message=String.format("This program only works with uniform read lengths. First record had length %d. Current record %s, has length %d", readLength, rec.getReadName(), rec.getReadLength()); log.error(message); throw new PicardException(message); } } // Not interested in counting unmapped, secondary or supplemental reads. if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary()) { return; } metrics.TOTAL_READS++; final int length = rec.getReadLength(); metrics.TOTAL_BASES += length; final boolean isPfRead = !rec.getReadFailsVendorQualityCheckFlag(); if (isPfRead) { metrics.PF_READS++; metrics.PF_BASES += length; } else { return; } // From here on read must have passed PF if (rec.getDuplicateReadFlag()) { metrics.DUPLICATE_READS++; return; } // Reads are not duplicated from here on. final int tLen = Math.abs(rec.getInferredInsertSize()); if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && !rec.getReadUnmappedFlag() && tLen != 0) { // If mated,both reads are mapped, and insert-size can be calculated, only examine first in pair, assume equal read lengths, and account for overlaps if (rec.getFirstOfPairFlag()) { // Since read is mated with both mate mapped, and insert-size available, we will only consider pair on first mate. if (rec.getReadNegativeStrandFlag() ^ rec.getMateNegativeStrandFlag()) { // If mated reads are in opposite directions: metrics.TOTAL_ILLUMINA_BASES += Math.min(2 * length, tLen); } else { // If mated reads are in same direction: metrics.TOTAL_ILLUMINA_BASES += length + Math.min(length, tLen); } } } else { // Otherwise add each read on its own. metrics.TOTAL_ILLUMINA_BASES += rec.getReadLength(); } } // Do not need unmapped reads at end of file. @Override protected boolean usesNoRefReads() { return false; } @Override protected void finish() { // Calculate some derived metrics metrics.READ_LENGTH = readLength; metrics.AVERAGE_ILMN_DEPTH = metrics.TOTAL_ILLUMINA_BASES / (double) metrics.TARGET_TERRITORY; // Output the file final MetricsFile<IlluminaSummaryMetrics, Integer> metricsFile = getMetricsFile(); metricsFile.addMetric(metrics); metricsFile.write(OUTPUT); } /** A set of metrics used to describe the general quality of a BAM file */ public static class IlluminaSummaryMetrics extends MetricBase { /** The total number of reads in the input file */ public int TOTAL_READS = 0; /** The number of reads that are PF - pass filter */ public int PF_READS = 0; /** The number of duplicate, PF reads */ public int DUPLICATE_READS = 0; /** The total number of bases in all reads */ public long TOTAL_BASES; /** The total number of bases in all PF reads */ public long PF_BASES = 0; /** The number of bases from PF, unique (non-duplicate) reads, only counting for overlapping bases once */ public long TOTAL_ILLUMINA_BASES = 0; /** The average depth considering only bases from PF, unique (non-duplicate) reads, only counting for overlapping bases once */ public double AVERAGE_ILMN_DEPTH = 0; /** The read length */ public long READ_LENGTH = 0; /** The size of the target region */ public long TARGET_TERRITORY = 0; public IlluminaSummaryMetrics() {} public IlluminaSummaryMetrics(final int TOTAL_READS, final int PF_READS, final int DUPLICATE_READS, final long TOTAL_BASES, final long PF_BASES, final long TOTAL_ILLUMINA_BASES, final double AVERAGE_ILMN_DEPTH, final long READ_LENGTH, final long TARGET_TERRITORY) { this.TOTAL_READS = TOTAL_READS; this.PF_READS = PF_READS; this.DUPLICATE_READS = DUPLICATE_READS; this.TOTAL_BASES = TOTAL_BASES; this.PF_BASES = PF_BASES; this.TOTAL_ILLUMINA_BASES = TOTAL_ILLUMINA_BASES; this.AVERAGE_ILMN_DEPTH = AVERAGE_ILMN_DEPTH; this.READ_LENGTH = READ_LENGTH; this.TARGET_TERRITORY = TARGET_TERRITORY; } } }