/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.Defaults; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamPairUtil.PairOrientation; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Metrics; import java.io.File; import java.util.EnumSet; import java.util.List; import java.util.Set; /** * A command line tool to read a BAM file and produce standard alignment metrics that would be applicable to any alignment. * Metrics to include, but not limited to: * <ul> * <li>Total number of reads (total, period, no exclusions)</li> * <li>Total number of PF reads (PF == does not fail vendor check flag)</li> * <li>Number of PF noise reads (does not fail vendor check and has noise attr set)</li> * <li>Total aligned PF reads (any PF read that has a sequence and position)</li> * <li>High quality aligned PF reads (high quality == mapping quality >= 20)</li> * <li>High quality aligned PF bases (actual aligned bases, calculate off alignment blocks)</li> * <li>High quality aligned PF Q20 bases (subset of above where base quality >= 20)</li> * <li>Median mismatches in HQ aligned PF reads (how many aligned bases != ref on average)</li> * <li>Reads aligned in pairs (vs. reads aligned with mate unaligned/not present)</li> * <li>Read length (how to handle mixed lengths?)</li> * <li>Bad Cycles - how many machine cycles yielded combined no-call and mismatch rates of >= 80%</li> * <li>Strand balance - reads mapped to positive strand / total mapped reads</li> * </ul> * Metrics are written for the first read of a pair, the second read, and combined for the pair. * * Chimeras are identified if any of the of following criteria are met: * <ul> * <li>the insert size is larger than MAX_INSERT_SIZE</li> * <li>the ends of a pair map to different contigs</li> * <li>the paired end orientation is different that the expected orientation</li> * <li>the read contains an SA tag (chimeric alignment)</li> * </ul> * * @author Doug Voet (dvoet at broadinstitute dot org) */ @CommandLineProgramProperties( usage = CollectAlignmentSummaryMetrics.USAGE_SUMMARY + CollectAlignmentSummaryMetrics.USAGE_DETAILS, usageShort = CollectAlignmentSummaryMetrics.USAGE_SUMMARY, programGroup = Metrics.class ) public class CollectAlignmentSummaryMetrics extends SinglePassSamProgram { static final String USAGE_SUMMARY = "<b>Produces a summary of alignment metrics from a SAM or BAM file.</b> "; static final String USAGE_DETAILS = "This tool takes a SAM/BAM file input and produces metrics detailing the quality of the read " + "alignments as well as the proportion of the reads that passed machine signal-to-noise threshold quality filters. " + "Note that these quality filters are specific to Illumina data; for additional information, please see the corresponding " + "<a href='https://www.broadinstitute.org/gatk/guide/article?id=6329'>GATK Dictionary entry</a>. </p>" + "" + "<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p>" + "<h4>Usage example:</h4>" + "<pre>" + " java -jar picard.jar CollectAlignmentSummaryMetrics \\<br />" + " R=reference_sequence.fasta \\<br />" + " I=input.bam \\<br />" + " O=output.txt" + "</pre>"+ "<p>Please see the CollectAlignmentSummaryMetrics " + "<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#AlignmentSummaryMetrics'>definitions</a> " + "for a complete description of the metrics produced by this tool.</p>" + "<hr />"; private static final Log log = Log.getInstance(CollectAlignmentSummaryMetrics.class); @Option(doc="Paired-end reads above this insert size will be considered chimeric along with inter-chromosomal pairs.") public int MAX_INSERT_SIZE = ChimeraUtil.DEFAULT_INSERT_SIZE_LIMIT; @Option(doc="Paired-end reads that do not have this expected orientation will be considered chimeric.") public Set<PairOrientation> EXPECTED_PAIR_ORIENTATIONS = EnumSet.copyOf(ChimeraUtil.DEFAULT_EXPECTED_ORIENTATIONS); @Option(doc="List of adapter sequences to use when processing the alignment metrics.") public List<String> ADAPTER_SEQUENCE = AdapterUtility.DEFAULT_ADAPTER_SEQUENCE; @Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics.") public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); @Option(shortName="BS", doc="Whether the SAM or BAM file consists of bisulfite sequenced reads.") public boolean IS_BISULFITE_SEQUENCED = false; //overridden to make it visible on the commandline and to change the doc. @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "Reference sequence file. Note that while this argument isn't required, without it only a small subset of the metrics will be calculated. Note also that if a reference sequence is provided, it must be accompanied by a sequence dictionary.", optional = true, overridable = true) public File REFERENCE_SEQUENCE = Defaults.REFERENCE_FASTA; private AlignmentSummaryMetricsCollector collector; /** Required main method implementation. */ public static void main(final String[] argv) { new CollectAlignmentSummaryMetrics().instanceMainWithExit(argv); } /** Silly method that is necessary to give unit test access to call doWork() */ protected final int testDoWork() { return doWork(); } @Override protected void setup(final SAMFileHeader header, final File samFile) { IOUtil.assertFileIsWritable(OUTPUT); if (header.getSequenceDictionary().isEmpty()) { log.warn(INPUT.getAbsoluteFile() + " has no sequence dictionary. If any reads " + "in the file are aligned, then alignment summary metrics collection will fail."); } final boolean doRefMetrics = REFERENCE_SEQUENCE != null; collector = new AlignmentSummaryMetricsCollector(METRIC_ACCUMULATION_LEVEL, header.getReadGroups(), doRefMetrics, ADAPTER_SEQUENCE, MAX_INSERT_SIZE, EXPECTED_PAIR_ORIENTATIONS, IS_BISULFITE_SEQUENCED); } @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { collector.acceptRecord(rec, ref); } @Override protected void finish() { collector.finish(); final MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file = getMetricsFile(); collector.addAllLevelsToFile(file); file.write(OUTPUT); } }