/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Metrics; import picard.metrics.GcBiasMetrics; import picard.util.RExecutor; import java.io.File; import java.text.NumberFormat; import java.util.List; import java.util.Set; /** * Tool to collect information about GC bias in the reads in a given BAM file. Computes * the number of windows (of size specified by SCAN_WINDOW_SIZE) in the genome at each GC% * and counts the number of read starts in each GC bin. What is output and plotted is * the "normalized coverage" in each bin - i.e. the number of reads per window normalized * to the average number of reads per window across the whole genome. * * @author Tim Fennell * edited by Kylee Bergin */ @CommandLineProgramProperties( usage = CollectGcBiasMetrics.USAGE_SUMMARY + CollectGcBiasMetrics.USAGE_DETAILS, usageShort = CollectGcBiasMetrics.USAGE_SUMMARY, programGroup = Metrics.class ) public class CollectGcBiasMetrics extends SinglePassSamProgram { static final String USAGE_SUMMARY = "Collect metrics regarding GC bias. "; static final String USAGE_DETAILS = "This tool collects information about the relative proportions of guanine (G) and cytosine (C)" + " nucleotides in a sample. Regions of high and low G + C content have been shown to interfere with mapping/aligning," + " ultimately leading to fragmented genome assemblies and poor coverage in a phenomenon known as 'GC bias'. " + "Detailed information on the effects of GC bias on the collection and analysis of sequencing data can be found at " + "DOI: 10.1371/journal.pone.0062856/.<br /><br />" + "" + "<p>The GC bias statistics are always output in a detailed long-form version, but a summary can also be produced. Both the " + "detailed metrics and the summary metrics are output as tables '.txt' files) and an accompanying chart that plots the " + "data ('.pdf' file). </p> " + "" + "<h4>Detailed metrics</h4>" + "The table of detailed metrics includes GC percentages for each bin (GC), the percentage of WINDOWS corresponding to each " + "GC bin of the reference sequence, the numbers of reads that start within a particular %GC content bin (READ_STARTS), and " + "the mean base quality of the reads that correspond to a specific GC content distribution window (MEAN_BASE_QUALITY). " + "NORMALIZED_COVERAGE is a relative measure of sequence coverage by the reads at a particular GC content." + "" + "For each run, the corresponding reference sequence is divided into bins or windows based on the percentage of G + C" + " content ranging from 0 - 100%. The percentages of G + C are determined from a defined length of sequence; the default " + "value is set at 100 bases. The mean of the distribution will vary among organisms; human DNA has a mean GC content " + "of 40%, suggesting a slight preponderance of AT-rich regions. <br /><br />" + "" + "<h4>Summary metrics</h4>" + "The table of summary metrics captures run-specific bias information including WINDOW_SIZE, ALIGNED_READS, TOTAL_CLUSTERS, " + "AT_DROPOUT, and GC_DROPOUT. While WINDOW_SIZE refers to the numbers of bases used for the distribution (see above), the " + "ALIGNED_READS and TOTAL_CLUSTERS are the total number of aligned reads and the total number of reads (after filtering) " + "produced in a run. In addition, the tool produces both AT_DROPOUT and GC_DROPOUT metrics, which indicate the percentage of " + "misaligned reads that correlate with low (%-GC is < 50%) or high (%-GC is > 50%) GC content respectively. <br /><br />" + "" + "The percentage of 'coverage' or depth in a GC bin is calculated by dividing the number of reads of a particular GC content " + "by the mean number of reads of all GC bins. A number of 1 represents mean coverage, a number less than 1 represents lower " + "than mean coverage (e.g. 0.5 means half as much coverage as average) while a number greater than 1 represents higher than " + "mean coverage (e.g. 3.1 means this GC bin has 3.1 times more reads per window than average). " + "" + "This tool also tracks mean base-quality scores of the reads within each GC content bin, enabling the user to determine " + "how base quality scores vary with GC content. <br /> <br />"+ "" + "The chart output associated with this data table plots the NORMALIZED_COVERAGE, the distribution of WINDOWs corresponding " + "to GC percentages, and base qualities corresponding to each %GC bin."+ "<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p>" + "<h4>Usage Example:</h4>"+ "<pre>" + "java -jar picard.jar CollectGcBiasMetrics \\<br />"+ " I=input.bam \\<br />"+ " O=gc_bias_metrics.txt \\<br />"+ " CHART=gc_bias_metrics.pdf \\<br />"+ " S=summary_metrics.txt \\<br />"+ " R=reference_sequence.fasta"+ "</pre>"+ "Please see <a href='https://broadinstitute.github.io/picard/picard-metric-definitions.html#GcBiasMetrics'>" + "the GcBiasMetrics documentation</a> for further explanations of each metric." + "<hr />"; /** The location of the R script to do the plotting. */ private static final String R_SCRIPT = "picard/analysis/gcBias.R"; // Usage and parameters @Option(shortName = "CHART", doc = "The PDF file to render the chart to.") public File CHART_OUTPUT; @Option(shortName = "S", doc = "The text file to write summary metrics to.") public File SUMMARY_OUTPUT; @Option(shortName = "WINDOW_SIZE", doc = "The size of the scanning windows on the reference genome that are used to bin reads.") public int SCAN_WINDOW_SIZE = 100; @Option(shortName = "MGF", doc = "For summary metrics, exclude GC windows that include less than this fraction of the genome.") public double MINIMUM_GENOME_FRACTION = 0.00001; @Option(shortName = "BS", doc = "Whether the SAM or BAM file consists of bisulfite sequenced reads.") public boolean IS_BISULFITE_SEQUENCED = false; @Option(shortName = "LEVEL", doc = "The level(s) at which to accumulate metrics.") public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); @Option(shortName = "ALSO_IGNORE_DUPLICATES", doc = "Use to get additional results without duplicates. This option " + "allows to gain two plots per level at the same time: one is the usual one and the other excludes duplicates.") public boolean ALSO_IGNORE_DUPLICATES = false; // Calculates GcBiasMetrics for all METRIC_ACCUMULATION_LEVELs provided private GcBiasMetricsCollector multiCollector; // Bins for the histograms to track the number of windows at each GC, and the number of read starts // at bins of each GC %. Need 101 to get from 0-100. private static final int BINS = 101; //////////////////////////////////////////////////////////////////////////// // Stock main method //////////////////////////////////////////////////////////////////////////// public static void main(final String[] args) { System.exit(new CollectGcBiasMetrics().instanceMain(args)); } ///////////////////////////////////////////////////////////////////////////// // Setup calculates windowsByGc for the entire reference. Must be done at // startup to avoid missing reference contigs in the case of small files // that may not have reads aligning to every reference contig. ///////////////////////////////////////////////////////////////////////////// @Override protected void setup(final SAMFileHeader header, final File samFile) { IOUtil.assertFileIsWritable(CHART_OUTPUT); IOUtil.assertFileIsWritable(SUMMARY_OUTPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); //Calculate windowsByGc for the reference sequence final int[] windowsByGc = GcBiasUtils.calculateRefWindowsByGc(BINS, REFERENCE_SEQUENCE, SCAN_WINDOW_SIZE); //Delegate actual collection to GcBiasMetricCollector multiCollector = new GcBiasMetricsCollector(METRIC_ACCUMULATION_LEVEL, windowsByGc, header.getReadGroups(), SCAN_WINDOW_SIZE, IS_BISULFITE_SEQUENCED, ALSO_IGNORE_DUPLICATES); } //////////////////////////////////////////////////////////////////////////// // MultiCollector acceptRead //////////////////////////////////////////////////////////////////////////// @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { multiCollector.acceptRecord(rec, ref); } ///////////////////////////////////////////////////////////////////////////// // Write out all levels of normalized coverage metrics to a file ///////////////////////////////////////////////////////////////////////////// @Override protected void finish() { multiCollector.finish(); writeResultsToFiles(); } private void writeResultsToFiles() { final MetricsFile<GcBiasMetrics, Integer> file = getMetricsFile(); final MetricsFile<GcBiasDetailMetrics, ?> detailMetricsFile = getMetricsFile(); final MetricsFile<GcBiasSummaryMetrics, ?> summaryMetricsFile = getMetricsFile(); multiCollector.addAllLevelsToFile(file); final List<GcBiasMetrics> gcBiasMetricsList = file.getMetrics(); for (final GcBiasMetrics gcbm : gcBiasMetricsList) { final List<GcBiasDetailMetrics> gcDetailList = gcbm.DETAILS.getMetrics(); for (final GcBiasDetailMetrics d : gcDetailList) { detailMetricsFile.addMetric(d); } summaryMetricsFile.addMetric(gcbm.SUMMARY); } detailMetricsFile.write(OUTPUT); summaryMetricsFile.write(SUMMARY_OUTPUT); final NumberFormat fmt = NumberFormat.getIntegerInstance(); fmt.setGroupingUsed(true); RExecutor.executeFromClasspath(R_SCRIPT, OUTPUT.getAbsolutePath(), SUMMARY_OUTPUT.getAbsolutePath(), CHART_OUTPUT.getAbsolutePath(), String.valueOf(SCAN_WINDOW_SIZE)); } }