/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import java.io.File; import java.util.Arrays; import java.util.List; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Metrics; import picard.util.RExecutor; @CommandLineProgramProperties( usage = CollectBaseDistributionByCycle.USAGE_SUMMARY + CollectBaseDistributionByCycle.USAGE_DETAILS, usageShort = CollectBaseDistributionByCycle.USAGE_SUMMARY, programGroup = Metrics.class ) public class CollectBaseDistributionByCycle extends SinglePassSamProgram { static final String USAGE_SUMMARY = "Chart the nucleotide distribution per cycle in a SAM or BAM file"; static final String USAGE_DETAILS = "This tool produces a chart of the nucleotide distribution per cycle in a SAM or BAM file " + "in order to enable assessment of systematic errors at specific positions in the reads.<br /><br />" + "" + "<h4>Interpretation notes</h4>" + "Increased numbers of miscalled bases will be reflected in base distribution changes and increases in the number of Ns. "+ "In general, we expect that for any given cycle, or position within reads, the relative proportions of A, T, C and G "+ "should reflect the AT:GC content of the organism's genome. Thus, for all four nucleotides, flattish lines would be "+ "expected. Deviations from this expectation, for example a spike of A at a particular cycle (position within reads), "+ "would suggest a systematic sequencing error."+ "" + "<h4>Note on quality trimming</h4>" + "In the past, many sequencing data processing workflows included discarding the low-quality tails of reads by applying "+ "hard-clipping at some arbitrary base quality threshold value. This is no longer useful because most sophisticated "+ "analysis tools (such as the GATK variant discovery tools) are quality-aware, meaning that they are able to take base "+ "quality into account when weighing evidence provided by sequencing reads. Unnecessary clipping may interfere with other "+ "quality control evaluations and may lower the quality of analysis results. For example, trimming reduces the effectiveness "+ "of the Base Recalibration (BQSR) pre-processing step of the "+ "<a href='https://www.broadinstitute.org/gatk/guide/best-practices'>GATK Best Practices for Variant Discovery</a>, "+ "which aims to correct some types of systematic biases that affect the accuracy of base quality scores."+ "<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p>"+ "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar CollectBaseDistributionByCycle \\<br />" + " CHART=collect_base_dist_by_cycle.pdf \\<br />" + " I=input.bam \\<br />" + " O=output.txt" + "</pre>" + "<hr />" ; @Option(shortName = "CHART", doc = "A file (with .pdf extension) to write the chart to.") public File CHART_OUTPUT; @Option(doc = "If set to true, calculate the base distribution over aligned reads only.") public boolean ALIGNED_READS_ONLY = false; @Option(doc = "If set to true, calculate the base distribution over PF reads only (Illumina specific). PF reads are reads that passed the internal quality filters applied by Illumina sequencers.") public boolean PF_READS_ONLY = false; private HistogramGenerator hist; private String plotSubtitle = ""; private final Log log = Log.getInstance(CollectBaseDistributionByCycle.class); public static void main(String[] args) { System.exit(new CollectBaseDistributionByCycle().instanceMain(args)); } @Override protected void setup(final SAMFileHeader header, final File samFile) { IOUtil.assertFileIsWritable(CHART_OUTPUT); final List<SAMReadGroupRecord> readGroups = header.getReadGroups(); if (readGroups.size() == 1) { plotSubtitle = StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary()); } hist = new HistogramGenerator(); } @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { if ((PF_READS_ONLY) && (rec.getReadFailsVendorQualityCheckFlag())) { return; } if ((ALIGNED_READS_ONLY) && (rec.getReadUnmappedFlag())) { return; } if (rec.isSecondaryOrSupplementary()) { return; } hist.addRecord(rec); } @Override protected void finish() { final MetricsFile<BaseDistributionByCycleMetrics, ?> metrics = getMetricsFile(); hist.addToMetricsFile(metrics); metrics.write(OUTPUT); if (hist.isEmpty()) { log.warn("No valid bases found in input file. No plot will be produced."); } else { final int rResult = RExecutor.executeFromClasspath("picard/analysis/baseDistributionByCycle.R", OUTPUT.getAbsolutePath(), CHART_OUTPUT.getAbsolutePath(), INPUT.getName(), plotSubtitle); if (rResult != 0) { throw new PicardException("R script nucleotideDistributionByCycle.R failed with return code " + rResult); } } } private class HistogramGenerator { private int maxLengthSoFar = 0; final private long[][] firstReadTotalsByCycle = new long[5][maxLengthSoFar]; private long[] firstReadCountsByCycle = new long[maxLengthSoFar]; final private long[][] secondReadTotalsByCycle = new long[5][maxLengthSoFar]; private long[] secondReadCountsByCycle = new long[maxLengthSoFar]; private boolean seenSecondEnd = false; private int baseToInt(final byte base) { switch (base) { case 'A': case 'a': return 0; case 'C': case 'c': return 1; case 'G': case 'g': return 2; case 'T': case 't': return 3; } return 4; } void addRecord(final SAMRecord rec) { final byte[] bases = rec.getReadBases(); if (bases == null) { return; } final int length = bases.length; final boolean rc = rec.getReadNegativeStrandFlag(); ensureArraysBigEnough(length + 1); if ((rec.getReadPairedFlag()) && (rec.getSecondOfPairFlag())) { seenSecondEnd = true; for (int i = 0; i < length; i++) { final int cycle = rc ? length - i : i + 1; secondReadTotalsByCycle[baseToInt(bases[i])][cycle] += 1; secondReadCountsByCycle[cycle] += 1; } } else { for (int i = 0; i < length; i++) { final int cycle = rc ? length - i : i + 1; firstReadTotalsByCycle[baseToInt(bases[i])][cycle] += 1; firstReadCountsByCycle[cycle] += 1; } } } private void ensureArraysBigEnough(final int length) { if (length > maxLengthSoFar) { for (int i = 0; i < 5; i++) { firstReadTotalsByCycle[i] = Arrays.copyOf(firstReadTotalsByCycle[i], length); secondReadTotalsByCycle[i] = Arrays.copyOf(secondReadTotalsByCycle[i], length); } firstReadCountsByCycle = Arrays.copyOf(firstReadCountsByCycle, length); secondReadCountsByCycle = Arrays.copyOf(secondReadCountsByCycle, length); maxLengthSoFar = length; } } boolean isEmpty() { return maxLengthSoFar == 0; } public void addToMetricsFile(final MetricsFile<BaseDistributionByCycleMetrics, ?> metrics) { int firstReadLength = 0; for (int i = 0; i < maxLengthSoFar; i++) { if (0 != firstReadCountsByCycle[i]) { final BaseDistributionByCycleMetrics metric = new BaseDistributionByCycleMetrics(); metric.READ_END = 1; metric.CYCLE = i; metric.PCT_A = (100.0 * firstReadTotalsByCycle[0][i] / firstReadCountsByCycle[i]); metric.PCT_C = (100.0 * firstReadTotalsByCycle[1][i] / firstReadCountsByCycle[i]); metric.PCT_G = (100.0 * firstReadTotalsByCycle[2][i] / firstReadCountsByCycle[i]); metric.PCT_T = (100.0 * firstReadTotalsByCycle[3][i] / firstReadCountsByCycle[i]); metric.PCT_N = (100.0 * firstReadTotalsByCycle[4][i] / firstReadCountsByCycle[i]); metrics.addMetric(metric); firstReadLength = i; } } if (seenSecondEnd) { for (int i = 0; i < maxLengthSoFar; i++) { if (0 != secondReadCountsByCycle[i]) { final BaseDistributionByCycleMetrics metric = new BaseDistributionByCycleMetrics(); metric.READ_END = 2; metric.CYCLE = (i + firstReadLength); metric.PCT_A = (100.0 * secondReadTotalsByCycle[0][i] / secondReadCountsByCycle[i]); metric.PCT_C = (100.0 * secondReadTotalsByCycle[1][i] / secondReadCountsByCycle[i]); metric.PCT_G = (100.0 * secondReadTotalsByCycle[2][i] / secondReadCountsByCycle[i]); metric.PCT_T = (100.0 * secondReadTotalsByCycle[3][i] / secondReadCountsByCycle[i]); metric.PCT_N = (100.0 * secondReadTotalsByCycle[4][i] / secondReadCountsByCycle[i]); metrics.addMetric(metric); } } } } } }