package picard.analysis; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import java.io.File; import java.util.Arrays; import java.util.List; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Metrics; import picard.util.RExecutor; @CommandLineProgramProperties( usage = "Program to chart the nucleotide distribution per cycle in a SAM or BAM file.", usageShort = "Program to chart the nucleotide distribution per cycle in a SAM or BAM file.", programGroup = Metrics.class ) public class CollectBaseDistributionByCycle extends SinglePassSamProgram { @Option(shortName = "CHART", doc = "A file (with .pdf extension) to write the chart to.") public File CHART_OUTPUT; @Option(doc = "If set to true, calculate the base distribution over aligned reads only.") public boolean ALIGNED_READS_ONLY = false; @Option(doc = "If set to true calculate the base distribution over PF reads only.") public boolean PF_READS_ONLY = false; private HistogramGenerator hist; private String plotSubtitle = ""; private final Log log = Log.getInstance(CollectBaseDistributionByCycle.class); public static void main(String[] args) { System.exit(new CollectBaseDistributionByCycle().instanceMain(args)); } @Override protected void setup(final SAMFileHeader header, final File samFile) { IOUtil.assertFileIsWritable(CHART_OUTPUT); final List<SAMReadGroupRecord> readGroups = header.getReadGroups(); if (readGroups.size() == 1) { plotSubtitle = StringUtil.asEmptyIfNull(readGroups.get(0).getLibrary()); } hist = new HistogramGenerator(); } @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { if ((PF_READS_ONLY) && (rec.getReadFailsVendorQualityCheckFlag())) { return; } if ((ALIGNED_READS_ONLY) && (rec.getReadUnmappedFlag())) { return; } if (rec.isSecondaryOrSupplementary()) { return; } hist.addRecord(rec); } @Override protected void finish() { final MetricsFile<BaseDistributionByCycleMetrics, ?> metrics = getMetricsFile(); hist.addToMetricsFile(metrics); metrics.write(OUTPUT); if (hist.isEmpty()) { log.warn("No valid bases found in input file. No plot will be produced."); } else { final int rResult = RExecutor.executeFromClasspath("picard/analysis/baseDistributionByCycle.R", OUTPUT.getAbsolutePath(), CHART_OUTPUT.getAbsolutePath(), INPUT.getName(), plotSubtitle); if (rResult != 0) { throw new PicardException("R script nucleotideDistributionByCycle.R failed with return code " + rResult); } } } private class HistogramGenerator { private int maxLengthSoFar = 0; final private long[][] firstReadTotalsByCycle = new long[5][maxLengthSoFar]; private long[] firstReadCountsByCycle = new long[maxLengthSoFar]; final private long[][] secondReadTotalsByCycle = new long[5][maxLengthSoFar]; private long[] secondReadCountsByCycle = new long[maxLengthSoFar]; private boolean seenSecondEnd = false; private int baseToInt(final byte base) { switch (base) { case 'A': case 'a': return 0; case 'C': case 'c': return 1; case 'G': case 'g': return 2; case 'T': case 't': return 3; } return 4; } void addRecord(final SAMRecord rec) { final byte[] bases = rec.getReadBases(); if (bases == null) { return; } final int length = bases.length; final boolean rc = rec.getReadNegativeStrandFlag(); ensureArraysBigEnough(length + 1); if ((rec.getReadPairedFlag()) && (rec.getSecondOfPairFlag())) { seenSecondEnd = true; for (int i = 0; i < length; i++) { final int cycle = rc ? length - i : i + 1; secondReadTotalsByCycle[baseToInt(bases[i])][cycle] += 1; secondReadCountsByCycle[cycle] += 1; } } else { for (int i = 0; i < length; i++) { final int cycle = rc ? length - i : i + 1; firstReadTotalsByCycle[baseToInt(bases[i])][cycle] += 1; firstReadCountsByCycle[cycle] += 1; } } } private void ensureArraysBigEnough(final int length) { if (length > maxLengthSoFar) { for (int i = 0; i < 5; i++) { firstReadTotalsByCycle[i] = Arrays.copyOf(firstReadTotalsByCycle[i], length); secondReadTotalsByCycle[i] = Arrays.copyOf(secondReadTotalsByCycle[i], length); } firstReadCountsByCycle = Arrays.copyOf(firstReadCountsByCycle, length); secondReadCountsByCycle = Arrays.copyOf(secondReadCountsByCycle, length); maxLengthSoFar = length; } } boolean isEmpty() { return maxLengthSoFar == 0; } public void addToMetricsFile(final MetricsFile<BaseDistributionByCycleMetrics, ?> metrics) { int firstReadLength = 0; for (int i = 0; i < maxLengthSoFar; i++) { if (0 != firstReadCountsByCycle[i]) { final BaseDistributionByCycleMetrics metric = new BaseDistributionByCycleMetrics(); metric.READ_END = 1; metric.CYCLE = i; metric.PCT_A = (100.0 * firstReadTotalsByCycle[0][i] / firstReadCountsByCycle[i]); metric.PCT_C = (100.0 * firstReadTotalsByCycle[1][i] / firstReadCountsByCycle[i]); metric.PCT_G = (100.0 * firstReadTotalsByCycle[2][i] / firstReadCountsByCycle[i]); metric.PCT_T = (100.0 * firstReadTotalsByCycle[3][i] / firstReadCountsByCycle[i]); metric.PCT_N = (100.0 * firstReadTotalsByCycle[4][i] / firstReadCountsByCycle[i]); metrics.addMetric(metric); firstReadLength = i; } } if (seenSecondEnd) { for (int i = 0; i < maxLengthSoFar; i++) { if (0 != secondReadCountsByCycle[i]) { final BaseDistributionByCycleMetrics metric = new BaseDistributionByCycleMetrics(); metric.READ_END = 2; metric.CYCLE = (i + firstReadLength); metric.PCT_A = (100.0 * secondReadTotalsByCycle[0][i] / secondReadCountsByCycle[i]); metric.PCT_C = (100.0 * secondReadTotalsByCycle[1][i] / secondReadCountsByCycle[i]); metric.PCT_G = (100.0 * secondReadTotalsByCycle[2][i] / secondReadCountsByCycle[i]); metric.PCT_T = (100.0 * secondReadTotalsByCycle[3][i] / secondReadCountsByCycle[i]); metric.PCT_N = (100.0 * secondReadTotalsByCycle[4][i] / secondReadCountsByCycle[i]); metrics.addMetric(metric); } } } } } }