/* * The MIT License * * Copyright (c) 2015 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; import htsjdk.samtools.filter.DuplicateReadFilter; import htsjdk.samtools.filter.InsertSizeFilter; import htsjdk.samtools.filter.NotPrimaryAlignmentFilter; import htsjdk.samtools.filter.SamRecordFilter; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequenceFileWalker; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.ListMap; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.SamLocusIterator; import htsjdk.samtools.util.SequenceUtil; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Metrics; import picard.util.DbSnpBitSetUtil; import java.io.File; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import static htsjdk.samtools.util.CodeUtil.getOrElse; import static htsjdk.samtools.util.SequenceUtil.generateAllKmers; import static java.lang.Math.log10; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; /** * Class for trying to quantify the CpCG->CpCA error rate. */ @CommandLineProgramProperties( usage = CollectOxoGMetrics.USAGE_SUMMARY + CollectOxoGMetrics.USAGE_DETAILS, usageShort = CollectOxoGMetrics.USAGE_SUMMARY, programGroup = Metrics.class ) public class CollectOxoGMetrics extends CommandLineProgram { static final String USAGE_SUMMARY = "Collect metrics to assess oxidative artifacts."; static final String USAGE_DETAILS = "This tool collects metrics quantifying the error rate resulting from oxidative artifacts. " + "For a brief primer on oxidative artifacts, see " + "<a href='http://gatkforums.broadinstitute.org/discussion/6328/oxog-oxidative-artifacts'>" + "the GATK Dictionary</a>." + "<br /><br />" + "This tool calculates the Phred-scaled probability that an alternate base call results from an oxidation artifact. This " + "probability score is based on base context, sequencing read orientation, and the characteristic low allelic frequency. " + "Please see the following reference for an in-depth " + "<a href='http://nar.oxfordjournals.org/content/early/2013/01/08/nar.gks1443'>discussion</a>" + " of the OxoG error rate. " + "<p>Lower probability values implicate artifacts resulting from 8-oxoguanine, while higher " + "probability values suggest that an alternate base call is due to either some other type of artifact or is a " + "real variant.</p>" + "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar CollectOxoGMetrics \\<br />" + " I=input.bam \\<br />" + " O=oxoG_metrics.txt \\<br />" + " R=reference_sequence.fasta" + "</pre>" + "" + "<hr />"; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input BAM file for analysis.") public File INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Location of output metrics file to write.") public File OUTPUT; @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "Reference sequence to which BAM is aligned.") public File REFERENCE_SEQUENCE; @Option(doc = "An optional list of intervals to restrict analysis to.", optional = true) public File INTERVALS; @Option(doc = "VCF format dbSNP file, used to exclude regions around known polymorphisms from analysis.", optional = true) public File DB_SNP; @Option(shortName = "Q", doc = "The minimum base quality score for a base to be included in analysis.") public int MINIMUM_QUALITY_SCORE = 20; @Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "The minimum mapping quality score for a base to be included in analysis.") public int MINIMUM_MAPPING_QUALITY = 30; @Option(shortName = "MIN_INS", doc = "The minimum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.") public int MINIMUM_INSERT_SIZE = 60; @Option(shortName = "MAX_INS", doc = "The maximum insert size for a read to be included in analysis. Set of 0 to allow unpaired reads.") public int MAXIMUM_INSERT_SIZE = 600; @Option(shortName = "NON_PF", doc = "Whether or not to include non-PF reads.") public boolean INCLUDE_NON_PF_READS = true; @Option(doc = "When available, use original quality scores for filtering.") public boolean USE_OQ = true; @Option(doc = "The number of context bases to include on each side of the assayed G/C base.") public int CONTEXT_SIZE = 1; @Option(doc = "The optional set of sequence contexts to restrict analysis to. If not supplied all contexts are analyzed.") public Set<String> CONTEXTS = new HashSet<String>(); @Option(doc = "For debugging purposes: stop after visiting this many sites with at least 1X coverage.") public int STOP_AFTER = Integer.MAX_VALUE; private final Log log = Log.getInstance(CollectOxoGMetrics.class); private static final String UNKNOWN_LIBRARY = "UnknownLibrary"; private static final String UNKNOWN_SAMPLE = "UnknownSample"; /** Metrics class for outputs. */ public static final class CpcgMetrics extends MetricBase { /** The name of the sample being assayed. */ public String SAMPLE_ALIAS; /** The name of the library being assayed. */ public String LIBRARY; /** The sequence context being reported on. */ public String CONTEXT; /** The total number of sites that had at least one base covering them. */ public int TOTAL_SITES; /** The total number of basecalls observed at all sites. */ public long TOTAL_BASES; /** The number of reference alleles observed as C in read 1 and G in read 2. */ public long REF_NONOXO_BASES; /** The number of reference alleles observed as G in read 1 and C in read 2. */ public long REF_OXO_BASES; /** The total number of reference alleles observed */ public long REF_TOTAL_BASES; /** * The count of observed A basecalls at C reference positions and T basecalls * at G reference bases that are correlated to instrument read number in a way * that rules out oxidation as the cause */ public long ALT_NONOXO_BASES; /** * The count of observed A basecalls at C reference positions and T basecalls * at G reference bases that are correlated to instrument read number in a way * that is consistent with oxidative damage. */ public long ALT_OXO_BASES; /** The oxo error rate, calculated as max(ALT_OXO_BASES - ALT_NONOXO_BASES, 1) / TOTAL_BASES */ public double OXIDATION_ERROR_RATE; /** -10 * log10(OXIDATION_ERROR_RATE) */ public double OXIDATION_Q; // Fields below this point are metrics that are calculated to see if there is oxidative damage that is // biased toward the reference base - i.e. occurs more where the reference base is a C vs. a G or vice // versa. /** The number of ref basecalls observed at sites where the genome reference == C. */ public long C_REF_REF_BASES; /** The number of ref basecalls observed at sites where the genome reference == G. */ public long G_REF_REF_BASES; /** The number of alt (A/T) basecalls observed at sites where the genome reference == C. */ public long C_REF_ALT_BASES; /** The number of alt (A/T) basecalls observed at sites where the genome reference == G. */ public long G_REF_ALT_BASES; /** * The rate at which C>A and G>T substitutions are observed at C reference sites above the expected rate if there * were no bias between sites with a C reference base vs. a G reference base. */ public double C_REF_OXO_ERROR_RATE; /** C_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */ public double C_REF_OXO_Q; /** * The rate at which C>A and G>T substitutions are observed at G reference sites above the expected rate if there * were no bias between sites with a C reference base vs. a G reference base. */ public double G_REF_OXO_ERROR_RATE; /** G_REF_OXO_ERROR_RATE expressed as a phred-scaled quality score. */ public double G_REF_OXO_Q; } // Stock main method public static void main(final String[] args) { new CollectOxoGMetrics().instanceMainWithExit(args); } @Override protected String[] customCommandLineValidation() { final int size = 1 + 2 * CONTEXT_SIZE; final List<String> messages = new ArrayList<String>(); for (final String ctx : CONTEXTS) { if (ctx.length() != size) { messages.add("Context " + ctx + " is not " + size + " long as implied by CONTEXT_SIZE=" + CONTEXT_SIZE); } else if (ctx.charAt(ctx.length() / 2) != 'C') { messages.add("Middle base of context sequence " + ctx + " must be C"); } } if (MINIMUM_INSERT_SIZE < 0) messages.add("MINIMUM_INSERT_SIZE cannot be negative"); if (MAXIMUM_INSERT_SIZE < 0) messages.add("MAXIMUM_INSERT_SIZE cannot be negative"); if (MAXIMUM_INSERT_SIZE < MINIMUM_INSERT_SIZE) { messages.add("MAXIMUM_INSERT_SIZE cannot be less than MINIMUM_INSERT_SIZE"); } return messages.isEmpty() ? null : messages.toArray(new String[messages.size()]); } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); if (INTERVALS != null) IOUtil.assertFileIsReadable(INTERVALS); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = SamReaderFactory.makeDefault().open(INPUT); final Set<String> samples = new HashSet<String>(); final Set<String> libraries = new HashSet<String>(); if (in.getFileHeader().getReadGroups().isEmpty()) { throw new PicardException("This analysis requires a read group entry in the alignment file header"); } for (final SAMReadGroupRecord rec : in.getFileHeader().getReadGroups()) { samples.add(getOrElse(rec.getSample(), UNKNOWN_SAMPLE)); libraries.add(getOrElse(rec.getLibrary(), UNKNOWN_LIBRARY)); } // Setup the calculators final Set<String> contexts = CONTEXTS.isEmpty() ? makeContextStrings(CONTEXT_SIZE) : CONTEXTS; final ListMap<String, Calculator> calculators = new ListMap<String, Calculator>(); for (final String context : contexts) { for (final String library : libraries) { calculators.add(context, new Calculator(library, context)); } } // Load up dbSNP if available log.info("Loading dbSNP File: " + DB_SNP); final DbSnpBitSetUtil dbSnp; if (DB_SNP != null) dbSnp = new DbSnpBitSetUtil(DB_SNP, in.getFileHeader().getSequenceDictionary()); else dbSnp = null; // Make an iterator that will filter out funny looking things final SamLocusIterator iterator; if (INTERVALS != null) { final IntervalList intervals = IntervalList.fromFile(INTERVALS); iterator = new SamLocusIterator(in, intervals.uniqued(), false); } else { iterator = new SamLocusIterator(in); } iterator.setEmitUncoveredLoci(false); iterator.setMappingQualityScoreCutoff(MINIMUM_MAPPING_QUALITY); iterator.setIncludeNonPfReads(INCLUDE_NON_PF_READS); final List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>(); filters.add(new NotPrimaryAlignmentFilter()); filters.add(new DuplicateReadFilter()); if (MINIMUM_INSERT_SIZE > 0 || MAXIMUM_INSERT_SIZE > 0) { filters.add(new InsertSizeFilter(MINIMUM_INSERT_SIZE, MAXIMUM_INSERT_SIZE)); } iterator.setSamFilters(filters); log.info("Starting iteration."); long nextLogTime = 0; int sites = 0; for (final SamLocusIterator.LocusInfo info : iterator) { // Skip dbSNP sites final String chrom = info.getSequenceName(); final int pos = info.getPosition(); final int index = pos - 1; if (dbSnp != null && dbSnp.isDbSnpSite(chrom, pos)) continue; // Skip sites at the end of chromosomes final byte[] bases = refWalker.get(info.getSequenceIndex()).getBases(); if (pos < 3 || pos > bases.length - 3) continue; // Skip non C-G bases final byte base = StringUtil.toUpperCase(bases[index]); if (base != 'C' && base != 'G') continue; // Get the context string final String context; { final String tmp = StringUtil.bytesToString(bases, index - CONTEXT_SIZE, 1 + (2 * CONTEXT_SIZE)).toUpperCase(); if (base == 'C') context = tmp; else /* if G */ context = SequenceUtil.reverseComplement(tmp); } final List<Calculator> calculatorsForContext = calculators.get(context); if (calculatorsForContext == null) continue; // happens if we get ambiguous bases in the reference for (final Calculator calc : calculatorsForContext) calc.accept(info, base); // See if we need to stop if (++sites % 100 == 0) { final long now = System.currentTimeMillis(); if (now > nextLogTime) { log.info("Visited " + sites + " sites of interest. Last site: " + chrom + ":" + pos); nextLogTime = now + 60000; } } if (sites >= STOP_AFTER) break; } final MetricsFile<CpcgMetrics, Integer> file = getMetricsFile(); for (final List<Calculator> calcs : calculators.values()) { for (final Calculator calc : calcs) { final CpcgMetrics m = calc.finish(); m.SAMPLE_ALIAS = StringUtil.join(",", new ArrayList<String>(samples)); file.addMetric(m); } } file.write(OUTPUT); CloserUtil.close(in); return 0; } private Set<String> makeContextStrings(final int contextSize) { final Set<String> contexts = new HashSet<String>(); for (final byte[] kmer : generateAllKmers(2 * contextSize + 1)) { if (kmer[contextSize] == 'C') { contexts.add(StringUtil.bytesToString(kmer)); } } log.info("Generated " + contexts.size() + " context strings."); return contexts; } /** A little class for counting alleles. */ private static class Counts { int controlA; int oxidatedA; int controlC; int oxidatedC; int total() { return controlC + oxidatedC + controlA + oxidatedA; } } /** * Class that calculated CpCG metrics for a specific library. */ private class Calculator { private final String library; private final String context; // Things to be accumulated int sites = 0; long refCcontrolA = 0; long refCoxidatedA = 0; long refCcontrolC = 0; long refCoxidatedC = 0; long refGcontrolA = 0; long refGoxidatedA = 0; long refGcontrolC = 0; long refGoxidatedC = 0; Calculator(final String library, final String context) { this.library = library; this.context = context; } void accept(final SamLocusIterator.LocusInfo info, final byte refBase) { final Counts counts = computeAlleleFraction(info, refBase); if (counts.total() > 0) { // Things calculated on all sites with coverage this.sites++; if (refBase == 'C') { this.refCcontrolA += counts.controlA; this.refCoxidatedA += counts.oxidatedA; this.refCcontrolC += counts.controlC; this.refCoxidatedC += counts.oxidatedC; } else if (refBase == 'G') { this.refGcontrolA += counts.controlA; this.refGoxidatedA += counts.oxidatedA; this.refGcontrolC += counts.controlC; this.refGoxidatedC += counts.oxidatedC; } else { throw new IllegalStateException("Reference bases other than G and C not supported."); } } } CpcgMetrics finish() { final CpcgMetrics m = new CpcgMetrics(); m.LIBRARY = this.library; m.CONTEXT = this.context; m.TOTAL_SITES = this.sites; m.TOTAL_BASES = this.refCcontrolC + this.refCoxidatedC + this.refCcontrolA + this.refCoxidatedA + this.refGcontrolC + this.refGoxidatedC + this.refGcontrolA + this.refGoxidatedA; m.REF_OXO_BASES = this.refCoxidatedC + refGoxidatedC; m.REF_NONOXO_BASES = this.refCcontrolC + this.refGcontrolC; m.REF_TOTAL_BASES = m.REF_OXO_BASES + m.REF_NONOXO_BASES; m.ALT_NONOXO_BASES = this.refCcontrolA + this.refGcontrolA; m.ALT_OXO_BASES = this.refCoxidatedA + this.refGoxidatedA; /** * Why do we calculate the oxo error rate using oxidatedA - controlA you ask? We know that all the * bases counted in oxidatedA are consistent with 8-oxo-G damage during shearing, but not all of them * will have been caused by this. If we assume that C>A errors caused by other factors will occur randomly * with respect to read1/read2, then we should see as many in the 8-oxo-G consistent state as not. So we * assume that controlA is half the story, and remove the other half from oxidatedA. */ m.OXIDATION_ERROR_RATE = Math.max(m.ALT_OXO_BASES - m.ALT_NONOXO_BASES, 1) / (double) m.TOTAL_BASES; m.OXIDATION_Q = -10 * log10(m.OXIDATION_ERROR_RATE); /** Now look for things that have a reference base bias! */ m.C_REF_REF_BASES = this.refCcontrolC + this.refCoxidatedC; m.G_REF_REF_BASES = this.refGcontrolC + this.refGoxidatedC; m.C_REF_ALT_BASES = this.refCcontrolA + this.refCoxidatedA; m.G_REF_ALT_BASES = this.refGcontrolA + this.refGoxidatedA; final double cRefErrorRate = m.C_REF_ALT_BASES / (double) (m.C_REF_ALT_BASES + m.C_REF_REF_BASES); final double gRefErrorRate = m.G_REF_ALT_BASES / (double) (m.G_REF_ALT_BASES + m.G_REF_REF_BASES); m.C_REF_OXO_ERROR_RATE = Math.max(cRefErrorRate - gRefErrorRate, 1e-10); m.G_REF_OXO_ERROR_RATE = Math.max(gRefErrorRate - cRefErrorRate, 1e-10); m.C_REF_OXO_Q = -10 * log10(m.C_REF_OXO_ERROR_RATE); m.G_REF_OXO_Q = -10 * log10(m.G_REF_OXO_ERROR_RATE); return m; } /** * */ private Counts computeAlleleFraction(final SamLocusIterator.LocusInfo info, final byte refBase) { final Counts counts = new Counts(); final byte altBase = (refBase == 'C') ? (byte) 'A' : (byte) 'T'; for (final SamLocusIterator.RecordAndOffset rec : info.getRecordAndOffsets()) { final byte qual; final SAMRecord samrec = rec.getRecord(); if (USE_OQ) { final byte[] oqs = samrec.getOriginalBaseQualities(); if (oqs != null) qual = oqs[rec.getOffset()]; else qual = rec.getBaseQuality(); } else { qual = rec.getBaseQuality(); } // Skip if below qual, or if library isn't a match if (qual < MINIMUM_QUALITY_SCORE) continue; if (!this.library.equals(getOrElse(samrec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY))) continue; // Get the read base, and get it in "as read" orientation final byte base = rec.getReadBase(); final byte baseAsRead = samrec.getReadNegativeStrandFlag() ? SequenceUtil.complement(base) : base; final int read = samrec.getReadPairedFlag() && samrec.getSecondOfPairFlag() ? 2 : 1; // Figure out how to count the alternative allele. If the damage is caused by oxidation of G // during shearing (in non-rnaseq data), then we know that: // G>T observation is always in read 1 // C>A observation is always in read 2 // But if the substitution is from other causes the distribution of A/T across R1/R2 will be // random. if (base == refBase) { if (baseAsRead == 'G' && read == 1) ++counts.oxidatedC; else if (baseAsRead == 'G' && read == 2) ++counts.controlC; else if (baseAsRead == 'C' && read == 1) ++counts.controlC; else if (baseAsRead == 'C' && read == 2) ++counts.oxidatedC; } else if (base == altBase) { if (baseAsRead == 'T' && read == 1) ++counts.oxidatedA; else if (baseAsRead == 'T' && read == 2) ++counts.controlA; else if (baseAsRead == 'A' && read == 1) ++counts.controlA; else if (baseAsRead == 'A' && read == 2) ++counts.oxidatedA; } } return counts; } } }