package picard.analysis.artifacts; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.util.QualityUtil; public class SequencingArtifactMetrics { public static final String PRE_ADAPTER_SUMMARY_EXT = ".pre_adapter_summary_metrics"; public static final String PRE_ADAPTER_DETAILS_EXT = ".pre_adapter_detail_metrics"; public static final String BAIT_BIAS_SUMMARY_EXT = ".bait_bias_summary_metrics"; public static final String BAIT_BIAS_DETAILS_EXT = ".bait_bias_detail_metrics"; public static final String ERROR_SUMMARY_EXT = ".error_summary_metrics"; private static final double MIN_ERROR = 1e-10; // minimum error rate to report /** * Summary analysis of a single pre-adapter artifact. * * These artifacts occur on the original template strand, before the addition of adapters, * so they correlate with read number / orientation in a specific way. * * For example, the well-known "Oxo-G" artifact occurs when a G on the template * strand is oxidized, giving it an affinity for binding to A rather than the usual C. * Thus PCR will introduce apparent G>T substitutions in read 1 and C>A in read 2. * In the resulting alignments, a given G>T or C>A observation could either be: * * 1. a true mutation * 2. an OxoG artifact * 3. some other kind of artifact * * On average, we assume that 1 and 3 will not display this read number / orientation bias, so * their contributions will cancel out in the calculation. * */ public static class PreAdapterSummaryMetrics extends MetricBase { /** The name of the sample being assayed. */ public String SAMPLE_ALIAS; /** The name of the library being assayed. */ public String LIBRARY; /** The (upper-case) original base on the reference strand. */ public char REF_BASE; /** The (upper-case) alternative base that is called as a result of DNA damage. */ public char ALT_BASE; /** * The total Phred-scaled Q-score for this artifact. A lower Q-score * means a higher probability that a REF_BASE:ALT_BASE observation * randomly picked from the data will be due to this artifact, rather * than a true variant. */ public double TOTAL_QSCORE; /** The sequence context (reference bases surrounding the locus of interest) having the lowest Q-score among all contexts for this artifact. */ public String WORST_CXT; /** The Q-score for the worst context. */ public double WORST_CXT_QSCORE; /** The pre-context (reference bases leading up to the locus of interest) with the lowest Q-score. */ public String WORST_PRE_CXT; /** The Q-score for the worst pre-context. */ public double WORST_PRE_CXT_QSCORE; /** The post-context (reference bases trailing after the locus of interest) with the lowest Q-score. */ public String WORST_POST_CXT; /** The Q-score for the worst post-context. */ public double WORST_POST_CXT_QSCORE; /** A "nickname" of this artifact, if it is a known error mode. */ public String ARTIFACT_NAME; /** * Label the artifacts corresponding to known error modes. */ public void inferArtifactName() { if (this.REF_BASE == 'G' && this.ALT_BASE == 'T') this.ARTIFACT_NAME = "OxoG"; else if (this.REF_BASE == 'C' && this.ALT_BASE == 'T') this.ARTIFACT_NAME = "Deamination"; else this.ARTIFACT_NAME = "NA"; } } /** * Summary analysis of a single bait bias artifact, also known as a reference bias artifact. * * These artifacts occur during or after the target selection step, and correlate with substitution * rates that are "biased", or higher for sites having one base on the reference/positive strand * relative to sites having the complementary base on that strand. * * For example, a G>T artifact during the target selection step might result in a higher * G>T / C>A substitution rate at sites with a G on the positive strand (and C on the negative), * relative to sites with the flip (C positive / G negative). This is known as the "G-Ref" artifact. * */ public static class BaitBiasSummaryMetrics extends MetricBase { /** The name of the sample being assayed. */ public String SAMPLE_ALIAS; /** The name of the library being assayed. */ public String LIBRARY; /** The (upper-case) original base on the reference strand. */ public char REF_BASE; /** The (upper-case) alternative base that is called as a result of DNA damage. */ public char ALT_BASE; /** * The total Phred-scaled Q-score for this artifact. A lower Q-score * means a higher probability that a REF_BASE:ALT_BASE observation * randomly picked from the data will be due to this artifact, rather * than a true variant. */ public double TOTAL_QSCORE; /** The sequence context (reference bases surrounding the locus of interest) having the lowest Q-score among all contexts for this artifact. */ public String WORST_CXT; /** The Q-score for the worst context. */ public double WORST_CXT_QSCORE; /** The pre-context (reference bases leading up to the locus of interest) with the lowest Q-score. */ public String WORST_PRE_CXT; /** The Q-score for the worst pre-context. */ public double WORST_PRE_CXT_QSCORE; /** The post-context (reference bases trailing after the locus of interest) with the lowest Q-score. */ public String WORST_POST_CXT; /** The Q-score for the worst post-context. */ public double WORST_POST_CXT_QSCORE; /** A "nickname" of this artifact, if it is a known error mode. */ public String ARTIFACT_NAME; /** * Label the artifacts corresponding to known error modes. */ public void inferArtifactName() { if (this.REF_BASE == 'G' && this.ALT_BASE == 'T') this.ARTIFACT_NAME = "Gref"; else if (this.REF_BASE == 'C' && this.ALT_BASE == 'A') this.ARTIFACT_NAME = "Cref"; else this.ARTIFACT_NAME = "NA"; } } /** * Pre-adapter artifacts broken down by context. */ public static class PreAdapterDetailMetrics extends MetricBase { /** The name of the sample being assayed. */ public String SAMPLE_ALIAS; /** The name of the library being assayed. */ public String LIBRARY; /** The (upper-case) original base on the reference strand. */ public char REF_BASE; /** The (upper-case) alternative base that is called as a result of DNA damage. */ public char ALT_BASE; /** The sequence context to which the analysis is constrained. */ public String CONTEXT; /** The number of REF_BASE:REF_BASE alignments having a read number and orientation that supports the presence of this artifact. */ public long PRO_REF_BASES; /** The number of REF_BASE:ALT_BASE alignments having a read number and orientation that supports the presence of this artifact. */ public long PRO_ALT_BASES; /** The number of REF_BASE:REF_BASE alignments having a read number and orientation that refutes the presence of this artifact. */ public long CON_REF_BASES; /** The number of REF_BASE:ALT_BASE alignments having a read number and orientation that refutes the presence of this artifact. */ public long CON_ALT_BASES; /** * The estimated error rate due to this artifact. * Calculated as max(1e-10, (PRO_ALT_BASES - CON_ALT_BASES) / (PRO_ALT_BASES + PRO_REF_BASES + CON_ALT_BASES + CON_REF_BASES)). */ public double ERROR_RATE; /** The Phred-scaled quality score of the artifact, calculated as -10 * log10(ERROR_RATE). */ public double QSCORE; /** * Calculate the error rate given the raw counts. Negative rates are set to MIN_ERROR. */ public void calculateDerivedStatistics() { this.ERROR_RATE = MIN_ERROR; final long totalBases = this.PRO_REF_BASES + this.PRO_ALT_BASES + this.CON_REF_BASES + this.CON_ALT_BASES; if (totalBases > 0) { final double rawErrorRate = (this.PRO_ALT_BASES - this.CON_ALT_BASES) / (double) totalBases; this.ERROR_RATE = Math.max(MIN_ERROR, rawErrorRate); } this.QSCORE = QualityUtil.getPhredScoreFromErrorProbability(this.ERROR_RATE); } public int compareTo(final PreAdapterDetailMetrics o) { int retval = Double.compare(QSCORE, o.QSCORE); if (retval != 0) return retval; retval = REF_BASE - o.REF_BASE; if (retval != 0) return retval; retval = ALT_BASE - o.ALT_BASE; if (retval != 0) return retval; retval = CONTEXT.compareTo(o.CONTEXT); return retval; } } /** * Bait bias artifacts broken down by context. */ public static class BaitBiasDetailMetrics extends MetricBase { public String SAMPLE_ALIAS; /** The name of the library being assayed. */ public String LIBRARY; /** The (upper-case) original base on the reference strand. */ public char REF_BASE; /** The (upper-case) alternative base that is called as a result of DNA damage. */ public char ALT_BASE; /** The sequence context to which the analysis is constrained. */ public String CONTEXT; /** The number of REF_BASE:REF_BASE alignments at sites with the given reference context. */ public long FWD_CXT_REF_BASES; /** The number of REF_BASE:ALT_BASE alignments at sites with the given reference context. */ public long FWD_CXT_ALT_BASES; /** The number of ~REF_BASE:~REF_BASE alignments at sites complementary to the given reference context. */ public long REV_CXT_REF_BASES; /** The number of ~REF_BASE:~ALT_BASE alignments at sites complementary to the given reference context. */ public long REV_CXT_ALT_BASES; /** The substitution rate of REF_BASE:ALT_BASE, calculated as max(1e-10, FWD_CXT_ALT_BASES / (FWD_CXT_ALT_BASES + FWD_CXT_REF_BASES)). */ public double FWD_ERROR_RATE; /** The substitution rate of ~REF_BASE:~ALT_BASE, calculated as max(1e-10, REV_CXT_ALT_BASES / (REV_CXT_ALT_BASES + REV_CXT_REF_BASES)). */ public double REV_ERROR_RATE; /** * The bait bias error rate, calculated as max(1e-10, FWD_ERROR_RATE - REV_ERROR_RATE). */ public double ERROR_RATE; /** The Phred-scaled quality score of the artifact, calculated as -10 * log10(ERROR_RATE). */ public double QSCORE; /** * Calculate the error rate given the raw counts. Negative rates are set to MIN_ERROR. */ public void calculateDerivedStatistics() { this.FWD_ERROR_RATE = MIN_ERROR; final long fwdBases = this.FWD_CXT_REF_BASES + this.FWD_CXT_ALT_BASES; if (fwdBases > 0) { final double fwdErr = this.FWD_CXT_ALT_BASES / (double) fwdBases; this.FWD_ERROR_RATE = Math.max(MIN_ERROR, fwdErr); } this.REV_ERROR_RATE = MIN_ERROR; final long revBases = this.REV_CXT_REF_BASES + this.REV_CXT_ALT_BASES; if (revBases > 0) { final double revErr = this.REV_CXT_ALT_BASES / (double) revBases; this.REV_ERROR_RATE = Math.max(MIN_ERROR, revErr); } this.ERROR_RATE = Math.max(MIN_ERROR, this.FWD_ERROR_RATE - this.REV_ERROR_RATE); this.QSCORE = QualityUtil.getPhredScoreFromErrorProbability(this.ERROR_RATE); } public int compareTo(final BaitBiasDetailMetrics o) { int retval = Double.compare(QSCORE, o.QSCORE); if (retval != 0) return retval; retval = REF_BASE - o.REF_BASE; if (retval != 0) return retval; retval = ALT_BASE - o.ALT_BASE; if (retval != 0) return retval; retval = CONTEXT.compareTo(o.CONTEXT); return retval; } } /** * Little container for passing around a detail metrics pair. */ static class DetailPair { final PreAdapterDetailMetrics preAdapterMetrics; final BaitBiasDetailMetrics baitBiasMetrics; DetailPair(final PreAdapterDetailMetrics preAdapterMetrics, final BaitBiasDetailMetrics baitBiasMetrics) { this.preAdapterMetrics = preAdapterMetrics; this.baitBiasMetrics = baitBiasMetrics; } } /** * Little container for passing around a summary metrics pair. */ static class SummaryPair { final PreAdapterSummaryMetrics preAdapterMetrics; final BaitBiasSummaryMetrics baitBiasMetrics; SummaryPair(final PreAdapterSummaryMetrics preAdapterMetrics, final BaitBiasSummaryMetrics baitBiasMetrics) { this.preAdapterMetrics = preAdapterMetrics; this.baitBiasMetrics = baitBiasMetrics; } } }