package picard.analysis.artifacts; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.SequenceUtil; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Metrics; import picard.analysis.CollectOxoGMetrics.*; import picard.analysis.artifacts.SequencingArtifactMetrics.*; import java.io.File; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; @CommandLineProgramProperties( usage = ConvertSequencingArtifactToOxoG.USAGE_SUMMARY + ConvertSequencingArtifactToOxoG.USAGE_DETAILS, usageShort = ConvertSequencingArtifactToOxoG.USAGE_SUMMARY, programGroup = Metrics.class ) public class ConvertSequencingArtifactToOxoG extends CommandLineProgram { static final String USAGE_SUMMARY = "Extract OxoG metrics from generalized artifacts metrics. "; static final String USAGE_DETAILS = "<p>This tool extracts 8-oxoguanine (OxoG) artifact metrics from the output of " + "CollectSequencingArtifactsMetrics (a tool that provides detailed information on a variety of artifacts found in sequencing " + "libraries) and converts them to the CollectOxoGMetrics tool's output format. This conveniently eliminates the need to run " + "CollectOxoGMetrics if we already ran CollectSequencingArtifactsMetrics in our pipeline. See the documentation for " + "<a href='http://broadinstitute.github.io/picard/command-line-overview.html#CollectSequencingArtifactsMetrics'>CollectSequencingArtifactsMetrics</a> "+ "and <a href='http://broadinstitute.github.io/picard/command-line-overview.html#CollectOxoGMetrics'>CollectOxoGMetrics</a> "+ "for additional information on these tools.</p>." + "<p>Note that only the base of the CollectSequencingArtifactsMetrics output file name is required for the (INPUT_BASE) "+ "parameter. For example, if the file name is artifact_metrics.txt.bait_bias_detail_metrics or "+ "artifact_metrics.txt.pre_adapter_detail_metrics, only the file name base 'artifact_metrics' is " + "required on the command line for this parameter. An output file called 'artifact_metrics.oxog_metrics' will be generated "+ "automatically. Finally, to run this tool successfully, the REFERENCE_SEQUENCE must be provided.</p>"+ "<h4>Usage example:</h4>" + "<pre>" + "java -jar picard.jar ConvertSequencingArtifactToOxoG \\<br />" + " I=artifact_metrics \\<br />" + " R=reference.fasta" + "</pre>" + "Please see the metrics definitions page at " + "<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#CollectOxoGMetrics.CpcgMetrics'>ConvertSequencingArtifactToOxoG</a> "+ "for detailed descriptions of the output metrics produced by this tool."+ "<hr />" ; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Basename of the input artifact metrics file (output by CollectSequencingArtifactMetrics)") public File INPUT_BASE; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Basename for output OxoG metrics. Defaults to same basename as input metrics", optional = true) public File OUTPUT_BASE; // Stock main method public static void main(final String[] args) { new ConvertSequencingArtifactToOxoG().instanceMainWithExit(args); } @Override protected int doWork() { if (OUTPUT_BASE == null) { OUTPUT_BASE = INPUT_BASE; } final File PRE_ADAPTER_IN = new File(INPUT_BASE + SequencingArtifactMetrics.PRE_ADAPTER_DETAILS_EXT); final File BAIT_BIAS_IN = new File(INPUT_BASE + SequencingArtifactMetrics.BAIT_BIAS_DETAILS_EXT); final File OXOG_OUT = new File(OUTPUT_BASE + ".oxog_metrics"); IOUtil.assertFileIsReadable(PRE_ADAPTER_IN); IOUtil.assertFileIsReadable(BAIT_BIAS_IN); IOUtil.assertFileIsWritable(OXOG_OUT); final List<PreAdapterDetailMetrics> preAdapterDetailMetricsList = MetricsFile.readBeans(PRE_ADAPTER_IN); final List<BaitBiasDetailMetrics> baitBiasDetailMetricsList = MetricsFile.readBeans(BAIT_BIAS_IN); // TODO should we validate that the two inputs match up as expected? /** * Determine output fields. Just copy these from the input for now. */ final String oxogSampleAlias = preAdapterDetailMetricsList.get(0).SAMPLE_ALIAS; final Set<String> oxogLibraries = new HashSet<String>(); final Set<String> oxogContexts = new HashSet<String>(); for (final PreAdapterDetailMetrics preAdapter : preAdapterDetailMetricsList) { oxogLibraries.add(preAdapter.LIBRARY); // Remember that OxoG only reports on the 'C' contexts if (preAdapter.REF_BASE == 'C') { oxogContexts.add(preAdapter.CONTEXT); } } /** * Store the inputs in maps of {Library -> {Context, Metric}} for easy access. * Remember, we only care about two transitions - C>A and G>T! Thus, for each context we * will only store one metric. */ final Map<String, Map<String, PreAdapterDetailMetrics>> preAdapterDetailMetricsMap = new HashMap<String, Map<String, PreAdapterDetailMetrics>>(); final Map<String, Map<String, BaitBiasDetailMetrics>> baitBiasDetailMetricsMap = new HashMap<String, Map<String, BaitBiasDetailMetrics>>(); for (final String library : oxogLibraries) { final Map<String, PreAdapterDetailMetrics> contextsToPreAdapter = new HashMap<String, PreAdapterDetailMetrics>(); final Map<String, BaitBiasDetailMetrics> contextsToBaitBias = new HashMap<String, BaitBiasDetailMetrics>(); preAdapterDetailMetricsMap.put(library, contextsToPreAdapter); baitBiasDetailMetricsMap.put(library, contextsToBaitBias); } for (final PreAdapterDetailMetrics preAdapter : preAdapterDetailMetricsList) { final Transition transition = Transition.transitionOf(preAdapter.REF_BASE, preAdapter.ALT_BASE); if (isOxoG(transition)) { preAdapterDetailMetricsMap.get(preAdapter.LIBRARY).put(preAdapter.CONTEXT, preAdapter); } } for (final BaitBiasDetailMetrics baitBias : baitBiasDetailMetricsList) { final Transition transition = Transition.transitionOf(baitBias.REF_BASE, baitBias.ALT_BASE); if (isOxoG(transition)) { baitBiasDetailMetricsMap.get(baitBias.LIBRARY).put(baitBias.CONTEXT, baitBias); } } /** * Create the OxoG metrics */ final List<CpcgMetrics> oxogMetrics = new ArrayList<CpcgMetrics>(); for (final String library : oxogLibraries) { for (final String context : oxogContexts) { final CpcgMetrics m = new CpcgMetrics(); m.SAMPLE_ALIAS = oxogSampleAlias; m.LIBRARY = library; m.CONTEXT = context; m.TOTAL_SITES = 0; // not calculated in the input metrics /** * Get the relevant input metrics. This is done in a somewhat confusing way: * * 1. For pre-adapter metrics: note that OxoG only reports 'C' contexts, even though the actual pre-adapter OxoG artifact * occurs when the reference-strand base is 'G'. This is because OxoG reverse-complements all the contexts for some reason. * Thus when we add an entry for 'ACA' in the output, we actually need to get that data from 'TGT' in the input. * * 2. For bait-bias metrics: for each context, we report two opposing error rates, C_REF and G_REF, because for this metric * the bias could really go in either direction (whereas for pre-adapter artifacts we only expect one direction: G>T, but * never C>A, on the original reference strand). C_REF corresponds to the actual context printed in that row, and G_REF * corresponds to its reverse complement. So for 'ACA' in the output, we need to take data from both 'ACA' and 'TGT' in the * input. */ final PreAdapterDetailMetrics preAdapter = preAdapterDetailMetricsMap.get(library).get(SequenceUtil.reverseComplement(context)); final BaitBiasDetailMetrics baitBiasFwd = baitBiasDetailMetricsMap.get(library).get(context); final BaitBiasDetailMetrics baitBiasRev = baitBiasDetailMetricsMap.get(library).get(SequenceUtil.reverseComplement(context)); // extract fields from PreAdapterDetailMetrics m.TOTAL_BASES = preAdapter.PRO_REF_BASES + preAdapter.PRO_ALT_BASES + preAdapter.CON_REF_BASES + preAdapter.CON_ALT_BASES; m.REF_TOTAL_BASES = preAdapter.PRO_REF_BASES + preAdapter.CON_REF_BASES; m.REF_NONOXO_BASES = preAdapter.CON_REF_BASES; m.REF_OXO_BASES = preAdapter.PRO_REF_BASES; m.ALT_NONOXO_BASES = preAdapter.CON_ALT_BASES; m.ALT_OXO_BASES = preAdapter.PRO_ALT_BASES; m.OXIDATION_ERROR_RATE = preAdapter.ERROR_RATE; m.OXIDATION_Q = preAdapter.QSCORE; // extract fields from BaitBiasDetailMetrics m.C_REF_REF_BASES = baitBiasFwd.FWD_CXT_REF_BASES; m.G_REF_REF_BASES = baitBiasFwd.REV_CXT_REF_BASES; m.C_REF_ALT_BASES = baitBiasFwd.FWD_CXT_ALT_BASES; m.G_REF_ALT_BASES = baitBiasFwd.REV_CXT_ALT_BASES; m.C_REF_OXO_ERROR_RATE = baitBiasFwd.ERROR_RATE; m.C_REF_OXO_Q = baitBiasFwd.QSCORE; m.G_REF_OXO_ERROR_RATE = baitBiasRev.ERROR_RATE; m.G_REF_OXO_Q = baitBiasRev.QSCORE; // add it oxogMetrics.add(m); } } final MetricsFile<CpcgMetrics, Integer> outputFile = getMetricsFile(); for (final CpcgMetrics m : oxogMetrics) { outputFile.addMetric(m); } outputFile.write(OXOG_OUT); return 0; } private boolean isOxoG(final Transition t) { return t.equals(Transition.CtoA) || t.equals(Transition.GtoT); } }