/* * The MIT License * * Copyright (c) 2013 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.analysis; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.programgroups.Metrics; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFileWalker; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import picard.util.RExecutor; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CollectionUtil; import java.io.File; import java.lang.Comparable;import java.lang.Override;import java.lang.String;import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Calculates and reports QC metrics for RRBS data based on the methylation status at individual C/G bases as well * as CpG sites across all reads in the input BAM/SAM file. * * @author jgentry@broadinstitute.org */ @CommandLineProgramProperties( usage = CollectRrbsMetrics.USAGE, usageShort = CollectRrbsMetrics.USAGE, programGroup = Metrics.class ) public class CollectRrbsMetrics extends CommandLineProgram { final static String USAGE = "Collects metrics about bisulfite conversion for RRBS data"; // Path to R file for plotting purposes private static final String R_SCRIPT = "picard/analysis/rrbsQc.R"; @Option(doc="The BAM or SAM file containing aligned reads. Must be coordinate sorted", shortName= StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; @Option(doc="Base name for output files", shortName=StandardOptionDefinitions.METRICS_FILE_SHORT_NAME) public String METRICS_FILE_PREFIX; @Option(doc="The reference sequence fasta file", shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME) public File REFERENCE; @Option(doc="Minimum read length") public int MINIMUM_READ_LENGTH = 5; @Option(doc="Threshold for base quality of a C base before it is considered") public int C_QUALITY_THRESHOLD = 20; @Option(doc="Threshold for quality of a base next to a C before the C base is considered") public int NEXT_BASE_QUALITY_THRESHOLD = 10; @Option(doc="Maximum percentage of mismatches in a read for it to be considered, with a range of 0-1") public double MAX_MISMATCH_RATE = 0.1; @Option(doc="Set of sequence names to consider, if not specified all sequences will be used", optional = true) public Set<String> SEQUENCE_NAMES = new HashSet<String>(); @Option(shortName=StandardOptionDefinitions.ASSUME_SORTED_SHORT_NAME, doc="If true, assume that the input file is coordinate sorted even if the header says otherwise.") public boolean ASSUME_SORTED = false; @Option(shortName="LEVEL", doc="The level(s) at which to accumulate metrics. ") public Set<MetricAccumulationLevel> METRIC_ACCUMULATION_LEVEL = CollectionUtil.makeSet(MetricAccumulationLevel.ALL_READS); public static final String DETAIL_FILE_EXTENSION = "rrbs_detail_metrics"; public static final String SUMMARY_FILE_EXTENSION = "rrbs_summary_metrics"; public static final String PDF_FILE_EXTENSION = "rrbs_qc.pdf"; private static final Log log = Log.getInstance(CollectRrbsMetrics.class); public static void main(final String[] args) { new CollectRrbsMetrics().instanceMainWithExit(args); } @Override protected int doWork() { if (!METRICS_FILE_PREFIX.endsWith(".")) { METRICS_FILE_PREFIX = METRICS_FILE_PREFIX + "."; } final File SUMMARY_OUT = new File(METRICS_FILE_PREFIX + SUMMARY_FILE_EXTENSION); final File DETAILS_OUT = new File(METRICS_FILE_PREFIX + DETAIL_FILE_EXTENSION); final File PLOTS_OUT = new File(METRICS_FILE_PREFIX + PDF_FILE_EXTENSION); assertIoFiles(SUMMARY_OUT, DETAILS_OUT, PLOTS_OUT); final SAMFileReader samReader = new SAMFileReader(INPUT); if (!ASSUME_SORTED && samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { throw new PicardException("The input file " + INPUT.getAbsolutePath() + " does not appear to be coordinate sorted"); } final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE); final ProgressLogger progressLogger = new ProgressLogger(log); final RrbsMetricsCollector metricsCollector = new RrbsMetricsCollector(METRIC_ACCUMULATION_LEVEL, samReader.getFileHeader().getReadGroups(), C_QUALITY_THRESHOLD, NEXT_BASE_QUALITY_THRESHOLD, MINIMUM_READ_LENGTH, MAX_MISMATCH_RATE); for (final SAMRecord samRecord : samReader) { progressLogger.record(samRecord); if (!samRecord.getReadUnmappedFlag() && !isSequenceFiltered(samRecord.getReferenceName())) { final ReferenceSequence referenceSequence = refWalker.get(samRecord.getReferenceIndex()); metricsCollector.acceptRecord(samRecord, referenceSequence); } } metricsCollector.finish(); final MetricsFile<RrbsMetrics, Comparable<?>> rrbsMetrics = getMetricsFile(); metricsCollector.addAllLevelsToFile(rrbsMetrics); // Using RrbsMetrics as a way to get both of the metrics objects through the MultiLevelCollector. Once // we get it out split it apart to the two separate MetricsFiles and write them to file final MetricsFile<RrbsSummaryMetrics, ?> summaryFile = getMetricsFile(); final MetricsFile<RrbsCpgDetailMetrics, ?> detailsFile = getMetricsFile(); for (final RrbsMetrics rrbsMetric : rrbsMetrics.getMetrics()) { summaryFile.addMetric(rrbsMetric.getSummaryMetrics()); for (final RrbsCpgDetailMetrics detailMetric : rrbsMetric.getDetailMetrics()) { detailsFile.addMetric(detailMetric); } } summaryFile.write(SUMMARY_OUT); detailsFile.write(DETAILS_OUT); RExecutor.executeFromClasspath(R_SCRIPT, DETAILS_OUT.getAbsolutePath(), SUMMARY_OUT.getAbsolutePath(), PLOTS_OUT.getAbsolutePath()); return 0; } private boolean isSequenceFiltered(final String sequenceName) { return (SEQUENCE_NAMES != null) && (SEQUENCE_NAMES.size() > 0) && (!SEQUENCE_NAMES.contains(sequenceName)); } private void assertIoFiles(final File summaryFile, final File detailsFile, final File plotsFile) { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE); IOUtil.assertFileIsWritable(summaryFile); IOUtil.assertFileIsWritable(detailsFile); IOUtil.assertFileIsWritable(plotsFile); } @Override protected String[] customCommandLineValidation() { final List<String> errorMsgs = new ArrayList<String>(); if (MAX_MISMATCH_RATE < 0 || MAX_MISMATCH_RATE > 1) { errorMsgs.add("MAX_MISMATCH_RATE must be in the range of 0-1"); } if (C_QUALITY_THRESHOLD < 0) { errorMsgs.add("C_QUALITY_THRESHOLD must be >= 0"); } if (NEXT_BASE_QUALITY_THRESHOLD < 0) { errorMsgs.add("NEXT_BASE_QUALITY_THRESHOLD must be >= 0"); } if (MINIMUM_READ_LENGTH <= 0) { errorMsgs.add("MINIMUM_READ_LENGTH must be > 0"); } return errorMsgs.size() == 0 ? null : errorMsgs.toArray(new String[errorMsgs.size()]); } }