/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.analysis.directed;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.StringUtil;
import picard.analysis.MetricAccumulationLevel;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.programgroups.Metrics;
import java.io.File;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME;
/**
* Collects a set of HS metrics from a sam or bam file. See HsMetricsCollector and CollectTargetedMetrics for more details.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = CollectHsMetrics.USAGE_SUMMARY + CollectHsMetrics.USAGE_DETAILS,
usageShort = CollectHsMetrics.USAGE_SUMMARY,
programGroup = Metrics.class
)
public class CollectHsMetrics extends CollectTargetedMetrics<HsMetrics, HsMetricCollector> {
static final String USAGE_SUMMARY = "Collects hybrid-selection (HS) metrics for a SAM or BAM file. ";
static final String USAGE_DETAILS = "This tool takes a SAM/BAM file input and collects metrics that are specific for sequence "+
"datasets generated through hybrid-selection. Hybrid-selection (HS) is the most commonly used technique to capture "+
"exon-specific sequences for targeted sequencing experiments such as exome sequencing; for more information, please " +
"see the corresponding <a href='http://www.broadinstitute.org/gatk/guide/article?id=6331'>GATK Dictionary entry</a>. </p> "+
"<p>This tool requires an aligned SAM or BAM file as well as bait and target interval files in Picard interval_list format. " +
"You should use the bait and interval files that correspond to the capture kit that was used to generate the capture " +
"libraries for sequencing, which can generally be obtained from the kit manufacturer. If the baits and target " +
"intervals are provided in BED format, you can convert them to the Picard interval_list format using Picard's " +
"<a href='http://broadinstitute.github.io/picard/command-line-overview.html#BedToIntervalList'>BedToInterval</a> tool. </p>" +
"<p>If a reference sequence is provided, this program will calculate both AT_DROPOUT and GC_DROPOUT metrics. Dropout " +
"metrics are an attempt to measure the reduced representation of reads, in regions that deviate from 50% G/C content. " +
"This reduction in the number of aligned reads is due to the increased numbers of errors associated with sequencing " +
"regions with excessive or deficient numbers of G/C bases, ultimately leading to poor mapping efficiencies and low" +
"coverage in the affected regions. </p>" +
"<p>If you are interested in getting G/C content and mean sequence depth information for every target interval, use the " +
"PER_TARGET_COVERAGE option. </p>" +
"<p>Note: Metrics labeled as percentages are actually expressed as fractions!</p> "+
"<h4>Usage Example:</h4>"+
"<pre>" +
"java -jar picard.jar CollectHsMetrics \\<br />" +
" I=input.bam \\<br />" +
" O=hs_metrics.txt \\<br />" +
" R=reference_sequence.fasta \\<br />" +
" BAIT_INTERVALS=bait.interval_list \\<br />" +
" TARGET_INTERVALS=target.interval_list" +
"</pre> " +
"<p>Please see " +
"<a href='http://broadinstitute.github.io/picard/picard-metric-definitions.html#HsMetrics'>CollectHsMetrics</a> for " +
"detailed descriptions of the output metrics produced by this tool.</p>" +
"<hr />"
;
@Option(shortName = "BI", doc = "An interval list file that contains the locations of the baits used.", minElements=1)
public List<File> BAIT_INTERVALS;
@Option(shortName = "N", doc = "Bait set name. If not provided it is inferred from the filename of the bait intervals.", optional = true)
public String BAIT_SET_NAME;
@Option(shortName = MINIMUM_MAPPING_QUALITY_SHORT_NAME, doc = "Minimum mapping quality for a read to contribute coverage.", overridable = true)
public int MINIMUM_MAPPING_QUALITY = 20;
@Option(shortName = "Q", doc = "Minimum base quality for a base to contribute coverage.", overridable = true)
public int MINIMUM_BASE_QUALITY = 20;
@Option(doc = "True if we are to clip overlapping reads, false otherwise.", optional=true, overridable = true)
public boolean CLIP_OVERLAPPING_READS = true;
@Override
protected IntervalList getProbeIntervals() {
for (final File file : BAIT_INTERVALS) IOUtil.assertFileIsReadable(file);
return IntervalList.fromFiles(BAIT_INTERVALS);
}
@Override
protected String getProbeSetName() {
if (BAIT_SET_NAME != null) {
return BAIT_SET_NAME;
} else {
final SortedSet<String> baitSetNames = new TreeSet<String>();
for (final File file : BAIT_INTERVALS) {
baitSetNames.add(CollectTargetedMetrics.renderProbeNameFromFile(file));
}
return StringUtil.join(".", baitSetNames);
}
}
/** Stock main method. */
public static void main(final String[] argv) {
System.exit(new CollectHsMetrics().instanceMain(argv));
}
@Override
protected HsMetricCollector makeCollector(final Set<MetricAccumulationLevel> accumulationLevels,
final List<SAMReadGroupRecord> samRgRecords,
final ReferenceSequenceFile refFile,
final File perTargetCoverage,
final File perBaseCoverage,
final IntervalList targetIntervals,
final IntervalList probeIntervals,
final String probeSetName,
final int nearProbeDistance) {
return new HsMetricCollector(accumulationLevels, samRgRecords, refFile, perTargetCoverage, perBaseCoverage, targetIntervals, probeIntervals, probeSetName, nearProbeDistance,
MINIMUM_MAPPING_QUALITY, MINIMUM_BASE_QUALITY, CLIP_OVERLAPPING_READS, true, COVERAGE_CAP, SAMPLE_SIZE);
}
}