/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.DownsamplingIteratorFactory;
import htsjdk.samtools.DownsamplingIteratorFactory.Strategy;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.DownsamplingIterator;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import java.io.File;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Random;
/**
* Class to randomly downsample a BAM file while respecting that we should either retain or discard
* all of the reads for a template - i.e. all reads with the same name, whether first or second of
* pair, secondary or supplementary, all travel together.
*
* @author Tim Fennell
*/
@CommandLineProgramProperties(
usage = DownsampleSam.USAGE_SUMMARY + DownsampleSam.USAGE_DETAILS,
usageShort = DownsampleSam.USAGE_SUMMARY,
programGroup = SamOrBam.class
)
public class DownsampleSam extends CommandLineProgram {
static final String USAGE_SUMMARY = "Downsample a SAM or BAM file. ";
static final String USAGE_DETAILS = "This tool applies a random downsampling algorithm to a SAM or BAM file to retain " +
"only a random subset of the reads. Reads in a mate-pair are either both kept or both discarded. Reads marked as not primary " +
"alignments are all discarded. Each read is given a probability P of being retained so that runs performed with the exact " +
"same input in the same order and with the same value for RANDOM_SEED will produce the same results." +
"All reads for a template are kept or discarded as a unit, with the goal of retaining reads" +
"from PROBABILITY * input templates. While this will usually result in approximately " +
"PROBABILITY * input reads being retained also, for very small PROBABILITIES this may not " +
"be the case.\n" +
"A number of different downsampling strategies are supported using the STRATEGY option:\n\n" +
"ConstantMemory: " + DownsamplingIteratorFactory.CONSTANT_MEMORY_DESCRPTION + "\n\n" +
"HighAccuracy: " + DownsamplingIteratorFactory.HIGH_ACCURACY_DESCRIPTION + "\n\n" +
"Chained: " + DownsamplingIteratorFactory.CHAINED_DESCRIPTION + "\n\n" +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar DownsampleSam \\<br />" +
" I=input.bam \\<br />" +
" O=downsampled.bam" +
"</pre>" +
"<hr />";
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to downsample.")
public File INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.")
public File OUTPUT;
@Option(shortName="S", doc="The downsampling strategy to use. See usage for discussion.")
public Strategy STRATEGY = Strategy.ConstantMemory;
@Option(shortName = "R", doc = "Random seed to use if deterministic behavior is desired. " +
"Setting to null will cause multiple invocations to produce different results.")
public Integer RANDOM_SEED = 1;
@Option(shortName = "P", doc = "The probability of keeping any individual read, between 0 and 1.")
public double PROBABILITY = 1;
@Option(shortName = "A", doc = "The accuracy that the downsampler should try to achieve if the selected strategy supports it. " +
"Note that accuracy is never guaranteed, but some strategies will attempt to provide accuracy within the requested bounds." +
"Higher accuracy will generally require more memory.")
public double ACCURACY = 0.0001;
private final Log log = Log.getInstance(DownsampleSam.class);
public static void main(final String[] args) {
new DownsampleSam().instanceMainWithExit(args);
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
// Warn the user if they are running with P=1; 0 <= P <= 1 is checked by the DownsamplingIteratorFactory
if (PROBABILITY == 1) {
log.warn("Running DownsampleSam with PROBABILITY=1! This will likely just recreate the input file.");
}
final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Wrote");
final DownsamplingIterator iterator = DownsamplingIteratorFactory.make(INPUT, STRATEGY, PROBABILITY, ACCURACY, RANDOM_SEED);
while (iterator.hasNext()) {
final SAMRecord rec = iterator.next();
out.addAlignment(rec);
progress.record(rec);
}
out.close();
CloserUtil.close(in);
final NumberFormat fmt = new DecimalFormat("0.00%");
log.info("Finished downsampling.");
log.info("Kept ", iterator.getAcceptedCount(), " out of ", iterator.getSeenCount(), " reads (", fmt.format(iterator.getAcceptedFraction()), ").");
return 0;
}
}