package picard.sam; import htsjdk.samtools.SAMFileReader; import htsjdk.samtools.SAMFileWriter; import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; import htsjdk.samtools.util.ProgressLogger; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.SamOrBam; import java.io.File; import java.util.HashMap; import java.util.Map; import java.util.Random; /** * Class to randomly downsample a BAM file while respecting that we should either get rid * of both ends of a pair or neither end of the pair! */ @CommandLineProgramProperties( usage = "Randomly down-sample a SAM or BAM file to retain " + "a random subset of the reads. Mate-pairs are either both kept or both discarded. Reads marked as not primary " + "alignments are all discarded. Each read is given a probability P of being retained - results with the exact " + "same input in the same order and with the same value for RANDOM_SEED will produce the same results.", usageShort = "Down-sample a SAM or BAM file to retain a random subset of the reads", programGroup = SamOrBam.class ) public class DownsampleSam extends CommandLineProgram { @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input SAM or BAM file to downsample.") public File INPUT; @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output, downsampled, SAM or BAM file to write.") public File OUTPUT; @Option(shortName="R", doc="Random seed to use if reproducibilty is desired. " + "Setting to null will cause multiple invocations to produce different results.") public Long RANDOM_SEED = 1L; @Option(shortName="P", doc="The probability of keeping any individual read, between 0 and 1.") public double PROBABILITY = 1; private final Log log = Log.getInstance(DownsampleSam.class); public static void main(final String[] args) { new DownsampleSam().instanceMainWithExit(args); } @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsWritable(OUTPUT); final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED); final SAMFileReader in = new SAMFileReader(INPUT); final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT); final Map<String,Boolean> decisions = new HashMap<String,Boolean>(); long total = 0; long kept = 0; final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Read"); for (final SAMRecord rec : in) { if (rec.isSecondaryOrSupplementary()) continue; ++total; final String key = rec.getReadName(); final Boolean previous = decisions.remove(key); final boolean keeper; if (previous == null) { keeper = r.nextDouble() <= PROBABILITY; if (rec.getReadPairedFlag()) decisions.put(key, keeper); } else { keeper = previous; } if (keeper) { out.addAlignment(rec); ++kept; } progress.record(rec); } out.close(); log.info("Finished! Kept " + kept + " out of " + total + " reads."); return 0; } }