/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileWriter;
import htsjdk.samtools.SAMFileWriterFactory;
import htsjdk.samtools.SAMProgramRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Histogram;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.CommandLineProgramProperties;
import picard.cmdline.Option;
import picard.cmdline.StandardOptionDefinitions;
import picard.cmdline.programgroups.SamOrBam;
import picard.sam.markduplicates.util.OpticalDuplicateFinder;
import picard.sam.util.PhysicalLocationInt;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Class to downsample a BAM file while respecting that we should either get rid
* of both ends of a pair or neither end of the pair. In addition, this program uses the read-name
* and extracts the position within the tile whence the read came from. The downsampling is based on this position.
* <p/>
* Note 1: This is technology and read-name dependent. If your read-names do not have coordinate information, or if your
* BAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly.
* This has been designed with Illumina MiSeq/HiSeq in mind.
* <p/>
* Note 2: The downsampling is _not_ random. It is deterministically dependent on the position of the read within its tile. Specifically,
* it draws out an ellipse that covers a FRACTION fraction of the area and each of the edges and uses this to determine whether to keep the
* record. Since reads with the same name have the same position (mates, secondary and supplemental alignments), the decision will be the same for all of them.
* <p/>
* Finally, the code has been designed to simulate sequencing less as accurately as possible, not for getting an exact downsample fraction.
* In particular, since the reads may be distributed non-evenly within the lanes/tiles, the resulting downsampling percentage will not be accurately
* determined by the input argument FRACTION. One should re-MarkDuplicates after downsampling in order to "expose" the duplicates whose representative has
* been downsampled away.
*
* @author Yossi Farjoun
*/
@CommandLineProgramProperties(
usage = "Class to downsample a BAM file while respecting that we should either get rid of both ends of a pair or neither \n" +
"end of the pair. In addition, this program uses the read-name and extracts the position within the tile whence \n" +
"the read came from. The downsampling is based on this position. Results with the exact same input will produce the \n" +
"same results.\n" +
"\n" +
"Note 1: This is technology and read-name dependent. If your read-names do not have coordinate information, or if your\n" +
"BAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly. \n" +
"This has been designed with Illumina MiSeq/HiSeq in mind.\n" +
"Note 2: The downsampling is not random. It is deterministically dependent on the position of the read within its tile.\n" +
"Note 3: Downsampling twice with this program is not supported.\n" +
"Note 4: You should call MarkDuplicates after downsampling.\n" +
"\n" +
"Finally, the code has been designed to simulate sequencing less as accurately as possible, not for getting an exact downsample \n" +
"fraction. In particular, since the reads may be distributed non-evenly within the lanes/tiles, the resulting downsampling \n" +
"percentage will not be accurately determined by the input argument FRACTION.",
usageShort = "Downsample a SAM or BAM file to retain a subset of the reads based on the reads location in each tile in the flowcell.",
programGroup = SamOrBam.class
)
public class PositionBasedDownsampleSam extends CommandLineProgram {
@Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to downsample.")
public File INPUT;
@Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.")
public File OUTPUT;
@Option(shortName = "F", doc = "The (approximate) fraction of reads to be kept, between 0 and 1.", optional = false)
public Double FRACTION = null;
@Option(doc = "Stop after processing N reads, mainly for debugging.", optional = true)
public Long STOP_AFTER = null;
@Option(doc = "Allow Downsampling again despite this being a bad idea with possibly unexpected results.", optional = true)
public boolean ALLOW_MULTIPLE_DOWNSAMPLING_DESPITE_WARNINGS = false;
@Option(doc = "Determines whether the duplicate tag should be reset since the downsampling requires re-marking duplicates.")
public boolean REMOVE_DUPLICATE_INFORMATION = true;
private final Log log = Log.getInstance(PositionBasedDownsampleSam.class);
private OpticalDuplicateFinder opticalDuplicateFinder;
private long total = 0;
private long kept = 0;
public static String PG_PROGRAM_NAME = "PositionBasedDownsampleSam";
private final static double ACCEPTABLE_FUDGE_FACTOR = 0.2;
/* max-position in tile as a function of tile. We might need to
look per-readgroup, but at this point I'm making the assumptions that I need to downsample a
sample where all the readgroups came from the same type of flowcell. */
CollectionUtil.DefaultingMap.Factory<Coord, Short> defaultingMapFactory = new CollectionUtil.DefaultingMap.Factory<Coord, Short>() {
@Override
public Coord make(final Short aShort) {
return new Coord();
}
};
final private Map<Short, Coord> tileCoord = new CollectionUtil.DefaultingMap<Short, Coord>(defaultingMapFactory, true);
final Map<Short, Histogram<Short>> xPositions = new HashMap<Short, Histogram<Short>>();
final Map<Short, Histogram<Short>> yPositions = new HashMap<Short, Histogram<Short>>();
public static void main(final String[] args) {
new PositionBasedDownsampleSam().instanceMainWithExit(args);
}
@Override
protected String[] customCommandLineValidation() {
final List<String> errors = new ArrayList<String>();
if (FRACTION < 0 || FRACTION > 1) {
errors.add("FRACTION must be a value between 0 and 1, found: " + FRACTION);
}
if (errors.isEmpty()) {
return null;
} else {
return errors.toArray(new String[errors.size()]);
}
}
@Override
protected int doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsWritable(OUTPUT);
log.info("Checking to see if input file has been downsampled with this program before.");
checkProgramRecords();
opticalDuplicateFinder = new OpticalDuplicateFinder();
log.info("Starting first pass. Examining read distribution in tiles.");
fillTileMinMaxCoord();
log.info("First pass done.");
log.info("Starting second pass. Outputting reads.");
outputSamRecords();
log.info("Second pass done.");
final double finalP = kept / (double) total;
if (Math.abs(finalP - FRACTION) / (Math.min(finalP, FRACTION) + 1e-10) > ACCEPTABLE_FUDGE_FACTOR) {
log.warn(String.format("You've requested FRACTION=%g, the resulting downsampling resulted in a rate of %f.", FRACTION, finalP));
}
log.info(String.format("Finished! Kept %d out of %d reads (P=%g).", kept, total, finalP));
return 0;
}
private void outputSamRecords() {
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7);
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final SAMFileHeader header = in.getFileHeader().clone();
final SAMFileHeader.PgIdGenerator pgIdGenerator = new SAMFileHeader.PgIdGenerator(header);
final SAMProgramRecord programRecord = new SAMProgramRecord(pgIdGenerator.getNonCollidingId(PG_PROGRAM_NAME));
programRecord.setProgramName(PG_PROGRAM_NAME);
programRecord.setCommandLine(getCommandLine());
programRecord.setProgramVersion(getVersion());
header.addProgramRecord(programRecord);
final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, OUTPUT);
final CircleSelector selector = new CircleSelector(FRACTION);
for (final SAMRecord rec : in) {
if (STOP_AFTER != null && total >= STOP_AFTER) break;
total++;
final PhysicalLocationInt pos = getSamRecordLocation(rec);
if (!xPositions.containsKey(pos.getTile())) {
xPositions.put(pos.getTile(), new Histogram<Short>(pos.getTile() + "-xpos", "count"));
}
if (!yPositions.containsKey(pos.getTile())) {
yPositions.put(pos.getTile(), new Histogram<Short>(pos.getTile() + "-ypos", "count"));
}
final boolean keepRecord = selector.select(pos, tileCoord.get(pos.getTile()));
if (keepRecord) {
if (REMOVE_DUPLICATE_INFORMATION) rec.setDuplicateReadFlag(false);
out.addAlignment(rec);
kept++;
}
progress.record(rec);
}
out.close();
CloserUtil.close(in);
}
private void checkProgramRecords() {
final SamReader in = SamReaderFactory
.makeDefault()
.referenceSequence(REFERENCE_SEQUENCE)
.open(INPUT);
for (final SAMProgramRecord pg : in.getFileHeader().getProgramRecords()) {
if (pg.getProgramName() != null && pg.getProgramName().equals(PG_PROGRAM_NAME)) {
final String outText = "Found previous Program Record that indicates that this BAM has been downsampled already with this program. Operation not supported! Previous PG: " + pg.toString();
if (ALLOW_MULTIPLE_DOWNSAMPLING_DESPITE_WARNINGS) {
log.warn(outText);
} else {
log.error(outText);
throw new PicardException(outText);
}
}
}
CloserUtil.close(in);
}
// scan all the tiles and find the smallest and largest coordinate (x & y) in that tile.
private void fillTileMinMaxCoord() {
final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT);
final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Read");
int total = 0;
for (final SAMRecord rec : in) {
if (STOP_AFTER != null && total >= STOP_AFTER) break;
total++;
progress.record(rec);
final PhysicalLocationInt location = getSamRecordLocation(rec);
//Defaulting map will create a new Coord if it's not there.
final Coord Pos = tileCoord.get(location.getTile());
Pos.maxX = Math.max(Pos.maxX, location.getX());
Pos.minX = Math.min(Pos.minX, location.getX());
Pos.maxY = Math.max(Pos.maxY, location.getY());
Pos.minY = Math.min(Pos.minY, location.getY());
Pos.count++;
}
// now that we know what the maximal/minimal numbers were, we should increase/decrease them a little, to account for sampling error
for (final Coord coord : tileCoord.values()) {
final int diffX = coord.maxX - coord.minX;
final int diffY = coord.maxY - coord.minY;
coord.maxX += diffX / coord.count;
coord.minX -= diffX / coord.count;
coord.maxY += diffY / coord.count;
coord.minY -= diffY / coord.count;
}
CloserUtil.close(in);
}
private PhysicalLocationInt getSamRecordLocation(final SAMRecord rec) {
final PhysicalLocationInt pos = new PhysicalLocationInt();
opticalDuplicateFinder.addLocationInformation(rec.getReadName(), pos);
return pos;
}
/*
* The reads are selected depending on whether they are in a periodically repeating circle whose representative
* overlaps the boundary of the tile. The circle is chosen as to have an area of FRACTION and the also a overlap
* of FRACTION with both the bottom and left edges of the unit square ([0,1] x [0,1]), which defines it uniquely.
* Finally the repeating pattern is there to make sure that the mask on the flowcell also has minimal boundary.
*
* The position of the reads is mapped into the unit square using the min/max coordinates of the tile prior to
* masking
*
* This choice of a mask is intended to accomplish several goasl:
* - pick out a fraction FRACTION of the reads
* - pick out a fraction FRACTION of the reads that are near the boundaries
* - keep nearby reads (in a tile) together by minimizing the boundary of the mask itself
* - keep nearby reads (in neighboring tiles) together (since they might be optical duplicates) by keeping that the
* mask is the same on all tiles, and by having the same mask on the left edge as on the right (same for top/bottom).
*/
private class CircleSelector {
private final double radiusSquared;
private final double offset;
private final boolean positiveSelection;
CircleSelector(final double fraction) {
final double p;
if (fraction > 0.5) {
p = 1 - fraction;
positiveSelection = false;
} else {
p = fraction;
positiveSelection = true;
}
radiusSquared = p / Math.PI; //thus the area is \pi r^2 = p, thus a fraction p of the unit square will be captured
/* if offset is used as the center of the circle (both x and y), this makes the overlap
region with each of the boundaries of the unit square have length p (and thus a fraction
p of the boundaries of each tile will be removed) */
if (p < 0) {
// at this point a negative p will result in a square-root of a negative number in the next step.
throw new PicardException("This shouldn't happen...");
}
offset = Math.sqrt(radiusSquared - p * p / 4);
}
private double roundedPart(final double x) {return x - Math.round(x);}
// this function checks to see if the location of the read is within the masking circle
private boolean select(final PhysicalLocationInt coord, final Coord tileCoord) {
// r^2 = (x-x_0)^2 + (y-y_0)^2, where both x_0 and y_0 equal offset
final double distanceSquared =
Math.pow(roundedPart(((coord.getX() - tileCoord.minX) / (double) (tileCoord.maxX - tileCoord.minX)) - offset), 2) +
Math.pow(roundedPart(((coord.getY() - tileCoord.minY) / (double) (tileCoord.maxY - tileCoord.minY)) - offset), 2);
return (distanceSquared > radiusSquared) ^ positiveSelection;
}
}
private class Coord {
public int minX;
public int minY;
public int maxX;
public int maxY;
public int count;
public Coord() {
count = 0;
minX = 0;
minY = 0;
maxX = 0;
maxY = 0;
}
}
}