/*
* The MIT License
*
* Copyright (c) 2014-2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates.util;
import htsjdk.samtools.util.Log;
import htsjdk.samtools.util.ProgressLogger;
import picard.sam.util.PhysicalLocation;
import picard.sam.util.ReadNameParser;
import java.util.List;
/**
* Contains methods for finding optical/co-localized/sequencing duplicates.
*
* @author Tim Fennell
* @author Nils Homer
*/
public class OpticalDuplicateFinder extends ReadNameParser {
public int opticalDuplicatePixelDistance;
public static final int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100;
public static final int DEFAULT_BIG_DUPLICATE_SET_SIZE = 1000;
/**
* Sets the size of a set that is big enough to log progress about.
* Defaults to {@value OpticalDuplicateFinder#DEFAULT_BIG_DUPLICATE_SET_SIZE}
*
* @param bigDuplicateSetSize the size of a set that is big enough to log progress about
*/
public void setBigDuplicateSetSize(final int bigDuplicateSetSize) {
this.bigDuplicateSetSize = bigDuplicateSetSize;
}
private int bigDuplicateSetSize = DEFAULT_BIG_DUPLICATE_SET_SIZE;
/**
* Uses the default duplicate distance {@value OpticalDuplicateFinder#DEFAULT_OPTICAL_DUPLICATE_DISTANCE} and the default read name regex
* {@link ReadNameParser#DEFAULT_READ_NAME_REGEX}.
*/
public OpticalDuplicateFinder() {
super();
this.opticalDuplicatePixelDistance = DEFAULT_OPTICAL_DUPLICATE_DISTANCE;
}
/**
*
* @param readNameRegex see {@link ReadNameParser#DEFAULT_READ_NAME_REGEX}.
* @param opticalDuplicatePixelDistance the optical duplicate pixel distance
* @param log the log to which to write messages.
*/
public OpticalDuplicateFinder(final String readNameRegex, final int opticalDuplicatePixelDistance, final Log log) {
super(readNameRegex, log);
this.opticalDuplicatePixelDistance = opticalDuplicatePixelDistance;
}
/**
* Finds which reads within the list of duplicates that are likely to be optical/co-localized duplicates of
* one another. Within each cluster of optical duplicates that is found, one read remains un-flagged for
* optical duplication and the rest are flagged as optical duplicates. The set of reads that are considered
* optical duplicates are indicated by returning "true" at the same index in the resulting boolean[] as the
* read appeared in the input list of physical locations.
*
* @param list a list of reads that are determined to be duplicates of one another
* @param keeper a single PhysicalLocation that is the one being kept as non-duplicate, and thus should never be
* annotated as an optical duplicate. May in some cases be null, or a PhysicalLocation not
* contained within the list!
* @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates
*/
public boolean[] findOpticalDuplicates(final List<? extends PhysicalLocation> list, final PhysicalLocation keeper) {
// If there is only one or zero reads passed in, then just return an array of all false
if (list.size() < 2) return new boolean[list.size()];
final int length = list.size();
final boolean[] opticalDuplicateFlags = new boolean[length];
final int distance = this.opticalDuplicatePixelDistance;
final PhysicalLocation actualKeeper = keeperOrNull(list, keeper);
final Log log;
final ProgressLogger progressLoggerForKeeper, progressLoggerForRest;
final boolean logProgress = length > bigDuplicateSetSize;
if (logProgress) {
log = Log.getInstance(OpticalDuplicateFinder.class);
progressLoggerForKeeper = new ProgressLogger(log, 10000, "compared", "ReadEnds to keeper");
progressLoggerForRest = new ProgressLogger(log, 1000, "compared", "ReadEnds to others");
log.info("Large duplicate set. size = " + length);
log.debug("About to compare to keeper:" + actualKeeper);
} else {
log = null;
progressLoggerForKeeper = null;
progressLoggerForRest = null;
}
// First go through and compare all the reads to the keeper
if (actualKeeper != null) {
for (int i = 0; i < length; ++i) {
final PhysicalLocation other = list.get(i);
opticalDuplicateFlags[i] = closeEnough(actualKeeper, other, distance);
// The main point of adding this log and if statement (also below) is a workaround a bug in the JVM
// which causes a deep exception (https://github.com/broadinstitute/picard/issues/472).
// It seems that this is related to https://bugs.openjdk.java.net/browse/JDK-8033717 which
// was closed due to non-reproducibility. We came across a bam file that evoked this error
// every time we tried to duplicate-mark it. The problem seemed to be a duplicate-set of size 500,000,
// and this loop seemed to kill the JVM for some reason. This logging statement (and the one in the
// loop below) solved the problem.
if (logProgress) progressLoggerForKeeper.record(String.format("%d", other.getReadGroup()), other.getX());
}
}
if (logProgress) log.debug("Done with comparing to keeper, now the rest.");
// Now go through and do each pairwise comparison not involving the actualKeeper
for (int i = 0; i < length; ++i) {
final PhysicalLocation lhs = list.get(i);
if (lhs == actualKeeper) continue; // no comparisons to actualKeeper since those are all handled above
// logging here for same reason as above
if (logProgress) progressLoggerForRest.record(String.format("%d", lhs.getReadGroup()), lhs.getX());
for (int j = i + 1; j < length; ++j) {
final PhysicalLocation rhs = list.get(j);
if (rhs == actualKeeper) continue; // no comparisons to actualKeeper since those are all handled above
if (opticalDuplicateFlags[i] && opticalDuplicateFlags[j]) continue; // both already marked, no need to check
if (closeEnough(lhs, rhs, distance)) {
// At this point we want to mark either lhs or rhs as duplicate. Either could have been marked
// as a duplicate of the keeper (but not both - that's checked above), so be careful about which
// one to now mark as a duplicate.
final int index = opticalDuplicateFlags[j] ? i : j;
opticalDuplicateFlags[index] = true;
}
}
}
return opticalDuplicateFlags;
}
/** Returns the keeper if it is contained within the list and has location information, otherwise null. */
private PhysicalLocation keeperOrNull(final List<? extends PhysicalLocation> list, final PhysicalLocation keeper) {
if (keeper != null && keeper.hasLocation()) {
for (final PhysicalLocation loc : list) {
if (loc == keeper) return keeper;
}
}
return null;
}
/** Simple method to test whether two physical locations are close enough to each other to be deemed optical dupes. */
private boolean closeEnough(final PhysicalLocation lhs, final PhysicalLocation rhs, final int distance) {
return lhs != rhs && // no comparing an object to itself (checked using object identity)!
lhs.hasLocation() && rhs.hasLocation() && // no comparing objects without locations
lhs.getReadGroup() == rhs.getReadGroup() && // must be in the same RG to be optical duplicates
lhs.getTile() == rhs.getTile() && // and the same tile
Math.abs(lhs.getX() - rhs.getX()) <= distance &&
Math.abs(lhs.getY() - rhs.getY()) <= distance;
}
}