/* * The MIT License * * Copyright (c) 2014 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package picard.sam.markduplicates.util; import htsjdk.samtools.util.Log; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Contains methods for finding optical duplicates. * * @author Tim Fennell * @author Nils Homer */ public class OpticalDuplicateFinder { public static final String DEFAULT_READ_NAME_REGEX = "[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*".intern(); public static final int DEFAULT_OPTICAL_DUPLICATE_DISTANCE = 100; public String readNameRegex; public int opticalDuplicatePixelDistance; private Pattern readNamePattern; private boolean warnedAboutRegexNotMatching = false; private final Log log; public OpticalDuplicateFinder() { this(DEFAULT_READ_NAME_REGEX, DEFAULT_OPTICAL_DUPLICATE_DISTANCE); } public OpticalDuplicateFinder(final int opticalDuplicatePixelDistance) { this(DEFAULT_READ_NAME_REGEX, opticalDuplicatePixelDistance); } public OpticalDuplicateFinder(final String readNameRegex) { this(readNameRegex, DEFAULT_OPTICAL_DUPLICATE_DISTANCE); } public OpticalDuplicateFinder(final String readNameRegex, final int opticalDuplicatePixelDistance) { this(readNameRegex, opticalDuplicatePixelDistance, null); } public OpticalDuplicateFinder(final String readNameRegex, final int opticalDuplicatePixelDistance, final Log log) { this.readNameRegex = readNameRegex; this.opticalDuplicatePixelDistance = opticalDuplicatePixelDistance; this.log = log; } /** * Small interface that provides access to the physical location information about a cluster. * All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow * non-zero positive integers, x and y coordinates may be negative. */ public static interface PhysicalLocation { public short getReadGroup(); public void setReadGroup(short rg); public short getTile(); public void setTile(short tile); public short getX(); public void setX(short x); public short getY(); public void setY(short y); public short getLibraryId(); public void setLibraryId(short libraryId); } private final int[] tmpLocationFields = new int[10]; // for optimization of addLocationInformation /** * Method used to extract tile/x/y from the read name and add it to the PhysicalLocation so that it * can be used later to determine optical duplication * * @param readName the name of the read/cluster * @param loc the object to add tile/x/y to * @return true if the read name contained the information in parsable form, false otherwise */ public boolean addLocationInformation(final String readName, final PhysicalLocation loc) { // Optimized version if using the default read name regex (== used on purpose): if (this.readNameRegex == this.DEFAULT_READ_NAME_REGEX) { final int fields = getRapidDefaultReadNameRegexSplit(readName, ':', tmpLocationFields); if (!(fields == 5 || fields == 7)) { if (null != log && !this.warnedAboutRegexNotMatching) { this.log.warn(String.format("Default READ_NAME_REGEX '%s' did not match read name '%s'. " + "You may need to specify a READ_NAME_REGEX in order to correctly identify optical duplicates. " + "Note that this message will not be emitted again even if other read names do not match the regex.", this.readNameRegex, readName)); this.warnedAboutRegexNotMatching = true; } return false; } final int offset = fields == 7 ? 2 : 0; loc.setTile((short) tmpLocationFields[offset + 2]); loc.setX((short) tmpLocationFields[offset + 3]); loc.setY((short) tmpLocationFields[offset + 4]); return true; } else if (this.readNameRegex == null) { return false; } else { // Standard version that will use the regex if (this.readNamePattern == null) this.readNamePattern = Pattern.compile(this.readNameRegex); final Matcher m = this.readNamePattern.matcher(readName); if (m.matches()) { loc.setTile((short) Integer.parseInt(m.group(1))); loc.setX((short) Integer.parseInt(m.group(2))); loc.setY((short) Integer.parseInt(m.group(3))); return true; } else { if (null != log && !this.warnedAboutRegexNotMatching) { this.log.warn(String.format("READ_NAME_REGEX '%s' did not match read name '%s'. Your regex may not be correct. " + "Note that this message will not be emitted again even if other read names do not match the regex.", this.readNameRegex, readName)); warnedAboutRegexNotMatching = true; } return false; } } } /** * Single pass method to parse the read name for the default regex. This will only insert the 2nd to the 4th * tokens (inclusive). It will also stop after the fifth token has been successfully parsed. */ protected int getRapidDefaultReadNameRegexSplit(final String readName, final char delim, final int[] tokens) { int tokensIdx = 0; int prevIdx = 0; for (int i = 0; i < readName.length(); i++) { if (readName.charAt(i) == delim) { if (1 < tokensIdx && tokensIdx < 5) tokens[tokensIdx] = rapidParseInt(readName.substring(prevIdx, i)); // only fill in 2-4 inclusive tokensIdx++; if (4 < tokensIdx) return tokensIdx; // early return, only consider the first five tokens prevIdx = i + 1; } } if (prevIdx < readName.length()) { if (1 < tokensIdx && tokensIdx < 5) tokens[tokensIdx] = rapidParseInt(readName.substring(prevIdx, readName.length())); // only fill in 2-4 inclusive tokensIdx++; } return tokensIdx; } /** * Very specialized method to rapidly parse a sequence of digits from a String up until the first * non-digit character. */ protected final int rapidParseInt(final String input) { final int len = input.length(); int val = 0; int i = 0; boolean isNegative = false; if (0 < len && '-' == input.charAt(0)) { i = 1; isNegative = true; } for (; i < len; ++i) { final char ch = input.charAt(i); if (Character.isDigit(ch)) { val = (val * 10) + (ch - 48); } else { break; } } if (isNegative) val = -val; return val; } /** * Finds which reads within the list of duplicates are likely to be optical duplicates of * one another. * <p/> * Note: this method will perform a sort() of the list; if it is imperative that the list be * unmodified a copy of the list should be passed to this method. * * @param list a list of reads that are determined to be duplicates of one another * @return a boolean[] of the same length as the incoming list marking which reads are optical duplicates */ public boolean[] findOpticalDuplicates(final List<? extends PhysicalLocation> list) { final int length = list.size(); final boolean[] opticalDuplicateFlags = new boolean[length]; Collections.sort(list, new Comparator<PhysicalLocation>() { public int compare(final PhysicalLocation lhs, final PhysicalLocation rhs) { int retval = lhs.getReadGroup() - rhs.getReadGroup(); if (retval == 0) retval = lhs.getTile() - rhs.getTile(); if (retval == 0) retval = lhs.getX() - rhs.getX(); if (retval == 0) retval = lhs.getY() - rhs.getY(); return retval; } }); outer: for (int i = 0; i < length; ++i) { final PhysicalLocation lhs = list.get(i); if (lhs.getTile() < 0) continue; for (int j = i + 1; j < length; ++j) { final PhysicalLocation rhs = list.get(j); if (opticalDuplicateFlags[j]) continue; if (lhs.getReadGroup() != rhs.getReadGroup()) continue outer; if (lhs.getTile() != rhs.getTile()) continue outer; if (rhs.getX() > lhs.getX() + this.opticalDuplicatePixelDistance) continue outer; if (Math.abs(lhs.getY() - rhs.getY()) <= this.opticalDuplicatePixelDistance) { opticalDuplicateFlags[j] = true; } } } return opticalDuplicateFlags; } }