/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.util;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.samtools.util.StringUtil;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicReference;
/**
* Store one or more AdapterPairs to use to mark adapter sequence of SAMRecords. This is a very compute-intensive process, so
* this class implements two heuristics to reduce computation:
* - Adapter sequences are truncated, and then any adapter pairs that become identical after truncation are collapsed into a single pair.
* - After a specified number of reads with adapter sequence has been seen, prune the list of adapter pairs to include only the most
* frequently seen adapters. For a flowcell, there should only be a single adapter pair found.
*
* Note that the AdapterPair object returned by all the adapterTrim* methods will not be one of the original AdapterPairs
* passed to the ctor, but rather will be one of the truncated copies.
*/
public class AdapterMarker {
public static final int DEFAULT_ADAPTER_LENGTH = 30;
public static final int DEFAULT_PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN = 100;
public static final int DEFAULT_NUM_ADAPTERS_TO_KEEP = 1;
// It is assumed that these are set once during execution, before the class is used to mark any adapters, but this is not enforced.
private int thresholdForSelectingAdaptersToKeep = DEFAULT_PRUNE_ADAPTER_LIST_AFTER_THIS_MANY_ADAPTERS_SEEN;
private int numAdaptersToKeep = DEFAULT_NUM_ADAPTERS_TO_KEEP;
private int minSingleEndMatchBases = ClippingUtility.MIN_MATCH_BASES;
private int minPairMatchBases = ClippingUtility.MIN_MATCH_PE_BASES;
private double maxSingleEndErrorRate = ClippingUtility.MAX_ERROR_RATE;
private double maxPairErrorRate = ClippingUtility.MAX_PE_ERROR_RATE;
// This is AtomicReference because one thread could be matching adapters while the threshold has been crossed in another
// thread and the array is being replaced.
private final AtomicReference<AdapterPair[]> adapters = new AtomicReference<AdapterPair[]>();
// All the members below are only accessed within a synchronized block.
private boolean thresholdReached = false;
private int numAdaptersSeen = 0;
private final CollectionUtil.DefaultingMap<AdapterPair, Integer> seenCounts = new CollectionUtil.DefaultingMap<AdapterPair, Integer>(0);
/**
* Truncates adapters to DEFAULT_ADAPTER_LENGTH
* @param originalAdapters These should be in order from longest & most likely to shortest & least likely.
*/
public AdapterMarker(final AdapterPair... originalAdapters) {
this(DEFAULT_ADAPTER_LENGTH, originalAdapters);
}
/**
* @param adapterLength Truncate adapters to this length.
* @param originalAdapters These should be in order from longest & most likely to shortest & least likely.
*/
public AdapterMarker(final int adapterLength, final AdapterPair... originalAdapters) {
// Truncate each AdapterPair to the given length, and then combine any that end up the same after truncation.
final ArrayList<TruncatedAdapterPair> truncatedAdapters = new ArrayList<TruncatedAdapterPair>();
for (final AdapterPair adapter : originalAdapters) {
final TruncatedAdapterPair truncatedAdapter = makeTruncatedAdapterPair(adapter, adapterLength);
final int matchingIndex = truncatedAdapters.indexOf(truncatedAdapter);
if (matchingIndex == -1) {
truncatedAdapters.add(truncatedAdapter);
} else {
final TruncatedAdapterPair matchingAdapter = truncatedAdapters.get(matchingIndex);
matchingAdapter.setName(matchingAdapter.getName() + "|" + adapter.getName());
}
}
adapters.set(truncatedAdapters.toArray(new AdapterPair[truncatedAdapters.size()]));
}
public int getNumAdaptersToKeep() {
return numAdaptersToKeep;
}
/**
* After seeing the thresholdForSelectingAdapters number of adapters, keep up to this many of the original adapters.
*/
public synchronized AdapterMarker setNumAdaptersToKeep(final int numAdaptersToKeep) {
if (numAdaptersToKeep <= 0) {
throw new IllegalArgumentException(String.format("numAdaptersToKeep should be positive: %d", numAdaptersToKeep));
}
this.numAdaptersToKeep = numAdaptersToKeep;
return this;
}
public int getThresholdForSelectingAdaptersToKeep() {
return thresholdForSelectingAdaptersToKeep;
}
/**
* When this number of adapters have been matched, discard the least-frequently matching ones.
* @param thresholdForSelectingAdaptersToKeep set to -1 to never discard any adapters.
*/
public synchronized AdapterMarker setThresholdForSelectingAdaptersToKeep(final int thresholdForSelectingAdaptersToKeep) {
this.thresholdForSelectingAdaptersToKeep = thresholdForSelectingAdaptersToKeep;
return this;
}
public int getMinSingleEndMatchBases() {
return minSingleEndMatchBases;
}
/**
*
* @param minSingleEndMatchBases When marking a single-end read, adapter must match at least this many bases.
*/
public synchronized AdapterMarker setMinSingleEndMatchBases(final int minSingleEndMatchBases) {
this.minSingleEndMatchBases = minSingleEndMatchBases;
return this;
}
public int getMinPairMatchBases() {
return minPairMatchBases;
}
/**
*
* @param minPairMatchBases When marking a paired-end read, adapter must match at least this many bases.
*/
public synchronized AdapterMarker setMinPairMatchBases(final int minPairMatchBases) {
this.minPairMatchBases = minPairMatchBases;
return this;
}
public double getMaxSingleEndErrorRate() {
return maxSingleEndErrorRate;
}
/**
* @param maxSingleEndErrorRate For single-end read, no more than this fraction of the bases that align with the adapter can
* mismatch the adapter and still be considered an adapter match.
*/
public synchronized AdapterMarker setMaxSingleEndErrorRate(final double maxSingleEndErrorRate) {
this.maxSingleEndErrorRate = maxSingleEndErrorRate;
return this;
}
public double getMaxPairErrorRate() {
return maxPairErrorRate;
}
/**
* @param maxPairErrorRate For paired-end read, no more than this fraction of the bases that align with the adapter can
* mismatch the adapter and still be considered an adapter match.
*/
public synchronized AdapterMarker setMaxPairErrorRate(final double maxPairErrorRate) {
this.maxPairErrorRate = maxPairErrorRate;
return this;
}
public AdapterPair adapterTrimIlluminaSingleRead(final SAMRecord read) {
return adapterTrimIlluminaSingleRead(read, minSingleEndMatchBases, maxSingleEndErrorRate);
}
public AdapterPair adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2) {
return adapterTrimIlluminaPairedReads(read1, read2, minPairMatchBases, maxPairErrorRate);
}
/**
* Overrides defaults for minMatchBases and maxErrorRate
*/
public AdapterPair adapterTrimIlluminaSingleRead(final SAMRecord read, final int minMatchBases, final double maxErrorRate) {
final AdapterPair ret = ClippingUtility.adapterTrimIlluminaSingleRead(read, minMatchBases, maxErrorRate, adapters.get());
if (ret != null) tallyFoundAdapter(ret);
return ret;
}
/**
* Overrides defaults for minMatchBases and maxErrorRate
*/
public AdapterPair adapterTrimIlluminaPairedReads(final SAMRecord read1, final SAMRecord read2,
final int minMatchBases, final double maxErrorRate) {
final AdapterPair ret = ClippingUtility.adapterTrimIlluminaPairedReads(read1, read2, minMatchBases, maxErrorRate, adapters.get());
if (ret != null) tallyFoundAdapter(ret);
return ret;
}
/** For unit testing only */
AdapterPair[] getAdapters() {
return adapters.get();
}
private TruncatedAdapterPair makeTruncatedAdapterPair(final AdapterPair adapterPair, final int adapterLength) {
return new TruncatedAdapterPair("truncated " + adapterPair.getName(),
substringAndRemoveTrailingNs(adapterPair.get3PrimeAdapterInReadOrder(), adapterLength),
substringAndRemoveTrailingNs(adapterPair.get5PrimeAdapterInReadOrder(), adapterLength));
}
/**
* Truncate to the given length, and in addition truncate any trailing Ns.
*/
private String substringAndRemoveTrailingNs(final String s, int length) {
length = Math.min(length, s.length());
final byte[] bytes = StringUtil.stringToBytes(s);
while (length > 0 && SequenceUtil.isNoCall(bytes[length - 1])) {
length--;
}
return s.substring(0, length);
}
/**
* Keep track of every time an adapter is found, until it is time to prune the list of adapters.
*/
private void tallyFoundAdapter(final AdapterPair foundAdapter) {
// If caller does not want adapter pruning, do nothing.
if (thresholdForSelectingAdaptersToKeep < 1) return;
synchronized (this) {
// Already pruned adapter list, so nothing more to do.
if (thresholdReached) return;
// Tally this adapter
seenCounts.put(foundAdapter, seenCounts.get(foundAdapter) + 1);
// Keep track of the number of times an adapter has been seen.
numAdaptersSeen += 1;
// Reached the threshold for pruning the list.
if (numAdaptersSeen >= thresholdForSelectingAdaptersToKeep) {
// Sort adapters by number of times each has been seen.
final TreeMap<Integer, AdapterPair> sortedAdapters = new TreeMap<Integer, AdapterPair>(new Comparator<Integer>() {
@Override
public int compare(final Integer integer, final Integer integer2) {
// Reverse of natural ordering
return integer2.compareTo(integer);
}
});
for (final Map.Entry<AdapterPair, Integer> entry : seenCounts.entrySet()) {
sortedAdapters.put(entry.getValue(), entry.getKey());
}
// Keep the #numAdaptersToKeep adapters that have been seen the most, plus any ties.
final ArrayList<AdapterPair> bestAdapters = new ArrayList<AdapterPair>(numAdaptersToKeep);
int countOfLastAdapter = Integer.MAX_VALUE;
for (final Map.Entry<Integer, AdapterPair> entry : sortedAdapters.entrySet()) {
if (bestAdapters.size() >= numAdaptersToKeep) {
if (entry.getKey() == countOfLastAdapter) {
bestAdapters.add(entry.getValue());
} else {
break;
}
} else {
countOfLastAdapter = entry.getKey();
bestAdapters.add(entry.getValue());
}
}
// Replace the existing list with the pruned list.
thresholdReached = true;
adapters.set(bestAdapters.toArray(new AdapterPair[bestAdapters.size()]));
}
}
}
private static final class TruncatedAdapterPair implements AdapterPair {
String name;
final String fivePrime, threePrime, fivePrimeReadOrder;
final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes;
private TruncatedAdapterPair(final String name, final String threePrimeReadOrder, final String fivePrimeReadOrder) {
this.name = name;
this.threePrime = threePrimeReadOrder;
this.threePrimeBytes = StringUtil.stringToBytes(threePrimeReadOrder);
this.fivePrimeReadOrder = fivePrimeReadOrder;
this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder);
this.fivePrime = SequenceUtil.reverseComplement(fivePrimeReadOrder);
this.fivePrimeBytes = StringUtil.stringToBytes(this.fivePrime);
}
public String get3PrimeAdapter(){ return threePrime; }
public String get5PrimeAdapter(){ return fivePrime; }
public String get3PrimeAdapterInReadOrder(){ return threePrime; }
public String get5PrimeAdapterInReadOrder() { return fivePrimeReadOrder; }
public byte[] get3PrimeAdapterBytes() { return threePrimeBytes; }
public byte[] get5PrimeAdapterBytes() { return fivePrimeBytes; }
public byte[] get3PrimeAdapterBytesInReadOrder() { return threePrimeBytes; }
public byte[] get5PrimeAdapterBytesInReadOrder() { return fivePrimeReadOrderBytes; }
public String getName() { return this.name; }
public void setName(final String name) {
this.name = name;
}
// WARNING: These methods ignore the name member!
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final TruncatedAdapterPair that = (TruncatedAdapterPair) o;
if (!fivePrime.equals(that.fivePrime)) return false;
if (!threePrime.equals(that.threePrime)) return false;
return true;
}
@Override
public int hashCode() {
int result = fivePrime.hashCode();
result = 31 * result + threePrime.hashCode();
return result;
}
@Override
public String toString() {
return "TruncatedAdapterPair{" +
"fivePrimeReadOrder='" + fivePrimeReadOrder + '\'' +
", threePrime='" + threePrime + '\'' +
", name='" + name + '\'' +
'}';
}
}
}