/*
* The MIT License
*
* Copyright (c) 2017 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
/**
* This acts as an iterator over duplicate sets. If a particular duplicate
* set consists of records that contain UMIs this iterator breaks up a single
* duplicate set into multiple duplicate based on the content of the UMIs.
* Since there may be sequencing errors in the UMIs, this class allows for
* simple error correction based on edit distances between the UMIs.
*
* @author fleharty
*/
package picard.sam.markduplicates;
import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.DuplicateSetIterator;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.CloseableIterator;
import picard.PicardException;
import java.util.*;
import static htsjdk.samtools.util.StringUtil.hammingDistance;
/**
* UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator
* in such a way that each duplicate set may be broken up into subsets according
* to UMIs in the records. Some tolerance for errors in the UMIs is allowed, and
* the degree of this is controlled by the maxEditDistanceToJoin parameter.
*/
class UmiAwareDuplicateSetIterator implements CloseableIterator<DuplicateSet> {
private final DuplicateSetIterator wrappedIterator;
private Iterator<DuplicateSet> nextSetsIterator;
private final int maxEditDistanceToJoin;
private final String umiTag;
private final String inferredUmiTag;
private final boolean allowMissingUmis;
private boolean isOpen = false;
private UmiMetrics metrics;
private boolean haveWeSeenFirstRead = false;
private long observedUmiBases = 0;
/**
* Creates a UMI aware duplicate set iterator
*
* @param wrappedIterator Iterator of DuplicatesSets to use and break-up by UMI.
* @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups
* @param umiTag The tag used in the bam file that designates the UMI
* @param assignedUmiTag The tag in the bam file that designates the assigned UMI
*/
UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin,
final String umiTag, final String assignedUmiTag, final boolean allowMissingUmis,
final UmiMetrics metrics) {
this.wrappedIterator = wrappedIterator;
this.maxEditDistanceToJoin = maxEditDistanceToJoin;
this.umiTag = umiTag;
this.inferredUmiTag = assignedUmiTag;
this.allowMissingUmis = allowMissingUmis;
this.metrics = metrics;
isOpen = true;
nextSetsIterator = Collections.emptyIterator();
}
@Override
public void close() {
isOpen = false;
wrappedIterator.close();
metrics.calculateDerivedFields();
}
@Override
public boolean hasNext() {
if (!isOpen) {
return false;
} else {
if (nextSetsIterator.hasNext() || wrappedIterator.hasNext()) {
return true;
} else {
isOpen = false;
return false;
}
}
}
@Override
public DuplicateSet next() {
if (!nextSetsIterator.hasNext()) {
process(wrappedIterator.next());
}
return nextSetsIterator.next();
}
/**
* Takes a duplicate set and breaks it up into possible smaller sets according to the UMI,
* and updates nextSetsIterator to be an iterator on that set of DuplicateSets.
*
* @param set Duplicate set that may be broken up into subsets according the UMIs
*/
private void process(final DuplicateSet set) {
// Ensure that the nextSetsIterator isn't already occupied
if (nextSetsIterator.hasNext()) {
throw new PicardException("nextSetsIterator is expected to be empty, but already contains data.");
}
final UmiGraph umiGraph = new UmiGraph(set, umiTag, inferredUmiTag, allowMissingUmis);
List<DuplicateSet> duplicateSets = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin);
// Collect statistics on numbers of observed and inferred UMIs
// and total numbers of observed and inferred UMIs
for (DuplicateSet ds : duplicateSets) {
List<SAMRecord> records = ds.getRecords();
SAMRecord representativeRead = ds.getRepresentative();
String inferredUmi = representativeRead.getStringAttribute(inferredUmiTag);
for (SAMRecord rec : records) {
String currentUmi = rec.getStringAttribute(umiTag);
if (currentUmi != null) {
// All UMIs should be the same length, the code presently does not support variable length UMIs
// TODO: Add support for variable length UMIs
if (!haveWeSeenFirstRead) {
metrics.MEAN_UMI_LENGTH = currentUmi.length();
haveWeSeenFirstRead = true;
} else {
if (metrics.MEAN_UMI_LENGTH != currentUmi.length()) {
throw new PicardException("UMIs of differing lengths were found.");
}
}
// Update UMI metrics associated with each record
metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi);
observedUmiBases += currentUmi.length();
metrics.addUmiObservation(currentUmi, inferredUmi);
}
}
}
// Update UMI metrics associated with each duplicate set
metrics.DUPLICATE_SETS_WITH_UMI += duplicateSets.size();
metrics.DUPLICATE_SETS_IGNORING_UMI++;
nextSetsIterator = duplicateSets.iterator();
}
}