/*
* The MIT License
*
* Copyright (c) 2016 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package picard.sam.markduplicates;
import htsjdk.samtools.DuplicateSet;
import htsjdk.samtools.SAMRecord;
import picard.PicardException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static java.util.stream.Collectors.counting;
/**
* UmiGraph is used to identify UMIs that come from the same original source molecule. The assumption
* is that UMIs with small edit distances are likely to be read errors on the sequencer rather than
* distinct molecules.
*
* The algorithm used here is to join all pairs of UMIs that are within maxEditDistanceToJoin. It is possible
* for a set of UMIs A, B and C to all be considered as part of the same source molecule even if two of the UMIs
* have a Hamming distance larger than maxEditDistanceToJoin. Suppose A = "ATCC", B = "AACC", and C = "AACG"
* and maxEditDistanceToJoin = 1. In this case, A and B are 1 Hamming distance so they are joined, and B and C
* are 1 Hamming distance so they are joined. Because A and B are joined and because B and C are joined, this results
* in A and C being joined even though they have a distance of 2.
*
* @author fleharty
*/
public class UmiGraph {
private final List<SAMRecord> records; // SAMRecords from the original duplicate set considered to break up by UMI
private final Map<String, Long> umiCounts; // Map of UMI sequences and how many times they have been observed
private final int[] duplicateSetID; // ID of the duplicate set that the UMI belongs to, the index is the UMI ID
private final String[] umi; // Sequence of actual UMI, the index is the UMI ID
private final int numUmis; // Number of observed UMIs
private final String umiTag; // UMI tag used in the SAM/BAM/CRAM file ie. RX
private final String assignedUmiTag; // Assigned UMI tag used in the SAM/BAM/CRAM file ie. MI
private final boolean allowMissingUmis; // Allow for missing UMIs
public UmiGraph(DuplicateSet set, String umiTag, String assignedUmiTag, boolean allowMissingUmis) {
this.umiTag = umiTag;
this.assignedUmiTag = assignedUmiTag;
this.allowMissingUmis = allowMissingUmis;
records = set.getRecords();
// First ensure that all the reads have a UMI, if any reads are missing a UMI throw an exception unless allowMissingUmis is true
for (SAMRecord rec : records) {
if(rec.getStringAttribute(umiTag) == null) {
if(allowMissingUmis) {
rec.setAttribute(umiTag, "");
} else {
throw new PicardException("Read " + rec.getReadName() + " does not contain a UMI with the " + umiTag + " attribute.");
}
}
}
// Count the number of times each UMI occurs
umiCounts = records.stream().collect(Collectors.groupingBy(p -> p.getStringAttribute(umiTag), counting()));
// At first we consider every UMI as if it were its own duplicate set
numUmis = umiCounts.size();
umi = new String[numUmis];
duplicateSetID = IntStream.rangeClosed(0, numUmis-1).toArray();
int i = 0;
for (String key : umiCounts.keySet()) {
umi[i] = key;
i++;
}
}
// Part of Union-Find with Path Compression to determine the duplicate set a particular UMI belongs to.
private int findRepresentativeUmi(int umiID) {
int representativeUmi = umiID; // All UMIs of a duplicate set will have the same reprsentativeUmi.
while (representativeUmi != duplicateSetID[representativeUmi]) {
representativeUmi = duplicateSetID[representativeUmi];
}
while (umiID != representativeUmi) {
int newUmiID = duplicateSetID[umiID];
duplicateSetID[umiID] = representativeUmi;
umiID = newUmiID;
}
return representativeUmi;
}
// Part of Union-Find with Path Compression that joins to UMIs to be part of the same duplicate set.
private void joinUmisIntoDuplicateSet(final int umi1ID, final int umi2ID) {
int representativeUmi1 = findRepresentativeUmi(umi1ID);
int representativeUmi2 = findRepresentativeUmi(umi2ID);
if (representativeUmi1 == representativeUmi2) return;
duplicateSetID[representativeUmi1] = representativeUmi2;
}
List<DuplicateSet> joinUmisIntoDuplicateSets(final int maxEditDistanceToJoin) {
// Compare all UMIs to each other. If they are within maxEditDistanceToJoin
// join them to the same duplicate set using the union-find algorithm.
for (int i = 0; i < numUmis; i++) {
for (int j = i + 1; j < numUmis; j++) {
if (isWithinEditDistance(umi[i], umi[j], maxEditDistanceToJoin)) {
joinUmisIntoDuplicateSet(i, j);
}
}
}
// This ensures that all duplicate sets have unique IDs. During Union-Find a tree is constructed
// where each UMI points to parent UMI. This ensures that all UMIs that belong to the same duplicate
// set point to the same parent UMI. Note that the parent UMI is only used as a representative UMI and
// is not at all related to the assigned UMI.
for (int i = 0; i < numUmis; i++) {
duplicateSetID[i] = findRepresentativeUmi(i);
}
final Map<Integer, List<SAMRecord>> duplicateSets = new HashMap<>();
// Assign UMIs to duplicateSets
final Map<String, Integer> duplicateSetsFromUmis = getDuplicateSetsFromUmis();
for (SAMRecord rec : records) {
final String umi = rec.getStringAttribute(umiTag);
final Integer duplicateSetIndex = duplicateSetsFromUmis.get(umi);
if (duplicateSets.containsKey(duplicateSetIndex)) {
duplicateSets.get(duplicateSetIndex).add(rec);
}
else {
final List<SAMRecord> n = new ArrayList<>();
n.add(rec);
duplicateSets.put(duplicateSetIndex, n);
}
}
final List<DuplicateSet> duplicateSetList = new ArrayList<>();
for (final Map.Entry<Integer, List<SAMRecord>> entry : duplicateSets.entrySet()) {
final DuplicateSet ds = new DuplicateSet();
final List<SAMRecord> recordList = entry.getValue();
// Add records to the DuplicateSet
for (final SAMRecord rec : recordList) {
ds.add(rec);
}
// For a particular duplicate set, identify the most common UMI
// and use this as an assigned UMI.
long maxCount = 0;
String assignedUmi = null;
for (SAMRecord rec : recordList) {
final String umi = rec.getStringAttribute(umiTag);
if (umiCounts.get(umi) > maxCount) {
maxCount = umiCounts.get(umi);
assignedUmi = umi;
}
}
// Set the records to contain the assigned UMI
for (final SAMRecord rec : recordList) {
if (allowMissingUmis && rec.getStringAttribute(umiTag) == "") {
// The SAM spec doesn't support empty tags, so we set it to null if it is empty.
rec.setAttribute(umiTag, null);
} else {
rec.setAttribute(assignedUmiTag, assignedUmi);
}
}
duplicateSetList.add(ds);
}
return duplicateSetList;
}
// Determine if the two strings s1 and s2 are within edit distance of editDistance.
// TODO: use HTSJDK version when this become available
private boolean isWithinEditDistance(final String s1, final String s2, final int editDistance) {
// Comparing edit distance of strings with different lengths is not supported
if (s1.length() != s2.length()) {
throw new PicardException("Attempting to determine if two UMIs of different length were within a specified edit distance.");
}
int measuredDistance = 0;
for (int i = 0;i < s1.length();i++) {
if (s1.charAt(i) != s2.charAt(i)) {
measuredDistance++;
if (measuredDistance > editDistance) {
return false;
}
}
}
return true;
}
// Create a map that maps a umi to the duplicateSetID
private Map<String, Integer> getDuplicateSetsFromUmis() {
final Map<String, Integer> duplicateSetsFromUmis = new HashMap<>();
for (int i = 0; i < duplicateSetID.length; i++) {
duplicateSetsFromUmis.put(umi[i], duplicateSetID[i]);
}
return duplicateSetsFromUmis;
}
}