package uk.ac.rhul.cs.cl1.merging;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.TreeMap;
import uk.ac.rhul.cs.cl1.NodeSet;
import uk.ac.rhul.cs.cl1.similarity.SimilarityFunction;
import uk.ac.rhul.cs.cl1.ValuedNodeSet;
import uk.ac.rhul.cs.cl1.ValuedNodeSetList;
import uk.ac.rhul.cs.graph.Graph;
import uk.ac.rhul.cs.collections.HashMultimap;
import uk.ac.rhul.cs.collections.Multiset;
import uk.ac.rhul.cs.utils.StringUtils;
import uk.ac.rhul.cs.collections.TreeMultiset;
import uk.ac.rhul.cs.utils.UnorderedPair;
/**
* Merges highly overlapping node sets in multiple passes, recalculating
* similarities after each pass.
*
* The algorithm first finds all the overlapping node set pairs that are
* more similar to each other than a given threshold and puts them in a
* priority queue. In each step, the pair at the front of the queue (i.e.
* the one with the highest similarity) is taken and merged, the
* similarities are re-calculated and those still above the threshold
* are put back in the queue. The process continues until the queue becomes
* empty.
*
* @author tamas
*
*/
public class MultiPassNodeSetMerger extends AbstractNodeSetMerger {
enum VerificationMode {
OFF, VERIFY, VERIFY_AND_MINIMIZE
}
/**
* Auxiliary data structure for verification mode; stores how many times
* a given node appeared in the input data.
*/
private TreeMap<Integer, Integer> counts = new TreeMap<Integer, Integer>();
/**
* Returns whether the node set merger is in debug mode.
*/
protected boolean debugging = false;
/**
* Returns whether the node set merger is in verification mode.
*
* In verification mode, the merger takes note of each node in the
* incoming node set and how many times they appear there. At the
* end of the merging process, the following should hold:
*
* - The sum of values of a given node in the resulting list of
* {@link uk.ac.rhul.cs.cl1.ValuedNodeSet}s must be equal to the sum of values of
* the same node in the incoming {@link uk.ac.rhul.cs.cl1.ValuedNodeSet}.
*
* - The similarity score between any pair of nodesets must be
* smaller than the threshold in the result.
*
* Verification mode turns on these checks. When the resulting list of
* {@link uk.ac.rhul.cs.cl1.ValuedNodeSet} instances fail these checks, this means that
* there is a bug in the merging algorithm.
*/
protected VerificationMode verificationMode = VerificationMode.VERIFY_AND_MINIMIZE;
class NodeSetPair extends UnorderedPair<ValuedNodeSet> implements Comparable<NodeSetPair> {
Double similarity;
public NodeSetPair(final ValuedNodeSet left, final ValuedNodeSet right, double similarity) {
super(left, right);
this.similarity = similarity;
}
public ValuedNodeSet getOtherThan(ValuedNodeSet item) {
if (this.getLeft() == item)
return this.getRight();
return this.getLeft();
}
public int compareTo(NodeSetPair other) {
if (this.equals(other) && this.similarity == other.similarity)
return 0;
if (this.similarity < other.similarity)
return 1;
if (this.similarity > other.similarity)
return -1;
return this.getLeft().compareTo(this.getRight());
}
public int hashCode() {
return super.hashCode() + (149 * similarity.hashCode());
}
public String toString() {
return "{" + this.getLeft().toString() + "} - {" + this.getRight().toString() + "}: " + similarity;
}
}
/**
* Returns whether the node set merger is in debug mode or not.
*/
public boolean isDebugging() {
return debugging;
}
/**
* Returns whether the node set merger is in verification mode.
*/
public boolean isVerificationMode() {
return verificationMode != VerificationMode.OFF;
}
public ValuedNodeSetList mergeOverlapping(ValuedNodeSetList nodeSets,
SimilarityFunction<NodeSet> similarityFunc, double threshold) {
int i, j, n = nodeSets.size();
double similarity;
ValuedNodeSetList result = new ValuedNodeSetList();
HashSet<ValuedNodeSet> activeNodesets = new HashSet<ValuedNodeSet>();
// The step counting is a bit tricky; instead of storing the actual number of
// node set pairs that we have to check in stepsTotal, we divide it by n and
// store that to avoid overflows in an integer when n is large.
double stepsTotal = (n-1) / 2.0, stepsTaken = 0.0;
if (n == 0)
return result;
if (isVerificationMode()) {
prepareForVerification(nodeSets);
}
Graph graph = nodeSets.get(0).getGraph();
// Stage 1: find overlapping pairs and index them
PriorityQueue<NodeSetPair> pairs = new PriorityQueue<NodeSetPair>();
HashMultimap<ValuedNodeSet, NodeSetPair> nodesetsToPairs = new HashMultimap<ValuedNodeSet, NodeSetPair>();
if (taskMonitor != null) {
taskMonitor.setStatus("Finding highly overlapping clusters...");
taskMonitor.setPercentCompleted(0);
}
for (i = 0; i < n; i++) {
ValuedNodeSet v1 = nodeSets.get(i);
for (j = i+1; j < n; j++) {
ValuedNodeSet v2 = nodeSets.get(j);
similarity = similarityFunc.getSimilarity(v1, v2);
if (similarity > 0) {
NodeSetPair pair = new NodeSetPair(v1, v2, similarity);
pairs.add(pair);
// debug(" Adding " + pair + " to pairs of " + v1);
nodesetsToPairs.put(v1, pair);
// debug(" Pairs of " + v1 + " are now " + nodesetsToPairs.get(v1));
// qdebug(" Adding " + pair + " to pairs of " + v2);
nodesetsToPairs.put(v2, pair);
// debug(" Pairs of " + v2 + " are now " + nodesetsToPairs.get(v2));
}
}
if (!nodesetsToPairs.containsKey(v1)) {
// No other node set is similar to v1, so add it to the result
result.add(v1);
}
stepsTaken += (n - i - 1) / (double)(n);
if (stepsTaken > stepsTotal) {
stepsTaken = stepsTotal;
}
if (taskMonitor != null) {
taskMonitor.setPercentCompleted((int)(100 * (((float)stepsTaken) / stepsTotal)));
}
}
if (taskMonitor != null) {
taskMonitor.setPercentCompleted(100);
}
// Store which nodesets are still active (i.e. unmerged)
activeNodesets.addAll(nodesetsToPairs.keySet());
if (debugging) {
System.err.println("Nodesets with no similar pairs:");
System.err.println(result);
System.err.println("Overlapping pairs to consider:");
System.err.println(pairs);
}
// Checkpoint
if (isVerificationMode()) {
ValuedNodeSetList tmpResult = new ValuedNodeSetList();
tmpResult.addAll(result);
tmpResult.addAll(activeNodesets);
verifyResult(tmpResult, similarityFunc, -1);
}
// Stage 2: merge overlapping pairs one by one
if (taskMonitor != null) {
taskMonitor.setStatus("Merging highly overlapping clusters...");
taskMonitor.setPercentCompleted(-1);
}
stepsTaken = 0;
while (!pairs.isEmpty()) {
NodeSetPair pair = pairs.poll();
ValuedNodeSet v1 = pair.getLeft();
ValuedNodeSet v2 = pair.getRight();
if (pair.similarity < threshold)
break;
debug("Merging pair: " + pair);
// debug(" Active nodesets: " + activeNodesets);
// If v1 was already merged into another nodeset, continue
if (!activeNodesets.contains(v1)) {
debug(" " + v1 + " already absorbed in another nodeset, skipping.");
nodesetsToPairs.remove(v2, pair);
continue;
}
// If v2 was already merged into another nodeset, continue
if (!activeNodesets.contains(v2)) {
debug(" " + v2 + " already absorbed in another nodeset, skipping.");
nodesetsToPairs.remove(v1, pair);
continue;
}
// Remove the v1-v2 pair from the nodeset to pair mappings
nodesetsToPairs.remove(v1, pair);
nodesetsToPairs.remove(v2, pair);
// Merge v1 and v2
Multiset<Integer> unionMembers = new TreeMultiset<Integer>();
unionMembers.addAll(v1.getMembers());
unionMembers.addAll(v2.getMembers());
ValuedNodeSet unionNodeset = new ValuedNodeSet(graph, unionMembers.elementSet());
for (Multiset.Entry<Integer> entry: unionMembers.entrySet()) {
Integer elt = entry.getElement();
int count = v1.getValue(elt, 0) + v2.getValue(elt, 0);
unionNodeset.setValue(elt, count);
}
// Update the NodeSetPairs related to either v1 or v2
boolean v1SubsetOfv2 = unionNodeset.equals(v2);
boolean v2SubsetOfv1 = unionNodeset.equals(v1);
if (!v1SubsetOfv2 && !v2SubsetOfv1) {
debug("v1 and v2 are not subsets of each other.");
for (NodeSetPair oldPair: nodesetsToPairs.get(v1)) {
ValuedNodeSet v3 = oldPair.getOtherThan(v1);
similarity = similarityFunc.getSimilarity(unionNodeset, v3);
nodesetsToPairs.remove(v3, oldPair);
if (similarity == 0)
continue;
NodeSetPair newPair = new NodeSetPair(unionNodeset, v3, similarity);
debug(" (1) updating pair: " + oldPair + " --> " + newPair);
pairs.add(newPair);
nodesetsToPairs.put(unionNodeset, newPair);
nodesetsToPairs.put(v3, newPair);
}
for (NodeSetPair oldPair: nodesetsToPairs.get(v2)) {
ValuedNodeSet v3 = oldPair.getOtherThan(v2);
if (unionNodeset == v3) {
// This happens when there is a triangle in the similarity
// graph, i.e. v1 -- v2 -- v3 -- v1, and v1 is being merged
// with v2. unionNodeset was then already added as a neighbor
// to v3 in the previous for loop, so we can skip it here.
continue;
}
similarity = similarityFunc.getSimilarity(unionNodeset, v3);
nodesetsToPairs.remove(v3, oldPair);
if (similarity == 0)
continue;
NodeSetPair newPair = new NodeSetPair(unionNodeset, v3, similarity);
debug(" (2) updating pair: " + oldPair + " --> " + newPair);
pairs.add(newPair);
nodesetsToPairs.put(unionNodeset, newPair);
nodesetsToPairs.put(v3, newPair);
}
// Remove v1 and v2 from the mapping from nodesets to pairs since
// they were already merged into another nodeset
nodesetsToPairs.removeAll(v1);
nodesetsToPairs.removeAll(v2);
activeNodesets.remove(v1);
activeNodesets.remove(v2);
activeNodesets.add(unionNodeset);
} else if (v1SubsetOfv2 && !v2SubsetOfv1) {
debug(" v1 is a real subset of v2.");
for (int member: v1)
v2.setValue(member, v1.getValue(member) + v2.getValue(member));
// v1 is subset of v2; unionNodeSet is then equal to v2.
// Pairs pertaining to v2 will stay as they are.
// Pairs pertaining to v1 have to be updated.
// Note that v2 will appear among the pairs related to v1.
Collection<NodeSetPair> v2Pairs = nodesetsToPairs.get(v2);
for (NodeSetPair oldPair: nodesetsToPairs.get(v1)) {
ValuedNodeSet v3 = oldPair.getOtherThan(v1);
nodesetsToPairs.remove(v3, oldPair);
if (v3 == v2)
continue;
similarity = similarityFunc.getSimilarity(v2, v3);
debug(" Similarity of {" + v2 + "} and {" + v3 + "} is " + similarity);
if (similarity == 0)
continue;
NodeSetPair newPair = new NodeSetPair(v2, v3, similarity);
if (v2Pairs.contains(newPair)) {
debug(" This pair is already among v2's pairs, skipping.");
continue;
}
debug(" (3) updating pair: " + oldPair + " --> " + newPair);
pairs.add(newPair);
nodesetsToPairs.put(v2, newPair);
nodesetsToPairs.put(v3, newPair);
}
// Remove v1 from the mapping from nodesets to pairs
nodesetsToPairs.removeAll(v1);
activeNodesets.remove(v1);
} else if (v2SubsetOfv1 && !v1SubsetOfv2) {
debug(" v2 is a real subset of v1.");
for (int member: v2)
v1.setValue(member, v2.getValue(member) + v1.getValue(member));
// v2 is subset of v1; unionNodeSet is then equal to v1.
// Pairs pertaining to v1 will stay as they are.
// Pairs pertaining to v2 have to be updated
// Note that v1 will appear among the pairs related to v2.
Collection<NodeSetPair> v1Pairs = nodesetsToPairs.get(v1);
for (NodeSetPair oldPair: nodesetsToPairs.get(v2)) {
ValuedNodeSet v3 = oldPair.getOtherThan(v2);
nodesetsToPairs.remove(v3, oldPair);
if (v3 == v1)
continue;
similarity = similarityFunc.getSimilarity(v1, v3);
if (similarity == 0)
continue;
NodeSetPair newPair = new NodeSetPair(v1, v3, similarity);
if (v1Pairs.contains(newPair))
continue;
debug(" (4) updating pair: " + oldPair + " --> " + newPair);
pairs.add(newPair);
nodesetsToPairs.put(v1, newPair);
nodesetsToPairs.put(v3, newPair);
}
// Remove v2 from the mapping from nodesets to pairs
nodesetsToPairs.removeAll(v2);
activeNodesets.remove(v2);
} else {
// v1 and v2 are equal. This can happen if they were joined via two
// independent join paths. We remove v2 and keep v1
debug(" v1 and v2 are identical.");
for (int member: v2)
v1.setValue(member, v2.getValue(member) + v1.getValue(member));
nodesetsToPairs.removeAll(v2);
activeNodesets.remove(v2);
}
// debug(" Active nodesets: " + activeNodesets);
// debug(" Queue is now: " + pairs);
// Checkpoint
if (isVerificationMode()) {
ValuedNodeSetList tmpResult = new ValuedNodeSetList();
tmpResult.addAll(result);
tmpResult.addAll(activeNodesets);
try {
verifyResult(tmpResult, similarityFunc, -1);
} catch (RuntimeException ex) {
System.err.println("Step " + stepsTaken + "\n" +
"Verification failed after merging:\n" + v1 + "\nand:\n" + v2);
if (verificationMode == VerificationMode.VERIFY_AND_MINIMIZE) {
System.err.println("Minimal subset that also fails:");
System.err.println(getMinimalSubsetThatFails(nodeSets, similarityFunc, threshold));
}
throw ex;
}
}
stepsTaken++;
}
// Add the nodesets that are still active
result.addAll(activeNodesets);
if (isVerificationMode()) {
try {
verifyResult(result, similarityFunc, threshold);
} catch (RuntimeException ex) {
if (verificationMode == VerificationMode.VERIFY_AND_MINIMIZE) {
System.err.println("\nMinimal subset that also fails:");
for (ValuedNodeSet ns: getMinimalSubsetThatFails(nodeSets, similarityFunc, threshold)) {
System.err.println(StringUtils.join(ns.getMembers(), " "));
}
}
throw ex;
}
}
if (taskMonitor != null) {
taskMonitor.setPercentCompleted(100);
}
return result;
}
/**
* Prepares the input nodeset for verification later on.
*/
private void prepareForVerification(ValuedNodeSetList input) {
counts.clear();
for (ValuedNodeSet nodeSet: input) {
for (int member: nodeSet) {
if (counts.containsKey(member))
counts.put(member, counts.get(member) + 1);
else
counts.put(member, 1);
}
}
}
/**
* Verifies the result.
*
* @see #verificationMode for more details.
* @throws RuntimeException if the verification failed
*/
private void verifyResult(ValuedNodeSetList result,
SimilarityFunction<NodeSet> similarityFunc, double threshold) {
TreeMap<Integer, Integer> newCounts = new TreeMap<Integer, Integer>();
newCounts.clear();
for (ValuedNodeSet nodeSet: result) {
for (int member: nodeSet) {
if (newCounts.containsKey(member))
newCounts.put(member, newCounts.get(member) + 1);
else
newCounts.put(member, 1);
}
}
if (!newCounts.keySet().equals(counts.keySet())) {
Graph graph = result.get(0).getGraph();
Set<Integer> ks = counts.keySet();
StringBuilder sb = new StringBuilder("Nodes only in counts:");
ks.removeAll(newCounts.keySet());
for (int k: ks) {
sb.append(" " + graph.getNodeName(k));
}
sb.append("\n");
ks = newCounts.keySet();
sb.append("Nodes only in newCounts:");
ks.removeAll(counts.keySet());
for (int k: ks) {
sb.append(" " + graph.getNodeName(k));
}
throw new RuntimeException("newCounts and counts is different!\n" + sb.toString());
}
if (threshold < 0)
return;
for (ValuedNodeSet nodeSet1: result) {
for (ValuedNodeSet nodeSet2: result) {
if (nodeSet1 == nodeSet2)
continue;
double sim = similarityFunc.getSimilarity(nodeSet1, nodeSet2);
if (sim >= threshold)
throw new RuntimeException("similarity of " + nodeSet1 +
" and " + nodeSet2 + " is " + sim +
", while the threshold is " + threshold);
}
}
}
/**
* Turns the debugging mode on or off.
*/
public void setDebugging(boolean debugging) {
this.debugging = debugging;
}
/**
* Turns the verification mode on or off.
*/
public void setVerificationMode(VerificationMode verificationMode) {
this.verificationMode = verificationMode;
}
private void debug(String message) {
if (!this.debugging)
return;
System.err.println(message);
}
/**
* Given a {@link ValuedNodeSetList} that fails the verification tests,
* finds a minimal subset subset that still fails but none of its
* subsets fail.
*
* For debugging purposes only.
*/
public static ValuedNodeSetList getMinimalSubsetThatFails(ValuedNodeSetList nodeSets,
SimilarityFunction<NodeSet> similarityFunc, double threshold) {
ValuedNodeSetList result;
boolean changed = true;
MultiPassNodeSetMerger merger = new MultiPassNodeSetMerger();
merger.setDebugging(false);
merger.setVerificationMode(VerificationMode.VERIFY);
// Check if the entire nodeset list fails the verifications or not. If
// not, return null.
try {
merger.mergeOverlapping(nodeSets, similarityFunc, threshold);
return null;
} catch (RuntimeException ignored) {
}
result = (ValuedNodeSetList)nodeSets.clone();
while (changed && !result.isEmpty()) {
Iterator<ValuedNodeSet> it;
changed = false;
it = result.iterator();
while (it.hasNext()) {
// Try removing the nodeset, see if we still fail.
ValuedNodeSet nodeSet = it.next();
boolean failing = false;
ValuedNodeSetList nodeSetsCopy = (ValuedNodeSetList)result.clone();
nodeSetsCopy.remove(nodeSet);
try {
merger.mergeOverlapping(nodeSetsCopy, similarityFunc, threshold);
} catch (RuntimeException ex) {
failing = true;
}
if (failing) {
// Still failing, great. Let's remove the nodeSet permanently.
it.remove();
changed = true;
break;
}
}
}
try {
merger.mergeOverlapping(result, similarityFunc, threshold);
System.err.println("wtf, we didn't fail!");
} catch (RuntimeException ex) {
ex.printStackTrace();
System.err.println("==============");
}
return result;
}
}