package uk.ac.rhul.cs.cl1.merging; import java.util.*; import com.sosnoski.util.array.IntArray; import com.sosnoski.util.queue.IntQueue; import uk.ac.rhul.cs.cl1.NodeSet; import uk.ac.rhul.cs.cl1.similarity.SimilarityFunction; import uk.ac.rhul.cs.cl1.ValuedNodeSet; import uk.ac.rhul.cs.cl1.ValuedNodeSetList; import uk.ac.rhul.cs.graph.BreadthFirstSearch; import uk.ac.rhul.cs.graph.Graph; import uk.ac.rhul.cs.collections.Multiset; import uk.ac.rhul.cs.collections.TreeMultiset; /** * Merges highly overlapping node sets in a node set list in a single * pass. * * The algorithm first finds all the overlapping node set pairs that are * more similar to each other than a given threshold and creates a graph out * of these pairs. The connected components of this graph will then become * the new nodesets. * * This was the only node set merging algorithm up to ClusterONE 0.91. * * @author tamas * */ public class SinglePassNodeSetMerger extends AbstractNodeSetMerger { /** * Merges highly overlapping nodesets and returns a new nodeset list. * * The algorithm progresses by creating a graph where each node * refers to one of the nodesets. Any two nodes in the graph will be * connected if the corresponding clusters overlap by at least the * given threshold. The connected components of the graph will be * used to derive the new nodesets in the result. * * @param similarityFunc specifies the similarity function to use * @param threshold the overlap threshold. Nodesets will be merged * if their overlap is at least as large as the * given threshold. * * @return a new nodeset list where no two nodesets have an overlap * larger than or equal to the given threshold, and no nodeset * has a density smaller than minDensity */ public ValuedNodeSetList mergeOverlapping( ValuedNodeSetList nodeSets, SimilarityFunction<NodeSet> similarityFunc, double threshold) { return mergeOverlappingNew(nodeSets, similarityFunc, threshold); } public ValuedNodeSetList mergeOverlappingOld( ValuedNodeSetList nodeSets, SimilarityFunction<NodeSet> similarityFunc, double threshold) { int i, n = nodeSets.size(); ValuedNodeSetList result = new ValuedNodeSetList(); // The step counting is a bit tricky; instead of storing the actual number of // node set pairs that we have to check in stepsTotal, we divide it by n and // store that to avoid overflows in an integer when n is large. double stepsTotal = (n-1) / 2.0, stepsTaken = 0.0; if (n == 0) return result; Graph graph = nodeSets.get(0).getGraph(); Graph overlapGraph = new Graph(); overlapGraph.createNodes(n); if (taskMonitor != null) { taskMonitor.setStatus("Finding highly overlapping clusters..."); taskMonitor.setPercentCompleted(0); } for (i = 0; i < n; i++) { NodeSet v1 = nodeSets.get(i); for (int j = i+1; j < n; j++) { if (similarityFunc.getSimilarity(v1, nodeSets.get(j)) >= threshold) overlapGraph.createEdge(i, j); } stepsTaken += (n - i - 1) / (double)(n); if (stepsTaken > stepsTotal) { stepsTaken = stepsTotal; } if (taskMonitor != null) { taskMonitor.setPercentCompleted((int)(100 * stepsTaken / stepsTotal)); } } if (taskMonitor != null) { taskMonitor.setPercentCompleted(100); } if (taskMonitor != null) { taskMonitor.setStatus("Merging highly overlapping clusters..."); taskMonitor.setPercentCompleted(0); } BitSet visited = new BitSet(n); for (i = visited.nextClearBit(0); i < n; i = visited.nextClearBit(i+1)) { if (overlapGraph.getDegree(i) == 0) { result.add(nodeSets.get(i)); visited.set(i); } else { BreadthFirstSearch bfs = new BreadthFirstSearch(overlapGraph, i); Multiset<Integer> members = new TreeMultiset<Integer>(); for (int j: bfs) { SortedSet<Integer> newMembers = nodeSets.get(j).getMembers(); members.addAll(newMembers); nodeSets.set(j, null); visited.set(j); } ValuedNodeSet newNodeSet = new ValuedNodeSet(graph, members.elementSet()); for (Multiset.Entry<Integer> entry: members.entrySet()) newNodeSet.setValue(entry.getElement(), entry.getCount()); result.add(newNodeSet); } if (taskMonitor != null) { taskMonitor.setPercentCompleted((int) (100.0 * i / n)); } } if (taskMonitor != null) { taskMonitor.setPercentCompleted(100); } return result; } public ValuedNodeSetList mergeOverlappingNew( ValuedNodeSetList nodeSets, SimilarityFunction<NodeSet> similarityFunc, double threshold) { int i, j, n; int numNodes; int numNodeSets = nodeSets.size(); ValuedNodeSetList result = new ValuedNodeSetList(); IntArray[] nodesToNodeSetIndexes; double stepsTotal; int stepsTaken; if (numNodeSets == 0) return result; Graph graph = nodeSets.get(0).getGraph(); numNodes = graph.getNodeCount(); // Create an index that maps nodes to all the nodesets that contain the node. // This is used to figure out easily what other nodesets could intersect the // nodeset we will be considering later on during a BFS. if (taskMonitor != null) { taskMonitor.setStatus("Indexing clusters..."); taskMonitor.setPercentCompleted(0); } nodesToNodeSetIndexes = new IntArray[numNodes]; stepsTaken = 0; stepsTotal = nodeSets.size(); for (NodeSet nodeSet: nodeSets) { for (int member: nodeSet) { if (nodesToNodeSetIndexes[member] == null) { nodesToNodeSetIndexes[member] = new IntArray(); } nodesToNodeSetIndexes[member].add(stepsTaken); } stepsTaken++; if (taskMonitor != null) { taskMonitor.setPercentCompleted((int) (100.0 * stepsTaken / stepsTotal)); } } if (taskMonitor != null) { taskMonitor.setPercentCompleted(100); } // Okay, indexing done. Now we will start a BFS on a graph where the vertices // are nodesets and two nodesets are connected if their similarity is larger than // or equal to the threshold -- but we do this without constructing the graph. // Here we assume that the similarity function is symmetric so the graph is // essentially undirected. if (taskMonitor != null) { taskMonitor.setStatus("Merging highly overlapping clusters..."); taskMonitor.setPercentCompleted(0); } BitSet visited = new BitSet(numNodeSets); IntQueue q = new IntQueue(); Multiset<Integer> members = new TreeMultiset<Integer>(); // TODO: if we used a Multiset for potentialNeighbors, we could get the intersection // sizes for "free" Set<Integer> potentialNeighbors = new HashSet<Integer>(); for (i = visited.nextClearBit(0); i < numNodeSets; i = visited.nextClearBit(i+1)) { // Okay, start a BFS from nodeSet i q.clear(); q.add(i); members.clear(); // Mark the initial nodeset as visited (we are marking nodesets as visited as soon as // they are added to the queue so they are not added twice) visited.set(i); while (!q.isEmpty()) { // Get the current nodeset from the queue int nodeSetIndex = q.remove(); NodeSet currentNodeSet = nodeSets.get(nodeSetIndex); // Merge the current nodeset into 'members' members.addAll(currentNodeSet.getMembers()); // Look at the index and find the potential neighbors of the nodeset in the // similarity graph by looking up each of its nodes. potentialNeighbors.clear(); for (int node: currentNodeSet) { IntArray array = nodesToNodeSetIndexes[node]; n = array.size(); for (j = 0; j < n; j++) { potentialNeighbors.add(array.get(j)); } } // Check each potential neighbor to see whether its similarity to the // currentNodeSet is high enough for (Integer neighborNodeSetIndex: potentialNeighbors) { if (visited.get(neighborNodeSetIndex)) continue; NodeSet neighborNodeSet = nodeSets.get(neighborNodeSetIndex); // If neighborNodeSet is null, it means that we have processed it already when // it was part of the queue earlier. It also means that we have checked the // (currentNodeSet, neighborNodeSet) pair already so there is nothing to do here. // This is a shortcut that we mark by (*) so we can refer to it in later comments. if (neighborNodeSet == null) continue; if (similarityFunc.getSimilarity(currentNodeSet, neighborNodeSet) >= threshold) { // Add neighborNodeSet to the queue and mark it as visited q.add(neighborNodeSetIndex); visited.set(neighborNodeSetIndex); } } // We can throw away the nodeset now because it has been merged into 'members'. // It also makes it possible to make a shortcut (marked by (*)) above in the for loop. nodeSets.set(nodeSetIndex, null); } // Construct a new ValuedNodeSet from 'members' and store it in the result ValuedNodeSet newNodeSet = new ValuedNodeSet(graph, members.elementSet()); for (Multiset.Entry<Integer> entry: members.entrySet()) { newNodeSet.setValue(entry.getElement(), entry.getCount()); } result.add(newNodeSet); // Update the progress bar if (taskMonitor != null) { taskMonitor.setPercentCompleted((int) (100.0 * i / numNodeSets)); } } if (taskMonitor != null) { taskMonitor.setPercentCompleted(100); } return result; } }