package edu.hawaii.jmotif.text.cluster; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Stack; import edu.hawaii.jmotif.text.CosineDistanceMatrix; /** * Hierarchical clustering factory. * * @author psenin * */ public class HC { /** * Implements hierarchical clustering for word bags. * * @param tfidfData The data to cluster. * @param criterion The linkage criterion. * @return The resulting cluster structure. */ public static Cluster Hc(HashMap<String, HashMap<String, Double>> tfidfData, LinkageCriterion criterion) { // pre-compute distances matrix // CosineDistanceMatrix distanceMatrix = new CosineDistanceMatrix(tfidfData); // Note, however, that Cosine distance is INVERSE - i.e. lower value means GREATER angle // so we need to substract it from 1 distanceMatrix.transformForHC(); // first put everything into own clusters // List<Cluster> activeClusters = new ArrayList<Cluster>(); for (String key : tfidfData.keySet()) { Cluster c = new Cluster(key); activeClusters.add(c); } // make a stack structure // Stack<Cluster> stack = new Stack<Cluster>(); // main loop goes on while there is more then one element in the active set while (activeClusters.size() > 1) { // if the stack is empty - push something into // if (stack.isEmpty()) { Cluster cc = activeClusters.get(0); stack.push(cc); } // find the cluster which is nearest to the one in stack head // Cluster top = stack.peek(); Cluster nearest = getNearest(top, activeClusters, tfidfData, distanceMatrix, criterion); // if the nearest is in the stack already - it must be the very next to the head // pop both and merge together // remove merged clusters from active set and add a newly merged one if (stack.contains(nearest)) { Cluster a = stack.pop(); Cluster b = stack.pop(); activeClusters.remove(a); activeClusters.remove(b); Cluster merged = new Cluster(); merged.merge(a, b, a.distanceTo(b, tfidfData, distanceMatrix, criterion)); activeClusters.add(merged); } // if nearest not in the stack - push it into else { stack.push(nearest); } } // recompute heights of joints // return resulting single cluster Cluster res = activeClusters.get(0); return res; } private static Cluster getNearest(Cluster stackTop, List<Cluster> activeClusters, HashMap<String, HashMap<String, Double>> data, CosineDistanceMatrix distanceMatrix, LinkageCriterion criterion) { Cluster res = null; double minDistance = Double.MAX_VALUE; for (Cluster cc : activeClusters) { if (stackTop.equals(cc)) { continue; } Double distance = stackTop.distanceTo(cc, data, distanceMatrix, criterion); if (distance < minDistance) { res = cc; minDistance = distance; } } return res; } }