package ch.akuhn.hapax.cluster; import static ch.akuhn.foreach.For.range; import static java.lang.Math.min; import java.util.ArrayList; import java.util.List; public class ClusterEngine<T> implements Runnable { private static final int DONE = -1; private List<Dendrogram<T>> clusters; int pending, found_a, found_b; private ClusterEngineRow[] rows; private boolean similarity; private double threshold; private int[] todos; public ClusterEngine(List<T> elements, Distance<T> dist) { similarity = dist instanceof Similarity<?>; init_clusters(elements); init_rows(elements, dist); init_todos(); pending = todos.length; } public Dendrogram<T> dendrogram() { this.run(); Dendrogram<T> root = null; for (int todo: todos) { if (todo == DONE) continue; assert root == null; root = clusters.get(todo); } return root; } private void findMinimum() { double min = Distance.INFINITY; for (int todo: todos) { if (todo == DONE) continue; ClusterEngineRow row = rows[todo]; if (row.min() < min) { threshold = min = row.min(); found_a = todo; found_b = row.found(); } } } private double get(int row, int column) { if (row == column) return 0; return row > column ? rows[row].get(column) : rows[column].get(row); } private void init_clusters(List<T> elements) { this.clusters = new ArrayList<Dendrogram<T>>(); for (T each: elements) this.clusters.add(new Dendrogram.Leaf<T>(each)); } private void init_rows(List<T> elements, Distance<T> dist) { this.rows = new ClusterEngineRow[elements.size()]; for (int row: range(rows.length)) { T element = elements.get(row); double[] values = new double[row]; for (int col: range(values.length)) { values[col] = dist.dist(element, elements.get(col)); } rows[row] = new ClusterEngineRow(values); } } private void init_todos() { todos = range(rows.length).asArray(); } private double linkage(int todo) { return min(get(found_a, todo), get(found_b, todo)); } private void mergeClusters() { todos[found_b] = DONE; for (int todo: todos) { if (todo == DONE) continue; put(found_a, todo, linkage(todo)); unset(found_b, todo); } clusters.set(found_a, clusters.get(found_a) .merge(clusters.get(found_b), similarity ? 1 - threshold : threshold)); clusters.set(found_b, null); pending--; } private double put(int row, int column, double min) { if (row == column) return 0; return row > column ? rows[row].set(column, min) : rows[column].set(row, min); } public void run() { while (pending > 1) { findMinimum(); mergeClusters(); } } private void unset(int row, int column) { if (row == column) return; if (row > column) rows[row].unset(column); else rows[column].unset(row); } } class ClusterEngineRow { private static final int NULL = -1; private int found = NULL; private double min = Distance.INFINITY; private final double[] values; public ClusterEngineRow(double[] values) { this.values = values; if (values.length == 0) found = 0; } public int found() { if (found == NULL) update(); return found; } public double get(int index) { return values[index]; } public double min() { if (found == NULL) update(); return min; } public double set(int index, double value) { return values[index] = value; } public void unset(int index) { values[index] = Distance.INFINITY; if (index == found) found = NULL; } private void update() { min = Distance.INFINITY; for (int n: range(values.length)) { if (values[n] < min) { min = values[n]; found = n; } } } }