/* * Copyright 2007 Tom Gibara * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.jgrasstools.gears.utils.clustering; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Maintains a collection of clusters which are adjusted as new elements are * added. The keyer assigned to this object should not be modified while there * are clusters. * * @author Tom Gibara * * @param <K> * the key type */ public class GvmClusters<S extends GvmSpace, K> { // statics /** * Helper method to avoid propagation of negative variances. * * @param var * a variance * @return the variance clamped at zero */ static double correct(double var) { return var >= 0.0 ? var : 0.0; } // fields /** * The greatest number of clusters that will be recorded */ final int capacity; /** * Defines the points that will be clusters */ final S space; /** * The keyer used to apply keys to clusters. */ private GvmKeyer<K> keyer = new GvmDefaultKeyer<K>(); /** * The clusters. */ private final GvmCluster<S,K>[] clusters; /** * All possible cluster pairs. */ private final GvmClusterPairs<S,K> pairs; /** * The number of points that have been added. */ private int additions = 0; /** * The current number of clusters. */ private int count = 0; /** * The number of clusters prior to reduction */ private int bound = 0; @SuppressWarnings("unchecked") public GvmClusters(S space, int capacity) { if (space == null) throw new IllegalArgumentException("null space"); if (capacity < 0) throw new IllegalArgumentException("negative capacity"); this.space = space; this.capacity = capacity; this.clusters = new GvmCluster[capacity]; pairs = new GvmClusterPairs<S,K>(capacity * (capacity-1) / 2); } // accessors /** * The keyer used to assign keys to clusters. */ public GvmKeyer<K> getKeyer() { return keyer; } /** * The keyer to be used. * * @param keyer a keyer, not null */ public void setKeyer(GvmKeyer<K> keyer) { if (keyer == null) throw new IllegalArgumentException(); this.keyer = keyer; } /** * The greatest number of clusters that may be recorded. * * @return the capacity */ public int getCapacity() { return capacity; } /** * The space in which clustering will be performed * * @return the space containing the points to be clustered */ public S getSpace() { return space; } // public methods /** * Removes all clusters and clustered points but retains the keyer. */ public void clear() { Arrays.fill(clusters, 0, bound, null); pairs.clear(); additions = 0; count = 0; bound = 0; } /** * Adds a point to be clustered. * * @param m * the mass at the point * @param xs * the coordinates of the point * @param key * a key assigned to the point, may be null */ public void add(double m, Object pt, K key) { if (m == 0.0) return; //nothing to do if (count < capacity) { //shortcut //TODO should prefer add if var comes to zero GvmCluster<S,K> cluster = new GvmCluster<S,K>(this); clusters[additions] = cluster; cluster.set(m, pt); addPairs(); cluster.key = keyer.addKey(cluster, key); count++; bound = count; } else { //identify cheapest merge GvmClusterPair<S,K> mergePair = pairs.peek(); double mergeT = mergePair == null ? Double.MAX_VALUE : mergePair.value; //find cheapest addition GvmCluster<S,K> additionC = null; double additionT = Double.MAX_VALUE; for (int i = 0; i < clusters.length; i++) { GvmCluster<S,K> cluster = clusters[i]; double t = cluster.test(m, pt); if (t < additionT) { additionC = cluster; additionT = t; } } if (additionT <= mergeT) { //chose addition additionC.add(m, pt); updatePairs(additionC); additionC.key = keyer.addKey(additionC, key); } else { //choose merge GvmCluster<S,K> c1 = mergePair.c1; GvmCluster<S,K> c2 = mergePair.c2; if (c1.m0 < c2.m0) { c1 = c2; c2 = mergePair.c1; } c1.key = keyer.mergeKeys(c1, c2); c1.add(c2); updatePairs(c1); c2.set(m, pt); updatePairs(c2); //TODO should this pass through a method on keyer? c2.key = null; c2.key = keyer.addKey(c2, key); } } additions++; } /** * Collapses the number of clusters subject to constraints on the maximum * permitted variance, and the least number of clusters. This method may be * called at any time, including between calls to add(). * * @param maxVar * an upper bound on the global variance that may not be exceeded * by merging clusters * @param minClusters * a lower bound on the the number of clusters that may not be * exceeded by merging clusters */ public void reduce(double maxVar, int minClusters) { if (minClusters < 0) throw new IllegalArgumentException("negative minClusters"); if (count <= minClusters) return; //nothing to do double totalVar = 0.0; double totalMass = 0.0; for (int i = 0; i < count; i++) { GvmCluster<S,K> cluster = clusters[i]; totalVar += cluster.var; totalMass += cluster.m0; } while (count > minClusters) { if (count == 1) { //remove the last cluster for (int i = 0; i < bound; i++) { GvmCluster<S,K> c = clusters[i]; if (!c.removed) { c.removed = true; break; } } } else { GvmClusterPair<S,K> mergePair = pairs.peek(); GvmCluster<S,K> c1 = mergePair.c1; GvmCluster<S,K> c2 = mergePair.c2; if (c1.m0 < c2.m0) { c1 = c2; c2 = mergePair.c1; } if (maxVar >= 0.0) { double diff = c1.test(c2) - c1.var - c2.var; totalVar += diff; if (totalVar/totalMass > maxVar) break; //stop here, we are going to exceed maximum } c1.key = keyer.mergeKeys(c1, c2); c1.add(c2); updatePairs(c1); removePairs(c2); c2.removed = true; } count--; } //iterate over clusters and remove dead clusters { int j = 0; for (int i = 0; i < bound;) { boolean lose = clusters[i].removed; if (lose) { i++; } else { if (i != j) clusters[j] = clusters[i]; i++; j++; } } for (; j < bound; j++) { clusters[j] = null; } } //iterate over cluster pairs and remove dead pairs for (int i = 0; i < count; i++) { GvmCluster<S,K> cluster = clusters[i]; GvmClusterPair<S,K>[] pairs = cluster.pairs; int k = 0; for (int j = 0; j < bound-1;) { GvmClusterPair<S,K> pair = pairs[j]; boolean lose = pair.c1.removed || pair.c2.removed; if (lose) { j++; } else { if (j != k) pairs[k] = pairs[j]; k++; j++; } } for (; k < bound; k++) { pairs[k] = null; } } bound = count; } /** * Obtains the clusters for the points added. This method may be called * at any time, including between calls to add(). * * @return the result of clustering the points thus far added */ public List<GvmResult<K>> results() { ArrayList<GvmResult<K>> list = new ArrayList<GvmResult<K>>(count); for (int i = 0; i < count; i++) { GvmCluster<S,K> cluster = clusters[i]; //TODO exclude massless clusters? list.add(new GvmResult<K>(cluster)); } return list; } // private utility methods //assumes that count not yet incremented //assumes last cluster is the one to add pairs for //assumes pairs are contiguous private void addPairs() { GvmCluster<S,K> cj = clusters[count]; int c = count - 1; //index at which new pairs registered for existing clusters for (int i = 0; i < count; i++) { GvmCluster<S,K> ci = clusters[i]; GvmClusterPair<S,K> pair = new GvmClusterPair<S,K>(ci, cj); ci.pairs[c] = pair; cj.pairs[i] = pair; pairs.add(pair); } } //does not assume pairs are contiguous private void updatePairs(GvmCluster<S,K> cluster) { GvmClusterPair<S,K>[] pairs = cluster.pairs; //accelerated path if (count == bound) { int limit = count - 1; for (int i = 0; i < limit; i++) { this.pairs.reprioritize(pairs[i]); } } else { int limit = bound - 1; for (int i = 0; i < limit; i++) { GvmClusterPair<S,K> pair = pairs[i]; if (pair.c1.removed || pair.c2.removed) continue; this.pairs.reprioritize(pair); } } } //does not assume pairs are contiguous //leaves pairs in cluster pair lists //these are tidied when everything is made contiguous again private void removePairs(GvmCluster<S,K> cluster) { GvmClusterPair<S,K>[] pairs = cluster.pairs; for (int i = 0; i < bound-1; i++) { GvmClusterPair<S,K> pair = pairs[i]; if (pair.c1.removed || pair.c2.removed) continue; this.pairs.remove(pair); } } }