/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.visualization.clustering;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Vector;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.math.random.JDKRandomGenerator;
import org.apache.commons.math.random.RandomDataImpl;
import org.apache.commons.math.random.RandomGenerator;
import at.tuwien.ifs.somtoolbox.layers.metrics.DistanceMetric;
import at.tuwien.ifs.somtoolbox.layers.metrics.L2Metric;
import at.tuwien.ifs.somtoolbox.layers.metrics.MetricException;
import at.tuwien.ifs.somtoolbox.util.VectorTools;
/**
* Pretty much the classic K-Means clustering. Tried to keep it simple, though.
*
* @author Robert Neumayer
* @version $Id: KMeans.java 3921 2010-11-05 12:54:53Z mayer $
*/
public class KMeans {
public enum InitType {
RANDOM, RANDOM_INSTANCE, LINEAR, LINEAR_INSTANCE, EQUAL_NUMBERS
}
protected double[][] data;
private int k, numberOfInstances, numberOfAttributes;
private double[] minValues, maxValues, differences;
// this one will be used for looking up cluster assignments
// which in turn will help to terminate the training process
// as soon as we don't experience any changes in cluster assignments
private Hashtable<Integer, Integer> instancesInClusters;
protected Cluster[] clusters;
// TODO remove this after testing
private static long RANDOM_SEED = 1234567;
/**
* Default constructor (as much defaulting as possible). Uses linear initialisation and Euclidean distance.
*
* @param k number of clusters
* @param data guess
*/
public KMeans(int k, double[][] data) {
this(k, data, InitType.RANDOM, new L2Metric());
}
/**
* Instantiate a new KMeans object with:
*
* @param k number of clusters
* @param data the data to cluster
* @param initialisation the initialisation method used (to be chosen from InitType)
*/
public KMeans(int k, double[][] data, InitType initialisation) {
this(k, data, initialisation, new L2Metric());
}
/**
* Construct a new K-Means bugger.
*
* @param k number of clusters
* @param data the data set
* @param initialisation initialisation type
* @param distanceFunction an LnMetric of your choice
*/
public KMeans(int k, double[][] data, InitType initialisation, DistanceMetric distanceFunction) {
this.k = k;
this.numberOfAttributes = data[0].length;
this.numberOfInstances = data.length;
this.instancesInClusters = new Hashtable<Integer, Integer>();
this.clusters = new Cluster[k];
this.data = data;
// initialise a couple of things
initMinAndMaxValues();
switch (initialisation) {
case LINEAR:
initClustersLinearly(distanceFunction);
break;
case LINEAR_INSTANCE:
initClustersLinearlyOnInstances(distanceFunction);
break;
case RANDOM:
initClustersRandomly(distanceFunction);
break;
case RANDOM_INSTANCE:
initClustersRandomlyOnInstances(distanceFunction);
break;
case EQUAL_NUMBERS:
initClustersEqualNumbers(distanceFunction);
break;
default:
break;
}
// printClusters();
// this one is to do a first assignment of data points to clusters
trainingStep();
}
/**
* Train for a certain number of steps. Note that we won't stop until all training steps are finished.
*
* @param numberOfSteps how many would you like?
*/
public void train(int numberOfSteps) {
for (int stepIndex = 0; stepIndex < numberOfSteps; stepIndex++) {
System.out.println("step: " + stepIndex + " / " + trainingStep());
}
}
/**
* Train for as long as instances move between clusters. "Not moving" means that there hasn't been a change in the
* last {@link #NUMBER_OF_UPDATE_RANGE} steps ({@value #NUMBER_OF_UPDATE_RANGE}).
*/
public void train() {
ArrayList<Boolean> lastUpdates = new ArrayList<Boolean>(NUMBER_OF_UPDATE_RANGE);
boolean hasUpdatedInLastKRounds = false;
do {
boolean thisUpdate = trainingStep();
hasUpdatedInLastKRounds = false;
// check if there was any update in the last steps
lastUpdates.add(0, thisUpdate);
for (int i = 0; i < NUMBER_OF_UPDATE_RANGE && i < lastUpdates.size(); i++) {
if (lastUpdates.get(i)) {
hasUpdatedInLastKRounds = true;
break;
}
}
} while (hasUpdatedInLastKRounds);
removeEmptyClusters();
}
// TODO implement a better stop criterion (e.g. less than 10 per cent of instances move between clusters per step)
/**
* Searches for clusters which have no instances assigned. These are then replaced FIXME FIXME private void
* substituteEmptyClusters() { System.out.println("Removing empty clusters:"); double[] replacementCentroid = new
* double[clusters[0].getCentroid().length]; for (int i = 0; i < clusters.length; i++) { if
* (clusters[i].getNumberOfInstances() != 0) replacementCentroid = clusters[i].getCentroid().clone(); } for (int i =
* 0; i < clusters.length; i++) { if (clusters[i].getNumberOfInstances() == 0)
* clusters[i].setCentroid(replacementCentroid); } }
*/
private void removeEmptyClusters() {
Vector<Cluster> nonEmptyClusters = new Vector<Cluster>();
for (Cluster cluster : clusters) {
if (cluster.getNumberOfInstances() != 0) {
nonEmptyClusters.add(cluster);
}
}
Cluster[] remainingClusters = new Cluster[nonEmptyClusters.size()];
for (int i = 0; i < nonEmptyClusters.size(); i++) {
remainingClusters[i] = nonEmptyClusters.elementAt(i);
}
clusters = remainingClusters;
}
private int lastNumberOfUpdates = Integer.MAX_VALUE;
private static final int NUMBER_OF_UPDATE_RANGE = 5; // we check that many steps for any updates
/**
* A classic training step in the K-Means world.
*
* @return whether this step brought any changes or not. Note, this one also says no if there were as many changes
* as in the last step.
*/
private boolean trainingStep() {
boolean didUpdate = false;
int numberOfUpdates = 0;
for (int instanceIndex = 0; instanceIndex < numberOfInstances; instanceIndex++) {
// find closest centroid
int indexOfClosestCluster = 0;
indexOfClosestCluster = getIndexOfClosestCluster(data[instanceIndex]);
// if there's no assignment stored in our lookup table, we add all
// instances to their according clusters (the one with the closest centroids that is)
// so this happens in the first run only
if (instancesInClusters.get(new Integer(instanceIndex)) == null) {
clusters[indexOfClosestCluster].addIndex(instanceIndex);
didUpdate = true;
numberOfUpdates++;
}
// now we have them in the lookup table but the closest cluster changed from the last run
// so we remove the instance from the cluster and assign it to a new one
else if (!instancesInClusters.get(new Integer(instanceIndex)).equals(new Integer(indexOfClosestCluster))) {
clusters[instancesInClusters.get(new Integer(instanceIndex))].removeInstanceIndex(instanceIndex);
clusters[indexOfClosestCluster].addIndex(instanceIndex);
didUpdate = true;
numberOfUpdates++;
}
// we always update the lookup table
instancesInClusters.put(instanceIndex, indexOfClosestCluster);
}
// calculate new centroids
System.out.println("SSE: " + this.getSSE());
System.out.println("performed: " + numberOfUpdates + " updates.");
calculateNewCentroids();
this.printCentroidsShort();
if (numberOfUpdates == lastNumberOfUpdates) {
return false;
}
lastNumberOfUpdates = numberOfUpdates;
return didUpdate;
}
/**
* Batch calculation of all cluster centroids.
*/
private void calculateNewCentroids() {
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
// go look for a new centroid if no instances are assigned to the cluster
if (clusters[clusterIndex].getNumberOfInstances() == 0) {
if (getSubstituteCentroid() != null) {
clusters[clusterIndex].setCentroid(getSubstituteCentroid());
}
}
clusters[clusterIndex].calculateCentroid(data);
}
}
/**
* Get a new centroid for empty clusters. We therefore take the instance with the largest SSE to the cluster
* centroid having the largest SSE. Get the idea? Read slowly.
*
* @return a new centroid (rather: a clone thereof :))
*/
private double[] getSubstituteCentroid() {
double maxSSE = Double.NEGATIVE_INFINITY;
int maxSSEIndex = -1;
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
clusters[clusterIndex].calculateCentroid(data);
double currentSSE = clusters[clusterIndex].SSE(data);
if (currentSSE > maxSSE) {
maxSSE = currentSSE;
maxSSEIndex = clusterIndex;
}
}
// System.out.println(maxSSEIndex);
// System.out.println(clusters.length);
// System.out.println(clusters[maxSSEIndex].getInstanceIndexWithMaxSSE(data));
// FIXME is this the right way of handling this (if the max sse exists in a cluster that has no instances
// assigned)
if (clusters[maxSSEIndex].getInstanceIndexWithMaxSSE(data) == -1) {
return null;
}
return data[clusters[maxSSEIndex].getInstanceIndexWithMaxSSE(data)].clone();
}
/**
* Get the index of the closest cluster for the given instance index. Note that in case of equally distant clusters
* we assign the first found cluster. At the end of the day this means that the clusters with lower indices will
* have a tendency to be larger. It hopefully won't have too much impact, possibly a random assignment in case of
* equal weights would make sense, however, this would require a couple of steps more in here.
*
* @param instance the data vector to be assigned
* @return index of the closest cluster centre
*/
private int getIndexOfClosestCluster(double[] instance) {
int indexOfClosestCluster = 0;
double smallestDistance = Double.POSITIVE_INFINITY;
double currentDistance = 0;
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
currentDistance = clusters[clusterIndex].getDistanceToCentroid(instance);
if (currentDistance < smallestDistance) {
smallestDistance = currentDistance;
indexOfClosestCluster = clusterIndex;
}
}
return indexOfClosestCluster;
}
/**
* Get a set of labels for the given clustering based on the occurrences of attributes within clusters, i.e. there's
* a preference for labels occurring in many instance.
*
* @return labels
*/
public int[][] getOccurrenceLabels(int numberOfLabels) {
int[][] labelIndices = new int[clusters.length][numberOfLabels];
for (int i = 0; i < getClusters().length; i++) {
Cluster c = getClusters()[i];
int[] occ = c.getNumberOfAttributeOccurrences(data);
int[] maxIndices = VectorTools.getIndicesOfMaxValues(occ, numberOfLabels);
labelIndices[i] = maxIndices;
}
return labelIndices;
}
/**
* Calculate random centroids for each cluster.
*/
private void initClustersRandomly(DistanceMetric distanceFunction) {
RandomGenerator rg = new JDKRandomGenerator();
// FIXME: this is for testing purposes only
rg.setSeed(RANDOM_SEED);
// for each cluster
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
// for each of its attributes
double[] centroid = new double[numberOfAttributes];
for (int attributeIndex = 0; attributeIndex < numberOfAttributes; attributeIndex++) {
centroid[attributeIndex] = differences[attributeIndex] * rg.nextDouble() + minValues[attributeIndex];
}
clusters[clusterIndex] = new Cluster(centroid, distanceFunction);
}
System.out.println("initial centroids: ");
// printCentroids();
}
/**
* cluster centres are initialised by equally sized random chunks of the input data when there's 150 instances, we
* assign 50 chosen randomly to each cluster and calculate its centre from these (the last cluster might be larger
* if numInstances mod k < 0)
*/
private void initClustersEqualNumbers(DistanceMetric distanceFunction) {
HashSet<Integer> usedIndices = new HashSet<Integer>();
int limit = numberOfInstances / k;
// FIXME: Test clustering with new permutation generator!
// int[] randPermIndices = RandomTools.permutation(new Random(RANDOM_SEED), this.numberOfInstances);
JDKRandomGenerator rg = new JDKRandomGenerator();
rg.setSeed(RANDOM_SEED);
int[] randPermIndices = new RandomDataImpl(rg).nextPermutation(this.numberOfInstances, this.numberOfInstances);
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
Cluster c = new Cluster(new double[data[0].length]);
// System.out.println("cluster: " + clusterIndex);
for (int randPermIndice : randPermIndices) {
int currentIndex = randPermIndice;
if ((c.getNumberOfInstances() < limit || clusterIndex == k - 1) && !usedIndices.contains(currentIndex)) {
c.addIndex(currentIndex);
usedIndices.add(currentIndex);
// System.out.print(" " + currentIndex);
}
}
// System.out.println();
c.calculateCentroid(data);
// clusters[clusterIndex] = c;
clusters[clusterIndex] = new Cluster(c.getCentroid(), distanceFunction);
// System.out.println("setting cluster: " + clusterIndex + " / " + c.getNumberOfInstances());
}
}
/** Take random points from the input data as centroids. */
private void initClustersRandomlyOnInstances(DistanceMetric distanceFunction) {
ArrayList<double[]> usedInstances = new ArrayList<double[]>();
RandomGenerator rg = new JDKRandomGenerator();
// FIXME: this is for testing purposes only
rg.setSeed(RANDOM_SEED);
// for each cluster
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
// draw a random input
double[] centroid = data[rg.nextInt(data.length - 1)].clone();
while (usedInstances.contains(centroid)) {
centroid = data[rg.nextInt(data.length - 1)].clone();
}
usedInstances.add(centroid);
clusters[clusterIndex] = new Cluster(centroid, distanceFunction);
}
}
/**
* This one does linear initialisation. In the two dimensional space it will place the cluster centres on a diagonal
* line of a square.
*/
private void initClustersLinearly(DistanceMetric distanceFunction) {
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
double[] centroid = new double[numberOfAttributes];
for (int attributeIndex = 0; attributeIndex < numberOfAttributes; attributeIndex++) {
centroid[attributeIndex] = (maxValues[attributeIndex] - minValues[attributeIndex])
/ (clusters.length + 1) * (clusterIndex + 1) + minValues[attributeIndex];
}
clusters[clusterIndex] = new Cluster(centroid, distanceFunction);
}
}
/**
* like {@link #initClustersLinearly(DistanceMetric)}, but after computing the exact linear point, rather finds &
* uses the closest instance from the data set as centroid.
*/
private void initClustersLinearlyOnInstances(DistanceMetric distanceFunction) {
ArrayList<double[]> usedInstances = new ArrayList<double[]>(); // to store which points are already taken
for (int clusterIndex = 0; clusterIndex < k; clusterIndex++) {
double[] centroid = new double[numberOfAttributes];
for (int attributeIndex = 0; attributeIndex < numberOfAttributes; attributeIndex++) {
centroid[attributeIndex] = (maxValues[attributeIndex] - minValues[attributeIndex])
/ (clusters.length + 1) * (clusterIndex + 1) + minValues[attributeIndex];
// now find the closest real instance to this point
double minDistance = Double.MAX_VALUE;
double[] minData = null;
try {
for (int i = 0; i < data.length; i++) {
if (distanceFunction.distance(centroid, data[i]) < minDistance
&& !usedInstances.contains(data[i]) || i == data.length - 1) {
minData = data[i];
minDistance = distanceFunction.distance(centroid, data[i]);
}
}
usedInstances.add(minData);
centroid = minData.clone();
} catch (MetricException e) {
e.printStackTrace();
}
}
clusters[clusterIndex] = new Cluster(centroid, distanceFunction);
}
}
/*
* TODO implement initialisation by the most frequent points (i.e. where people stand around in SL) private void initClustersFrequency() {
* clusters[0] = new Cluster(new double[] { 73, 183, 43 }); clusters[1] = new Cluster(new double[] { 144, 10, 33 }); clusters[2] = new Cluster(new
* double[] { 118, 156, 62 }); clusters[3] = new Cluster(new double[] { 188, 64, 33 }); clusters[4] = new Cluster(new double[] { 188, 63, 33 });
* clusters[5] = new Cluster(new double[] { 118, 156, 33 }); clusters[6] = new Cluster(new double[] { 141, 53, 42 }); clusters[7] = new
* Cluster(new double[] { 117, 153, 62 }); }
*/
/**
* Initialise the cluster centres with the given centres.
*
* @param centroids centroids for clusters.
* @throws MoreCentresThanKException don't dare to set more or less centres than our k value.
*/
public void setClusterCentroids(double[][] centroids) throws MoreCentresThanKException {
if (centroids.length != k) {
throw new MoreCentresThanKException("So, someone was trying to set " + centroids.length + " centres for "
+ k + " clusters. Hint: possibly transpose the matrix and try again.");
}
for (int i = 0; i < clusters.length; i++) {
clusters[i].setCentroid(centroids[i]);
}
// to do initial assignment of instances to the new centroids
trainingStep();
}
/**
* Utility method to get the min, max, and diff values of the data set. This is used for scaling the (random) values
* in the initialisation functions.
*/
private void initMinAndMaxValues() {
minValues = new double[numberOfAttributes];
maxValues = new double[numberOfAttributes];
differences = new double[numberOfAttributes];
// for each attribute
for (int j = 0; j < numberOfAttributes; j++) {
// in each instance (i.e. each single value now :-))
minValues[j] = Double.MAX_VALUE;
maxValues[j] = Double.MIN_VALUE;
for (double[] element : data) {
if (element[j] < minValues[j]) {
minValues[j] = element[j];
}
if (element[j] > maxValues[j]) {
maxValues[j] = element[j];
}
}
differences[j] = maxValues[j] - minValues[j];
}
}
/**
* Get a double[][] of all cluster centroids.
*
* @return all cluster centroids
*/
public double[][] getClusterCentroids() {
double[][] centroids = new double[k][numberOfAttributes];
for (int indexClusters = 0; indexClusters < clusters.length; indexClusters++) {
centroids[indexClusters] = clusters[indexClusters].getCentroid();
}
return centroids;
}
public double[][] getClusterVariances() {
double[][] variances = new double[clusters.length][numberOfAttributes];
for (int indexClusters = 0; indexClusters < clusters.length; indexClusters++) {
double[][] instances = clusters[indexClusters].getInstances(data);
// for all attributes in this cluster
for (int i = 0; i < numberOfAttributes; i++) {
double n = 0;
double mean = 0;
double m2 = 0;
double delta = 0;
for (double[] instance : instances) {
n++;
double value = instance[i];
delta = value - mean;
mean += delta / n;
m2 += delta * (value - mean);
}
variances[indexClusters][i] = m2 / n;
}
}
for (double[] vars : variances) {
System.out.println(ArrayUtils.toString(vars));
}
return variances;
}
/**
* Get a double[][] of all cluster centroids. Normalised in the range of the original data.
*
* @return all cluster centroids
*/
public double[][] getMinMaxNormalisedClusterCentroids() {
double[][] normalisedCentroids = new double[k][numberOfAttributes];
for (int indexClusters = 0; indexClusters < k; indexClusters++) {
double[] normalisedCentroid = clusters[indexClusters].getCentroid();
for (int i = 0; i < normalisedCentroid.length; i++) {
normalisedCentroid[i] = (normalisedCentroid[i] - minValues[i]) / (maxValues[i] - minValues[i]);
}
normalisedCentroids[indexClusters] = normalisedCentroid;
}
return normalisedCentroids;
}
/**
* Get a double[][] of all cluster centroids. Normalised in the range of the centroids.
*
* @return all cluster centroids
*/
public double[][] getMinMaxNormalisedClusterCentroidsWithin() {
double[] min = new double[data.clone()[0].length];
double[] max = new double[data.clone()[0].length];
// min[] = Double.MAX_VALUE;
// double max[] = Double.MIN_VALUE;
for (int i = 0; i < data[0].length; i++) {
for (Cluster cluster : clusters) {
if (i == 0) {
min[i] = Double.MAX_VALUE;
max[i] = Double.MIN_VALUE;
}
if (cluster.getCentroid()[i] > max[i]) {
max[i] = cluster.getCentroid()[i];
}
if (cluster.getCentroid()[i] < min[i]) {
min[i] = cluster.getCentroid()[i];
}
}
}
double[][] centroids = new double[k][numberOfAttributes];
for (int indexClusters = 0; indexClusters < k; indexClusters++) {
double[] centroid = clusters[indexClusters].getCentroid();
for (int i = 0; i < centroid.length; i++) {
centroid[i] = (centroid[i] - minValues[i]) / maxValues[i];
}
centroids[indexClusters] = centroid;
}
return centroids;
}
public double[] getMinValues() {
return minValues;
}
public double[] getMaxValues() {
return maxValues;
}
public double[] getDifferences() {
return differences;
}
public Cluster[] getClusters() {
return clusters;
}
/**
* Get the sum of the squared error for all clusters.
*
* @return SSE.
*/
public double getSSE() {
double sse = 0d;
for (Cluster cluster : clusters) {
sse += cluster.SSE(data);
}
return sse;
}
/**
* Get the sum of the squared error for single clusters.
*
* @return several SSEs.
*/
public double[] getSSEs() {
double[] sse = new double[k];
for (int i = 0; i < clusters.length; i++) {
sse[i] = clusters[i].SSE(data);
}
return sse;
}
public void printCentroids() {
for (int i = 0; i < clusters.length; i++) {
System.out.println(i + " / " + clusters[i].SSE(data) + " / " + clusters[i].getNumberOfInstances() + " / "
+ ArrayUtils.toString(clusters[i].getCentroid()));
}
}
public void printCentroidsShort() {
for (int i = 0; i < clusters.length; i++) {
System.out.println("\t" + i + " / " + clusters[i].SSE(data) + " / " + clusters[i].getNumberOfInstances());
}
}
public void printClusterIndices() {
for (int i = 0; i < clusters.length; i++) {
System.out.println("Cluster: " + i);
clusters[i].printClusterIndices(data);
}
}
/**
* @return Returns the data.
*/
public double[][] getData() {
return data;
}
}