KMeansClusteringTrainer.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.machinelearning;

import marytts.signalproc.analysis.distance.DistanceComputer;
import marytts.util.math.MathUtils;

/**
 * K-Means clustering training algorithm
 * 
 * Reference: J. MacQueen, 1967, "Some methods for classification and analysis of multivariate observations", Proc. Fifth Berkeley
 * Symp. on Math. Statist. and Prob., Vol. 1 (Univ. of Calif. Press, 1967), pp. 281-297.
 * 
 * @author Oytun Türk
 */
public class KMeansClusteringTrainer {
	public Cluster[] clusters; // Parameters of each cluster
	public int[] totalObservationsInClusters; // Total number of observations in each cluster
	public int[] clusterIndices; // Assigned cluster for each observation vector
	public double[][] covMatrixGlobal; // Global covariance matrix of data
	public double[][] invCovMatrixGlobal; // Inverse of global covariance matrix of data

	// This function clusters multi-dimensional feature vectors using K-Means clustering procedure
	// Each row of x, i.e. x[0], x[1], ... corresponds to an observation vector.
	// The dimension of each vector should be identical.
	// All training parameters are given by kmeansParams (See KMeansClusteringTrainerParams.java for details)
	// Training consists of four steps:
	// (a) Initialization (random assignment of cluster means using data points that are far away from each other + slight random
	// shifts)
	// (b) Hard clustering of samples according to new cluster means
	// (c) Update of cluster means using assigned samples
	// (d) Re-iteration of (b) and (c) until convergence, i.e. when overall cluster occupancy does not change much
	public void train(double[][] x, KMeansClusteringTrainerParams kmeansParams) {
		if (kmeansParams.globalVariances == null) {
			double[] meanVector = MathUtils.mean(x, true);
			kmeansParams.globalVariances = MathUtils.variance(x, meanVector, true);
		}

		int observations = x.length;
		int dimension = x[0].length;

		int c, k, k2, d, t, iter, i, j, totChanged;
		int ind = -1;
		boolean bCont;
		double rnd, tmpDist;
		double minDist = Double.MIN_VALUE;

		double[][] m_new = new double[kmeansParams.numClusters][];
		for (k = 0; k < kmeansParams.numClusters; k++)
			m_new[k] = new double[dimension];

		int[][] b = new int[observations][];
		for (t = 0; t < observations; t++)
			b[t] = new int[kmeansParams.numClusters];

		int[][] b_old = new int[observations][];
		for (t = 0; t < observations; t++)
			b_old[t] = new int[kmeansParams.numClusters];

		int[] prev_totals = new int[kmeansParams.numClusters];
		double changedPerc;

		double[] mAll = new double[dimension];

		clusters = new Cluster[kmeansParams.numClusters];
		for (k = 0; k < kmeansParams.numClusters; k++)
			clusters[k] = new Cluster(dimension, kmeansParams.isDiagonalOutputCovariance);

		for (k = 1; k <= kmeansParams.numClusters; k++) {
			for (d = 1; d <= dimension; d++)
				clusters[k - 1].meanVector[d - 1] = 0.0;

			for (t = 1; t <= observations; t++)
				b[t - 1][k - 1] = 0;
		}

		// Select initial cluster centers
		mAll = MathUtils.mean(x, true);

		k = 1;
		double[] dists = new double[observations];
		double[] tmp = new double[kmeansParams.numClusters + 1];
		double maxD = Double.MAX_VALUE;
		int maxInd = -1;

		while (k <= kmeansParams.numClusters) {
			for (t = 1; t <= observations; t++) {
				if (k > 1) {
					for (i = 1; i <= k - 1; i++)
						tmp[i - 1] = DistanceComputer.getNormalizedEuclideanDistance(clusters[i - 1].meanVector, x[t - 1],
								kmeansParams.globalVariances);

					tmp[k - 1] = DistanceComputer.getNormalizedEuclideanDistance(mAll, x[t - 1], kmeansParams.globalVariances);
					dists[t - 1] = MathUtils.mean(tmp, 0, k - 1);
				} else {
					dists[t - 1] = DistanceComputer.getNormalizedEuclideanDistance(mAll, x[t - 1], kmeansParams.globalVariances);
				}
			}

			for (t = 1; t <= observations; t++) {
				if (t == 1 || dists[t - 1] > maxD) {
					maxD = dists[t - 1];
					maxInd = t;
				}
			}

			for (d = 0; d < dimension; d++)
				clusters[k - 1].meanVector[d] = x[maxInd - 1][d];

			// System.out.println("Cluster center " + String.valueOf(k) + " initialized...");
			k++;
		}
		//

		int[] tinyClusterInds = new int[kmeansParams.numClusters];
		int numTinyClusters = 0;
		double[] tmps = new double[kmeansParams.numClusters];
		int[] inds;
		totalObservationsInClusters = new int[kmeansParams.numClusters];
		clusterIndices = new int[observations];

		iter = 0;
		bCont = true;
		while (bCont) {
			for (t = 1; t <= observations; t++) // Overall observations
			{
				for (i = 1; i <= kmeansParams.numClusters; i++) // Overall classes
				{
					tmpDist = DistanceComputer.getNormalizedEuclideanDistance(clusters[i - 1].meanVector, x[t - 1],
							kmeansParams.globalVariances);
					b[t - 1][i - 1] = 0;
					if (i == 1 || tmpDist < minDist) {
						minDist = tmpDist;
						ind = i;
					}
				}
				for (i = 1; i <= kmeansParams.numClusters; i++) // Overall classes
				{
					if (i == ind)
						b[t - 1][i - 1] = 1;
				}
			}

			// Update means
			for (i = 1; i <= kmeansParams.numClusters; i++) {
				totalObservationsInClusters[i - 1] = 0;
				tinyClusterInds[i - 1] = 0;
			}

			c = 1;
			for (i = 1; i <= kmeansParams.numClusters; i++) {
				for (d = 1; d <= dimension; d++)
					m_new[i - 1][d - 1] = 0.0f;

				for (t = 1; t <= observations; t++) {
					if (b[t - 1][i - 1] == 1) {
						for (d = 1; d <= dimension; d++)
							m_new[i - 1][d - 1] = m_new[i - 1][d - 1] + x[t - 1][d - 1];

						clusterIndices[t - 1] = i - 1; // zero-based
						(totalObservationsInClusters[i - 1])++;
					}
				}

				// Do something if totalObservationsInClusters[i-1] is less than some value
				// (i.e. there are too few observations for the cluster)
				if ((double) totalObservationsInClusters[i - 1] < kmeansParams.minSamplesInOneCluster) {
					tinyClusterInds[c - 1] = i;
					numTinyClusters++;
					c++;
				}
			}
			//

			c = 0;
			for (i = 0; i < totalObservationsInClusters.length; i++)
				tmps[i] = totalObservationsInClusters[i];

			inds = MathUtils.quickSort(tmps, 0, kmeansParams.numClusters - 1);
			for (i = 1; i <= kmeansParams.numClusters; i++) {
				if (totalObservationsInClusters[i - 1] >= kmeansParams.minSamplesInOneCluster) {
					for (d = 1; d <= dimension; d++)
						clusters[i - 1].meanVector[d - 1] = m_new[i - 1][d - 1] / totalObservationsInClusters[i - 1];
				} else {
					for (d = 1; d <= dimension; d++) {
						rnd = Math.random() * Math.abs(clusters[inds[kmeansParams.numClusters - c - 1]].meanVector[d - 1]) * 0.01;
						clusters[i - 1].meanVector[d - 1] = clusters[inds[kmeansParams.numClusters - c - 1]].meanVector[d - 1]
								+ rnd;
					}
					c++;
				}
			}

			for (i = 1; i <= kmeansParams.numClusters; i++)
				prev_totals[i - 1] = totalObservationsInClusters[i - 1];

			iter++;
			totChanged = 0;
			if (iter > 1) {
				if (iter >= kmeansParams.maxIterations)
					bCont = false;

				for (t = 1; t <= observations; t++) {
					for (i = 1; i <= kmeansParams.numClusters; i++) {
						if (b_old[t - 1][i - 1] != b[t - 1][i - 1]) {
							totChanged++;
							break; // Count each difference once
						}
					}
				}

				changedPerc = (double) totChanged / observations * 100.0;
				if (changedPerc < kmeansParams.minClusterChangePercent) // stop if number of clusters changed is less than
																		// %MIN_CHANGE_PERCENT of total observation
					bCont = false;

				// System.out.println("K-Means iteration: " + String.valueOf(iter) + " with " + String.valueOf(changedPerc) +
				// " percent of cluster assignments updated");
			}
			// else
			// System.out.println("K-Means iteration: " + String.valueOf(iter) + " K-means initialized");

			for (t = 1; t <= observations; t++) {
				for (k2 = 1; k2 <= kmeansParams.numClusters; k2++)
					b_old[t - 1][k2 - 1] = b[t - 1][k2 - 1];
			}
		}

		// Finally, calculate the cluster covariances
		double[][] tmpCov = null;
		double[] diag = null;
		int d1, d2;
		for (i = 0; i < kmeansParams.numClusters; i++) {
			if (totalObservationsInClusters[i] > 0) {
				int[] indices = new int[totalObservationsInClusters[i]];
				int count = 0;
				for (t = 0; t < observations; t++) {
					if (clusterIndices[t] == i)
						indices[count++] = t;
				}

				if (kmeansParams.isDiagonalOutputCovariance) {
					tmpCov = MathUtils.covariance(x, clusters[i].meanVector, true, indices);
					diag = MathUtils.diagonal(tmpCov);
					for (d1 = 0; d1 < diag.length; d1++)
						diag[d1] = Math.max(diag[d1], kmeansParams.minCovarianceAllowed);
					System.arraycopy(diag, 0, clusters[i].covMatrix[0], 0, diag.length);
					clusters[i].invCovMatrix[0] = MathUtils.inverse(clusters[i].covMatrix[0]);
				} else {
					clusters[i].covMatrix = MathUtils.covariance(x, clusters[i].meanVector, true, indices);
					for (d1 = 0; d1 < clusters[i].covMatrix.length; d1++) {
						for (d2 = 0; d2 < clusters[i].covMatrix[d1].length; d2++)
							clusters[i].covMatrix[d1][d2] = Math.max(clusters[i].covMatrix[d1][d2],
									kmeansParams.minCovarianceAllowed);
					}

					clusters[i].invCovMatrix = MathUtils.inverse(clusters[i].covMatrix);
				}
			}
		}

		// There can be no observations for some clusters, i.e. when the number of clusters is large as compared to the actual
		// clusters in data
		// In this case, assign largest cluster´s mean, covariance, and inverse covariance to these empty clusters
		for (i = 0; i < kmeansParams.numClusters; i++)
			tmps[i] = totalObservationsInClusters[i];

		inds = MathUtils.quickSort(tmps, 0, kmeansParams.numClusters - 1);
		int largestClusterInd = inds[kmeansParams.numClusters - 1];
		for (i = 0; i < kmeansParams.numClusters; i++) {
			if (totalObservationsInClusters[i] < kmeansParams.minSamplesInOneCluster) {
				System.arraycopy(clusters[largestClusterInd].meanVector, 0, clusters[i].meanVector, 0, dimension);
				if (kmeansParams.isDiagonalOutputCovariance) {
					System.arraycopy(clusters[largestClusterInd].covMatrix[0], 0, clusters[i].covMatrix[0], 0, dimension);
					System.arraycopy(clusters[largestClusterInd].invCovMatrix[0], 0, clusters[i].invCovMatrix[0], 0, dimension);
				} else {
					for (j = 0; j < dimension; j++) {
						System.arraycopy(clusters[largestClusterInd].covMatrix[j], 0, clusters[i].covMatrix[j], 0, dimension);
						System.arraycopy(clusters[largestClusterInd].invCovMatrix[j], 0, clusters[i].invCovMatrix[j], 0,
								dimension);
					}
				}
			}
		}
		//

		if (kmeansParams.isDiagonalOutputCovariance) {
			tmpCov = MathUtils.covariance(x, true);
			covMatrixGlobal = new double[1][tmpCov.length];
			covMatrixGlobal[0] = MathUtils.diagonal(tmpCov);

			for (d1 = 0; d1 < covMatrixGlobal[0].length; d1++)
				covMatrixGlobal[0][d1] = Math.max(covMatrixGlobal[0][d1], kmeansParams.minCovarianceAllowed);

			invCovMatrixGlobal = new double[1][tmpCov.length];
			invCovMatrixGlobal[0] = MathUtils.inverse(covMatrixGlobal[0]);
		} else {
			covMatrixGlobal = MathUtils.covariance(x);

			for (d1 = 0; d1 < covMatrixGlobal[0].length; d1++) {
				for (d2 = 0; d2 < covMatrixGlobal[d1].length; d2++)
					covMatrixGlobal[d1][d2] = Math.max(covMatrixGlobal[d1][d2], kmeansParams.minCovarianceAllowed);
			}

			invCovMatrixGlobal = MathUtils.inverse(covMatrixGlobal);
		}

		// System.out.println("K-Means clustering completed...");
	}

	public int getFeatureDimension() {
		if (clusters != null && clusters[0].meanVector != null)
			return clusters[0].meanVector.length;
		else
			return 0;
	}

	public int getTotalClusters() {
		if (clusters != null)
			return clusters.length;
		else
			return 0;
	}

	public boolean isDiagonalCovariance() {
		if (clusters != null)
			return clusters[0].isDiagonalCovariance;
		else
			return false;
	}
}