/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.machinelearning; import marytts.signalproc.analysis.distance.DistanceComputer; import marytts.util.math.MathUtils; /** * K-Means clustering training algorithm * * Reference: J. MacQueen, 1967, "Some methods for classification and analysis of multivariate observations", Proc. Fifth Berkeley * Symp. on Math. Statist. and Prob., Vol. 1 (Univ. of Calif. Press, 1967), pp. 281-297. * * @author Oytun Türk */ public class KMeansClusteringTrainer { public Cluster[] clusters; // Parameters of each cluster public int[] totalObservationsInClusters; // Total number of observations in each cluster public int[] clusterIndices; // Assigned cluster for each observation vector public double[][] covMatrixGlobal; // Global covariance matrix of data public double[][] invCovMatrixGlobal; // Inverse of global covariance matrix of data // This function clusters multi-dimensional feature vectors using K-Means clustering procedure // Each row of x, i.e. x[0], x[1], ... corresponds to an observation vector. // The dimension of each vector should be identical. // All training parameters are given by kmeansParams (See KMeansClusteringTrainerParams.java for details) // Training consists of four steps: // (a) Initialization (random assignment of cluster means using data points that are far away from each other + slight random // shifts) // (b) Hard clustering of samples according to new cluster means // (c) Update of cluster means using assigned samples // (d) Re-iteration of (b) and (c) until convergence, i.e. when overall cluster occupancy does not change much public void train(double[][] x, KMeansClusteringTrainerParams kmeansParams) { if (kmeansParams.globalVariances == null) { double[] meanVector = MathUtils.mean(x, true); kmeansParams.globalVariances = MathUtils.variance(x, meanVector, true); } int observations = x.length; int dimension = x[0].length; int c, k, k2, d, t, iter, i, j, totChanged; int ind = -1; boolean bCont; double rnd, tmpDist; double minDist = Double.MIN_VALUE; double[][] m_new = new double[kmeansParams.numClusters][]; for (k = 0; k < kmeansParams.numClusters; k++) m_new[k] = new double[dimension]; int[][] b = new int[observations][]; for (t = 0; t < observations; t++) b[t] = new int[kmeansParams.numClusters]; int[][] b_old = new int[observations][]; for (t = 0; t < observations; t++) b_old[t] = new int[kmeansParams.numClusters]; int[] prev_totals = new int[kmeansParams.numClusters]; double changedPerc; double[] mAll = new double[dimension]; clusters = new Cluster[kmeansParams.numClusters]; for (k = 0; k < kmeansParams.numClusters; k++) clusters[k] = new Cluster(dimension, kmeansParams.isDiagonalOutputCovariance); for (k = 1; k <= kmeansParams.numClusters; k++) { for (d = 1; d <= dimension; d++) clusters[k - 1].meanVector[d - 1] = 0.0; for (t = 1; t <= observations; t++) b[t - 1][k - 1] = 0; } // Select initial cluster centers mAll = MathUtils.mean(x, true); k = 1; double[] dists = new double[observations]; double[] tmp = new double[kmeansParams.numClusters + 1]; double maxD = Double.MAX_VALUE; int maxInd = -1; while (k <= kmeansParams.numClusters) { for (t = 1; t <= observations; t++) { if (k > 1) { for (i = 1; i <= k - 1; i++) tmp[i - 1] = DistanceComputer.getNormalizedEuclideanDistance(clusters[i - 1].meanVector, x[t - 1], kmeansParams.globalVariances); tmp[k - 1] = DistanceComputer.getNormalizedEuclideanDistance(mAll, x[t - 1], kmeansParams.globalVariances); dists[t - 1] = MathUtils.mean(tmp, 0, k - 1); } else { dists[t - 1] = DistanceComputer.getNormalizedEuclideanDistance(mAll, x[t - 1], kmeansParams.globalVariances); } } for (t = 1; t <= observations; t++) { if (t == 1 || dists[t - 1] > maxD) { maxD = dists[t - 1]; maxInd = t; } } for (d = 0; d < dimension; d++) clusters[k - 1].meanVector[d] = x[maxInd - 1][d]; // System.out.println("Cluster center " + String.valueOf(k) + " initialized..."); k++; } // int[] tinyClusterInds = new int[kmeansParams.numClusters]; int numTinyClusters = 0; double[] tmps = new double[kmeansParams.numClusters]; int[] inds; totalObservationsInClusters = new int[kmeansParams.numClusters]; clusterIndices = new int[observations]; iter = 0; bCont = true; while (bCont) { for (t = 1; t <= observations; t++) // Overall observations { for (i = 1; i <= kmeansParams.numClusters; i++) // Overall classes { tmpDist = DistanceComputer.getNormalizedEuclideanDistance(clusters[i - 1].meanVector, x[t - 1], kmeansParams.globalVariances); b[t - 1][i - 1] = 0; if (i == 1 || tmpDist < minDist) { minDist = tmpDist; ind = i; } } for (i = 1; i <= kmeansParams.numClusters; i++) // Overall classes { if (i == ind) b[t - 1][i - 1] = 1; } } // Update means for (i = 1; i <= kmeansParams.numClusters; i++) { totalObservationsInClusters[i - 1] = 0; tinyClusterInds[i - 1] = 0; } c = 1; for (i = 1; i <= kmeansParams.numClusters; i++) { for (d = 1; d <= dimension; d++) m_new[i - 1][d - 1] = 0.0f; for (t = 1; t <= observations; t++) { if (b[t - 1][i - 1] == 1) { for (d = 1; d <= dimension; d++) m_new[i - 1][d - 1] = m_new[i - 1][d - 1] + x[t - 1][d - 1]; clusterIndices[t - 1] = i - 1; // zero-based (totalObservationsInClusters[i - 1])++; } } // Do something if totalObservationsInClusters[i-1] is less than some value // (i.e. there are too few observations for the cluster) if ((double) totalObservationsInClusters[i - 1] < kmeansParams.minSamplesInOneCluster) { tinyClusterInds[c - 1] = i; numTinyClusters++; c++; } } // c = 0; for (i = 0; i < totalObservationsInClusters.length; i++) tmps[i] = totalObservationsInClusters[i]; inds = MathUtils.quickSort(tmps, 0, kmeansParams.numClusters - 1); for (i = 1; i <= kmeansParams.numClusters; i++) { if (totalObservationsInClusters[i - 1] >= kmeansParams.minSamplesInOneCluster) { for (d = 1; d <= dimension; d++) clusters[i - 1].meanVector[d - 1] = m_new[i - 1][d - 1] / totalObservationsInClusters[i - 1]; } else { for (d = 1; d <= dimension; d++) { rnd = Math.random() * Math.abs(clusters[inds[kmeansParams.numClusters - c - 1]].meanVector[d - 1]) * 0.01; clusters[i - 1].meanVector[d - 1] = clusters[inds[kmeansParams.numClusters - c - 1]].meanVector[d - 1] + rnd; } c++; } } for (i = 1; i <= kmeansParams.numClusters; i++) prev_totals[i - 1] = totalObservationsInClusters[i - 1]; iter++; totChanged = 0; if (iter > 1) { if (iter >= kmeansParams.maxIterations) bCont = false; for (t = 1; t <= observations; t++) { for (i = 1; i <= kmeansParams.numClusters; i++) { if (b_old[t - 1][i - 1] != b[t - 1][i - 1]) { totChanged++; break; // Count each difference once } } } changedPerc = (double) totChanged / observations * 100.0; if (changedPerc < kmeansParams.minClusterChangePercent) // stop if number of clusters changed is less than // %MIN_CHANGE_PERCENT of total observation bCont = false; // System.out.println("K-Means iteration: " + String.valueOf(iter) + " with " + String.valueOf(changedPerc) + // " percent of cluster assignments updated"); } // else // System.out.println("K-Means iteration: " + String.valueOf(iter) + " K-means initialized"); for (t = 1; t <= observations; t++) { for (k2 = 1; k2 <= kmeansParams.numClusters; k2++) b_old[t - 1][k2 - 1] = b[t - 1][k2 - 1]; } } // Finally, calculate the cluster covariances double[][] tmpCov = null; double[] diag = null; int d1, d2; for (i = 0; i < kmeansParams.numClusters; i++) { if (totalObservationsInClusters[i] > 0) { int[] indices = new int[totalObservationsInClusters[i]]; int count = 0; for (t = 0; t < observations; t++) { if (clusterIndices[t] == i) indices[count++] = t; } if (kmeansParams.isDiagonalOutputCovariance) { tmpCov = MathUtils.covariance(x, clusters[i].meanVector, true, indices); diag = MathUtils.diagonal(tmpCov); for (d1 = 0; d1 < diag.length; d1++) diag[d1] = Math.max(diag[d1], kmeansParams.minCovarianceAllowed); System.arraycopy(diag, 0, clusters[i].covMatrix[0], 0, diag.length); clusters[i].invCovMatrix[0] = MathUtils.inverse(clusters[i].covMatrix[0]); } else { clusters[i].covMatrix = MathUtils.covariance(x, clusters[i].meanVector, true, indices); for (d1 = 0; d1 < clusters[i].covMatrix.length; d1++) { for (d2 = 0; d2 < clusters[i].covMatrix[d1].length; d2++) clusters[i].covMatrix[d1][d2] = Math.max(clusters[i].covMatrix[d1][d2], kmeansParams.minCovarianceAllowed); } clusters[i].invCovMatrix = MathUtils.inverse(clusters[i].covMatrix); } } } // There can be no observations for some clusters, i.e. when the number of clusters is large as compared to the actual // clusters in data // In this case, assign largest cluster´s mean, covariance, and inverse covariance to these empty clusters for (i = 0; i < kmeansParams.numClusters; i++) tmps[i] = totalObservationsInClusters[i]; inds = MathUtils.quickSort(tmps, 0, kmeansParams.numClusters - 1); int largestClusterInd = inds[kmeansParams.numClusters - 1]; for (i = 0; i < kmeansParams.numClusters; i++) { if (totalObservationsInClusters[i] < kmeansParams.minSamplesInOneCluster) { System.arraycopy(clusters[largestClusterInd].meanVector, 0, clusters[i].meanVector, 0, dimension); if (kmeansParams.isDiagonalOutputCovariance) { System.arraycopy(clusters[largestClusterInd].covMatrix[0], 0, clusters[i].covMatrix[0], 0, dimension); System.arraycopy(clusters[largestClusterInd].invCovMatrix[0], 0, clusters[i].invCovMatrix[0], 0, dimension); } else { for (j = 0; j < dimension; j++) { System.arraycopy(clusters[largestClusterInd].covMatrix[j], 0, clusters[i].covMatrix[j], 0, dimension); System.arraycopy(clusters[largestClusterInd].invCovMatrix[j], 0, clusters[i].invCovMatrix[j], 0, dimension); } } } } // if (kmeansParams.isDiagonalOutputCovariance) { tmpCov = MathUtils.covariance(x, true); covMatrixGlobal = new double[1][tmpCov.length]; covMatrixGlobal[0] = MathUtils.diagonal(tmpCov); for (d1 = 0; d1 < covMatrixGlobal[0].length; d1++) covMatrixGlobal[0][d1] = Math.max(covMatrixGlobal[0][d1], kmeansParams.minCovarianceAllowed); invCovMatrixGlobal = new double[1][tmpCov.length]; invCovMatrixGlobal[0] = MathUtils.inverse(covMatrixGlobal[0]); } else { covMatrixGlobal = MathUtils.covariance(x); for (d1 = 0; d1 < covMatrixGlobal[0].length; d1++) { for (d2 = 0; d2 < covMatrixGlobal[d1].length; d2++) covMatrixGlobal[d1][d2] = Math.max(covMatrixGlobal[d1][d2], kmeansParams.minCovarianceAllowed); } invCovMatrixGlobal = MathUtils.inverse(covMatrixGlobal); } // System.out.println("K-Means clustering completed..."); } public int getFeatureDimension() { if (clusters != null && clusters[0].meanVector != null) return clusters[0].meanVector.length; else return 0; } public int getTotalClusters() { if (clusters != null) return clusters.length; else return 0; } public boolean isDiagonalCovariance() { if (clusters != null) return clusters[0].isDiagonalCovariance; else return false; } }