/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.machinelearning; import java.awt.Color; import java.util.Arrays; import javax.swing.JFrame; import marytts.signalproc.display.FunctionGraph; import marytts.util.math.MathUtils; import marytts.util.math.Polynomial; /** * K-Means clustering training algorithm * * Reference: J. MacQueen, 1967, "Some methods for classification and analysis of multivariate observations", Proc. Fifth Berkeley * Symp. on Math. Statist. and Prob., Vol. 1 (Univ. of Calif. Press, 1967), pp. 281-297. * * This version is adapted to work with a distance function between polynomials. * * @author Oytun Türk, Marc Schröder */ public class PolynomialKMeansClusteringTrainer { /** * This function clusters polynomials using K-Means clustering procedure, using a polynomial distance function. Training * consists of four steps: (a) Initialization (random assignment of cluster means using data points that are far away from * each other + slight random shifts) (b) Hard clustering of samples according to new cluster means (c) Update of cluster * means using assigned samples (d) Re-iteration of (b) and (c) until convergence, i.e. when overall cluster occupancy does * not change much * * @param polynomials * the observations to cluster * @param kmeansParams * All training parameters are given by kmeansParams (See KMeansClusteringTrainerParams.java for details) * @return the clusters trained */ public static PolynomialCluster[] train(Polynomial[] polynomials, KMeansClusteringTrainerParams kmeansParams) { int[] totalObservationsInClusters; // Total number of observations in each cluster int[] clusterIndices; // Assigned cluster for each observation vector int observations = polynomials.length; int polynomialOrder = polynomials[0].getOrder(); // Intermediate representations for computing updated cluster means: Polynomial[] m_new = new Polynomial[kmeansParams.numClusters]; // b[i][j] == true if observation i belongs to cluster j boolean[][] b = new boolean[observations][kmeansParams.numClusters]; boolean[][] b_old = new boolean[observations][kmeansParams.numClusters]; Polynomial[] clusterMeans = new Polynomial[kmeansParams.numClusters]; for (int k = 0; k < kmeansParams.numClusters; k++) clusterMeans[k] = new Polynomial(polynomialOrder); for (int t = 1; t <= observations; t++) { Arrays.fill(b[t - 1], false); } // Select initial cluster centers Polynomial mAll = Polynomial.mean(polynomials); double[] dists = new double[observations]; double[] tmp = new double[kmeansParams.numClusters + 1]; for (int k = 1; k <= kmeansParams.numClusters; k++) { // For each cluster, initiate it with the observation that is most distant from the clusters initiated so far for (int t = 1; t <= observations; t++) { if (k > 1) { for (int i = 1; i <= k - 1; i++) { tmp[i - 1] = clusterMeans[i - 1].polynomialDistance(polynomials[t - 1]); } tmp[k - 1] = mAll.polynomialDistance(polynomials[t - 1]); dists[t - 1] = MathUtils.mean(tmp, 0, k - 1); } else { dists[t - 1] = mAll.polynomialDistance(polynomials[t - 1]); } } double maxD = Double.MIN_VALUE; int maxInd = -1; for (int t = 1; t <= observations; t++) { if (dists[t - 1] > maxD) { maxD = dists[t - 1]; maxInd = t; } } clusterMeans[k - 1].copyCoeffs(polynomials[maxInd - 1]); // System.out.println("Cluster center " + String.valueOf(k) + " initialized..."); } // int[] tinyClusterInds = new int[kmeansParams.numClusters]; int numTinyClusters = 0; totalObservationsInClusters = new int[kmeansParams.numClusters]; clusterIndices = new int[observations]; int iter = 0; boolean bCont = true; while (bCont) { // Associate each observation with the nearest cluster for (int t = 1; t <= observations; t++) { // Over all observations double minDist = Double.MAX_VALUE; int ind = -1; for (int i = 1; i <= kmeansParams.numClusters; i++) { // Over all clusters double tmpDist = clusterMeans[i - 1].polynomialDistance(polynomials[t - 1]); b[t - 1][i - 1] = false; if (tmpDist < minDist) { minDist = tmpDist; ind = i; } } // associate the observation with the cluster to which it has minimum distance: b[t - 1][ind - 1] = true; } // Prepare means per cluster based on new cluster members: for (int i = 1; i <= kmeansParams.numClusters; i++) { totalObservationsInClusters[i - 1] = 0; tinyClusterInds[i - 1] = 0; } int c = 1; // count tiny clusters for (int i = 1; i <= kmeansParams.numClusters; i++) { m_new[i - 1] = new Polynomial(polynomialOrder); for (int t = 1; t <= observations; t++) { if (b[t - 1][i - 1]) { // observation t is associated with cluster i for (int d = 0; d <= polynomialOrder; d++) m_new[i - 1].coeffs[d] += polynomials[t - 1].coeffs[d]; clusterIndices[t - 1] = i - 1; // zero-based (totalObservationsInClusters[i - 1])++; } } // Do something if totalObservationsInClusters[i-1] is less than some value // (i.e. there are too few observations for the cluster) if ((double) totalObservationsInClusters[i - 1] < kmeansParams.minSamplesInOneCluster) { tinyClusterInds[c - 1] = i; numTinyClusters++; c++; } } // Update the means of clusters if these are big enough, // and replace tiny clusters with random variations of big ones: c = 0; // need doubles to use quicksort: double[] tmps = new double[totalObservationsInClusters.length]; for (int a = 0; a < tmps.length; a++) { tmps[a] = totalObservationsInClusters[a]; } int[] inds = MathUtils.quickSort(tmps, 0, kmeansParams.numClusters - 1); for (int i = 1; i <= kmeansParams.numClusters; i++) { if (totalObservationsInClusters[i - 1] >= kmeansParams.minSamplesInOneCluster) { // a normal-sized cluster -- // update mean for (int d = 0; d <= polynomialOrder; d++) { clusterMeans[i - 1].coeffs[d] = m_new[i - 1].coeffs[d] / totalObservationsInClusters[i - 1]; } } else { // a tiny cluster -- reinitialise with a random variation of one of the big clusters for (int d = 0; d <= polynomialOrder; d++) { double rnd = 2 * (Math.random() - 0.5) /* a random number between -1 and 1 */ * clusterMeans[inds[kmeansParams.numClusters - c - 1]].coeffs[d] * 0.01; clusterMeans[i - 1].coeffs[d] = clusterMeans[inds[kmeansParams.numClusters - c - 1]].coeffs[d] + rnd; } c++; } } int[] prev_totals = totalObservationsInClusters.clone(); iter++; // Count number of observations that have changed cluster: int totChanged = 0; if (iter > 1) { if (iter >= kmeansParams.maxIterations) { bCont = false; } for (int t = 1; t <= observations; t++) { for (int i = 1; i <= kmeansParams.numClusters; i++) { if (b_old[t - 1][i - 1] != b[t - 1][i - 1]) { totChanged++; break; // Count each difference once } } } double changedPerc = (double) totChanged / observations * 100.0; if (changedPerc < kmeansParams.minClusterChangePercent) { // stop if number of clusters changed is less than // %MIN_CHANGE_PERCENT of total observation bCont = false; } // System.out.println("K-Means iteration: " + String.valueOf(iter) + " with " + String.valueOf(changedPerc) + // " percent of cluster assignments updated"); } // else // System.out.println("K-Means iteration: " + String.valueOf(iter) + " K-means initialized"); for (int t = 1; t <= observations; t++) { System.arraycopy(b[t - 1], 0, b_old[t - 1], 0, b[t - 1].length); } } // We do not compute covariances here, because we are unidimensional only. // Now fill the custers with their means and members: PolynomialCluster[] clusters = new PolynomialCluster[kmeansParams.numClusters]; for (int i = 1; i <= kmeansParams.numClusters; i++) { Polynomial[] members = new Polynomial[totalObservationsInClusters[i - 1]]; int m = 0; for (int t = 1; t <= observations; t++) { if (b[t - 1][i - 1]) { members[m] = polynomials[t - 1]; m++; } } assert m == members.length; clusters[i - 1] = new PolynomialCluster(clusterMeans[i - 1], members); } return clusters; // System.out.println("K-Means clustering completed..."); } public static void main(String[] args) { // Test clustering with random polynomials, and visualise result int order = 3; int numPolynomials = 1000; int numClusters = 50; // Initialise with random data: Polynomial[] ps = new Polynomial[numPolynomials]; for (int i = 0; i < numPolynomials; i++) { double[] coeffs = new double[order + 1]; for (int c = 0; c < coeffs.length; c++) { coeffs[c] = Math.random(); } ps[i] = new Polynomial(coeffs); } KMeansClusteringTrainerParams params = new KMeansClusteringTrainerParams(); params.numClusters = numClusters; // Train: PolynomialCluster[] clusters = PolynomialKMeansClusteringTrainer.train(ps, params); // Visualise: FunctionGraph clusterGraph = new FunctionGraph(0, 1, new double[1]); clusterGraph.setYMinMax(0, 5); clusterGraph.setPrimaryDataSeriesStyle(Color.BLUE, FunctionGraph.DRAW_DOTS, FunctionGraph.DOT_FULLCIRCLE); JFrame jf = clusterGraph.showInJFrame("", false, true); for (int i = 0; i < clusters.length; i++) { double[] meanValues = clusters[i].getMeanPolynomial().generatePolynomialValues(100, 0, 1); clusterGraph.updateData(0, 1. / meanValues.length, meanValues); Polynomial[] members = clusters[i].getClusterMembers(); for (int m = 0; m < members.length; m++) { double[] pred = members[m].generatePolynomialValues(meanValues.length, 0, 1); clusterGraph.addDataSeries(pred, Color.GRAY, FunctionGraph.DRAW_LINE, -1); jf.repaint(); } jf.setTitle("Cluster " + (i + 1) + " of " + clusters.length + ": " + members.length + " members"); jf.repaint(); try { Thread.sleep(500); } catch (InterruptedException ie) { } } System.exit(0); } }