/** * Copyright 2007 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.machinelearning; import java.io.IOException; import marytts.util.MaryUtils; import marytts.util.io.FileUtils; import marytts.util.io.MaryRandomAccessFile; import marytts.util.math.DoubleMatrix; import marytts.util.math.MathUtils; import marytts.util.string.StringUtils; /** * * Expectation-Maximization (EM) based GMM training * * Reference: A. P. Dempster, N. M. Laird, and D. B. Rubin. Maximum likelihood from in- complete data via the em algorithm. * Journal of the Royal Statistical Society: Series B, 39(1):1–38, November 1977. * * @author Oytun Türk */ public class GMMTrainer { public double[] logLikelihoods; public GMMTrainer() { logLikelihoods = null; } // This function calls the Expectation-Maximization (EM) algorithm // to fit a Gaussian Mixture Model (GMM) to multi-dimensional data in x. // Each row of x, i.e. x[0], x[1], ... corresponds to an observation vector. // The dimension of each vector should be identical. // Either a java implementation or a native C implementation (in Windows OS only) can be used. // Note that native C implementation (GMMTrainer.exe) works 5 to 10 times faster. // All training parameters are given by gmmParams (See GMMTrainerParams.java for details) // Training consists of two steps: // (a) Initialization using K-Means clustering // (b) EM iterations to increase total log-likelihood of the model given the data public GMM train(double[][] x, GMMTrainerParams gmmParams) { long startTime, endTime; /* * //For testing Java and native C versions with identical data String dataFile0 = "d:/gmmTester2.dat"; DoubleData d0 = * null; if (FileUtils.exists(dataFile0)) { d0 = new DoubleData(dataFile0); x = new double[d0.numVectors][]; for (int i=0; * i<d0.numVectors; i++) { x[i] = new double[d0.dimension]; System.arraycopy(d0.vectors[i], 0, x[i], 0, d0.dimension); } } * else { d0 = new DoubleData(x); d0.write(dataFile0); } // */ startTime = System.currentTimeMillis(); GMM gmm = null; if (x != null && gmmParams.totalComponents > 0) { if (!MaryUtils.isWindows()) gmmParams.useNativeCLibTrainer = false; if (!gmmParams.useNativeCLibTrainer) // Java based training { int featureDimension = x[0].length; int i; for (i = 1; i < x.length; i++) assert x[i].length == featureDimension; // Initialize components with KMeans clustering KMeansClusteringTrainerParams kmeansParams = new KMeansClusteringTrainerParams(gmmParams); KMeansClusteringTrainer kmeansClusterer = new KMeansClusteringTrainer(); kmeansClusterer.train(x, kmeansParams); // Create initial GMM according to KMeans clustering results GMM initialGmm = new GMM(kmeansClusterer); // Update model parameters with Expectation-Maximization gmm = expectationMaximization(x, initialGmm, gmmParams.emMinIterations, gmmParams.emMaxIterations, gmmParams.isUpdateCovariances, gmmParams.tinyLogLikelihoodChangePercent, gmmParams.minCovarianceAllowed); } else // native C library based training (only available for Windows OS) { String strIsBigEndian = "1"; String dataFile = StringUtils.getRandomFileName("d:/gmmTemp_", 8, ".dat"); DoubleMatrix d = new DoubleMatrix(x); d.write(dataFile); String gmmFile = StringUtils.modifyExtension(dataFile, ".gmm"); String logFile = StringUtils.modifyExtension(dataFile, ".log"); String strCommand = "GMMTrainer.exe " + "\"" + dataFile + "\" " + "\"" + gmmFile + "\" " + String.valueOf(gmmParams.totalComponents) + " " + strIsBigEndian + " " + String.valueOf(gmmParams.isDiagonalCovariance ? 1 : 0) + " " + String.valueOf(gmmParams.kmeansMaxIterations) + " " + String.valueOf(gmmParams.kmeansMinClusterChangePercent) + " " + String.valueOf(gmmParams.kmeansMinSamplesInOneCluster) + " " + String.valueOf(gmmParams.emMinIterations) + " " + String.valueOf(gmmParams.emMaxIterations) + " " + String.valueOf(gmmParams.isUpdateCovariances ? 1 : 0) + " " + String.valueOf(gmmParams.tinyLogLikelihoodChangePercent) + " " + String.valueOf(gmmParams.minCovarianceAllowed) + " " + "\"" + logFile + "\""; int exitVal = MaryUtils.shellExecute(strCommand, true); if (exitVal == 0) { System.out.println("GMM training with native C library done..."); gmm = new GMM(gmmFile); FileUtils.delete(gmmFile); } else System.out.println("Error executing native C library with exit code " + exitVal); FileUtils.delete(dataFile); } } endTime = System.currentTimeMillis(); System.out.println("GMM training took " + String.valueOf((endTime - startTime) / 1000.0) + " seconds..."); return gmm; } /* * EM algorithm to fit a GMM to multi-dimensional data x: Data matrix (Each row is another observation vector) initialGMM: * Initial GMM model (can be initialized using K-Means clustering (See function train) emMinimumIterations: Minimum number of * EM iterations for which the algorithm will not quit even when the total likelihood does not change much with additional * iterations) emMaximumIterations: Maximum number of EM iterations for which the algorithm will quit even when total * likelihood has not settled yet isUpdateCovariances: Update covariance matrices in EM iterations? * tinyLogLikelihoodChangePercent: Threshold to compare percent decrease in total log-likelihood to stop iterations * automatically minimumCovarianceAllowed: Minimum covariance value allowed - should be a small positive number to avoid * ill-conditioned training * * Reference: A. P. Dempster, N. M. Laird, and D. B. Rubin. Maximum likelihood from incomplete data via the em algorithm. * Journal of the Royal Statistical Society: Series B, 39(1):1–38, November 1977. * * Many practical tutorials for EM training of GMMs exist on the web, i.e.: * http://bengio.abracadoudou.com/lectures/old/tex_gmm.pdf */ public GMM expectationMaximization(double[][] x, GMM initialGmm, int emMinimumIterations, int emMaximumIterations, boolean isUpdateCovariances, double tinyLogLikelihoodChangePercent, double minimumCovarianceAllowed) { int i, j, k; int totalObservations = x.length; GMM gmm = new GMM(initialGmm); for (i = 0; i < totalObservations; i++) assert x[i].length == gmm.featureDimension; int numIterations = 1; double error = 0.0; double prevErr; for (k = 0; k < gmm.totalComponents; k++) gmm.weights[k] = 1.0f / gmm.totalComponents; boolean bContinue = true; double[] zDenum = new double[totalObservations]; double P_xj_tetak; double[][] zNum = new double[totalObservations][gmm.totalComponents]; double[][] z = new double[totalObservations][gmm.totalComponents]; double[] num1 = new double[gmm.featureDimension]; double[] tmpMean = new double[gmm.featureDimension]; double[][] num2 = new double[gmm.featureDimension][gmm.featureDimension]; double tmpSum; double mean_diff; double denum; double diffk; double tmpZeroMean; int d1, d2; logLikelihoods = new double[emMaximumIterations]; long start, end; start = end = 0; // Main EM iteartions loop while (bContinue) { start = System.currentTimeMillis(); // Expectation step // Find zjk's at time (s+1) using alphak's at time (s) for (j = 0; j < totalObservations; j++) { zDenum[j] = 0.0f; for (k = 0; k < gmm.totalComponents; k++) { // P(xj|teta_k) if (gmm.isDiagonalCovariance) P_xj_tetak = MathUtils.getGaussianPdfValue(x[j], gmm.components[k].meanVector, gmm.components[k].getCovMatrixDiagonal(), gmm.components[k].getConstantTerm()); else P_xj_tetak = MathUtils.getGaussianPdfValue(x[j], gmm.components[k].meanVector, gmm.components[k].getInvCovMatrix(), gmm.components[k].getConstantTerm()); /* * if (P_xj_tetak<MathUtils.TINY_PROBABILITY) P_xj_tetak=MathUtils.TINY_PROBABILITY; */ zNum[j][k] = gmm.weights[k] * P_xj_tetak; zDenum[j] = zDenum[j] + zNum[j][k]; } } // Find zjk's at time (s+1) for (j = 0; j < totalObservations; j++) { for (k = 0; k < gmm.totalComponents; k++) z[j][k] = zNum[j][k] / zDenum[j]; } // Now update alphak's to find their values at time (s+1) for (k = 0; k < gmm.totalComponents; k++) { tmpSum = 0.0; for (j = 0; j < totalObservations; j++) tmpSum += z[j][k]; gmm.weights[k] = tmpSum / totalObservations; } // Maximization step // Find the model parameters at time (s+1) using zjk's at time (s+1) mean_diff = 0.0; for (k = 0; k < gmm.totalComponents; k++) { for (d1 = 0; d1 < gmm.featureDimension; d1++) { num1[d1] = 0.0f; for (d2 = 0; d2 < gmm.featureDimension; d2++) num2[d1][d2] = 0.0f; } denum = 0.0; for (j = 0; j < totalObservations; j++) { denum += z[j][k]; for (d1 = 0; d1 < gmm.featureDimension; d1++) { num1[d1] += x[j][d1] * z[j][k]; tmpZeroMean = x[j][d1] - gmm.components[k].meanVector[d1]; for (d2 = 0; d2 < gmm.featureDimension; d2++) num2[d1][d2] += z[j][k] * tmpZeroMean * (x[j][d2] - gmm.components[k].meanVector[d2]); } } for (d1 = 0; d1 < gmm.featureDimension; d1++) tmpMean[d1] = num1[d1] / denum; diffk = 0.0f; for (d1 = 0; d1 < gmm.featureDimension; d1++) { tmpZeroMean = tmpMean[d1] - gmm.components[k].meanVector[d1]; diffk += tmpZeroMean * tmpZeroMean; } diffk = Math.sqrt(diffk); mean_diff += diffk; for (d1 = 0; d1 < gmm.featureDimension; d1++) gmm.components[k].meanVector[d1] = tmpMean[d1]; if (isUpdateCovariances) { if (gmm.isDiagonalCovariance) { for (d1 = 0; d1 < gmm.featureDimension; d1++) gmm.components[k].covMatrix[0][d1] = Math.max(num2[d1][d1] / denum, minimumCovarianceAllowed); } else { for (d1 = 0; d1 < gmm.featureDimension; d1++) { for (d2 = 0; d2 < gmm.featureDimension; d2++) gmm.components[k].covMatrix[d1][d2] = Math.max(num2[d1][d2] / denum, minimumCovarianceAllowed); } } gmm.components[k].setDerivedValues(); } } if (numIterations == 1) error = mean_diff; else { prevErr = error; error = mean_diff; } logLikelihoods[numIterations - 1] = 0.0; if (gmm.isDiagonalCovariance) { for (j = 0; j < totalObservations; j++) { double tmp = 0.0; for (k = 0; k < gmm.totalComponents; k++) { P_xj_tetak = MathUtils.getGaussianPdfValue(x[j], gmm.components[k].meanVector, gmm.components[k].getCovMatrixDiagonal(), gmm.components[k].getConstantTerm()); /* * if (P_xj_tetak<MathUtils.TINY_PROBABILITY) P_xj_tetak=MathUtils.TINY_PROBABILITY; */ tmp += gmm.weights[k] * P_xj_tetak; } logLikelihoods[numIterations - 1] += Math.log(tmp); } } else { for (j = 0; j < totalObservations; j++) { double tmp = 0.0; for (k = 0; k < gmm.totalComponents; k++) { P_xj_tetak = MathUtils.getGaussianPdfValue(x[j], gmm.components[k].meanVector, gmm.components[k].getInvCovMatrix(), gmm.components[k].getConstantTerm()); /* * if (P_xj_tetak<MathUtils.TINY_PROBABILITY) P_xj_tetak=MathUtils.TINY_PROBABILITY; */ tmp += gmm.weights[k] * P_xj_tetak; } logLikelihoods[numIterations - 1] += Math.log(tmp); } } end = System.currentTimeMillis(); System.out.println("For " + String.valueOf(gmm.totalComponents) + " mixes - EM iteration no: " + String.valueOf(numIterations) + " with avg. difference in means " + String.valueOf(error) + " log-likelihood=" + String.valueOf(logLikelihoods[numIterations - 1]) + " in " + String.valueOf((end - start) / 1000.0) + " sec"); // Force iterations to stop if maximum number of iterations has been reached if (numIterations + 1 > emMaximumIterations) break; // Force iterations to stop if minimum number of iterations has been reached AND total log likelihood does not change // much if (numIterations > emMinimumIterations && logLikelihoods[numIterations - 1] - logLikelihoods[numIterations - 2] < Math .abs(logLikelihoods[numIterations - 1] / 100 * tinyLogLikelihoodChangePercent)) break; numIterations++; } double[] tmpLogLikelihoods = new double[numIterations - 1]; System.arraycopy(logLikelihoods, 0, tmpLogLikelihoods, 0, numIterations - 1); logLikelihoods = new double[numIterations - 1]; System.arraycopy(tmpLogLikelihoods, 0, logLikelihoods, 0, numIterations - 1); System.out.println("GMM training completed..."); return gmm; } public static void testEndianFileIO() throws IOException { boolean b1 = true; char c1 = 'c'; short s1 = 111; int i1 = 222; double d1 = 33.3; float f1 = 44.4f; long l1 = 555; String javaFile = "d:/endianJava.tmp"; MaryRandomAccessFile fp = new MaryRandomAccessFile(javaFile, "rw"); if (fp != null) { fp.writeBooleanEndian(b1); fp.writeCharEndian(c1); fp.writeShortEndian(s1); fp.writeIntEndian(i1); fp.writeDoubleEndian(d1); fp.writeFloatEndian(f1); fp.writeLongEndian(l1); fp.close(); } boolean b2; char c2; short s2; int i2; double d2; float f2; long l2; String cFile = "d:/endianC.tmp"; if (FileUtils.exists(cFile)) { MaryRandomAccessFile fp2 = new MaryRandomAccessFile(cFile, "r"); if (fp2 != null) { b2 = fp2.readBooleanEndian(); c2 = fp2.readCharEndian(); s2 = fp2.readShortEndian(); i2 = fp2.readIntEndian(); d2 = fp2.readDoubleEndian(); f2 = fp2.readFloatEndian(); l2 = fp2.readLongEndian(); fp2.close(); if (b1 != b2) System.out.println("Error in bool!\n"); if (c1 != c2) System.out.println("Error in char!\n"); if (s1 != s2) System.out.println("Error in short!\n"); if (i1 != i2) System.out.println("Error in int!\n"); if (d1 != d2) System.out.println("Error in double!\n"); if (f1 != f2) System.out.println("Error in float!\n"); if (l1 != l2) System.out.println("Error in long!\n"); } else System.out.println("C generated file cannot be opened...\n"); } else System.out.println("C generated file not found...\n"); } public static void main(String[] args) { int numClusters = 20; int numSamplesInClusters = 2000; double[] variances = { 0.01 }; int vectorDim = 10; ClusteredDataGenerator[] c = new ClusteredDataGenerator[vectorDim]; int i, j, n; int totalVectors = 0; for (i = 0; i < vectorDim; i++) { if (i < variances.length) c[i] = new ClusteredDataGenerator(numClusters, numSamplesInClusters, 10.0 * (i + 1), variances[i]); else c[i] = new ClusteredDataGenerator(numClusters, numSamplesInClusters, 10.0 * (i + 1), variances[0]); } totalVectors = c[0].data.length; double[][] x = new double[totalVectors][vectorDim]; int counter = 0; for (n = 0; n < c.length; n++) { for (i = 0; i < c[n].data.length; i++) x[i][n] = c[n].data[i]; } x = MathUtils.randomSort(x); double[] m = MathUtils.mean(x); double[] v = MathUtils.variance(x, m); System.out.println(String.valueOf(m[0]) + " " + String.valueOf(v[0])); GMMTrainerParams gmmParams = new GMMTrainerParams(); gmmParams.totalComponents = numClusters; gmmParams.isDiagonalCovariance = true; gmmParams.kmeansMaxIterations = 100; gmmParams.kmeansMinClusterChangePercent = 0.01; gmmParams.kmeansMinSamplesInOneCluster = 10; gmmParams.emMinIterations = 100; gmmParams.emMaxIterations = 2000; gmmParams.isUpdateCovariances = true; gmmParams.tinyLogLikelihoodChangePercent = 0.001; gmmParams.minCovarianceAllowed = 1e-5; gmmParams.useNativeCLibTrainer = true; GMMTrainer g = new GMMTrainer(); GMM gmm = g.train(x, gmmParams); if (gmm != null) { for (i = 0; i < gmm.totalComponents; i++) System.out.println("Gaussian #" + String.valueOf(i + 1) + " mean=" + String.valueOf(gmm.components[i].meanVector[0]) + " variance=" + String.valueOf(gmm.components[i].covMatrix[0][0]) + " prior=" + gmm.weights[i]); } /* * try { testEndianFileIO(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } */ } }