/** * Copyright 2010 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.util.math; import java.io.BufferedReader; import java.io.FileReader; import java.util.Vector; import Jama.EigenvalueDecomposition; import Jama.Matrix; import Jama.SingularValueDecomposition; /*** * Principal component analysis solve PCA using eigenvectors decomposion and singular value decomposition (SVD). * * Ref: Jonathon Shlens, "A tutorial on principal component analysis", (Dated: April 22, 2009; Version 3.01) * http://www.snl.salk.edu/~shlens/pca.pdf * * @author marcela */ public class PCA { private Matrix covariance; private double[] V; // eigenValues or diagonal of svd; private Matrix PC; // principal components private double[] varianceProportion; // proportion of variance of each PC public double[][] getCovariance() { return covariance.getArray(); } public double[] getV() { return V; } public double[][] getPC() { return PC.getArray(); } public double[][] getDataProjected(Matrix data, boolean debug) { // Project the original data set Matrix dataProjected; dataProjected = PC.transpose().times(data); if (debug) { System.out.println("Data projected:"); dataProjected.print(dataProjected.getRowDimension(), 3); } return dataProjected.getArray(); } public void printPricipalComponents() { System.out.println("PC"); PC.print(PC.getRowDimension(), 4); } public void printPricipalComponents(Vector<String> factors) { System.out.println("PCs:"); for (int j = 0; j < PC.getColumnDimension(); j++) { System.out.println("PC(" + j + ")"); for (int i = 0; i < PC.getRowDimension(); i++) System.out.format(" %s %.5f\n", factors.elementAt(i), PC.get(i, j)); } } /*** * * @param factors * linguistic factors * @param numPCA * number of PC, between 1 and Max num PCs */ public void printPricipalComponents(String[] factors, int numPCA) { if (numPCA >= 1 && numPCA <= PC.getColumnDimension()) { System.out.println("Ordered PC(" + numPCA + ")"); numPCA = numPCA - 1; double loadings[] = new double[PC.getRowDimension()]; // Make a copy of the loadings vector for (int i = 0; i < PC.getRowDimension(); i++) loadings[i] = Math.abs(PC.get(i, numPCA)); // this sort is from lowest to highest int indices[] = MathUtils.quickSort(loadings); // now print from highest to lowest int index; for (int i = PC.getRowDimension() - 1; i >= 0; i--) { index = indices[i]; System.out.format(" %s %.5f\n", factors[index], PC.get(index, numPCA)); } for (int i = PC.getRowDimension() - 1; i >= 0; i--) { index = indices[i]; System.out.format("%s\n", factors[index]); } } else { System.out.println("PC number should be >= 1 and <= " + PC.getColumnDimension()); } } public void printImportanceOfComponents() { System.out.println("Importance of components:"); for (int j = 0; j < varianceProportion.length; j++) System.out.format("PC(%s)=%.4f ", j + 1, varianceProportion[j]); System.out.println(); } public double[] getImportanceOfComponents() { return varianceProportion; } public double getImportanceOfComponents(int numPC) { return varianceProportion[numPC]; } /*** * perform principal component analysis * * @param data * a vector of doubles * @param rows * number of rows, trials or examples * @param cols * number of cols, dimensions or factors * @param eigen * if true use eigenvalues, if false use SVD (singular value decomposition) * @param scale * if true use znormalisation, if false just remove the mean from each dimension */ public void principalComponentAnalysis(Vector<Double> data, int rows, int cols, boolean eigen, boolean scale) { if (data == null) throw new NullPointerException("Null data"); if (rows < 0 || cols < 0) throw new IllegalArgumentException("Number of rows and cols must be greater than 0"); Matrix dataX = new Matrix(rows, cols); // Fill the data in the matrix X (independent variables) and vector y (dependet variable) int n = 0; // number of data points for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) dataX.set(i, j, data.elementAt(n++)); } boolean debug = false; if (eigen) eigenPCA(dataX, scale, debug); else svdPCA(dataX, scale, debug); } /*** * Solving PCA using eigenvector decomposition * * @param data * Matrix with M rows corresponding to dimensions or factors and N columns corresponding to trials or examples * @param scale * if true : applying zscore normalisation if false: just removing the mean * @param debug * debug */ public void eigenPCA(Matrix data, boolean scale, boolean debug) { // M dimensions // N trials int M = data.getRowDimension(); int N = data.getColumnDimension(); // substract the mean for each dimension // if applying zscore scaling then divide by the standard deviation // double element[][] = data.getArrayCopy(); double mn; double sd; for (int i = 0; i < M; i++) { mn = MathUtils.mean(data.getArray()[i]); if (mn == 0.0) throw new Error("eigenPCA: mean of dimension " + (i + 1) + " is 0.0"); if (scale) { sd = MathUtils.standardDeviation(data.getArray()[i]); if (sd == 0.0) throw new Error("eigenPCA: variance of dimension " + (i + 1) + " is 0.0"); // divide by the standard deviation for (int j = 0; j < N; j++) data.set(i, j, ((data.get(i, j) - mn) / sd)); } else { // remove the mean for (int j = 0; j < N; j++) data.set(i, j, (data.get(i, j) - mn)); } } if (debug) { System.out.println("Data:"); data.print(data.getRowDimension(), 3); } // calculate the covariance matrix // covariance = 1/(N-1) * data * data' covariance = data.times(data.transpose()); covariance = covariance.times(1.0 / (N - 1)); if (debug) { System.out.println("Covariance"); covariance.print(covariance.getRowDimension(), 3); } // find the eigenvectors and eigenvalues // eig() returns the values not ordered EigenvalueDecomposition pc = covariance.eig(); if (debug) { System.out.println("EigenValues (on the diagonal)"); pc.getD().print(pc.getD().getRowDimension(), 3); System.out.println("EigenVectors"); pc.getV().print(pc.getV().getRowDimension(), 3); } // get the diagonal values and sort them double values[] = new double[pc.getD().getRowDimension()]; for (int i = 0; i < pc.getD().getRowDimension(); i++) values[i] = pc.getD().get(i, i); // sort is from lowest to highest int indices[] = MathUtils.quickSort(values); V = new double[values.length]; // sort the variances in decreasing order double d[][] = new double[pc.getV().getRowDimension()][pc.getV().getColumnDimension()]; for (int j = 0; j < values.length; j++) { int k = indices[values.length - 1 - j]; V[j] = values[k]; for (int i = 0; i < pc.getV().getRowDimension(); i++) d[i][j] = pc.getV().get(i, k); } PC = new Matrix(d); if (debug) { System.out.println("PC:"); PC.print(PC.getRowDimension(), 3); } // project the original data // signals = PC' * data Matrix projectedData = PC.transpose().times(data); // The variance for each principal component can be read off the diagonal of the covariance matrix // of projected_data Matrix covProjectedData = projectedData.times(projectedData.transpose()); // get the diagonal and sum of variance varianceProportion = new double[covProjectedData.getColumnDimension()]; double sumPropVar = 0.0; // sum of the proportion of variance for (int j = 0; j < covProjectedData.getColumnDimension(); j++) { varianceProportion[j] = covProjectedData.get(j, j); sumPropVar += varianceProportion[j]; } for (int j = 0; j < covProjectedData.getColumnDimension(); j++) varianceProportion[j] = varianceProportion[j] / sumPropVar; } /*** * Solving PCA using singular value decomposition (SVD) (more general solution) * * @param data * Matrix with M rows corresponding to dimensions or factors and N columns corresponding to trials or examples * @param * scale if true : applying zscore normalisation if false: just removing the mean * @param scale * scale * @param debug * debug */ public void svdPCA(Matrix data, boolean scale, boolean debug) { // M dimensions // N trials int M = data.getRowDimension(); int N = data.getColumnDimension(); // substract the mean for each dimension // if applying zscore scaling then divide by the standard deviation double mn; double sd; for (int i = 0; i < M; i++) { mn = MathUtils.mean(data.getArray()[i]); if (mn == 0.0) throw new Error("svdPCA: mean of dimension " + (i + 1) + " is 0.0"); if (scale) { sd = MathUtils.standardDeviation(data.getArray()[i]); if (sd == 0.0) throw new Error("svdPCA: variance of dimension " + (i + 1) + " is 0.0"); // divide by the standard deviation for (int j = 0; j < N; j++) data.set(i, j, ((data.get(i, j) - mn) / sd)); } else { // remove the mean for (int j = 0; j < N; j++) data.set(i, j, (data.get(i, j) - mn)); } } if (debug) { System.out.println("Data:"); data.print(data.getRowDimension(), 3); } // construct the matrix Y // Y = data' / sqrt(N-1); Matrix Y = data.transpose(); Y = Y.times(1.0 / Math.sqrt(N - 1)); // SVD does it all // [u, S, PC] = svd(Y); SingularValueDecomposition svd = Y.svd(); // calculate the variances if (debug) System.out.println("Values:"); // svd.getS().print(svd.getS().getRowDimension(), 3); // get the diagonal values and sort them V = new double[svd.getS().getRowDimension()]; for (int i = 0; i < svd.getS().getRowDimension(); i++) { V[i] = svd.getS().get(i, i); if (debug) System.out.println(V[i]); } // System.out.println("V:"); // svd.getV().print(svd.getV().getRowDimension(), 3); // System.out.println("U:"); // svd.getU().print(svd.getU().getRowDimension(), 3); PC = svd.getV(); if (debug) { System.out.println("PC:"); PC.print(PC.getRowDimension(), 3); } // project the original data // signals = PC' * data Matrix projectedData = PC.transpose().times(data); // The variance for each principal component can be read off the diagonal of the covariance matrix // of projected_data Matrix covProjectedData = projectedData.times(projectedData.transpose()); // get the diagonal and sum of variance varianceProportion = new double[covProjectedData.getColumnDimension()]; double sumPropVar = 0.0; // sum of the proportion of variance for (int j = 0; j < covProjectedData.getColumnDimension(); j++) { varianceProportion[j] = covProjectedData.get(j, j); sumPropVar += varianceProportion[j]; } for (int j = 0; j < covProjectedData.getColumnDimension(); j++) varianceProportion[j] = varianceProportion[j] / sumPropVar; } /*** * PCA * * @param fileName * data one column per dimension or linguistic factor * @param eigen * if true use eigenvalues, if false use svd (recomended) * @param scale * if true use z-normalisation (recomended), if false substract off the mean for ecah dimension */ public void principalComponentAnalysis(String fileName, boolean eigen, boolean scale) { try { BufferedReader reader = new BufferedReader(new FileReader(fileName)); Matrix data = Matrix.read(reader); int rows = data.getRowDimension() - 1; int cols = data.getColumnDimension() - 1; data = data.getMatrix(0, rows, 1, cols); // dataVowels(:,1:cols) -> dependent variables if (eigen) eigenPCA(data.transpose(), scale, false); else svdPCA(data.transpose(), scale, false); } catch (Exception e) { throw new RuntimeException("Problem reading file " + fileName, e); } } public static void main(String[] args) throws Exception { // get the data /* * String dataFile="/project/mary/marcela/quality_parameters/pca/emo_mat.data"; BufferedReader reader = new * BufferedReader(new FileReader(dataFile)); * * Matrix data = Matrix.read(reader); data = data.transpose(); //data.print(data.getRowDimension(), 3); * * PCA pca = new PCA(); pca.eigenPCA(data, true, true); pca.svdPCA(data, true, true); */ /* * String dataFile="/project/mary/marcela/UnitSel-voices/slt-arctic/temp/t1.vowels"; BufferedReader reader = new * BufferedReader(new FileReader(dataFile)); Matrix dataVowels = Matrix.read(reader); Matrix data = * dataVowels.transpose(); * * PCA pca = new PCA(); //pca.eigenPCA(data, true, false); pca.svdPCA(data, true, false); pca.printPricipalComponents(); * * String durFile="/project/mary/marcela/UnitSel-voices/slt-arctic/temp/dur.vowels"; BufferedReader durReader = new * BufferedReader(new FileReader(durFile)); Matrix duration = Matrix.read(durReader); * * Regression regVowel = new Regression(); regVowel.multipleLinearRegression(duration.getColumnPackedCopy(), * dataVowels.getArray(), true); regVowel.printCoefficients(); * System.out.println("Correlation vowels original duration / predicted duration = " + regVowel.getCorrelation()); */ String dataFile = "/project/mary/marcela/UnitSel-voices/slt-arctic/temp/dur-vowels.data"; BufferedReader reader = new BufferedReader(new FileReader(dataFile)); Matrix dataVowels = Matrix.read(reader); int rows = dataVowels.getRowDimension() - 1; int cols = dataVowels.getColumnDimension() - 1; Matrix indVar = dataVowels.getMatrix(0, rows, 0, 0); // dataVowels(:,0) -> col 0 is the independent variable dataVowels = dataVowels.getMatrix(0, rows, 1, cols); // dataVowels(:,1:cols) -> dependent variables PCA pca = new PCA(); // pca.eigenPCA(dataVowels.transpose(), true, false); pca.svdPCA(dataVowels.transpose(), true, false); pca.printPricipalComponents(); Regression regVowel = new Regression(); regVowel.multipleLinearRegression(indVar, dataVowels, true); regVowel.printCoefficients(); System.out.println("Correlation vowels original duration / predicted duration = " + regVowel.getCorrelation()); } }