/**
* Copyright 2010 DFKI GmbH.
* All Rights Reserved. Use is subject to license terms.
*
* This file is part of MARY TTS.
*
* MARY TTS is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package marytts.util.math;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Vector;
import Jama.Matrix;
/***
* Multiple linear regression For the case of k independent variables x_1, x_2, ... x_k the sample regression equation is given by
* y_i = b_0 + b_1*x_1i + b_2*x_2i + ... + b_k*x_ki + e_i y = X*b
*
* Ref: Walpole, Myers, Myers and Ye, "Probability and statistics for engineers and scientists", Prentice Hall, chapter 12, pg.
* 400, 2002.
*
* @author marcela
* */
public class Regression {
private double[] coeffs; // keep coefficient in a Matrix
private double[] residuals; // duration(i) - predicted_duration(i)
private double[] predictedValues; // predicted_duration(i) --> y(i)
private double correlation; // correlation between predicted_duration and duration
private double rmse; // root mean square error (RMSE)
private Matrix b;
private boolean b0Term;
public double[] getCoeffs() {
return coeffs;
}
public Regression() {
predictedValues = null;
b = null;
}
public void setCoeffs(double coeffsVal[]) {
coeffs = new double[coeffsVal.length];
for (int i = 0; i < coeffs.length; i++) {
coeffs[i] = coeffsVal[i];
}
b = new Matrix(coeffs, coeffs.length);
}
public double[] getResiduals() {
return residuals;
}
public double[] getPredictedValues() {
return predictedValues;
}
/* Correlation between original values and predicted ones. */
public double getCorrelation() {
return correlation;
}
/* RMSE: root mean square error */
public double getRMSE() {
return rmse;
}
/***
*
* @param data
* dependent and independent variables data={{y1, x11, x12, ... x1k}, {y2, x21, x22, ... x2k}, ... {yn, xn1, xn2,
* ... xnk}}
* @param interceptTerm
* number of the column that will be used as dependent variable → vector y by default the first column is y
* @param rows
* number of rows
* @param cols
* number of cols including the dependent variable
* @return coefficients or null if problems found
*/
public double[] multipleLinearRegression(double[] data, int rows, int cols, boolean interceptTerm) {
if (data == null)
throw new NullPointerException("Null data");
if (rows < 0 || cols < 0)
throw new IllegalArgumentException("Number of rows and cols must be greater than 0");
b0Term = interceptTerm;
Matrix dataX;
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
dataX = new Matrix(rows, cols);
coeffs = new double[cols];
} else {
dataX = new Matrix(rows, cols - 1);
coeffs = new double[cols - 1];
}
double[] datay = new double[rows];
// Fill the data in the matrix X (independent variables) and vector y (dependet variable)
int n = 0; // number of data points
for (int i = 0; i < rows; i++) {
if (interceptTerm) {
dataX.set(i, 0, 1.0);
datay[i] = data[n++]; // first column is the dependent variable
for (int j = 1; j < cols; j++)
dataX.set(i, j, data[n++]);
} else { // No interceptTerm so no need to fill the first column with 1s
datay[i] = data[n++]; // first column is the dependent variable
for (int j = 0; j < cols - 1; j++)
dataX.set(i, j, data[n++]);
}
}
multipleLinearRegression(datay, dataX);
return coeffs;
}
public double[] multipleLinearRegression(double[] datay, double[][] datax, boolean interceptTerm) {
if (datay == null || datax == null)
throw new NullPointerException("Null data");
b0Term = interceptTerm;
int rows = datay.length;
int cols = datax[0].length;
Matrix dataX;
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
dataX = new Matrix(rows, cols + 1);
coeffs = new double[cols + 1];
} else {
dataX = new Matrix(datax);
coeffs = new double[cols];
}
// If intercept, we need to add a ones column to dataX
if (interceptTerm) {
for (int i = 0; i < rows; i++) {
dataX.set(i, 0, 1.0);
for (int j = 1; j < cols + 1; j++)
dataX.set(i, j, datax[i][j - 1]);
}
}
multipleLinearRegression(datay, dataX);
return coeffs;
}
public double[] multipleLinearRegression(Vector<Double> vectory, Vector<Double> vectorx, int rows, int cols,
boolean interceptTerm) {
if (vectory == null || vectorx == null)
throw new NullPointerException("Null data");
b0Term = interceptTerm;
Matrix dataX;
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
dataX = new Matrix(rows, cols + 1);
coeffs = new double[cols + 1];
} else {
dataX = new Matrix(rows, cols);
coeffs = new double[cols];
}
double[] datay = new double[rows];
// Fill the data in the matrix X (independent variables) and vector y (dependet variable)
int n = 0; // number of data points
for (int i = 0; i < rows; i++) {
if (interceptTerm) {
datay[i] = vectory.elementAt(i); // first column is the dependent variable
dataX.set(i, 0, 1.0);
for (int j = 1; j < cols + 1; j++) {
dataX.set(i, j, vectorx.elementAt(n++));
}
} else { // No interceptTerm so no need to fill the first column with 1s
datay[i] = vectory.elementAt(i); // first column is the dependent variable
for (int j = 0; j < cols; j++) {
dataX.set(i, j, vectorx.elementAt(n++));
}
}
}
multipleLinearRegression(datay, dataX);
return coeffs;
}
/***
*
* @param data
* Vector contains dependent variable first and then independent variables
* @param rows
* rows
* @param cols
* cols
* @param interceptTerm
* intercept term
* @return coeffs
*/
public double[] multipleLinearRegression(Vector<Double> data, int rows, int cols, boolean interceptTerm) {
if (data == null)
throw new NullPointerException("Null data");
if (rows < 0 || cols < 0)
throw new IllegalArgumentException("Number of rows and cols must be greater than 0");
b0Term = interceptTerm;
Matrix dataX;
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
dataX = new Matrix(rows, cols);
coeffs = new double[cols];
} else {
dataX = new Matrix(rows, cols - 1);
coeffs = new double[cols - 1];
}
double[] datay = new double[rows];
// Fill the data in the matrix X (independent variables) and vector y (dependet variable)
int n = 0; // number of data points
for (int i = 0; i < rows; i++) {
if (interceptTerm) {
dataX.set(i, 0, 1.0);
datay[i] = data.elementAt(n++); // first column is the dependent variable
for (int j = 1; j < cols; j++)
dataX.set(i, j, data.elementAt(n++));
} else { // No interceptTerm so no need to fill the first column with 1s
datay[i] = data.elementAt(n++); // first column is the dependent variable
for (int j = 0; j < cols - 1; j++)
dataX.set(i, j, data.elementAt(n++));
}
}
multipleLinearRegression(datay, dataX);
return coeffs;
}
public void multipleLinearRegression(Matrix datay, Matrix dataX, boolean interceptTerm) {
b0Term = interceptTerm;
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
int row = dataX.getRowDimension();
int col = dataX.getColumnDimension();
Matrix B = new Matrix(row, col + 1);
Matrix ones = new Matrix(row, 1);
for (int i = 0; i < row; i++)
ones.set(i, 0, 1.0);
B.setMatrix(0, row - 1, 0, 0, ones);
B.setMatrix(0, row - 1, 1, col, dataX);
multipleLinearRegression(datay, B);
} else {
multipleLinearRegression(datay, dataX);
}
}
private void multipleLinearRegression(double[] datay, Matrix dataX) {
Matrix y = new Matrix(datay, datay.length);
multipleLinearRegression(y, dataX);
}
/***
* Least-square solution y = X * b where: y_i = b_0 + b_1*x_1i + b_2*x_2i + ... + b_k*x_ki including intercep term y_i =
* b_1*x_1i + b_2*x_2i + ... + b_k*x_ki without intercep term
*
* @param datay
* datay
* @param dataX
* dataX
*/
private void multipleLinearRegression(Matrix datay, Matrix dataX) {
Matrix X, y;
try {
X = dataX;
y = datay;
b = X.solve(y);
coeffs = new double[b.getRowDimension()];
for (int j = 0; j < b.getRowDimension(); j++) {
coeffs[j] = b.get(j, 0);
// System.out.println("coeff[" + j + "]=" + coeffs[j]);
}
// Residuals:
Matrix r = X.times(b).minus(y);
residuals = r.getColumnPackedCopy();
// root mean square error (RMSE)
rmse = Math.sqrt(MathUtils.sumSquared(residuals) / residuals.length);
// Predicted values
Matrix p = X.times(b);
predictedValues = p.getColumnPackedCopy();
// Correlation between original values and predicted ones
correlation = MathUtils.correlation(predictedValues, y.getColumnPackedCopy());
} catch (RuntimeException re) {
throw new Error("Error solving Least-square solution: y = X * b");
}
}
public void printCoefficients(int[] indices, String[] factors) {
if (coeffs != null) {
System.out.println("Linear regression:");
if (b0Term) {
System.out.format(" %.5f\n", coeffs[0]);
for (int j = 1; j < coeffs.length; j++)
System.out.format(" %.5f (%s)\n", coeffs[j], factors[indices[j - 1]]);
} else {
for (int j = 0; j < coeffs.length; j++)
System.out.format(" %.5f (%s)\n", coeffs[j], factors[indices[j]]);
}
} else
System.out.println("There is no coefficients to print.");
}
public void printCoefficients() {
if (coeffs != null) {
for (int j = 0; j < coeffs.length; j++)
System.out.format("coeff[%d]=%.5f\n", j, coeffs[j]);
} else
System.out.println("There is no coefficients to print.");
}
public void multipleLinearRegression(String fileName, boolean interceptTerm) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
Matrix data = Matrix.read(reader);
reader.close();
int rows = data.getRowDimension() - 1;
int cols = data.getColumnDimension() - 1;
Matrix indVar = data.getMatrix(0, rows, 0, 0); // dataVowels(:,0) -> col 0 is the independent variable
data = data.getMatrix(0, rows, 1, cols); // dataVowels(:,1:cols) -> dependent variables
multipleLinearRegression(indVar, data, interceptTerm);
} catch (Exception e) {
throw new RuntimeException("Problem reading file " + fileName, e);
}
}
/***
* multipleLinearRegression providing index numbers for the columns in fileName, index 0 correspond to column 1
*
* @param fileName
* file name
* @param indVariable
* column number (index) of the independent variable
* @param c
* int[] column numbers array (indices) of dependent variables
* @param factors
* factors
* @param rowIni
* and rowEnd should be given from 0 - maxData-1
* @param rowEnd
* row end
* @param interceptTerm
* intercept term
*/
public void multipleLinearRegression(String fileName, int indVariable, int[] c, String[] factors, boolean interceptTerm,
int rowIni, int rowEnd) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
Matrix data = Matrix.read(reader);
reader.close();
int rows = data.getRowDimension() - 1;
int cols = data.getColumnDimension() - 1;
if (rowIni < 0 || rowIni > rows)
throw new RuntimeException("Problem reading file, rowIni=" + rowIni + " and number of rows in file=" + rows);
if (rowEnd < 0 || rowEnd > rows)
throw new RuntimeException("Problem reading file, rowIni=" + rowIni + " and number of rows in file=" + rows);
if (rowIni > rowEnd)
throw new RuntimeException("Problem reading file, rowIni < rowend" + rowIni + " < " + rowEnd);
Matrix indVar = data.getMatrix(rowIni, rowEnd, indVariable, indVariable); // dataVowels(:,0) -> last col is the
// independent variable
data = data.getMatrix(rowIni, rowEnd, c); // the dependent variables correspond to the column indices in c
multipleLinearRegression(indVar, data, interceptTerm);
} catch (Exception e) {
throw new RuntimeException("Problem reading file " + fileName, e);
}
}
// Given a set of coefficients and data predict values applying linear equation
// This function can be used to test with data that was not used in training
// c[] is the number of the columns in the file not the indexFeatures
public void predictValues(String fileName, int indVariable, int[] c, boolean interceptTerm, int rowIni, int rowEnd) {
try {
BufferedReader reader = new BufferedReader(new FileReader(fileName));
Matrix data = Matrix.read(reader);
reader.close();
int rows = data.getRowDimension() - 1;
int cols = data.getColumnDimension() - 1;
if (rowIni < 0 || rowIni > rows)
throw new RuntimeException("Problem reading file, rowIni=" + rowIni + " and number of rows in file=" + rows);
if (rowEnd < 0 || rowEnd > rows)
throw new RuntimeException("Problem reading file, rowIni=" + rowIni + " and number of rows in file=" + rows);
if (rowIni > rowEnd)
throw new RuntimeException("Problem reading file, rowIni < rowend" + rowIni + " < " + rowEnd);
Matrix indVar = data.getMatrix(rowIni, rowEnd, indVariable, indVariable); // dataVowels(:,0) -> last col is the
// independent variable
data = data.getMatrix(rowIni, rowEnd, c); // the dependent variables correspond to the column indices in c
int numCoeff;
if (interceptTerm)
numCoeff = c.length + 1;
else
numCoeff = c.length;
if (b != null) {
if (b.getRowDimension() == numCoeff) {
if (interceptTerm) { // first column of X is filled with 1s if b_0 != 0
int row = data.getRowDimension();
int col = data.getColumnDimension();
Matrix B = new Matrix(row, col + 1);
Matrix ones = new Matrix(row, 1);
for (int i = 0; i < row; i++)
ones.set(i, 0, 1.0);
B.setMatrix(0, row - 1, 0, 0, ones);
B.setMatrix(0, row - 1, 1, col, data);
data = B;
}
// Residuals:
Matrix r = data.times(b).minus(indVar);
residuals = r.getColumnPackedCopy();
// root mean square error (RMSE)
rmse = Math.sqrt(MathUtils.sumSquared(residuals) / residuals.length);
// Predicted values
Matrix p = data.times(b);
predictedValues = p.getColumnPackedCopy();
for (int i = 0; i < predictedValues.length; i++)
if (predictedValues[i] < 0.0)
System.out.println("*** WARNING predictedValue < 0.0 : predictedValues[" + i + "]="
+ predictedValues[i]);
// Correlation between original values and predicted ones
correlation = MathUtils.correlation(predictedValues, indVar.getColumnPackedCopy());
System.out.println("Correlation predicted values and real: " + correlation);
System.out.println("RMSE (root mean square error): " + rmse);
} else {
throw new RuntimeException("Number of columns of data is not the same as number of coeficients");
}
} else {
throw new RuntimeException("Regression coefficients are not loaded");
}
} catch (Exception e) {
throw new RuntimeException("Problem reading file " + fileName, e);
}
}
public static void main(String[] args) throws Exception {
Regression reg = new Regression();
double[] yvals = { 25.5, 31.2, 25.9, 38.4, 18.4, 26.7, 26.4, 25.9, 32.0, 25.2, 39.7, 35.7, 26.5 };
double[][] xvals = { { 1.74, 5.30, 10.8 }, { 6.32, 5.42, 9.4 }, { 6.22, 8.41, 7.2 }, { 10.52, 4.63, 8.5 },
{ 1.19, 11.60, 9.4 }, { 1.22, 5.85, 9.9 }, { 4.10, 6.62, 8.0 }, { 6.32, 8.72, 9.1 }, { 4.08, 4.42, 8.7 },
{ 4.15, 7.60, 9.2 }, { 10.15, 4.83, 9.4 }, { 1.72, 3.12, 7.6 }, { 1.70, 5.30, 8.2 } };
Matrix A = new Matrix(xvals);
int row = A.getRowDimension();
int col = A.getColumnDimension();
A.print(row, 3);
Matrix B = new Matrix(row, col + 1);
Matrix ones = new Matrix(row, 1);
for (int i = 0; i < row; i++)
ones.set(i, 0, 1.0);
B.setMatrix(0, row - 1, 0, 0, ones);
B.setMatrix(0, row - 1, 1, col, A);
B.print(row, 3);
boolean interceptTerm = true;
double coeffs[] = reg.multipleLinearRegression(yvals, xvals, interceptTerm);
reg.printCoefficients();
Vector<Double> y = new Vector<Double>();
Vector<Double> x = new Vector<Double>();
Vector<Double> data = new Vector<Double>();
double array[] = new double[13 * 4];
int cols = 3;
int rows = yvals.length;
int n = 0;
for (int i = 0; i < rows; i++) {
y.add(yvals[i]);
data.add(yvals[i]);
array[n++] = yvals[i];
for (int j = 0; j < cols; j++) {
x.add(xvals[i][j]);
data.add(xvals[i][j]);
array[n++] = xvals[i][j];
}
}
System.out.println("Vectors y and x:");
coeffs = reg.multipleLinearRegression(y, x, rows, cols, interceptTerm);
reg.printCoefficients();
// All the data in only one Vector<Double>
cols = 4; // because includes the dependent variable
rows = yvals.length;
System.out.println("Vector<> data:");
coeffs = reg.multipleLinearRegression(data, rows, cols, interceptTerm);
reg.printCoefficients();
// array
System.out.println("Vector array [] data:");
coeffs = reg.multipleLinearRegression(array, rows, cols, interceptTerm);
reg.printCoefficients();
}
}