package edu.princeton.cs.algs4.ch14;
import edu.princeton.cs.introcs.*;
/*************************************************************************
* Compilation: javac LinearRegression.java
* Execution: java LinearRegression
*
* Compute least squares solution to y = beta * x + alpha.
* Simple linear regression.
*
* // TODO: rename beta and alpha to slope and intercept.
*
*************************************************************************/
/**
* The <tt>LinearRegression</tt> class performs a simple linear regression
* on an set of <em>N</em> data points (<em>y<sub>i</sub></em>, <em>x<sub>i</sub></em>).
* That is, it fits a straight line <em>y</em> = α + β <em>x</em>,
* (where <em>y</em> is the response variable, <em>x</em> is the predictor variable,
* α is the <em>y-intercept</em>, and β is the <em>slope</em>)
* that minimizes the sum of squared residuals of the linear regression model.
* It also computes associated statistics, including the coefficient of
* determination <em>R</em><sup>2</sup> and the standard deviation of the
* estimates for the slope and <em>y</em>-intercept.
*
* @author Robert Sedgewick
* @author Kevin Wayne
*/
public class LinearRegression {
private final int N;
private final double alpha, beta;
private final double R2;
private final double svar, svar0, svar1;
/**
* Performs a linear regression on the data points <tt>(y[i], x[i])</tt>.
* @param x the values of the predictor variable
* @param y the corresponding values of the response variable
* @throws java.lang.IllegalArgumentException if the lengths of the two arrays are not equal
*/
public LinearRegression(double[] x, double[] y) {
if (x.length != y.length) {
throw new IllegalArgumentException("array lengths are not equal");
}
N = x.length;
// first pass
double sumx = 0.0, sumy = 0.0, sumx2 = 0.0;
for (int i = 0; i < N; i++) sumx += x[i];
for (int i = 0; i < N; i++) sumx2 += x[i]*x[i];
for (int i = 0; i < N; i++) sumy += y[i];
double xbar = sumx / N;
double ybar = sumy / N;
// second pass: compute summary statistics
double xxbar = 0.0, yybar = 0.0, xybar = 0.0;
for (int i = 0; i < N; i++) {
xxbar += (x[i] - xbar) * (x[i] - xbar);
yybar += (y[i] - ybar) * (y[i] - ybar);
xybar += (x[i] - xbar) * (y[i] - ybar);
}
beta = xybar / xxbar;
alpha = ybar - beta * xbar;
// more statistical analysis
double rss = 0.0; // residual sum of squares
double ssr = 0.0; // regression sum of squares
for (int i = 0; i < N; i++) {
double fit = beta*x[i] + alpha;
rss += (fit - y[i]) * (fit - y[i]);
ssr += (fit - ybar) * (fit - ybar);
}
int degreesOfFreedom = N-2;
R2 = ssr / yybar;
svar = rss / degreesOfFreedom;
svar1 = svar / xxbar;
svar0 = svar/N + xbar*xbar*svar1;
}
/**
* Returns the <em>y</em>-intercept α of the best of the best-fit line <em>y</em> = α + β <em>x</em>.
* @return the <em>y</em>-intercept α of the best-fit line <em>y = α + β x</em>
*/
public double intercept() {
return alpha;
}
/**
* Returns the slope β of the best of the best-fit line <em>y</em> = α + β <em>x</em>.
* @return the slope β of the best-fit line <em>y</em> = α + β <em>x</em>
*/
public double slope() {
return beta;
}
/**
* Returns the coefficient of determination <em>R</em><sup>2</sup>.
* @return the coefficient of determination <em>R</em><sup>2</sup>, which is a real number between 0 and 1
*/
public double R2() {
return R2;
}
/**
* Returns the standard error of the estimate for the intercept.
* @return the standard error of the estimate for the intercept
*/
public double interceptStdErr() {
return Math.sqrt(svar0);
}
/**
* Returns the standard error of the estimate for the slope.
* @return the standard error of the estimate for the slope
*/
public double slopeStdErr() {
return Math.sqrt(svar1);
}
/**
* Returns the expected response <tt>y</tt> given the value of the predictor
* variable <tt>x</tt>.
* @param x the value of the predictor variable
* @return the expected response <tt>y</tt> given the value of the predictor
* variable <tt>x</tt>
*/
public double predict(double x) {
return beta*x + alpha;
}
/**
* Returns a string representation of the simple linear regression model.
* @return a string representation of the simple linear regression model,
* including the best-fit line and the coefficient of determination <em>R</em><sup>2</sup>
*/
public String toString() {
String s = "";
s += String.format("%.2f N + %.2f", slope(), intercept());
return s + " (R^2 = " + String.format("%.3f", R2()) + ")";
}
}
/*************************************************************************
* Copyright 2002-2012, Robert Sedgewick and Kevin Wayne.
*
* This file is part of algs4-package.jar, which accompanies the textbook
*
* Algorithms, 4th edition by Robert Sedgewick and Kevin Wayne,
* Addison-Wesley Professional, 2011, ISBN 0-321-57351-X.
* http://algs4.cs.princeton.edu
*
*
* algs4-package.jar is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* algs4-package.jar is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with algs4-package.jar. If not, see http://www.gnu.org/licenses.
*************************************************************************/