/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.learner.functions;
import java.util.Iterator;
import java.util.List;
import Jama.Matrix;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Statistics;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.AbstractLearner;
import com.rapidminer.operator.learner.LearnerCapability;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
/**
* <p>This operator calculates a linear regression model. It uses the Akaike criterion
* for model selection.</p>
*
* @author Ingo Mierswa
* @version $Id: LinearRegression.java,v 1.10 2008/09/04 17:54:09 ingomierswa Exp $
*/
public class LinearRegression extends AbstractLearner {
/** The parameter name for "The feature selection method used during regression." */
public static final String PARAMETER_FEATURE_SELECTION = "feature_selection";
/** The parameter name for "Indicates if the algorithm should try to delete colinear features during the regression." */
public static final String PARAMETER_ELIMINATE_COLINEAR_FEATURES = "eliminate_colinear_features";
/** The parameter name for "The minimum standardized coefficient for the removal of colinear feature elimination." */
public static final String PARAMETER_MIN_STANDARDIZED_COEFFICIENT = "min_standardized_coefficient";
/** The parameter name for "The ridge parameter used during ridge regression." */
public static final String PARAMETER_RIDGE = "ridge";
/** Attribute selection methods */
public static final String[] FEATURE_SELECTION_METHODS = {
"none",
"M5 prime",
"greedy"
};
/** Attribute selection method: No attribute selection */
public static final int NO_SELECTION = 0;
/** Attribute selection method: M5 method */
public static final int M5_PRIME = 1;
/** Attribute selection method: Greedy method */
public static final int GREEDY = 2;
public LinearRegression(OperatorDescription description) {
super(description);
}
public Model learn(ExampleSet exampleSet) throws OperatorException {
Attribute label = exampleSet.getAttributes().getLabel();
Attribute workingLabel = label;
boolean cleanUpLabel = false;
String firstClassName = null;
String secondClassName = null;
if (label.isNominal()) {
if (label.getMapping().size() == 2) {
firstClassName = label.getMapping().getNegativeString();
secondClassName = label.getMapping().getPositiveString();
int firstIndex = label.getMapping().getNegativeIndex();
workingLabel = AttributeFactory.createAttribute("regression_label", Ontology.REAL);
exampleSet.getExampleTable().addAttribute(workingLabel);
for (Example example : exampleSet) {
double index = example.getValue(label);
if (index == firstIndex) {
example.setValue(workingLabel, 0.0d);
} else {
example.setValue(workingLabel, 1.0d);
}
}
exampleSet.getAttributes().setLabel(workingLabel);
cleanUpLabel = true;
}
}
// start with all attributes
int numberOfAttributes = exampleSet.getAttributes().size();
boolean[] attributeSelection = new boolean[numberOfAttributes];
int counter = 0;
String[] attributeNames = new String[numberOfAttributes];
for (Attribute attribute : exampleSet.getAttributes()) {
attributeSelection[counter] = attribute.isNumerical();
attributeNames[counter] = attribute.getName();
counter++;
}
// compute and store statistics and turn off attributes with std. dev. = 0
exampleSet.recalculateAllAttributeStatistics();
double[] means = new double[numberOfAttributes];
double[] standardDeviations = new double[numberOfAttributes];
counter = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
if (attributeSelection[counter]) {
means[counter] = exampleSet.getStatistics(attribute, Statistics.AVERAGE);
standardDeviations[counter] = Math.sqrt(exampleSet.getStatistics(attribute, Statistics.VARIANCE));
if (standardDeviations[counter] == 0) {
attributeSelection[counter] = false;
}
}
counter++;
}
double labelMean = exampleSet.getStatistics(workingLabel, Statistics.AVERAGE);
double classStandardDeviation = Math.sqrt(exampleSet.getStatistics(workingLabel, Statistics.VARIANCE));
int numberOfExamples = exampleSet.size();
double[] coefficients = new double[numberOfAttributes + 1];
// perform a regression and remove colinear attributes
do {
coefficients = performRegression(exampleSet, attributeSelection, means, labelMean);
} while (getParameterAsBoolean(PARAMETER_ELIMINATE_COLINEAR_FEATURES) && deselectAttributeWithHighestCoefficient(attributeSelection, coefficients, standardDeviations, classStandardDeviation));
// determine the current number of attributes + 1
int currentlySelectedAttributes = 1;
for (int i = 0; i < attributeSelection.length; i++) {
if (attributeSelection[i]) {
currentlySelectedAttributes++;
}
}
double error = getSquaredError(exampleSet, attributeSelection, coefficients);
double akaike = (numberOfExamples - currentlySelectedAttributes) + 2 * currentlySelectedAttributes;
boolean improved;
int currentNumberOfAttributes = currentlySelectedAttributes;
switch (getParameterAsInt(PARAMETER_FEATURE_SELECTION)) {
case GREEDY:
do {
boolean[] currentlySelected = attributeSelection.clone();
improved = false;
currentNumberOfAttributes--;
for (int i = 0; i < attributeSelection.length; i++) {
if (currentlySelected[i]) {
// calculate the akaike value without this attribute
currentlySelected[i] = false;
double[] currentCoeffs = performRegression(exampleSet, currentlySelected, means, labelMean);
double currentMSE = getSquaredError(exampleSet, currentlySelected, currentCoeffs);
double currentAkaike = currentMSE / error * (numberOfExamples - currentlySelectedAttributes) + 2 * currentNumberOfAttributes;
// if the value is improved compared to the current best
if (currentAkaike < akaike) {
improved = true;
akaike = currentAkaike;
System.arraycopy(currentlySelected, 0, attributeSelection, 0, attributeSelection.length);
coefficients = currentCoeffs;
}
currentlySelected[i] = true;
}
}
} while (improved);
break;
case M5_PRIME:
// attribute removal as in M5 prime
do {
improved = false;
currentNumberOfAttributes--;
// find the attribute with the smallest standardized coefficient
double minStadardizedCoefficient = 0;
int attribute2Deselect = -1;
int coefficientIndex = 0;
for (int i = 0; i < attributeSelection.length; i++) {
if (attributeSelection[i]) {
double standardizedCoefficient = Math.abs(coefficients[coefficientIndex] * standardDeviations[i] / classStandardDeviation);
if ((coefficientIndex == 0) || (standardizedCoefficient < minStadardizedCoefficient)) {
minStadardizedCoefficient = standardizedCoefficient;
attribute2Deselect = i;
}
coefficientIndex++;
}
}
// check if removing this attribute improves Akaike
if (attribute2Deselect >= 0) {
attributeSelection[attribute2Deselect] = false;
double[] currentCoefficients = performRegression(exampleSet, attributeSelection, means, labelMean);
double currentError = getSquaredError(exampleSet, attributeSelection, currentCoefficients);
double currentAkaike = currentError / error * (numberOfExamples - currentlySelectedAttributes) + 2 * currentNumberOfAttributes;
if (currentAkaike < akaike) {
improved = true;
akaike = currentAkaike;
coefficients = currentCoefficients;
} else {
attributeSelection[attribute2Deselect] = true;
}
}
} while (improved);
break;
case NO_SELECTION:
break;
}
// clean up?
if (cleanUpLabel) {
exampleSet.getAttributes().remove(workingLabel);
exampleSet.getExampleTable().removeAttribute(workingLabel);
exampleSet.getAttributes().setLabel(label);
}
return new LinearRegressionModel(exampleSet, attributeSelection, coefficients, firstClassName, secondClassName);
}
/** This method removes the attribute with the highest standardized coefficient
* greater than the minimum coefficient parameter. Checks only those attributes
* which are currently selected. Returns true if an attribute was actually
* deselected and false otherwise. */
private boolean deselectAttributeWithHighestCoefficient(boolean[] selectedAttributes, double[] coefficients, double[] standardDeviations, double classStandardDeviation) throws UndefinedParameterError {
double minCoefficient = getParameterAsDouble(PARAMETER_MIN_STANDARDIZED_COEFFICIENT);
int attribute2Deselect = -1;
int coefficientIndex = 0;
for (int i = 0; i < selectedAttributes.length; i++) {
if (selectedAttributes[i]) {
double standardizedCoefficient = Math.abs(coefficients[coefficientIndex] * standardDeviations[i] / classStandardDeviation);
if (standardizedCoefficient > minCoefficient) {
minCoefficient = standardizedCoefficient;
attribute2Deselect = i;
}
coefficientIndex++;
}
}
if (attribute2Deselect >= 0) {
selectedAttributes[attribute2Deselect] = false;
return true;
}
return false;
}
/** Calculates the squared error of a regression model on the training data. */
private double getSquaredError(ExampleSet exampleSet, boolean[] selectedAttributes, double[] coefficients) {
double error = 0;
Iterator<Example> i = exampleSet.iterator();
while (i.hasNext()) {
Example example = i.next();
double prediction = regressionPrediction(example, selectedAttributes, coefficients);
double diff = prediction - example.getLabel();
error += diff * diff;
}
return error;
}
/** Calculates the prediction for the given example. */
private double regressionPrediction(Example example, boolean[] selectedAttributes, double[] coefficients) {
double prediction = 0;
int index = 0;
int counter = 0;
for (Attribute attribute : example.getAttributes()) {
if (selectedAttributes[counter++]) {
prediction += coefficients[index] * example.getValue(attribute);
index++;
}
}
prediction += coefficients[index];
return prediction;
}
/** Calculate a linear regression only from the selected attributes. The method returns the
* calculated coefficients. */
private double[] performRegression(ExampleSet exampleSet, boolean[] selectedAttributes, double[] means, double labelMean) throws UndefinedParameterError {
int currentlySelectedAttributes = 0;
for (int i = 0; i < selectedAttributes.length; i++) {
if (selectedAttributes[i]) {
currentlySelectedAttributes++;
}
}
Matrix independent = null, dependent = null;
double[] weights = null;
if (currentlySelectedAttributes > 0) {
independent = new Matrix(exampleSet.size(), currentlySelectedAttributes);
dependent = new Matrix(exampleSet.size(), 1);
int exampleIndex = 0;
Iterator<Example> i = exampleSet.iterator();
weights = new double[exampleSet.size()];
Attribute weightAttribute = exampleSet.getAttributes().getWeight();
while (i.hasNext()) {
Example example = i.next();
int attributeIndex = 0;
dependent.set(exampleIndex, 0, example.getLabel());
int counter = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
if (selectedAttributes[counter]) {
double value = example.getValue(attribute) - means[counter];
independent.set(exampleIndex, attributeIndex, value);
attributeIndex++;
}
counter++;
}
if (weightAttribute != null)
weights[exampleIndex] = example.getValue(weightAttribute);
else
weights[exampleIndex] = 1.0d;
exampleIndex++;
}
}
double[] coefficients = new double[currentlySelectedAttributes + 1];
if (currentlySelectedAttributes > 0) {
double[] coefficientsWithoutIntercept =
(new com.rapidminer.tools.math.LinearRegression(independent,
dependent,
weights,
getParameterAsDouble(PARAMETER_RIDGE))).getCoefficients();
System.arraycopy(coefficientsWithoutIntercept, 0, coefficients, 0, currentlySelectedAttributes);
}
coefficients[currentlySelectedAttributes] = labelMean;
int coefficientIndex = 0;
for (int i = 0; i < selectedAttributes.length; i++) {
if (selectedAttributes[i]) {
coefficients[coefficients.length - 1] -= coefficients[coefficientIndex] * means[i];
coefficientIndex++;
}
}
return coefficients;
}
public boolean supportsCapability(LearnerCapability lc) {
if (lc.equals(LearnerCapability.NUMERICAL_ATTRIBUTES))
return true;
if (lc.equals(LearnerCapability.NUMERICAL_CLASS))
return true;
if (lc.equals(LearnerCapability.BINOMINAL_CLASS))
return true;
if (lc == LearnerCapability.WEIGHTED_EXAMPLES)
return true;
return false;
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeCategory(PARAMETER_FEATURE_SELECTION, "The feature selection method used during regression.", FEATURE_SELECTION_METHODS, M5_PRIME));
types.add(new ParameterTypeBoolean(PARAMETER_ELIMINATE_COLINEAR_FEATURES, "Indicates if the algorithm should try to delete colinear features during the regression.", true));
types.add(new ParameterTypeDouble(PARAMETER_MIN_STANDARDIZED_COEFFICIENT, "The minimum standardized coefficient for the removal of colinear feature elimination.", 0.0d, Double.POSITIVE_INFINITY, 1.5d));
types.add(new ParameterTypeDouble(PARAMETER_RIDGE, "The ridge parameter used during ridge regression.", 0.0d, Double.POSITIVE_INFINITY, 1.0E-8));
return types;
}
}