/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.learner.functions.linear;
import java.util.LinkedList;
import java.util.List;
import org.apache.commons.math3.distribution.FDistribution;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.UndefinedParameterError;
/**
* This implements an iterative T-Test based selection. First a forward selection is run and all
* attributes for which the null hypothesis is significantly denied are selected. On this set a
* backward selection is performed and all attributes for which in the combination the null
* hypothesis can't be denied are dropped. The next round then starts with the remaining attributes
* until there's no further change or the maximal number of rounds are exceeded.
*
* @author Sebastian Land
*/
public class IterativeTTestLinearRegressionMethod extends TTestLinearRegressionMethod {
public static final String PARAMETER_MAX_ITERATIONS = "max_iterations";
public static final String PARAMETER_FORWARD_SELECTION_THRESHOLD = "forward_alpha";
public static final String PARAMETER_BACKWARD_SELECTION_THRESHOLD = "backward_alpha";
@Override
public LinearRegressionResult applyMethod(LinearRegression regression, boolean useBias, double ridge,
ExampleSet exampleSet, boolean[] isUsedAttribute, int numberOfExamples, int numberOfUsedAttributes,
double[] means, double labelMean, double[] standardDeviations, double labelStandardDeviation,
double[] coefficientsOnFullData, double errorOnFullData) throws UndefinedParameterError, ProcessStoppedException {
int maxIterations = regression.getParameterAsInt(PARAMETER_MAX_ITERATIONS);
double alphaForward = regression.getParameterAsDouble(PARAMETER_FORWARD_SELECTION_THRESHOLD);
double alphaBackward = regression.getParameterAsDouble(PARAMETER_BACKWARD_SELECTION_THRESHOLD);
FDistribution fdistribution;
// check if the F-distribution can be calculated
int secondDegreeOfFreedom = exampleSet.size() - coefficientsOnFullData.length;
if (secondDegreeOfFreedom > 0) {
fdistribution = new FDistribution(1, secondDegreeOfFreedom);
} else {
fdistribution = null;
}
double generalCorrelation = regression.getCorrelation(exampleSet, isUsedAttribute, coefficientsOnFullData, useBias);
generalCorrelation *= generalCorrelation;
// building data structures
boolean[] isAllowedToUse = isUsedAttribute;
// initialize array for checking for change
boolean[] isLastRoundUsed = new boolean[isUsedAttribute.length];
boolean[] isToUseNextRound = new boolean[isUsedAttribute.length];
isUsedAttribute = new boolean[isUsedAttribute.length];
// do until nothing changes or max rounds exceeded
int iteration = 0;
while (iteration == 0 || iteration < maxIterations && isSelectionDiffering(isUsedAttribute, isLastRoundUsed)) {
System.arraycopy(isUsedAttribute, 0, isLastRoundUsed, 0, isUsedAttribute.length);
// first do forward selection for all single non-selected and
// allowed attributes
int coefficientIndex = 0;
for (int i = 0; i < isAllowedToUse.length; i++) {
if (isAllowedToUse[i] && !isUsedAttribute[i]) {
// check if this not selected one will receive significant coefficient
isUsedAttribute[i] = true;
double[] coefficients = regression.performRegression(exampleSet, isUsedAttribute, means, labelMean,
ridge, useBias);
// only if it is possible to calculate the probabilities, the p-value for this
// attribute is checked
if (fdistribution != null) {
double pValue = getPValue(coefficients[coefficientIndex], i, regression, useBias, ridge, exampleSet,
isUsedAttribute, standardDeviations, labelStandardDeviation, fdistribution,
generalCorrelation);
if (1.0d - pValue <= alphaForward) {
isToUseNextRound[i] = true;
}
}
isUsedAttribute[i] = false;
} else if (isUsedAttribute[i]) {
coefficientIndex++;
}
}
// now add all that we have remembered to use
for (int i = 0; i < isUsedAttribute.length; i++) {
isUsedAttribute[i] |= isToUseNextRound[i];
isToUseNextRound[i] = false;
}
// now we have to deselect all that do not fulfill t-test in combination
{
double[] coefficients = regression.performRegression(exampleSet, isUsedAttribute, means, labelMean, ridge,
useBias);
isUsedAttribute = filterByPValue(regression, useBias, ridge, exampleSet, isUsedAttribute, means, labelMean,
standardDeviations, labelStandardDeviation, coefficients, alphaBackward).isUsedAttribute;
}
iteration++;
}
// calculate result
LinearRegressionResult result = new LinearRegressionResult();
result.isUsedAttribute = isUsedAttribute;
result.coefficients = regression.performRegression(exampleSet, isUsedAttribute, means, labelMean, ridge, useBias);
result.error = regression.getSquaredError(exampleSet, isUsedAttribute, result.coefficients, useBias);
return result;
}
private boolean isSelectionDiffering(boolean[] isUsedAttribute, boolean[] isLastRoundUsed) {
for (int i = 0; i < isUsedAttribute.length; i++) {
if (isUsedAttribute[i] != isLastRoundUsed[i]) {
return true;
}
}
return false;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<ParameterType>();
types.add(new ParameterTypeInt(PARAMETER_MAX_ITERATIONS, "The maximal number of rounds for iterative selection.", 1,
Integer.MAX_VALUE, 10));
types.add(new ParameterTypeDouble(PARAMETER_FORWARD_SELECTION_THRESHOLD,
"This is the alpha level for the used t-test for selecting attributes.", 0, 1, 0.05));
types.add(new ParameterTypeDouble(PARAMETER_BACKWARD_SELECTION_THRESHOLD,
"This is the alpha level for the used t-test for deselecting attributes.", 0, 1, 0.05));
return types;
}
}