/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.functions.linear; import java.util.LinkedList; import java.util.List; import org.apache.commons.math3.distribution.FDistribution; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.UndefinedParameterError; /** * This implements an attribute selection method for linear regression that is based on a T-Test. It * will filter out all attributes whose coefficient is not significantly different from 0. * * @author Sebastian Land, Ingo Mierswa * */ public class TTestLinearRegressionMethod implements LinearRegressionMethod { public static final String PARAMETER_SIGNIFICANCE_LEVEL = "alpha"; @Override public LinearRegressionResult applyMethod(LinearRegression regression, boolean useBias, double ridge, ExampleSet exampleSet, boolean[] isUsedAttribute, int numberOfExamples, int numberOfUsedAttributes, double[] means, double labelMean, double[] standardDeviations, double labelStandardDeviation, double[] coefficientsOnFullData, double errorOnFullData) throws UndefinedParameterError, ProcessStoppedException { double alpha = regression.getParameterAsDouble(PARAMETER_SIGNIFICANCE_LEVEL); LinearRegressionResult result = filterByPValue(regression, useBias, ridge, exampleSet, isUsedAttribute, means, labelMean, standardDeviations, labelStandardDeviation, coefficientsOnFullData, alpha); return result; } /** * This method filters the selected attributes depending on their p-value in respect to the * significance niveau alpha. * * @throws ProcessStoppedException */ protected LinearRegressionResult filterByPValue(LinearRegression regression, boolean useBias, double ridge, ExampleSet exampleSet, boolean[] isUsedAttribute, double[] means, double labelMean, double[] standardDeviations, double labelStandardDeviation, double[] coefficientsOnFullData, double alpha) throws UndefinedParameterError, ProcessStoppedException { FDistribution fdistribution; // check if the F-distribution can be calculated int secondDegreeOfFreedom = exampleSet.size() - coefficientsOnFullData.length; if (secondDegreeOfFreedom > 0) { fdistribution = new FDistribution(1, secondDegreeOfFreedom); } else { fdistribution = null; } double generalCorrelation = regression.getCorrelation(exampleSet, isUsedAttribute, coefficientsOnFullData, useBias); generalCorrelation *= generalCorrelation; int index = 0; for (int i = 0; i < isUsedAttribute.length; i++) { if (isUsedAttribute[i]) { double coefficient = coefficientsOnFullData[index]; // only if it is possible to calculate the probabilities, the alpha value for this // attribute is checked if (fdistribution != null) { double probability = getPValue(coefficient, i, regression, useBias, ridge, exampleSet, isUsedAttribute, standardDeviations, labelStandardDeviation, fdistribution, generalCorrelation); if (1.0d - probability > alpha) { isUsedAttribute[i] = false; } index++; } else { isUsedAttribute[i] = false; } } } LinearRegressionResult result = new LinearRegressionResult(); result.isUsedAttribute = isUsedAttribute; result.coefficients = regression.performRegression(exampleSet, isUsedAttribute, means, labelMean, ridge, useBias); result.error = regression.getSquaredError(exampleSet, isUsedAttribute, result.coefficients, useBias); return result; } /** * Returns the PValue of the attributeIndex-th attribute that expresses the probability that the * coefficient is only random. * * @throws ProcessStoppedException */ protected double getPValue(double coefficient, int attributeIndex, LinearRegression regression, boolean useBias, double ridge, ExampleSet exampleSet, boolean[] isUsedAttribute, double[] standardDeviations, double labelStandardDeviation, FDistribution fdistribution, double generalCorrelation) throws UndefinedParameterError, ProcessStoppedException { double tolerance = regression.getTolerance(exampleSet, isUsedAttribute, attributeIndex, ridge, useBias); double standardError = Math.sqrt((1.0d - generalCorrelation) / (tolerance * (exampleSet.size() - exampleSet.getAttributes().size() - 1.0d))) * labelStandardDeviation / standardDeviations[attributeIndex]; // calculating other statistics double tStatistics = coefficient / standardError; double probability = fdistribution.cumulativeProbability(tStatistics * tStatistics); return probability; } @Override public List<ParameterType> getParameterTypes() { LinkedList<ParameterType> types = new LinkedList<ParameterType>(); types.add(new ParameterTypeDouble(PARAMETER_SIGNIFICANCE_LEVEL, "This is the significance level of the t-test.", 0, 1, 0.05)); return types; } }