/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.features.selection; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeWeights; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.OperatorChain; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ValueDouble; import com.rapidminer.operator.ValueString; import com.rapidminer.operator.performance.PerformanceCriterion; import com.rapidminer.operator.performance.PerformanceVector; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.SubprocessTransformRule; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.parameter.conditions.EqualTypeCondition; import com.rapidminer.tools.math.AnovaCalculator; import com.rapidminer.tools.math.SignificanceCalculationException; import com.rapidminer.tools.math.SignificanceTestResult; /** * This operator starts with the full set of attributes and, in each round, it removes each remaining attribute of the * given set of examples. For each removed attribute, the performance is estimated using inner operators, e.g. a * cross-validation. Only the attribute giving the least decrease of performance is finally removed from the selection. Then a new * round is started with the modified selection. This implementation will avoid any additional memory consumption beside * the memory used originally for storing the data and the memory which might be needed for applying the inner * operators. * A parameter specifies when the iteration will be aborted. There are three different behaviors possible: * <ul> * <li><b>with decrease</b>runs as long as there is any increase in performance</li> * <li><b>with decrease of more than</b>runs as long as the decrease is less than the specified threshold, either relative or absolute.</li> * <li><b>with significant decrease</b> stops as soon as the decrease is significant to the specified level.</li> * </ul> * * The parameter speculative_rounds defines how many rounds will be performed in a row, after a first time the stopping * criterion was fulfilled. If the performance increases again during the speculative rounds, the elimination will be continued. * Otherwise all additionally eliminated attributes will be restored, as if no speculative rounds would have been executed. * This might help to avoid getting stuck in local optima. * * The operator provides a value for logging the performance in each round using a ProcessLog. * * @author Sebastian Land * */ public class BackwardAttributeEliminationOperator extends OperatorChain { public static final String PARAMETER_STOPPING_BEHAVIOR = "stopping_behavior"; public static final String PARAMETER_MAX_ATTRIBUTES = "maximal_number_of_eliminations"; public static final String PARAMETER_MAX_RELATIVE_DECREASE = "maximal_relative_decrease"; public static final String PARAMETER_MAX_ABSOLUT_DECREASE = "maximal_absolute_decrease"; public static final String PARAMETER_USE_RELATIVE_DECREASE = "use_relative_decrease"; public static final String PARAMETER_ALPHA = "alpha"; public static final String PARAMETER_ALLOWED_CONSECUTIVE_FAILS = "speculative_rounds"; public static final String[] STOPPING_BEHAVIORS = new String[] { "with decrease", "with decrease of more than", "with significant decrease" }; public static final int WITH_DECREASE = 0; public static final int WITH_DECREASE_EXCEEDS = 1; public static final int WITH_DECREASE_SIGNIFICANT = 2; private double currentNumberOfFeatures = 0; private Attributes currentAttributes; private InputPort exampleSetInput = getInputPorts().createPort("example set", ExampleSet.class); private OutputPort innerExampleSetSource = getSubprocess(0).getInnerSources().createPort("example set"); private InputPort innerPerformanceSink = getSubprocess(0).getInnerSinks().createPort("performance", PerformanceVector.class); private OutputPort exampleSetOutput = getOutputPorts().createPort("example set"); private OutputPort weightsOutput = getOutputPorts().createPort("attribute weights"); private OutputPort performanceOutput = getOutputPorts().createPort("performance"); public BackwardAttributeEliminationOperator(OperatorDescription description) { super(description, "Learning Process"); getTransformer().addPassThroughRule(exampleSetInput, innerExampleSetSource); getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0))); getTransformer().addPassThroughRule(exampleSetInput, exampleSetOutput); getTransformer().addGenerationRule(performanceOutput, PerformanceVector.class); getTransformer().addGenerationRule(weightsOutput, AttributeWeights.class); addValue(new ValueDouble("number of attributes", "The current number of attributes.") { @Override public double getDoubleValue() { return currentNumberOfFeatures; } }); addValue(new ValueString("feature_names", "A comma separated list of all features of this round.") { @Override public String getStringValue() { if (currentAttributes == null) return "This logging value is only available during execution of this operator's inner subprocess."; StringBuffer buffer = new StringBuffer(); for (Attribute attribute: currentAttributes) { if (buffer.length() > 0) buffer.append(", "); buffer.append(attribute.getName()); } return buffer.toString(); } }); } @Override public void doWork() throws OperatorException { ExampleSet exampleSetOriginal = exampleSetInput.getData(); ExampleSet exampleSet = (ExampleSet) exampleSetOriginal.clone(); int numberOfAttributes = exampleSet.getAttributes().size(); Attributes attributes = exampleSet.getAttributes(); int maxNumberOfAttributes = Math.min(getParameterAsInt(PARAMETER_MAX_ATTRIBUTES), numberOfAttributes - 1); int maxNumberOfFails = getParameterAsInt(PARAMETER_ALLOWED_CONSECUTIVE_FAILS); int behavior = getParameterAsInt(PARAMETER_STOPPING_BEHAVIOR); boolean useRelativeIncrease = (behavior == WITH_DECREASE_EXCEEDS) ? getParameterAsBoolean(PARAMETER_USE_RELATIVE_DECREASE) : false; double maximalDecrease = 0; if (useRelativeIncrease) maximalDecrease = useRelativeIncrease ? getParameterAsDouble(PARAMETER_MAX_RELATIVE_DECREASE) : getParameterAsDouble(PARAMETER_MAX_ABSOLUT_DECREASE); double alpha = (behavior == WITH_DECREASE_SIGNIFICANT)? getParameterAsDouble(PARAMETER_ALPHA) : 0d; // remembering attributes and removing all from example set Attribute[] attributeArray = new Attribute[numberOfAttributes]; int i = 0; Iterator<Attribute> iterator = attributes.iterator(); while (iterator.hasNext()) { Attribute attribute = iterator.next(); attributeArray[i] = attribute; i++; } boolean[] selected = new boolean[numberOfAttributes]; Arrays.fill(selected, true); boolean earlyAbort = false; List<Integer> speculativeList = new ArrayList<Integer>(maxNumberOfFails); int numberOfFails = maxNumberOfFails; currentNumberOfFeatures = numberOfAttributes; currentAttributes = attributes; PerformanceVector lastPerformance = getPerformance(exampleSet); PerformanceVector bestPerformanceEver = lastPerformance; for (i = 0; i < maxNumberOfAttributes && !earlyAbort; i++) { // setting values for logging currentNumberOfFeatures = numberOfAttributes - i - 1; // performing a round int bestIndex = 0; PerformanceVector currentBestPerformance = null; for (int current = 0; current < numberOfAttributes; current++) { if (selected[current]) { // switching off attributes.remove(attributeArray[current]); currentAttributes = attributes; // evaluate performance PerformanceVector performance = getPerformance(exampleSet); if (currentBestPerformance == null || performance.compareTo(currentBestPerformance) > 0) { bestIndex = current; currentBestPerformance = performance; } // switching on attributes.addRegular(attributeArray[current]); currentAttributes = null; // removing reference } } double currentFitness = currentBestPerformance.getMainCriterion().getFitness(); if (i != 0) { double lastFitness = lastPerformance.getMainCriterion().getFitness(); // switch stopping behavior switch (behavior) { case WITH_DECREASE: if (lastFitness >= currentFitness) earlyAbort = true; break; case WITH_DECREASE_EXCEEDS: if (useRelativeIncrease) { // relative increase testing if (currentFitness < lastFitness - Math.abs(lastFitness * maximalDecrease)) earlyAbort = true; } else { // absolute increase testing if (currentFitness < lastFitness - maximalDecrease) earlyAbort = true; } break; case WITH_DECREASE_SIGNIFICANT: AnovaCalculator calculator = new AnovaCalculator(); calculator.setAlpha(alpha); PerformanceCriterion pc = currentBestPerformance.getMainCriterion(); calculator.addGroup(pc.getAverageCount(), pc.getAverage(), pc.getVariance()); pc = lastPerformance.getMainCriterion(); calculator.addGroup(pc.getAverageCount(), pc.getAverage(), pc.getVariance()); SignificanceTestResult result; try { result = calculator.performSignificanceTest(); } catch (SignificanceCalculationException e) { throw new UserError(this, 920, e.getMessage()); } if (lastFitness > currentFitness && result.getProbability() < alpha) earlyAbort = true; } } if (earlyAbort) { // check if there are some free tries left if (numberOfFails == 0) { break; } numberOfFails--; speculativeList.add(bestIndex); earlyAbort = false; // needs performance increase compared to better performance of current and last! if (currentBestPerformance.compareTo(lastPerformance) > 0) lastPerformance = currentBestPerformance; } else { // resetting maximal number of fails. numberOfFails = maxNumberOfFails; speculativeList.clear(); lastPerformance = currentBestPerformance; bestPerformanceEver = currentBestPerformance; } // switching best index off attributes.remove(attributeArray[bestIndex]); selected[bestIndex] = false; } // add predictively removed attributes: speculative execution did not yield good result for (Integer removeIndex: speculativeList) { selected[removeIndex] = true; attributes.addRegular(attributeArray[removeIndex]); } AttributeWeights weights = new AttributeWeights(); i = 0; for (Attribute attribute : attributeArray) { if (selected[i]) weights.setWeight(attribute.getName(), 1d); else weights.setWeight(attribute.getName(), 0d); i++; } exampleSetOutput.deliver(exampleSet); performanceOutput.deliver(bestPerformanceEver); weightsOutput.deliver(weights); } private PerformanceVector getPerformance(ExampleSet exampleSet) throws OperatorException, MissingIOObjectException { innerExampleSetSource.deliver(exampleSet); getSubprocess(0).execute(); return innerPerformanceSink.getData(); } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeInt(PARAMETER_MAX_ATTRIBUTES, "The maximal number of backward eliminations. Hence the resulting number of attributes is maximal reduced by this number.", 1, Integer.MAX_VALUE, 10); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_ALLOWED_CONSECUTIVE_FAILS, "Defines the number of times, the stopping criterion might be consecutivly ignored before the elimination is actually stopped. A number higher than one might help not to stack in the local optima.", 0, Integer.MAX_VALUE, 0); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_STOPPING_BEHAVIOR, "Defines on what criterias the elimination is stopped.", STOPPING_BEHAVIORS, 0); type.setExpert(false); types.add(type); type = new ParameterTypeBoolean(PARAMETER_USE_RELATIVE_DECREASE, "If checked, the relative performance decrease will be used as stopping criterion.", true); type.setExpert(false); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_STOPPING_BEHAVIOR, STOPPING_BEHAVIORS,false, WITH_DECREASE_EXCEEDS)); types.add(type); type = new ParameterTypeDouble(PARAMETER_MAX_ABSOLUT_DECREASE, "If the absolut performance decrease to the last step exceeds this threshold, the selection will be stopped.", Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, true); type.setExpert(false); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_RELATIVE_DECREASE, true, false)); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_STOPPING_BEHAVIOR, STOPPING_BEHAVIORS, false, WITH_DECREASE_EXCEEDS)); types.add(type); type = new ParameterTypeDouble(PARAMETER_MAX_RELATIVE_DECREASE, "If the relative performance decrease to the last step exceeds this threshold, the selection will be stopped.", -1d, 1d, true); type.setExpert(false); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_RELATIVE_DECREASE, true, true)); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_STOPPING_BEHAVIOR, STOPPING_BEHAVIORS, false, WITH_DECREASE_EXCEEDS)); types.add(type); type = new ParameterTypeDouble(PARAMETER_ALPHA, "The probability threshold which determines if differences are considered as significant.", 0.0d, 1.0d, 0.05d); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_STOPPING_BEHAVIOR, STOPPING_BEHAVIORS, true, WITH_DECREASE_SIGNIFICANT)); types.add(type); return types; } }