/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.features.selection; import java.util.LinkedList; import java.util.List; import com.rapidminer.example.ExampleSet; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.features.FeatureOperator; import com.rapidminer.operator.features.Individual; import com.rapidminer.operator.features.KeepBest; import com.rapidminer.operator.features.Population; import com.rapidminer.operator.features.PopulationOperator; import com.rapidminer.operator.features.RedundanceRemoval; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; /** * <p> * This operator realizes the two deterministic greedy feature selection algorithms forward * selection and backward elimination. However, we added some enhancements to the standard * algorithms which are described below: * </p> * * <h4>Forward Selection</h4> * <ol> * <li>Create an initial population with {@rapidminer.math n} individuals where {@rapidminer.math n} * is the input example set's number of attributes. Each individual will use exactly one of the * features.</li> * <li>Evaluate the attribute sets and select only the best {@rapidminer.math k}.</li> * <li>For each of the {@rapidminer.math k} attribute sets do: If there are {@rapidminer.math j} * unused attributes, make {@rapidminer.math j} copies of the attribute set and add exactly one of * the previously unused attributes to the attribute set.</li> * <li>As long as the performance improved in the last {@rapidminer.math p} iterations go to 2</li> * </ol> * * <h4>Backward Elimination</h4> * <ol> * <li>Start with an attribute set which uses all features.</li> * <li>Evaluate all attribute sets and select the best {@rapidminer.math k}.</li> * <li>For each of the {@rapidminer.math k} attribute sets do: If there are {@rapidminer.math j} * attributes used, make {@rapidminer.math j} copies of the attribute set and remove exactly one of * the previously used attributes from the attribute set.</li> * <li>As long as the performance improved in the last {@rapidminer.math p} iterations go to 2</li> * </ol> * * <p> * The parameter {@rapidminer.math k} can be specified by the parameter <code>keep_best</code>, the * parameter {@rapidminer.math p} can be specified by the parameter * <code>generations_without_improval</code>. These parameters have default values 1 which means * that the standard selection algorithms are used. Using other values increase the runtime but * might help to avoid local extrema in the search for the global optimum. * </p> * * <p> * Another unusual parameter is <code>maximum_number_of_generations</code>. This parameter bounds * the number of iterations to this maximum of feature selections / deselections. In combination * with <code>generations_without_improval</code> this allows several different selection schemes * (which are described for forward selection, backward elimination works analogous): * * <ul> * <li><code>maximum_number_of_generations</code> = {@rapidminer.math m} and * <code>generations_without_improval</code> = {@rapidminer.math p}: Selects maximal * {@rapidminer.math m} features. The selection stops if not performance improvement was measured in * the last {@rapidminer.math p} generations.</li> * <li><code>maximum_number_of_generations</code> = {@rapidminer.math -1} and * <code>generations_without_improval</code> = {@rapidminer.math p}: Tries to selects new features * until no performance improvement was measured in the last {@rapidminer.math p} generations.</li> * <li><code>maximum_number_of_generations</code> = {@rapidminer.math m} and * <code>generations_without_improval</code> = {@rapidminer.math -1}: Selects maximal * {@rapidminer.math m} features. The selection stops is not stopped until all combinations with * maximal {@rapidminer.math m} were tried. However, the result might contain less features than * these.</li> * <li><code>maximum_number_of_generations</code> = {@rapidminer.math -1} and * <code>generations_without_improval</code> = {@rapidminer.math -1}: Test all combinations of * attributes (brute force, this might take a very long time and should only be applied to small * attribute sets).</li> * </ul> * </p> * * @author Simon Fischer, Ingo Mierswa */ public class FeatureSelectionOperator extends FeatureOperator { /** The parameter name for "Forward selection or backward elimination." */ public static final String PARAMETER_SELECTION_DIRECTION = "selection_direction"; /** The parameter name for "Keep the best n individuals in each generation." */ public static final String PARAMETER_KEEP_BEST = "keep_best"; /** * The parameter name for "Stop after n generations without improvement of the performance * (-1: stops if the maximum_number_of_generations is reached)." */ public static final String PARAMETER_GENERATIONS_WITHOUT_IMPROVAL = "generations_without_improval"; public static final String PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL = "limit_generations_without_improval"; /** * The parameter name for "Delivers the maximum amount of generations (-1: might use or * deselect all features)." */ public static final String PARAMETER_LIMIT_NUMBER_OF_GENERATIONS = "limit_number_of_generations"; public static final String PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS = "maximum_number_of_generations"; public static final int FORWARD_SELECTION = 0; public static final int BACKWARD_ELIMINATION = 1; private static final String[] DIRECTIONS = { "forward", "backward" }; private int generationsWOImp; private int maxGenerations; public FeatureSelectionOperator(OperatorDescription description) { super(description); } @Override protected ExampleSetMetaData modifyInnerOutputExampleSet(ExampleSetMetaData metaData) { metaData.attributesAreSubset(); return metaData; } @Override protected ExampleSetMetaData modifyOutputExampleSet(ExampleSetMetaData metaData) { metaData.attributesAreSubset(); return metaData; } @Override public void doWork() throws OperatorException { this.maxGenerations = getParameterAsBoolean(PARAMETER_LIMIT_NUMBER_OF_GENERATIONS) ? getParameterAsInt(PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS) : -1; this.generationsWOImp = getParameterAsBoolean(PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL) ? getParameterAsInt(PARAMETER_GENERATIONS_WITHOUT_IMPROVAL) : -1; super.doWork(); } int getDefaultDirection() { return FORWARD_SELECTION; } /** * May <tt>es</tt> have <i>n</i> features. The initial population contains (depending on whether * forward selection or backward elimination is used) either * <ul> * <li><i>n</i> elements with exactly 1 feature switched on or * <li>1 element with all <i>n</i> features switched on. * </ul> */ @Override public Population createInitialPopulation(ExampleSet es) throws UndefinedParameterError { int direction = getParameterAsInt(PARAMETER_SELECTION_DIRECTION); Population initP = new Population(); if (direction == FORWARD_SELECTION) { for (int a = 0; a < es.getAttributes().size(); a++) { double[] weights = new double[es.getAttributes().size()]; weights[a] = 1.0d; initP.add(new Individual(weights)); } } else { double[] weights = new double[es.getAttributes().size()]; for (int a = 0; a < es.getAttributes().size(); a++) { weights[a] = 1.0d; } initP.add(new Individual(weights)); } return initP; } /** * The operators performs two steps: * <ol> * <li>forward selection/backward elimination * <li>kick out all but the <tt>keep_best</tt> individuals * <li>remove redundant individuals * </ol> */ @Override public List<PopulationOperator> getPreEvaluationPopulationOperators(ExampleSet input) throws OperatorException { int direction = getParameterAsInt(PARAMETER_SELECTION_DIRECTION); int keepBest = getParameterAsInt(PARAMETER_KEEP_BEST); List<PopulationOperator> preOp = new LinkedList<PopulationOperator>(); preOp.add(new KeepBest(keepBest)); if (direction == FORWARD_SELECTION) { preOp.add(new ForwardSelection()); if (this.maxGenerations <= 0) { this.maxGenerations = input.getAttributes().size() - 1; } else { this.maxGenerations--; // ensures the correct number of // features } } else { preOp.add(new BackwardElimination()); if (this.maxGenerations <= 0) { this.maxGenerations = input.getAttributes().size(); } } preOp.add(new RedundanceRemoval()); return preOp; } /** empty list */ @Override public List<PopulationOperator> getPostEvaluationPopulationOperators(ExampleSet input) throws OperatorException { return new LinkedList<PopulationOperator>(); } /** * Returns true if the best individual is not better than the last generation's best individual. */ @Override public boolean solutionGoodEnough(Population pop) throws OperatorException { return pop.empty() || generationsWOImp > 0 && pop.getGenerationsWithoutImproval() >= generationsWOImp || pop.getGeneration() >= maxGenerations; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = new LinkedList<ParameterType>(); ParameterType type = new ParameterTypeCategory(PARAMETER_SELECTION_DIRECTION, "Forward selection or backward elimination.", DIRECTIONS, getDefaultDirection()); type.setExpert(false); types.add(type); type = new ParameterTypeBoolean( PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL, "Indicates if the optimization should be aborted if this number of generations showed no improvement. If unchecked, always the maximal number of generations will be used.", true); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_GENERATIONS_WITHOUT_IMPROVAL, "Stop after n generations without improval of the performance.", 1, Integer.MAX_VALUE, 1); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_LIMIT_GENERATIONS_WITHOUT_IMPROVAL, false, true)); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_LIMIT_NUMBER_OF_GENERATIONS, "Defines if the number of generations should be limited on a specific number.", false, false)); type = new ParameterTypeInt(PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS, "Defines the maximum amount of generations.", 1, Integer.MAX_VALUE, 10); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_LIMIT_NUMBER_OF_GENERATIONS, true, true)); type.setExpert(false); types.add(new ParameterTypeInt(PARAMETER_KEEP_BEST, "Keep the best n individuals in each generation.", 1, Integer.MAX_VALUE, 1)); types.add(type); types.addAll(super.getParameterTypes()); return types; } @Override protected int getMaximumGenerations() throws UndefinedParameterError { return getParameterAsBoolean(PARAMETER_LIMIT_NUMBER_OF_GENERATIONS) ? getParameterAsInt(PARAMETER_MAXIMUM_NUMBER_OF_GENERATIONS) : -1; } }