/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.features.selection; import java.util.Collection; import java.util.Iterator; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; /** * Removes useless attribute from the example set. Useless attributes are * <ul> * <li>nominal attributes which has the same value for more than <code>p</code> * percent of all examples.</li> * <li>numerical attributes which standard deviation is less or equal to a * given deviation threshold <code>t</code>.</li> * </ul> * * @author Ingo Mierswa */ public class RemoveUselessFeatures extends AbstractFeatureSelection { /** The parameter name for "Removes all numerical attributes with standard deviation less or equal to this threshold." */ public static final String PARAMETER_NUMERICAL_MIN_DEVIATION = "numerical_min_deviation"; /** The parameter name for "Removes all nominal attributes which provides more than the given amount of only one value." */ public static final String PARAMETER_NOMINAL_SINGLE_VALUE_UPPER = "nominal_useless_above"; /** The parameter name for "Removes all nominal attributes which provides less than the given amount of at least one value (-1: remove attributes with values occuring only once)." */ public static final String PARAMETER_NOMINAL_SINGLE_VALUE_LOWER = "nominal_useless_below"; private static final String PARAMETER_REMOVE_ID_LIKE = "nominal_remove_id_like"; public RemoveUselessFeatures(OperatorDescription description) { super(description); } @Override protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError { metaData.attributesAreSubset(); return metaData; } @Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { exampleSet.recalculateAllAttributeStatistics(); double numericalMinDeviation = getParameterAsDouble(PARAMETER_NUMERICAL_MIN_DEVIATION); double nominalSingleValueUpper = getParameterAsDouble(PARAMETER_NOMINAL_SINGLE_VALUE_UPPER); double nominalSingleValueLower = getParameterAsDouble(PARAMETER_NOMINAL_SINGLE_VALUE_LOWER); if (getParameterAsBoolean(PARAMETER_REMOVE_ID_LIKE)) { nominalSingleValueLower = 1.0d / exampleSet.size(); } Iterator<Attribute> i = exampleSet.getAttributes().iterator(); while (i.hasNext()) { Attribute attribute = i.next(); if (attribute.isNominal()) { Collection values = attribute.getMapping().getValues(); double[] valueCounts = new double[values.size()]; Iterator v = values.iterator(); int n = 0; while (v.hasNext()) { String value = (String) v.next(); valueCounts[n] = exampleSet.getStatistics(attribute, Statistics.COUNT, value); n++; } if (exampleSet.getStatistics(attribute, Statistics.UNKNOWN) / exampleSet.size() >= nominalSingleValueUpper) { i.remove(); continue; } // check for single values which dominates other values and // calculate maximum double maximumValueCount = Double.NEGATIVE_INFINITY; for (n = 0; n < valueCounts.length; n++) { double percent = valueCounts[n] / exampleSet.size(); maximumValueCount = Math.max(maximumValueCount, percent); if (percent >= nominalSingleValueUpper) { i.remove(); break; } } // check if the maximum is below lower bound to remove widely // spreaded attributes if (maximumValueCount <= nominalSingleValueLower) { i.remove(); continue; } } else if (attribute.isNumerical()) { if (exampleSet.getStatistics(attribute, Statistics.UNKNOWN) / exampleSet.size() >= nominalSingleValueUpper) { i.remove(); continue; } // remove numerical attribute with low deviation if (Math.sqrt(exampleSet.getStatistics(attribute, Statistics.VARIANCE)) <= numericalMinDeviation) i.remove(); } else { // do nothing for data attributes log("Attribute '" + attribute.getName() + "' is not numerical and not nominal, do nothing..."); } checkForStop(); } if (exampleSet.getAttributes().size() <= 0) { logWarning("Example set does not not have any attribute after removing the useless attributes!"); } return exampleSet; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeDouble(PARAMETER_NUMERICAL_MIN_DEVIATION, "Removes all numerical attributes with standard deviation less or equal to this threshold.", 0.0d, Double.POSITIVE_INFINITY, 0.0d); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_NOMINAL_SINGLE_VALUE_UPPER, "Removes all nominal attributes which most frequent value is contained in more than this fraction of all examples.", 0.0d, 1.0d, 1.0d); type.setExpert(false); types.add(type); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_REMOVE_ID_LIKE, "If checked, nominal attributes which values appear only once in the complete exampleset are removed.", false, false)); type = new ParameterTypeDouble(PARAMETER_NOMINAL_SINGLE_VALUE_LOWER, "Removes all nominal attributes which most frequent value is contained in less than this fraction of all examples.", 0d, 1.0d, 0d); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_REMOVE_ID_LIKE, false, false)); type.setExpert(false); types.add(type); return types; } }