/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.features.selection; import java.util.Collection; import java.util.Iterator; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeDouble; /** * Removes useless attribute from the example set. Useless attributes are * <ul> * <li>nominal attributes which has the same value for more than <code>p</code> * percent of all examples.</li> * <li>numerical attributes which standard deviation is less or equal to a * given deviation threshold <code>t</code>.</li> * </ul> * * @author Ingo Mierswa * @version $Id: RemoveUselessFeatures.java,v 1.12 2006/04/05 08:57:27 * ingomierswa Exp $ */ public class RemoveUselessFeatures extends Operator { /** The parameter name for "Removes all numerical attributes with standard deviation less or equal to this threshold." */ public static final String PARAMETER_NUMERICAL_MIN_DEVIATION = "numerical_min_deviation"; /** The parameter name for "Removes all nominal attributes which provides more than the given amount of only one value." */ public static final String PARAMETER_NOMINAL_SINGLE_VALUE_UPPER = "nominal_single_value_upper"; /** The parameter name for "Removes all nominal attributes which provides less than the given amount of at least one value (-1: remove attributes with values occuring only once)." */ public static final String PARAMETER_NOMINAL_SINGLE_VALUE_LOWER = "nominal_single_value_lower"; private static final Class[] INPUT_CLASSES = { ExampleSet.class }; private static final Class[] OUTPUT_CLASSES = { ExampleSet.class }; public RemoveUselessFeatures(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); ExampleSet clone = (ExampleSet) exampleSet.clone(); clone.recalculateAllAttributeStatistics(); double numericalMinDeviation = getParameterAsDouble(PARAMETER_NUMERICAL_MIN_DEVIATION); double nominalSingleValueUpper = getParameterAsDouble(PARAMETER_NOMINAL_SINGLE_VALUE_UPPER); double nominalSingleValueLower = getParameterAsDouble(PARAMETER_NOMINAL_SINGLE_VALUE_LOWER); if (nominalSingleValueLower < 0.0d) { nominalSingleValueLower = 1.0d / clone.size(); } Iterator<Attribute> i = clone.getAttributes().iterator(); while (i.hasNext()) { Attribute attribute = i.next(); if (attribute.isNominal()) { Collection values = attribute.getMapping().getValues(); double[] valueCounts = new double[values.size()]; Iterator v = values.iterator(); int n = 0; while (v.hasNext()) { String value = (String) v.next(); valueCounts[n] = clone.getStatistics(attribute, Statistics.COUNT, value); n++; } if (clone.getStatistics(attribute, Statistics.UNKNOWN) / clone.size() >= nominalSingleValueUpper) { i.remove(); continue; } // check for single values which dominates other values and // calculate maximum double maximumValueCount = Double.NEGATIVE_INFINITY; for (n = 0; n < valueCounts.length; n++) { double percent = valueCounts[n] / clone.size(); maximumValueCount = Math.max(maximumValueCount, percent); if (percent >= nominalSingleValueUpper) { i.remove(); break; } } // check if the maximum is below lower bound to remove widely // spreaded attributes if (maximumValueCount <= nominalSingleValueLower) { i.remove(); continue; } } else if (attribute.isNumerical()) { if (clone.getStatistics(attribute, Statistics.UNKNOWN) / clone.size() >= nominalSingleValueUpper) { i.remove(); continue; } // remove numerical attribute with low deviation if (Math.sqrt(clone.getStatistics(attribute, Statistics.VARIANCE)) <= numericalMinDeviation) i.remove(); } else { // do nothing for data attributes log("Attribute '" + attribute.getName() + "' is not numerical and not nominal, do nothing..."); } checkForStop(); } if (clone.getAttributes().size() <= 0) { logWarning("Example set does not not have any attribute after removing the useless attributes!"); } return new IOObject[] { clone }; } public Class<?>[] getInputClasses() { return INPUT_CLASSES; } public Class<?>[] getOutputClasses() { return OUTPUT_CLASSES; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeDouble(PARAMETER_NUMERICAL_MIN_DEVIATION, "Removes all numerical attributes with standard deviation less or equal to this threshold.", 0.0d, Double.POSITIVE_INFINITY, 0.0d); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_NOMINAL_SINGLE_VALUE_UPPER, "Removes all nominal attributes which provides more than the given amount of only one value.", 0.0d, 1.0d, 1.0d); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_NOMINAL_SINGLE_VALUE_LOWER, "Removes all nominal attributes which provides less than the given amount of at least one value (-1: remove attributes with values occuring only once).", -1.0d, 1.0d, -1.0d); type.setExpert(false); types.add(type); return types; } }