/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Locale; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MDInteger; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.ParameterCondition; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; /** * Replaces missing values in examples. If a value is missing, it is replaced by one of the functions * "minimum", "maximum", "average", and "none", which is applied to the non * missing attribute values of the example set. "none" means, that the value is not replaced. The function can * be selected using the parameter list <code>columns</code>. If an attribute's name appears in this list as a key, the * value is used as the function name. If the attribute's name is not in the list, the function specified by the * <code>default</code> parameter is used. For nominal attributes the mode is used for the average, i.e. the nominal * value which occurs most often in the data. For nominal attributes and replacement type zero the first nominal value * defined for this attribute is used. The replenishment "value" indicates that the user defined parameter * should be used for the replacement. * * @author Ingo Mierswa, Simon Fischer */ public class MissingValueReplenishment extends ValueReplenishment { /** The parameter name for "This value is used for some of the replenishment types." */ public static final String PARAMETER_REPLENISHMENT_VALUE = "replenishment_value"; private static final int NONE = 0; private static final int MINIMUM = 1; private static final int MAXIMUM = 2; private static final int AVERAGE = 3; private static final int ZERO = 4; private static final int VALUE = 5; private static final String[] REPLENISHMENT_NAMES = { "none", "minimum", "maximum", "average", "zero", "value" }; public MissingValueReplenishment(OperatorDescription description) { super(description); } @Override protected Collection<AttributeMetaData> modifyAttributeMetaData(ExampleSetMetaData emd, AttributeMetaData amd) throws UndefinedParameterError { amd.setNumberOfMissingValues(new MDInteger(0)); return Collections.singletonList(amd); } @Override protected int[] getFilterValueTypes() { return new int[] { Ontology.VALUE_TYPE }; } @Override public String[] getFunctionNames() { return REPLENISHMENT_NAMES; } @Override public int getDefaultFunction() { return AVERAGE; } @Override public int getDefaultColumnFunction() { return AVERAGE; } @Override public double getReplacedValue() { return Double.NaN; } @Override public double getReplenishmentValue(int functionIndex, ExampleSet exampleSet, Attribute attribute) throws UserError { switch (functionIndex) { case NONE: return Double.NaN; case MINIMUM: return exampleSet.getStatistics(attribute, Statistics.MINIMUM); case MAXIMUM: return exampleSet.getStatistics(attribute, Statistics.MAXIMUM); case AVERAGE: if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { return Double.NaN; } else if (attribute.isNominal()) { return exampleSet.getStatistics(attribute, Statistics.MODE); } else { return exampleSet.getStatistics(attribute, Statistics.AVERAGE); } case ZERO: return 0.0d; case VALUE: String valueString = getParameterAsString(PARAMETER_REPLENISHMENT_VALUE); if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { String formatString = null; if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE)) { formatString = "MM/dd/yyyy"; } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.TIME)) { formatString = "hh.mm a"; } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { formatString = "MM/dd/yyyy hh.mm a"; } SimpleDateFormat dateFormat = new SimpleDateFormat(formatString, Locale.US); try { Date date = dateFormat.parse(valueString); return date.getTime(); } catch (ParseException e) { throw new UserError(this, 218, PARAMETER_REPLENISHMENT_VALUE, valueString); } } else if (attribute.isNominal()) { return attribute.getMapping().mapString(valueString); } else { try { return Double.parseDouble(valueString); } catch (NumberFormatException e) { throw new UserError(this, 211, PARAMETER_REPLENISHMENT_VALUE, valueString); } } default: throw new RuntimeException("Illegal value functionIndex: " + functionIndex); } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterTypeString type = new ParameterTypeString(PARAMETER_REPLENISHMENT_VALUE, "This value is used for some of the replenishment types.", true, false); type.registerDependencyCondition(new ParameterCondition(this, PARAMETER_DEFAULT, true) { @Override public boolean isConditionFullfilled() { // check if any of the options is set to value try { if (getParameterAsInt(PARAMETER_DEFAULT) == VALUE) return true; List<String[]> pairs = getParameterList(PARAMETER_COLUMNS); if (pairs != null) { for (String[] pair : pairs) { if (pair[1].equals("value") || pair[1].equals(""+VALUE)) return true; } } } catch (UndefinedParameterError e) { } return false; } }); types.add(type); return types; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), MissingValueReplenishment.class, attributeSelector); } }