/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.text.NumberFormat; import java.text.ParseException; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorVersion; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.preprocessing.GuessValueTypes; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.AboveOperatorVersionCondition; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.StrictDecimalFormat; import com.rapidminer.tools.math.container.Range; /** * <p> * This operator transforms nominal attributes into numerical ones. In contrast to the * NominalToNumeric operator, this operator directly parses numbers from the wrongly as nominal * values encoded values. Please note that this operator will first check the stored nominal * mappings for all attributes. If (old) mappings are still stored which actually are nominal * (without the corresponding data being part of the example set), the attribute will not be * converted. Please use the operator {@link GuessValueTypes} in these cases. * </p> * * @author Regina Fritsch, Ingo Mierswa */ public class NominalNumbers2Numerical extends AbstractFilteredDataProcessing { /** Last version where unparsable values were being ignored. */ public static final OperatorVersion CHANGE_6_0_3_UNPARSABLE_VALUES_ACTION = new OperatorVersion(6, 0, 3); /** * The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character"; /** Used for separation of digits (1,000,000.0 or 1.000.000,0) . */ public static final String PARAMETER_GROUP_SEPARATOR = "group_separator"; /** Allow unparsable values, use missing values instead. */ public static final String PARAMETER_UNPARSABLE_VALUE_HANDLING = "unparsable_value_handling"; public static final String[] UNPARSABLE_VALUES_HANDLING_METHOD = new String[] { "skip attribute", "fail", "replace with missing values" }; public static final int IGNORE = 0; public static final int COMPLAIN = 1; public static final int REPLACE_WITH_MISSING_VALUES = 2; public NominalNumbers2Numerical(OperatorDescription description) { super(description); } @Override public ExampleSetMetaData applyOnFilteredMetaData(ExampleSetMetaData emd) throws UndefinedParameterError { NumberFormat format = makeFormat(); Iterator<AttributeMetaData> iterator = emd.getAllAttributes().iterator(); List<AttributeMetaData> affectedList = new LinkedList<>(); while (iterator.hasNext()) { AttributeMetaData amd = iterator.next(); if (amd.isNominal()) { Set<String> values = amd.getValueSet(); // check if values are transformed boolean isTransformed = true; double min = Double.POSITIVE_INFINITY; double max = Double.NEGATIVE_INFINITY; try { for (String value : values) { double numValue = format.parse(value).doubleValue(); min = Math.min(min, numValue); max = Math.max(max, numValue); } } catch (ParseException e) { isTransformed = false; } if (isTransformed) { // removing and inserting in order to reflect correct order iterator.remove(); affectedList.add(amd); // transform attribute amd.setType(Ontology.NUMERICAL); if (min == Double.POSITIVE_INFINITY) { min = Double.NEGATIVE_INFINITY; } if (max == Double.NEGATIVE_INFINITY) { max = Double.POSITIVE_INFINITY; } amd.setValueRange(new Range(min, max), SetRelation.EQUAL); } } } emd.addAllAttributes(affectedList); return emd; } @Override public ExampleSet applyOnFiltered(ExampleSet exampleSet) throws OperatorException { int unparsableValueHandling = getParameterAsInt(PARAMETER_UNPARSABLE_VALUE_HANDLING); NumberFormat format = makeFormat(); List<Attribute> newAttributes = new LinkedList<>(); // using iterator for avoiding "concurrent modification" Iterator<Attribute> a = exampleSet.getAttributes().iterator(); while (a.hasNext()) { Attribute attribute = a.next(); if (attribute.isNominal()) { if (getCompatibilityLevel().isAtMost(CHANGE_6_0_3_UNPARSABLE_VALUES_ACTION) || unparsableValueHandling == IGNORE) { try { for (String value : attribute.getMapping().getValues()) { format.parse(value); } } catch (ParseException e) { // only if unparsable values should be ignored: // run next iteration if the value can not be parsed to a number continue; } } // new attribute Attribute newAttribute = AttributeFactory.createAttribute(Ontology.NUMERICAL); exampleSet.getExampleTable().addAttribute(newAttribute); newAttributes.add(newAttribute); // copy data for (Example e : exampleSet) { double oldValue = e.getValue(attribute); if (!Double.isNaN(oldValue)) { String value = e.getValueAsString(attribute); try { e.setValue(newAttribute, format.parse(value).doubleValue()); } catch (ParseException ex) { switch (unparsableValueHandling) { case IGNORE: continue; default: case COMPLAIN: throw new UserError(this, ex, 946, value); case REPLACE_WITH_MISSING_VALUES: e.setValue(newAttribute, Double.NaN); break; } } } else { e.setValue(newAttribute, Double.NaN); } } // delete attribute and rename the new attribute a.remove(); newAttribute.setName(attribute.getName()); } } for (Attribute attribute : newAttributes) { exampleSet.getAttributes().addRegular(attribute); } return exampleSet; } private NumberFormat makeFormat() throws UndefinedParameterError { StrictDecimalFormat format = StrictDecimalFormat.getInstance(this); return format; } @Override protected int[] getFilterValueTypes() { return new int[] { Ontology.NOMINAL }; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(StrictDecimalFormat.getParameterTypes(this)); ParameterType type; type = new ParameterTypeCategory(PARAMETER_UNPARSABLE_VALUE_HANDLING, "This selects the method for handling occurrences of values which are not parsable to numbers.", UNPARSABLE_VALUES_HANDLING_METHOD, COMPLAIN, false); type.registerDependencyCondition(new AboveOperatorVersionCondition(this, CHANGE_6_0_3_UNPARSABLE_VALUES_ACTION)); types.add(type); return types; } @Override public boolean writesIntoExistingData() { return false; } @Override public OperatorVersion[] getIncompatibleVersionChanges() { return new OperatorVersion[] { CHANGE_6_0_3_UNPARSABLE_VALUES_ACTION }; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), NominalNumbers2Numerical.class, null); } }