/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.text.NumberFormat; import java.text.ParseException; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.preprocessing.GuessValueTypes; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.StrictDecimalFormat; import com.rapidminer.tools.math.container.Range; /** * <p>This operator transforms nominal attributes into numerical ones. In contrast to * the NominalToNumeric operator, this operator directly parses numbers from * the wrongly as nominal values encoded values. Please note that this operator * will first check the stored nominal mappings for all attributes. If (old) mappings * are still stored which actually are nominal (without the corresponding data being part of * the example set), the attribute will not be converted. Please use the operator * {@link GuessValueTypes} in these cases.</p> * * @author Regina Fritsch, Ingo Mierswa */ public class NominalNumbers2Numerical extends AbstractFilteredDataProcessing { /** The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character"; /** Used for separation of digits (1,000,000.0 or 1.000.000,0) . */ public static final String PARAMETER_GROUP_SEPARATOR = "group_separator"; public NominalNumbers2Numerical(OperatorDescription description) { super(description); } @Override public ExampleSetMetaData applyOnFilteredMetaData(ExampleSetMetaData emd) throws UndefinedParameterError { NumberFormat format = makeFormat(); Iterator<AttributeMetaData> iterator = emd.getAllAttributes().iterator(); List<AttributeMetaData> affectedList = new LinkedList<AttributeMetaData>(); while (iterator.hasNext()) { AttributeMetaData amd = iterator.next(); if (amd.isNominal()) { Set<String> values = amd.getValueSet(); // check if values are transformed boolean isTransformed = true; double min = Double.POSITIVE_INFINITY; double max = Double.NEGATIVE_INFINITY; try { for (String value : values) { double numValue = format.parse(value).doubleValue(); min = Math.min(min, numValue); max = Math.max(max, numValue); } } catch (ParseException e) { isTransformed = false; } if (isTransformed) { // removing and inserting in order to reflect correct order iterator.remove(); affectedList.add(amd); // transform attribute amd.setType(Ontology.NUMERICAL); if (min == Double.POSITIVE_INFINITY) { min = Double.NEGATIVE_INFINITY; } if (max == Double.NEGATIVE_INFINITY) { max = Double.POSITIVE_INFINITY; } amd.setValueRange(new Range(min, max), SetRelation.EQUAL); } } } emd.addAllAttributes(affectedList); return emd; } @Override public ExampleSet applyOnFiltered(ExampleSet exampleSet) throws OperatorException { NumberFormat format = makeFormat(); List<Attribute> newAttributes = new LinkedList<Attribute>(); // using iterator for avoiding "concurrent modification" Iterator<Attribute> a = exampleSet.getAttributes().iterator(); while (a.hasNext()) { Attribute attribute = a.next(); if (attribute.isNominal()) { boolean isNumericalNominal = true; try { for(String value : attribute.getMapping().getValues()) { format.parse(value); } } catch (Exception e){ isNumericalNominal = false; } if (isNumericalNominal) { // new attribute Attribute newAttribute = AttributeFactory.createAttribute(Ontology.NUMERICAL); exampleSet.getExampleTable().addAttribute(newAttribute); newAttributes.add(newAttribute); // copy data for (Example e : exampleSet) { double oldValue = e.getValue(attribute); if (!Double.isNaN(oldValue)) { String value = e.getValueAsString(attribute); try { e.setValue(newAttribute, format.parse(value).doubleValue()); } catch (ParseException ex) { throw new UserError(this, ex, 946, value); } } else { e.setValue(newAttribute, Double.NaN); } } // delete attribute and rename the new attribute a.remove(); newAttribute.setName(attribute.getName()); } } } for (Attribute attribute : newAttributes) { exampleSet.getAttributes().addRegular(attribute); } return exampleSet; } private NumberFormat makeFormat() throws UndefinedParameterError { StrictDecimalFormat format = StrictDecimalFormat.getInstance(this); return format; } @Override protected int[] getFilterValueTypes() { return new int[] { Ontology.NOMINAL }; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(StrictDecimalFormat.getParameterTypes(this)); // TODO: Replace old parameters by new ones // types.add(new ParameterTypeChar(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", '.', false)); // types.add(new ParameterTypeChar(PARAMETER_GROUP_SEPARATOR, "Character that is used to separate groups (e.g. in 1.000.000 or 1,000,000).", false)); return types; } @Override public boolean writesIntoExistingData() { return false; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), NominalNumbers2Numerical.class, null); } }