/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.apache.commons.lang.ArrayUtils; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorVersion; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.preprocessing.filter.NominalNumbers2Numerical; import com.rapidminer.operator.tools.AttributeSubsetSelector; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.Tools; /** * This operator can be used to (re-)guess the value types of all attributes. This might be useful * after some preprocessing transformations and "purifying" some of the columns, * especially if columns which were nominal before can be handled as numerical columns. With this * operator, the value types of all attributes do not have to be transformed manually with operators * like {@link NominalNumbers2Numerical}. * * @author Ingo Mierswa */ public class GuessValueTypes extends AbstractDataProcessing { /** The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character"; /** The parameter name for "Character that is used as decimal point." */ public static final String PARAMETER_NUMBER_GROUPING_CHARACTER = "number_grouping_character"; private AttributeSubsetSelector attributeSelector = new AttributeSubsetSelector(this, getExampleSetInputPort()); /** * Incompatible version, old version writes into the exampleset, if original output port is not * connected. */ private static final OperatorVersion VERSION_MAY_WRITE_INTO_DATA = new OperatorVersion(7, 1, 1); public GuessValueTypes(OperatorDescription description) { super(description); } @Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // init char decimalPointCharacter = getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0); Character groupingCharacter = null; if (isParameterSet(PARAMETER_NUMBER_GROUPING_CHARACTER)) { groupingCharacter = getParameterAsString(PARAMETER_NUMBER_GROUPING_CHARACTER).charAt(0); } Set<Attribute> attributeSet = attributeSelector.getAttributeSubset(exampleSet, false); int size = attributeSet.size(); int[] inputValueTypes = new int[size]; int index = 0; for (Attribute attribute : attributeSet) { inputValueTypes[index++] = attribute.getValueType(); } // init progress getProgress().setTotal(100); double totalProgress = exampleSet.size() * attributeSet.size(); long progressCounter = 0; // guessed value types int[] guessedValueTypes = new int[size]; index = 0; for (Attribute attribute : attributeSet) { progressCounter = exampleSet.size() * index; getProgress().setCompleted((int) (50 * ((progressCounter - 1) / totalProgress))); if (!attribute.isNominal() && !attribute.isNumerical()) { index++; continue; } for (Example example : exampleSet) { // trigger progress if (progressCounter++ % 500_000 == 0) { getProgress().setCompleted((int) (50 * ((progressCounter - 1) / totalProgress))); } double originalValue = example.getValue(attribute); if (!Double.isNaN(originalValue)) { if (guessedValueTypes[index] != Ontology.NOMINAL) { try { String valueString = example.getValueAsString(attribute); if (!Attribute.MISSING_NOMINAL_VALUE.equals(valueString)) { if (groupingCharacter != null) { valueString = valueString.replace(groupingCharacter.toString(), ""); } valueString = valueString.replace(decimalPointCharacter, '.'); double value = Double.parseDouble(valueString); if (guessedValueTypes[index] != Ontology.REAL) { if (Tools.isEqual(Math.round(value), value)) { guessedValueTypes[index] = Ontology.INTEGER; } else { guessedValueTypes[index] = Ontology.REAL; } } } } catch (NumberFormatException e) { guessedValueTypes[index] = Ontology.NOMINAL; break; } } } } index++; } // if we could not guess any type, use the default one for (int i = 0; i < size; i++) { if (guessedValueTypes[i] == 0) { guessedValueTypes[i] = inputValueTypes[i]; } } progressCounter = 0; getProgress().setCompleted(50); // the example set contains at least one example and the guessing was performed if (exampleSet.size() > 0) { // new attributes List<AttributeRole> newAttributes = new LinkedList<AttributeRole>(); index = 0; for (Attribute attribute : attributeSet) { if (!attribute.isNominal() && !attribute.isNumerical()) { index++; continue; } AttributeRole role = exampleSet.getAttributes().getRole(attribute); Attribute newAttribute = AttributeFactory.createAttribute(guessedValueTypes[index]); exampleSet.getExampleTable().addAttribute(newAttribute); AttributeRole newRole = new AttributeRole(newAttribute); newRole.setSpecial(role.getSpecialName()); newAttributes.add(newRole); // copy data for (Example e : exampleSet) { double oldValue = e.getValue(attribute); if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(guessedValueTypes[index], Ontology.NUMERICAL)) { if (!Double.isNaN(oldValue)) { String valueString = e.getValueAsString(attribute); if (Attribute.MISSING_NOMINAL_VALUE.equals(valueString)) { e.setValue(newAttribute, Double.NaN); } else { if (groupingCharacter != null) { valueString = valueString.replace(groupingCharacter.toString(), ""); } valueString = valueString.replace(decimalPointCharacter, '.'); e.setValue(newAttribute, Double.parseDouble(valueString)); } } else { e.setValue(newAttribute, Double.NaN); } } else { if (!Double.isNaN(oldValue)) { String value = e.getValueAsString(attribute); e.setValue(newAttribute, newAttribute.getMapping().mapString(value)); } else { e.setValue(newAttribute, Double.NaN); } } // trigger progress if (++progressCounter % 500_000 == 0) { getProgress().setCompleted((int) (50 * (progressCounter / totalProgress) + 50)); } } exampleSet.getAttributes().remove(role); newAttribute.setName(attribute.getName()); index++; } for (AttributeRole role : newAttributes) { if (role.isSpecial()) { exampleSet.getAttributes().setSpecialAttribute(role.getAttribute(), role.getSpecialName()); } else { exampleSet.getAttributes().addRegular(role.getAttribute()); } } // trigger progress getProgress().complete(); } return exampleSet; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(attributeSelector.getParameterTypes()); types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", ".", false)); types.add(new ParameterTypeString(PARAMETER_NUMBER_GROUPING_CHARACTER, "Character that is used as the number grouping character, i.e. for groups of thousands.", true)); return types; } @Override public boolean writesIntoExistingData() { if (getCompatibilityLevel().isAbove(VERSION_MAY_WRITE_INTO_DATA)) { return false; } else { // old version: true only if original output port is connected return isOriginalOutputConnected(); } } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), GuessValueTypes.class, attributeSelector); } @Override public OperatorVersion[] getIncompatibleVersionChanges() { return (OperatorVersion[]) ArrayUtils.addAll(super.getIncompatibleVersionChanges(), new OperatorVersion[] { VERSION_MAY_WRITE_INTO_DATA }); } }