/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map.Entry; import java.util.Set; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.SimpleProcessSetupError; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.preprocessing.AbstractValueProcessing; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeList; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.math.container.Range; /** * <p>This operator takes an <code>ExampleSet</code> as input and maps the * values of certain attributes to other values. The operator can replace * nominal values (e.g. replace the value "green" by the value * "green_color") as well as numerical values (e.g. replace the * all values "3" by "-1"). A single mapping can be * specified using the parameters <code>replace_what</code> and * <code>replace_by</code>. Multiple mappings can be specified in the parameter * list <code>value_mappings</code>.</p> * * <p>Additionally, the operator allows to define (and consider) a default * mapping. If <code>add_default_mapping</code> is set to true and <code>default_value</code> * is properly set, all values that occur in the example set but are not listed * in the value mappings list are replaced by the default value. This may be * helpful in cases where only some values should be mapped explicitly and * many unimportant values should be mapped to a default value (e.g. "other").</p> * * <p>If the parameter <code>consider_regular_expressions</code> is enabled, the * values are replaced by the new values if the original values match the given * regular expressions. The value corresponding to the first matching regular * expression in the mappings list is taken as replacement.</p> * * <p>This operator supports regular expressions for the attribute names, * i.e. the value mapping is applied on all attributes for which the name * fulfills the pattern defined by the name expression.</p> * * @author Tobias Malbrecht */ public class AttributeValueMapper extends AbstractValueProcessing { public static final String PARAMETER_NEW_VALUES = "new_value"; /** The parameter name for "The first value which should be merged." */ public static final String PARAMETER_VALUE_MAPPINGS = "value_mappings"; /** The parameter name for "The second value which should be merged." */ public static final String PARAMETER_OLD_VALUES = "old_values"; /** The parameter name for "All occurrences of this value will be replaced." */ public static final String PARAMETER_REPLACE_WHAT = "replace_what"; /** The parameter name for "The new attribute value to use." */ public static final String PARAMETER_REPLACE_BY = "replace_by"; /** The parameter name for "Enables matching based on regular expressions; original values may be specified as regular expressions." */ public static final String PARAMETER_CONSIDER_REGULAR_EXPRESSIONS = "consider_regular_expressions"; /** The parameter name for "If set to true, all original values which are not listed in the value mappings list are mapped to the default value." */ public static final String PARAMETER_ADD_DEFAULT_MAPPING = "add_default_mapping"; /** The parameter name for "The default value all original values are mapped to, if add_default_mapping is set to true." */ public static final String PARAMETER_DEFAULT_VALUE = "default_value"; public AttributeValueMapper(OperatorDescription description) { super(description); } @Override public ExampleSetMetaData applyOnFilteredMetaData(ExampleSetMetaData emd) { try { if (emd.getAllAttributes().isEmpty()) { return emd; } boolean first = true; boolean nominal = false; for (AttributeMetaData amd : emd.getAllAttributes()) { if (first) { nominal = amd.isNominal(); first = false; } else { if (nominal != amd.isNominal()) { this.addError(new SimpleProcessSetupError(Severity.ERROR, getPortOwner(), "attributes_must_have_same_type")); return emd; } } } boolean useValueRegex = getParameterAsBoolean(PARAMETER_CONSIDER_REGULAR_EXPRESSIONS); List<String[]> mappingParameterList = getParameterList(PARAMETER_VALUE_MAPPINGS); String replaceWhat = getParameterAsString(PARAMETER_REPLACE_WHAT); String replaceBy = getParameterAsString(PARAMETER_REPLACE_BY); HashMap<String, String> mappings = new LinkedHashMap<String, String>(); HashMap<Pattern, String> patternMappings = new LinkedHashMap<Pattern, String>(); if (replaceWhat != null && replaceBy != null && !replaceWhat.equals("") && !replaceBy.equals("")) { mappings.put(replaceWhat, replaceBy); if (useValueRegex) { try { Pattern valuePattern = Pattern.compile(replaceWhat); patternMappings.put(valuePattern, replaceBy); } catch (PatternSyntaxException e) { } } } Iterator<String[]> listIterator = mappingParameterList.iterator(); int j = 0; while (listIterator.hasNext()) { String[] pair = listIterator.next(); replaceWhat = pair[0]; replaceBy = pair[1]; mappings.put(replaceWhat,replaceBy); if (useValueRegex) { try { Pattern valuePattern = Pattern.compile(replaceWhat); patternMappings.put(valuePattern, replaceBy); } catch (PatternSyntaxException e) { } } j++; } boolean defaultMappingAdded = getParameterAsBoolean(PARAMETER_ADD_DEFAULT_MAPPING); String defaultValue = getParameterAsString(PARAMETER_DEFAULT_VALUE); if (nominal) { for (AttributeMetaData amd : emd.getAllAttributes()) { Set<String> valueSet = new TreeSet<String>(); for (String value : amd.getValueSet()) { String mappedValue = mappings.get(value); if (useValueRegex) { for (Entry<Pattern, String> patternEntry : patternMappings.entrySet()) { Matcher matcher = patternEntry.getKey().matcher(value); if (matcher.matches()) { mappedValue = patternEntry.getValue(); break; } } } if (mappedValue == null) { if (defaultMappingAdded) { if (defaultValue.equals("?")) { } else { valueSet.add(defaultValue); } } else { valueSet.add(value); } } else { valueSet.add(mappedValue); } } amd.setValueSet(valueSet, SetRelation.SUBSET); } } else { HashMap<Double, Double> numericalValueMapping = new HashMap<Double, Double>(); for (Entry<String, String> entry : mappings.entrySet()) { double oldValue = Double.NaN; double newValue = Double.NaN; if (!entry.getKey().equals("?")) { try { oldValue = Double.valueOf(entry.getKey()); } catch (NumberFormatException e) { this.addError(new SimpleProcessSetupError(Severity.ERROR, AttributeValueMapper.this.getPortOwner(), "mapping_must_be_number", entry.getKey())); continue; } } if (!entry.getValue().equals("?")) { try { newValue = Double.valueOf(entry.getValue()); } catch (NumberFormatException e) { this.addError(new SimpleProcessSetupError(Severity.ERROR, AttributeValueMapper.this.getPortOwner(), "mapping_must_be_number", entry.getValue())); continue; } } numericalValueMapping.put(oldValue, newValue); } double numericalDefaultValue = Double.NaN; if (defaultMappingAdded && !defaultValue.equals("?")) { numericalDefaultValue = Double.valueOf(defaultValue); } for (AttributeMetaData amd : emd.getAllAttributes()) { double lower = amd.getValueRange().getLower(); double upper = amd.getValueRange().getUpper(); double mappedLower = Double.POSITIVE_INFINITY; double mappedUpper = Double.NEGATIVE_INFINITY; for (Double value : numericalValueMapping.values()) { if (value < mappedLower) { mappedLower = value; } if (value > mappedUpper) { mappedUpper = value; } } if (!Double.isNaN(numericalDefaultValue) && numericalDefaultValue < mappedLower) { mappedLower = numericalDefaultValue; } if (!Double.isNaN(numericalDefaultValue) && numericalDefaultValue > mappedUpper) { mappedUpper = numericalDefaultValue; } amd.setValueRange(new Range(Math.min(lower, mappedLower), Math.max(upper, mappedUpper)), SetRelation.SUBSET); } } } catch (UndefinedParameterError e) { } return emd; } @Override public ExampleSet applyOnFiltered(ExampleSet exampleSet) throws OperatorException { boolean first = true; boolean nominal = false; LinkedHashMap<Attribute, Attribute> attributeMap = new LinkedHashMap<Attribute, Attribute>(); for (Attribute oldAttribute : exampleSet.getAttributes()) { if (first) { nominal = oldAttribute.isNominal(); first = false; } else { if (nominal != oldAttribute.isNominal()) { throw new UserError(this, 126); } } Attribute newAttribute = AttributeFactory.createAttribute(oldAttribute.getValueType()); attributeMap.put(oldAttribute, newAttribute); } boolean useValueRegex = getParameterAsBoolean(PARAMETER_CONSIDER_REGULAR_EXPRESSIONS); List<String[]> mappingParameterList = getParameterList(PARAMETER_VALUE_MAPPINGS); String replaceWhat = getParameterAsString(PARAMETER_REPLACE_WHAT); String replaceBy = getParameterAsString(PARAMETER_REPLACE_BY); HashMap<String, String> mappings = new LinkedHashMap<String, String>(); HashMap<Pattern, String> patternMappings = new LinkedHashMap<Pattern, String>(); if (replaceWhat != null && replaceBy != null && !replaceWhat.equals("") && !replaceBy.equals("")) { mappings.put(replaceWhat, replaceBy); if (useValueRegex) { try { Pattern valuePattern = Pattern.compile(replaceWhat); patternMappings.put(valuePattern, replaceBy); } catch (PatternSyntaxException e) { throw new UserError(this, 206, replaceWhat, e.getMessage()); } } } Iterator<String[]> listIterator = mappingParameterList.iterator(); int j = 0; while (listIterator.hasNext()) { String[] pair = listIterator.next(); replaceWhat = pair[0]; replaceBy = pair[1]; mappings.put(replaceWhat,replaceBy); if (useValueRegex) { try { Pattern valuePattern = Pattern.compile(replaceWhat); patternMappings.put(valuePattern, replaceBy); } catch (PatternSyntaxException e) { throw new UserError(this, 206, replaceWhat, e.getMessage()); } } j++; } boolean defaultMappingAdded = getParameterAsBoolean(PARAMETER_ADD_DEFAULT_MAPPING); String defaultValue = getParameterAsString(PARAMETER_DEFAULT_VALUE); if (defaultMappingAdded) { if (defaultValue == null || defaultValue.equals("")) { throw new UserError(this, 201, new Object[] { PARAMETER_ADD_DEFAULT_MAPPING, "true", PARAMETER_DEFAULT_VALUE }); } } if (attributeMap.size() > 0) { if (nominal) { for (Entry<Attribute, Attribute> entry : attributeMap.entrySet()) { Attribute oldAttribute = entry.getKey(); Attribute newAttribute = entry.getValue(); exampleSet.getExampleTable().addAttribute(newAttribute); exampleSet.getAttributes().addRegular(newAttribute); for (Example example : exampleSet) { double value = example.getValue(oldAttribute); String stringValue = null; if (Double.isNaN(value)) { stringValue = "?"; } else { stringValue = oldAttribute.getMapping().mapIndex((int) value); } String mappedValue = mappings.get(stringValue); if (useValueRegex) { for (Entry<Pattern, String> patternEntry : patternMappings.entrySet()) { Matcher matcher = patternEntry.getKey().matcher(stringValue); if (matcher.matches()) { mappedValue = patternEntry.getValue(); break; } } } if (mappedValue == null) { if (stringValue.equals("?")) { example.setValue(newAttribute, Double.NaN); } else { if (defaultMappingAdded) { if (defaultValue.equals("?")) { example.setValue(newAttribute, Double.NaN); } else { example.setValue(newAttribute, defaultValue); } } else { example.setValue(newAttribute, newAttribute.getMapping().mapString(stringValue)); } } } else { if (mappedValue.equals("?")) { example.setValue(newAttribute, Double.NaN); } else { example.setValue(newAttribute, newAttribute.getMapping().mapString(mappedValue)); } } checkForStop(); } exampleSet.getAttributes().remove(oldAttribute); newAttribute.setName(oldAttribute.getName()); } } else { HashMap<Double, Double> numericalValueMapping = new HashMap<Double, Double>(); for (Entry<String, String> entry : mappings.entrySet()) { double oldValue = Double.NaN; double newValue = Double.NaN; if (!entry.getKey().equals("?")) { oldValue = Double.valueOf(entry.getKey()); } if (!entry.getValue().equals("?")) { newValue = Double.valueOf(entry.getValue()); } numericalValueMapping.put(oldValue, newValue); } double numericalDefaultValue = Double.NaN; if (defaultMappingAdded && !defaultValue.equals("?")) { numericalDefaultValue = Double.valueOf(defaultValue); } for (Entry<Attribute, Attribute> entry : attributeMap.entrySet()) { Attribute oldAttribute = entry.getKey(); Attribute newAttribute = entry.getValue(); exampleSet.getExampleTable().addAttribute(newAttribute); exampleSet.getAttributes().addRegular(newAttribute); for (Example example : exampleSet) { double value = example.getValue(oldAttribute); Double mappedValue = numericalValueMapping.get(Double.valueOf(value)); if (mappedValue == null) { if (defaultMappingAdded) { example.setValue(newAttribute, numericalDefaultValue); } else { example.setValue(newAttribute, value); } } else { example.setValue(newAttribute, mappedValue); } checkForStop(); } exampleSet.getAttributes().remove(oldAttribute); newAttribute.setName(oldAttribute.getName()); } } } return exampleSet; } @Override protected int[] getFilterValueTypes() { return new int[] { Ontology.ATTRIBUTE_VALUE }; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeList(PARAMETER_VALUE_MAPPINGS, "The value mappings.", new ParameterTypeString(PARAMETER_OLD_VALUES, "The original values which should be replaced.", false), new ParameterTypeString(PARAMETER_NEW_VALUES, "Specifies the new value", false)); type.setExpert(false); types.add(type); type = new ParameterTypeString(PARAMETER_REPLACE_WHAT, "All occurrences of this value will be replaced.", true); type.setExpert(false); types.add(type); type = new ParameterTypeString(PARAMETER_REPLACE_BY, "The new attribute value to use.", true); type.setExpert(false); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_CONSIDER_REGULAR_EXPRESSIONS, "Enables matching based on regular expressions; original values may be specified as regular expressions.", false)); type = new ParameterTypeBoolean(PARAMETER_ADD_DEFAULT_MAPPING, "If set to true, all original values which are not listed in the value mappings list are mapped to the default value.", false); type.setExpert(false); types.add(type); type = new ParameterTypeString(PARAMETER_DEFAULT_VALUE, "The default value all original values are mapped to, if add_default_mapping is set to true.", true); type.setExpert(false); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_ADD_DEFAULT_MAPPING, true, true)); types.add(type); return types; } @Override public boolean writesIntoExistingData() { return false; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), AttributeValueMapper.class, null); } }