/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.preprocessing.AbstractDataProcessing; import com.rapidminer.operator.tools.AttributeSubsetSelector; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeRegexp; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; /** * <p>This operator creates new attributes from a nominal attribute by dividing * the nominal values into parts according to a split criterion (regular expression). * This operator provides two different modes, depending on the setting of the * parameter "splitting_mode".</p> * * <h3>Ordered Splits</h3> * <p>In the first split mode, called ordered_split, the resulting attributes get * the name of the original attribute together with a number indicating the order. * For example, if the original data contained the values<br/><br/> * * attribute-name <br/> * -------------- <br/> * value1 <br/> * value2, value3 <br/> * value3 <br/> * <br/> * * and should be divided by the separating commas, the resulting attributes would be * attribute-name1, attribute-name2, attribute-name3 with the tuples * (value1, ?, ?), (value2, value3, ?), and (value3, ?, ?), respectively. * This mode is useful if the original values indicated some order like, for example, * a preference. * </p> * * <h3>Unordered Splits</h3> * <p>In the second split mode, called unordered_split, the resulting attributes get * the name of the original attribute together with the value for each of the occurring * values. For example, if the original data contained the values<br/><br/> * * attribute-name <br/> * -------------- <br/> * value1 <br/> * value2, value3 <br/> * value3 <br/> * <br/> * * and again should be divided by the separating commas, the resulting attributes would * be attribute-name-value1, attribute-name-value2, and * attribute-name-value3 with the tuples (true, false, false), (false, true, true), and * (false, false, true), respectively. * This mode is useful if the order is not important but the goal is a basket like * data set containing all occurring values. * </p> * * @author Ingo Mierswa */ public class AttributeValueSplit extends AbstractDataProcessing { public static final String PARAMETER_SPLIT_PATTERN = "split_pattern"; public static final String PARAMETER_SPLIT_MODE = "split_mode"; public final static String[] SPLIT_MODES = new String[] { "ordered_split", "unordered_split" }; public final static int SPLIT_MODE_ORDERED = 0; public final static int SPLIT_MODE_UNORDERED = 1; private AttributeSubsetSelector attributeSubsetSelector = new AttributeSubsetSelector(this, getExampleSetInputPort(), Ontology.NOMINAL); public AttributeValueSplit(OperatorDescription description) { super(description); } @Override protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError { String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN); try { Pattern splittingPattern = Pattern.compile(splittingRegex); ExampleSetMetaData subset = attributeSubsetSelector.getMetaDataSubset(metaData, false); SetRelation attributeSetRelation = SetRelation.EQUAL; for (AttributeMetaData amd: subset.getAllAttributes()) { if (!amd.isSpecial() && amd.isNominal()) { attributeSetRelation = attributeSetRelation.merge(amd.getValueSetRelation()); int maxNumber = 0; if (amd.getValueSetRelation() == SetRelation.SUBSET || amd.getValueSetRelation() == SetRelation.UNKNOWN) maxNumber = 3; String[][] valueParts = new String[amd.getValueSet().size()][]; int i = 0; for (String value: amd.getValueSet()) { valueParts[i] = splittingPattern.split(value); maxNumber = Math.max(maxNumber, valueParts[i].length); i++; } // removing old attribute metaData.removeAttribute(metaData.getAttributeByName(amd.getName())); // creating new attributes for (i = 0; i < maxNumber; i++) { AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + (i + 1), Ontology.NOMINAL); Set<String> valueSet = new HashSet<String>(); for (int value = 0; value < valueParts.length; value++) { if (valueParts[value].length > i) valueSet.add(valueParts[value][i]); } newAmd.setValueSet(valueSet, amd.getValueSetRelation()); if (i > 0) newAmd.getNumberOfMissingValues().increaseByUnknownAmount(); metaData.addAttribute(newAmd); } } } metaData.mergeSetRelation(attributeSetRelation); } catch (PatternSyntaxException e) {} return metaData; } @Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN); Pattern splittingPattern = null; try { splittingPattern = Pattern.compile(splittingRegex); } catch (PatternSyntaxException e) { throw new UserError(this, 206, splittingRegex, e.getMessage()); } int type = getParameterAsInt(PARAMETER_SPLIT_MODE); for (Attribute attribute : attributeSubsetSelector.getAttributeSubset(exampleSet, false)) { if (attribute.isNominal()) { switch (type) { case SPLIT_MODE_ORDERED: orderedSplit(exampleSet, attribute, splittingPattern); break; case SPLIT_MODE_UNORDERED: default: unorderedSplit(exampleSet, attribute, splittingPattern); break; } } } return exampleSet; } private void orderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) { // check for maximum number int maxNumber = 0; for (Example example : exampleSet) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); maxNumber = Math.max(maxNumber, parts.length); } if (maxNumber >= 2) { // create new attributes Attribute[] newAttributes = new Attribute[maxNumber]; for (int a = 0; a < maxNumber; a++) { newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + (a+1), Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(newAttributes[a]); exampleSet.getAttributes().addRegular(newAttributes[a]); } // fill new attributes with values for (Example example : exampleSet) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); int p = 0; for (String part : parts) { example.setValue(newAttributes[p], newAttributes[p].getMapping().mapString(part)); p++; } while (p < maxNumber) { example.setValue(newAttributes[p], Double.NaN); p++; } } exampleSet.getAttributes().remove(attribute); } } private void unorderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) { // check for maximum number SortedSet<String> allValues = new TreeSet<String>(); boolean splitFound = false; for (Example example : exampleSet) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); for (String part : parts) { allValues.add(part); } if (parts.length > 1) { splitFound = true; } } if (splitFound) { // create new attributes Attribute[] newAttributes = new Attribute[allValues.size()]; Map<String, Integer> indexMap = new HashMap<String, Integer>(); int a = 0; Iterator<String> v = allValues.iterator(); while (v.hasNext()) { String value = v.next(); newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + value, Ontology.BINOMINAL); newAttributes[a].getMapping().mapString("false"); newAttributes[a].getMapping().mapString("true"); exampleSet.getExampleTable().addAttribute(newAttributes[a]); exampleSet.getAttributes().addRegular(newAttributes[a]); indexMap.put(value, a); a++; } // fill new attributes with values for (Example example : exampleSet) { // set all new attributes to false for (Attribute newAttribute : newAttributes) { example.setValue(newAttribute, newAttribute.getMapping().mapString("false")); } String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); int p = 0; for (String part : parts) { Attribute newAttribute = newAttributes[indexMap.get(part)]; example.setValue(newAttribute, newAttribute.getMapping().mapString("true")); p++; } } exampleSet.getAttributes().remove(attribute); } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(attributeSubsetSelector.getParameterTypes()); ParameterType type = new ParameterTypeRegexp(PARAMETER_SPLIT_PATTERN, "The pattern which is used for dividing the nominal values into different parts.", ","); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_SPLIT_MODE, "The split mode of this operator, either ordered splits (keeping the original order) or unordered (keeping basket-like information).", SPLIT_MODES, SPLIT_MODE_ORDERED); type.setExpert(false); types.add(type); return types; } @Override public boolean writesIntoExistingData() { return false; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), AttributeValueSplit.class, attributeSubsetSelector); } }