/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorVersion; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.preprocessing.AbstractDataProcessing; import com.rapidminer.operator.tools.AttributeSubsetSelector; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeRegexp; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; /** * <p> * This operator creates new attributes from a nominal attribute by dividing the nominal values into * parts according to a split criterion (regular expression). This operator provides two different * modes, depending on the setting of the parameter "splitting_mode". * </p> * * <h3>Ordered Splits</h3> * <p> * In the first split mode, called ordered_split, the resulting attributes get the name of the * original attribute together with a number indicating the order. For example, if the original data * contained the values<br/> * <br/> * * attribute-name <br/> * -------------- <br/> * value1 <br/> * value2, value3 <br/> * value3 <br/> * <br/> * * and should be divided by the separating commas, the resulting attributes would be * attribute-name1, attribute-name2, attribute-name3 with the tuples (value1, ?, ?), (value2, * value3, ?), and (value3, ?, ?), respectively. This mode is useful if the original values * indicated some order like, for example, a preference. * </p> * * <h3>Unordered Splits</h3> * <p> * In the second split mode, called unordered_split, the resulting attributes get the name of the * original attribute together with the value for each of the occurring values. For example, if the * original data contained the values<br/> * <br/> * * attribute-name <br/> * -------------- <br/> * value1 <br/> * value2, value3 <br/> * value3 <br/> * <br/> * * and again should be divided by the separating commas, the resulting attributes would be * attribute-name-value1, attribute-name-value2, and attribute-name-value3 with the tuples (true, * false, false), (false, true, true), and (false, false, true), respectively. This mode is useful * if the order is not important but the goal is a basket like data set containing all occurring * values. * </p> * * @author Ingo Mierswa, Nils Woehler */ public class AttributeValueSplit extends AbstractDataProcessing { public static final String PARAMETER_SPLIT_PATTERN = "split_pattern"; public static final String PARAMETER_SPLIT_MODE = "split_mode"; public final static String[] SPLIT_MODES = new String[] { "ordered_split", "unordered_split" }; public final static int SPLIT_MODE_ORDERED = 0; public final static int SPLIT_MODE_UNORDERED = 1; /** last version where selected but missing attributes were silently ignored */ private static final OperatorVersion OPERATOR_VERSION_6_0_3 = new OperatorVersion(6, 0, 3); private AttributeSubsetSelector attributeSubsetSelector = new AttributeSubsetSelector(this, getExampleSetInputPort(), Ontology.NOMINAL); public AttributeValueSplit(OperatorDescription description) { super(description); } @Override protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError { String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN); try { Pattern splittingPattern = Pattern.compile(splittingRegex); ExampleSetMetaData subset = attributeSubsetSelector.getMetaDataSubset(metaData, false, true); SetRelation attributeSetRelation = SetRelation.EQUAL; int type = getParameterAsInt(PARAMETER_SPLIT_MODE); for (AttributeMetaData amd : subset.getAllAttributes()) { if (!amd.isSpecial() && amd.isNominal()) { attributeSetRelation = attributeSetRelation.merge(amd.getValueSetRelation()); // removing old attribute metaData.removeAttribute(metaData.getAttributeByName(amd.getName())); switch (type) { case SPLIT_MODE_ORDERED: int maxNumber = 0; if (amd.getValueSetRelation() == SetRelation.SUBSET || amd.getValueSetRelation() == SetRelation.UNKNOWN) { maxNumber = 3; } String[][] valueParts = new String[amd.getValueSet().size()][]; int i = 0; for (String value : amd.getValueSet()) { valueParts[i] = splittingPattern.split(value); maxNumber = Math.max(maxNumber, valueParts[i].length); i++; } // creating new attributes for (i = 0; i < maxNumber; i++) { AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + (i + 1), Ontology.NOMINAL); Set<String> valueSet = new HashSet<>(); for (int value = 0; value < valueParts.length; value++) { if (valueParts[value].length > i) { valueSet.add(valueParts[value][i]); } } newAmd.setValueSet(valueSet, amd.getValueSetRelation()); if (i > 0) { newAmd.getNumberOfMissingValues().increaseByUnknownAmount(); } metaData.addAttribute(newAmd); } break; case SPLIT_MODE_UNORDERED: Set<String> splitValuesSet = new HashSet<>(); for (String value : amd.getValueSet()) { String[] splitValue = splittingPattern.split(value); for (int k = 0; k < splitValue.length; k++) { splitValuesSet.add(splitValue[k]); } } // creating new attributes for (String splitValue : splitValuesSet) { AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + splitValue, Ontology.NOMINAL); Set<String> valueSet = new HashSet<>(); valueSet.add("true"); valueSet.add("false"); newAmd.setValueSet(valueSet, amd.getValueSetRelation()); metaData.addAttribute(newAmd); } default: break; } } } metaData.mergeSetRelation(attributeSetRelation); } catch (PatternSyntaxException e) { } return metaData; } @Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN); Pattern splittingPattern = null; try { splittingPattern = Pattern.compile(splittingRegex); } catch (PatternSyntaxException e) { throw new UserError(this, 206, splittingRegex, e.getMessage()); } int type = getParameterAsInt(PARAMETER_SPLIT_MODE); // Until version 6.0.3 there was thrown no UserError when attributes were missing. // Compatibility check to avoid older processes to fail. boolean errorOnMissing = getCompatibilityLevel().isAtMost(OPERATOR_VERSION_6_0_3) ? false : true; for (Attribute attribute : attributeSubsetSelector.getAttributeSubset(exampleSet, false, errorOnMissing)) { if (attribute.isNominal()) { switch (type) { case SPLIT_MODE_ORDERED: orderedSplit(exampleSet, attribute, splittingPattern); break; case SPLIT_MODE_UNORDERED: default: unorderedSplit(exampleSet, attribute, splittingPattern); break; } } } return exampleSet; } private void orderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) { // check for maximum number int maxNumber = 0; for (Example example : exampleSet) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); maxNumber = Math.max(maxNumber, parts.length); } if (maxNumber >= 2) { // create new attributes Attribute[] newAttributes = new Attribute[maxNumber]; for (int a = 0; a < maxNumber; a++) { newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + (a + 1), Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(newAttributes[a]); exampleSet.getAttributes().addRegular(newAttributes[a]); } // fill new attributes with values for (Example example : exampleSet) { int p = 0; // check if value is missing, otherwise a "?" string could be filled in. if (!Double.isNaN(example.getValue(attribute))) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); for (String part : parts) { example.setValue(newAttributes[p], newAttributes[p].getMapping().mapString(part)); p++; } } while (p < maxNumber) { example.setValue(newAttributes[p], Double.NaN); p++; } } exampleSet.getAttributes().remove(attribute); } } private void unorderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) { // check for maximum number SortedSet<String> allValues = new TreeSet<>(); boolean splitFound = false; for (Example example : exampleSet) { String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); for (String part : parts) { allValues.add(part); } if (parts.length > 1) { splitFound = true; } } if (splitFound) { // create new attributes Attribute[] newAttributes = new Attribute[allValues.size()]; Map<String, Integer> indexMap = new HashMap<>(); int a = 0; Iterator<String> v = allValues.iterator(); while (v.hasNext()) { String value = v.next(); newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + value, Ontology.BINOMINAL); newAttributes[a].getMapping().mapString("false"); newAttributes[a].getMapping().mapString("true"); exampleSet.getExampleTable().addAttribute(newAttributes[a]); exampleSet.getAttributes().addRegular(newAttributes[a]); indexMap.put(value, a); a++; } // fill new attributes with values for (Example example : exampleSet) { // set all new attributes to false for (Attribute newAttribute : newAttributes) { example.setValue(newAttribute, newAttribute.getMapping().mapString("false")); } String value = example.getNominalValue(attribute); String[] parts = splittingPattern.split(value); // int p = 0; for (String part : parts) { Attribute newAttribute = newAttributes[indexMap.get(part)]; example.setValue(newAttribute, newAttribute.getMapping().mapString("true")); // p++; } } exampleSet.getAttributes().remove(attribute); } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(attributeSubsetSelector.getParameterTypes()); ParameterType type = new ParameterTypeRegexp(PARAMETER_SPLIT_PATTERN, "The pattern which is used for dividing the nominal values into different parts.", ","); type.setExpert(false); types.add(type); type = new ParameterTypeCategory(PARAMETER_SPLIT_MODE, "The split mode of this operator, either ordered splits (keeping the original order) or unordered (keeping basket-like information).", SPLIT_MODES, SPLIT_MODE_ORDERED); type.setExpert(false); types.add(type); return types; } @Override public boolean writesIntoExistingData() { return false; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), AttributeValueSplit.class, attributeSubsetSelector); } @Override public OperatorVersion[] getIncompatibleVersionChanges() { OperatorVersion[] changes = super.getIncompatibleVersionChanges(); changes = Arrays.copyOf(changes, changes.length + 1); changes[changes.length - 1] = OPERATOR_VERSION_6_0_3; return changes; } }