/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SetRelation;
import com.rapidminer.operator.preprocessing.AbstractDataProcessing;
import com.rapidminer.operator.tools.AttributeSubsetSelector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeRegexp;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
/**
* <p>This operator creates new attributes from a nominal attribute by dividing
* the nominal values into parts according to a split criterion (regular expression).
* This operator provides two different modes, depending on the setting of the
* parameter "splitting_mode".</p>
*
* <h3>Ordered Splits</h3>
* <p>In the first split mode, called ordered_split, the resulting attributes get
* the name of the original attribute together with a number indicating the order.
* For example, if the original data contained the values<br/><br/>
*
* attribute-name <br/>
* -------------- <br/>
* value1 <br/>
* value2, value3 <br/>
* value3 <br/>
* <br/>
*
* and should be divided by the separating commas, the resulting attributes would be
* attribute-name1, attribute-name2, attribute-name3 with the tuples
* (value1, ?, ?), (value2, value3, ?), and (value3, ?, ?), respectively.
* This mode is useful if the original values indicated some order like, for example,
* a preference.
* </p>
*
* <h3>Unordered Splits</h3>
* <p>In the second split mode, called unordered_split, the resulting attributes get
* the name of the original attribute together with the value for each of the occurring
* values. For example, if the original data contained the values<br/><br/>
*
* attribute-name <br/>
* -------------- <br/>
* value1 <br/>
* value2, value3 <br/>
* value3 <br/>
* <br/>
*
* and again should be divided by the separating commas, the resulting attributes would
* be attribute-name-value1, attribute-name-value2, and
* attribute-name-value3 with the tuples (true, false, false), (false, true, true), and
* (false, false, true), respectively.
* This mode is useful if the order is not important but the goal is a basket like
* data set containing all occurring values.
* </p>
*
* @author Ingo Mierswa
*/
public class AttributeValueSplit extends AbstractDataProcessing {
public static final String PARAMETER_SPLIT_PATTERN = "split_pattern";
public static final String PARAMETER_SPLIT_MODE = "split_mode";
public final static String[] SPLIT_MODES = new String[] {
"ordered_split",
"unordered_split"
};
public final static int SPLIT_MODE_ORDERED = 0;
public final static int SPLIT_MODE_UNORDERED = 1;
private AttributeSubsetSelector attributeSubsetSelector = new AttributeSubsetSelector(this, getExampleSetInputPort(), Ontology.NOMINAL);
public AttributeValueSplit(OperatorDescription description) {
super(description);
}
@Override
protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError {
String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN);
try {
Pattern splittingPattern = Pattern.compile(splittingRegex);
ExampleSetMetaData subset = attributeSubsetSelector.getMetaDataSubset(metaData, false);
SetRelation attributeSetRelation = SetRelation.EQUAL;
for (AttributeMetaData amd: subset.getAllAttributes()) {
if (!amd.isSpecial() && amd.isNominal()) {
attributeSetRelation = attributeSetRelation.merge(amd.getValueSetRelation());
int maxNumber = 0;
if (amd.getValueSetRelation() == SetRelation.SUBSET || amd.getValueSetRelation() == SetRelation.UNKNOWN)
maxNumber = 3;
String[][] valueParts = new String[amd.getValueSet().size()][];
int i = 0;
for (String value: amd.getValueSet()) {
valueParts[i] = splittingPattern.split(value);
maxNumber = Math.max(maxNumber, valueParts[i].length);
i++;
}
// removing old attribute
metaData.removeAttribute(metaData.getAttributeByName(amd.getName()));
// creating new attributes
for (i = 0; i < maxNumber; i++) {
AttributeMetaData newAmd = new AttributeMetaData(amd.getName() + "_" + (i + 1), Ontology.NOMINAL);
Set<String> valueSet = new HashSet<String>();
for (int value = 0; value < valueParts.length; value++) {
if (valueParts[value].length > i)
valueSet.add(valueParts[value][i]);
}
newAmd.setValueSet(valueSet, amd.getValueSetRelation());
if (i > 0)
newAmd.getNumberOfMissingValues().increaseByUnknownAmount();
metaData.addAttribute(newAmd);
}
}
}
metaData.mergeSetRelation(attributeSetRelation);
} catch (PatternSyntaxException e) {}
return metaData;
}
@Override
public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
String splittingRegex = getParameterAsString(PARAMETER_SPLIT_PATTERN);
Pattern splittingPattern = null;
try {
splittingPattern = Pattern.compile(splittingRegex);
} catch (PatternSyntaxException e) {
throw new UserError(this, 206, splittingRegex, e.getMessage());
}
int type = getParameterAsInt(PARAMETER_SPLIT_MODE);
for (Attribute attribute : attributeSubsetSelector.getAttributeSubset(exampleSet, false)) {
if (attribute.isNominal()) {
switch (type) {
case SPLIT_MODE_ORDERED:
orderedSplit(exampleSet, attribute, splittingPattern);
break;
case SPLIT_MODE_UNORDERED:
default:
unorderedSplit(exampleSet, attribute, splittingPattern);
break;
}
}
}
return exampleSet;
}
private void orderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) {
// check for maximum number
int maxNumber = 0;
for (Example example : exampleSet) {
String value = example.getNominalValue(attribute);
String[] parts = splittingPattern.split(value);
maxNumber = Math.max(maxNumber, parts.length);
}
if (maxNumber >= 2) {
// create new attributes
Attribute[] newAttributes = new Attribute[maxNumber];
for (int a = 0; a < maxNumber; a++) {
newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + (a+1), Ontology.NOMINAL);
exampleSet.getExampleTable().addAttribute(newAttributes[a]);
exampleSet.getAttributes().addRegular(newAttributes[a]);
}
// fill new attributes with values
for (Example example : exampleSet) {
String value = example.getNominalValue(attribute);
String[] parts = splittingPattern.split(value);
int p = 0;
for (String part : parts) {
example.setValue(newAttributes[p], newAttributes[p].getMapping().mapString(part));
p++;
}
while (p < maxNumber) {
example.setValue(newAttributes[p], Double.NaN);
p++;
}
}
exampleSet.getAttributes().remove(attribute);
}
}
private void unorderedSplit(ExampleSet exampleSet, Attribute attribute, Pattern splittingPattern) {
// check for maximum number
SortedSet<String> allValues = new TreeSet<String>();
boolean splitFound = false;
for (Example example : exampleSet) {
String value = example.getNominalValue(attribute);
String[] parts = splittingPattern.split(value);
for (String part : parts) {
allValues.add(part);
}
if (parts.length > 1) {
splitFound = true;
}
}
if (splitFound) {
// create new attributes
Attribute[] newAttributes = new Attribute[allValues.size()];
Map<String, Integer> indexMap = new HashMap<String, Integer>();
int a = 0;
Iterator<String> v = allValues.iterator();
while (v.hasNext()) {
String value = v.next();
newAttributes[a] = AttributeFactory.createAttribute(attribute.getName() + "_" + value, Ontology.BINOMINAL);
newAttributes[a].getMapping().mapString("false");
newAttributes[a].getMapping().mapString("true");
exampleSet.getExampleTable().addAttribute(newAttributes[a]);
exampleSet.getAttributes().addRegular(newAttributes[a]);
indexMap.put(value, a);
a++;
}
// fill new attributes with values
for (Example example : exampleSet) {
// set all new attributes to false
for (Attribute newAttribute : newAttributes) {
example.setValue(newAttribute, newAttribute.getMapping().mapString("false"));
}
String value = example.getNominalValue(attribute);
String[] parts = splittingPattern.split(value);
int p = 0;
for (String part : parts) {
Attribute newAttribute = newAttributes[indexMap.get(part)];
example.setValue(newAttribute, newAttribute.getMapping().mapString("true"));
p++;
}
}
exampleSet.getAttributes().remove(attribute);
}
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.addAll(attributeSubsetSelector.getParameterTypes());
ParameterType type = new ParameterTypeRegexp(PARAMETER_SPLIT_PATTERN, "The pattern which is used for dividing the nominal values into different parts.", ",");
type.setExpert(false);
types.add(type);
type = new ParameterTypeCategory(PARAMETER_SPLIT_MODE, "The split mode of this operator, either ordered splits (keeping the original order) or unordered (keeping basket-like information).", SPLIT_MODES, SPLIT_MODE_ORDERED);
type.setExpert(false);
types.add(type);
return types;
}
@Override
public boolean writesIntoExistingData() {
return false;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), AttributeValueSplit.class, attributeSubsetSelector);
}
}