/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeRole;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.NonSpecialAttributesExampleSet;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessSetupError.Severity;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.PortPairExtender;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.AttributeSubsetPassThroughRule;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetPassThroughRule;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.MetaDataError;
import com.rapidminer.operator.ports.metadata.MetaDataInfo;
import com.rapidminer.operator.ports.metadata.SetRelation;
import com.rapidminer.operator.ports.metadata.SimpleMetaDataError;
import com.rapidminer.operator.ports.metadata.SubprocessTransformRule;
import com.rapidminer.operator.tools.AttributeSubsetSelector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
/**
* <p>This operator can be used to select one attribute (or a subset) by defining a
* regular expression for the attribute name and applies its inner operators to
* the resulting subset. Please note that this operator will also use special
* attributes which makes it necessary for all preprocessing steps which should
* be performed on special attributes (and are normally not performed on special
* attributes).</p>
*
* <p>This operator is also able to deliver the additional results of the inner
* operator if desired.</p>
*
* <p>Afterwards, the remaining original attributes are added
* to the resulting example set if the parameter "keep_subset_only" is set to
* false (default).</p>
*
* <p>Please note that this operator is very powerful and can be used to create
* new preprocessing schemes by combining it with other preprocessing operators.
* However, there are two major restrictions (among some others): first, since the inner result
* will be combined with the rest of the input example set, the number of
* examples (data points) is not allowed to be changed inside of the subset preprocessing.
* Second, attribute role changes will not be delivered to the outside since internally all special
* attributes will be changed to regular for the inner operators and role changes can afterwards
* not be delivered.</p>
*
* @author Ingo Mierswa, Shevek
*/
public class AttributeSubsetPreprocessing extends OperatorChain {
/** The parameter name for "Indicates if the additional results (other than example set) of the inner operator should also be returned." */
public static final String PARAMETER_DELIVER_INNER_RESULTS = "deliver_inner_results";
/** The parameter name for "Indicates if the attributes which did not match the regular expression should be removed by this operator." */
public static final String PARAMETER_KEEP_SUBSET_ONLY = "keep_subset_only";
/** The parameter name for "Indicates how to handle with doubling of Attributenames*/
public static final String PARAMETER_ROLE_CONFLICT_HANDLING = "role_conflict_handling";
public static final String[] HANDLE_ROLE_CONFLICT_MODES = {"error","keep new","keep original"};
public static final int HANDLE_ROLE_CONFLICT_ERROR = 0;
public static final int HANDLE_ROLE_CONFLICT_KEEP_NEW = 1;
public static final int HANDLE_ROLE_CONFLICT_KEEP_ORIGINAL = 2;
public static final String PARAMETER_NAME_CONFLICT_HANDLING = "name_conflict_handling";
public static final String[] HANDLE_NAME_CONFLICT_MODES = {"error","keep new","keep original"};
public static final int HANDLE_NAME_CONFLICT_ERROR = 0;
public static final int HANDLE_NAME_CONFLICT_KEEP_NEW = 1;
public static final int HANDLE_NAME_CONFLICT_KEEP_ORIGINAL = 2;
private final InputPort exampleSetInput = getInputPorts().createPort("example set", ExampleSet.class);
private final OutputPort innerExampleSetSource = getSubprocess(0).getInnerSources().createPort("exampleSet");
private final InputPort innerExampleSetSink = getSubprocess(0).getInnerSinks().createPort("example set", ExampleSet.class);
private final OutputPort exampleSetOutput = getOutputPorts().createPort("example set");
private final PortPairExtender innerResultPorts = new PortPairExtender("through", getSubprocess(0).getInnerSinks(), getOutputPorts());
private final AttributeSubsetSelector attributeSelector = new AttributeSubsetSelector(this, exampleSetInput);
public AttributeSubsetPreprocessing(OperatorDescription description) {
super(description, "Subset Process");
getTransformer().addRule(new AttributeSubsetPassThroughRule(exampleSetInput, innerExampleSetSource, this, false));
getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0)));
getTransformer().addRule(new ExampleSetPassThroughRule(innerExampleSetSink, exampleSetOutput, SetRelation.UNKNOWN) {
@Override
public ExampleSetMetaData modifyExampleSet(ExampleSetMetaData inputMetaData) {
if (getParameterAsBoolean(PARAMETER_KEEP_SUBSET_ONLY)) {
return inputMetaData;
} else {
MetaData metaData = exampleSetInput.getMetaData();
if (metaData instanceof ExampleSetMetaData) {
inputMetaData = (ExampleSetMetaData) metaData;
ExampleSetMetaData subsetAmd = attributeSelector.getMetaDataSubset(inputMetaData, false);
// storing unused attributes
List<AttributeMetaData> unusedAttributes = new LinkedList<AttributeMetaData>();
Iterator<AttributeMetaData> iterator = inputMetaData.getAllAttributes().iterator();
while (iterator.hasNext()) {
AttributeMetaData amd = iterator.next();
if (!(subsetAmd.containsAttributeName(amd.getName()) == MetaDataInfo.YES)) {
unusedAttributes.add(amd);
}
}
// retrieving result
if (innerExampleSetSink.getMetaData() instanceof ExampleSetMetaData) {
ExampleSetMetaData resultMetaData = (ExampleSetMetaData) innerExampleSetSink.getMetaData().clone();
//merge and add unused completely
Iterator<AttributeMetaData> iter=unusedAttributes.iterator();
int nameConflict=0;
int roleConflict=0;
try {nameConflict=getParameterAsInt(PARAMETER_NAME_CONFLICT_HANDLING);
} catch(UndefinedParameterError e) {}
try {roleConflict=getParameterAsInt(PARAMETER_ROLE_CONFLICT_HANDLING);
} catch(UndefinedParameterError e) {}
while(iter.hasNext()) {
AttributeMetaData unusedControl = iter.next();
if(unusedControl.getRole()!=null && resultMetaData.getSpecial(unusedControl.getRole())!= null) {
//use-cases
switch (roleConflict) {
case HANDLE_ROLE_CONFLICT_ERROR:
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.new_special_role_exist", new Object[] {unusedControl.getRole(), resultMetaData.getSpecial(unusedControl.getRole()).getName()}));
break;
case HANDLE_ROLE_CONFLICT_KEEP_ORIGINAL:
// remove special attribute
AttributeMetaData toRemove=resultMetaData.getSpecial(unusedControl.getRole());
resultMetaData.removeAttribute(toRemove);
// throw error if name of original attribute exists at another point in the resultSet
if(resultMetaData.getAttributeByName(unusedControl.getName())!=null){
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.role_and_name_conflict",new Object[] {unusedControl.getName()}));
} else {
//insert the new one
resultMetaData.addAttribute(unusedControl);
}
break;
case HANDLE_ROLE_CONFLICT_KEEP_NEW:
//throw error if the name of the special attribute exists already, else we don't do anything
String SpecialResultName=resultMetaData.getSpecial(unusedControl.getRole()).getName();
if(!(unusedControl.getName().equals(SpecialResultName)) && inputMetaData.getAttributeByName(SpecialResultName) != null) {
// throw error because is case isn't defined
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.role_and_name_conflict",new Object[] {SpecialResultName}));
}
break;
default:
// don't do anything, we keep the new one
break;
}
} else {
//test for name conflict
if(resultMetaData.getAttributeByName(unusedControl.getName())!=null){
if(unusedControl.getRole()!=null){
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.role_and_name_conflict",new Object[] {unusedControl.getName()}));
} else{
// we have a regular attribute
switch (nameConflict) {
case HANDLE_NAME_CONFLICT_ERROR:
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.new_attribute_exist",new Object[] {unusedControl.getName()}));
break;
case HANDLE_NAME_CONFLICT_KEEP_ORIGINAL:
// tests whether a attribute with special role and same name exists
AttributeMetaData toRemove=resultMetaData.getAttributeByName(unusedControl.getName());
if(toRemove.isSpecial()){
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.role_and_name_conflict",new Object[] {unusedControl.getName()}));
} else {
//act as defined
resultMetaData.removeAttribute(toRemove);
resultMetaData.addAttribute(unusedControl);
}
break;
case HANDLE_NAME_CONFLICT_KEEP_NEW:
// throws error if the attribute with this name isn't regular an could be removed later in this loop
AttributeMetaData toKeep=resultMetaData.getAttributeByName(unusedControl.getName());
if(toKeep.isSpecial()){
innerExampleSetSink.addError(new SimpleMetaDataError(Severity.ERROR, innerExampleSetSink, "work_on_subset.role_and_name_conflict",new Object[] {unusedControl.getName()}));
}
default:
// don't do anything, we keep the new one
break;
}
}
} else {
//there is no conflict for the attribute
resultMetaData.addAttribute(unusedControl);
}//end if
}
}//end while
return resultMetaData;
}
}
}
return inputMetaData;
}
});
getTransformer().addRule(innerResultPorts.makePassThroughRule());
innerResultPorts.start();
}
@Override
public void doWork() throws OperatorException {
ExampleSet inputSet = exampleSetInput.getData(ExampleSet.class);
ExampleSet workingExampleSet = (ExampleSet)inputSet.clone();
Set<Attribute> selectedAttributes = attributeSelector.getAttributeSubset(workingExampleSet, false);
List<Attribute> unusedAttributes = new LinkedList<Attribute>();
Iterator<Attribute> iterator = workingExampleSet.getAttributes().allAttributes();
while (iterator.hasNext()) {
Attribute attribute = iterator.next();
if (!selectedAttributes.contains(attribute)) {
unusedAttributes.add(attribute);
iterator.remove();
}
}
// converting special to normal
workingExampleSet = new NonSpecialAttributesExampleSet(workingExampleSet);
// perform inner operators
innerExampleSetSource.deliver(workingExampleSet);
getSubprocess(0).execute();
// retrieve transformed example set
ExampleSet resultSet = innerExampleSetSink.getData(ExampleSet.class);
// add old attributes if desired
if (!getParameterAsBoolean(PARAMETER_KEEP_SUBSET_ONLY)) {
if (resultSet.size() != inputSet.size()) {
throw new UserError(this, 127, "changing the size of the example set is not allowed if the non-processed attributes should be kept.");
}
mergeSets(resultSet, inputSet, unusedAttributes, resultSet.getExampleTable().equals(inputSet.getExampleTable()), getParameterAsInt(PARAMETER_ROLE_CONFLICT_HANDLING), getParameterAsInt(PARAMETER_NAME_CONFLICT_HANDLING));
}
// add all other results if desired
innerResultPorts.passDataThrough();
// deliver example set
exampleSetOutput.deliver(resultSet);
}
/**
*
* @param resultSet = ExampleSet which will be returned
* @param inputSet = original ExampleSet
* @param unusedAttributes = list of unused Attributes of the original ExampleSet
* @param identicalExampleTables = boolean Value which indicates whether the ExampleSets are the same
* @param roleConflictHandling = integer Value for Role Conflict Handling
* @param nameConflictHandling = integer Value for Name Conflict Handling
* @throws UserError
*/
private void mergeSets(ExampleSet resultSet, ExampleSet inputSet, List<Attribute> unusedAttributes, boolean identicalExampleTables, int roleConflictHandling,int nameConflictHandling) throws UserError {
// Check whether the underlying example table has been change
if (identicalExampleTables) {
//if Attribute names are duplicated it throws an Exception or decide whether the new or old Attribute should be kept
for (Attribute attribute : unusedAttributes) {
AttributeRole role = inputSet.getAttributes().getRole(attribute);
if (resultSet.getAttributes().getSpecial(role.getSpecialName()) != null) {
switch (roleConflictHandling) {
case HANDLE_ROLE_CONFLICT_ERROR:
throw new UserError(this,"attribute_subset_preprocessing.role_conflict",role.getAttribute().getName(),resultSet.getAttributes().getSpecial(role.getSpecialName()).getName());
case HANDLE_ROLE_CONFLICT_KEEP_ORIGINAL:
// remove special attribute
resultSet.getAttributes().remove(attribute);
// throw error if name of original attribute exists at another point in the resultSet
if(resultSet.getAttributes().get(attribute.getName())!=null){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", role.getAttribute().getName());
}
//insert the new one
resultSet.getAttributes().add(role);
break;
case HANDLE_ROLE_CONFLICT_KEEP_NEW:
//throw error if the name of the special attribute exists already, else we don't do anything
String SpecialResultName=resultSet.getAttributes().getSpecial(role.getSpecialName()).getName();
if(inputSet.getAttributes().get(SpecialResultName) != null && !(attribute.getName().equals(SpecialResultName))) {
// throw error because is case isn't defined
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict",attribute.getName());
}
break;
default:
// don't do anything, we keep the new one
break;
}
} else {
// we have a regular attribute or the special attribute isn't part of the result set until now
if(resultSet.getAttributes().get(attribute.getName()) != null) {
if(role.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", attribute.getName());
}
// we have a regular attribute
switch (nameConflictHandling) {
case HANDLE_NAME_CONFLICT_ERROR:
throw new UserError(this,"attribute_subset_preprocessing.name_conflict", role.getAttribute().getName());
case HANDLE_NAME_CONFLICT_KEEP_ORIGINAL:
// tests whether a attribute with special role and same name exists
AttributeRole control=resultSet.getAttributes().getRole(attribute.getName());
if(control.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", attribute.getName());
}
resultSet.getAttributes().remove(attribute);
resultSet.getAttributes().add(role);
break;
case HANDLE_NAME_CONFLICT_KEEP_NEW:
AttributeRole Keep=resultSet.getAttributes().getRole(attribute.getName());
if(Keep.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", attribute.getName());
}
default:
// don't do anything, we keep the new one
break;
}
} else {
//there is no conflict for the attribute
resultSet.getAttributes().add(role);
}
}
}
} else {
//we have two different ExampleSets
logWarning("Underlying example table has changed: data copy into new table is necessary in order to keep non-processed attributes.");
for (Attribute oldAttribute : unusedAttributes) {
AttributeRole oldRole = inputSet.getAttributes().getRole(oldAttribute);
if (resultSet.getAttributes().getSpecial(oldRole.getSpecialName()) != null) {
switch (roleConflictHandling) {
case HANDLE_ROLE_CONFLICT_ERROR:
String targetRole=oldRole.getSpecialName();
throw new UserError(this,"attribute_subset_preprocessing.role_conflict", new Object[] {targetRole,resultSet.getAttributes().getSpecial(targetRole).getName()});
case HANDLE_ROLE_CONFLICT_KEEP_ORIGINAL:
// remove the special attribute in resultSet and copy the original to the resulSet
resultSet.getAttributes().remove(oldAttribute);
// throw error if name of original attribute exists at another point in the resultSet
if(resultSet.getAttributes().get(oldAttribute.getName())!=null){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", oldAttribute.getName());
}
// create and add copy of unused attributes from input set
Attribute newAttribute = (Attribute)oldAttribute.clone();
resultSet.getExampleTable().addAttribute(newAttribute);
AttributeRole newRole = new AttributeRole(newAttribute);
if (oldRole.isSpecial())
newRole.setSpecial(oldRole.getSpecialName());
// add to result set
resultSet.getAttributes().add(newRole);
// copy data for the new attribute
Iterator<Example> oldIterator = inputSet.iterator();
Iterator<Example> newIterator = resultSet.iterator();
while (oldIterator.hasNext()) {
Example oldExample = oldIterator.next();
Example newExample = newIterator.next();
newExample.setValue(newAttribute, oldExample.getValue(oldAttribute));
}
break;
case HANDLE_ROLE_CONFLICT_KEEP_NEW:
// if a name-conflict with other attributes exists we throw an error
String SpecialResultName=resultSet.getAttributes().getSpecial(oldRole.getSpecialName()).getName();
if(inputSet.getAttributes().get(SpecialResultName) != null && !(oldAttribute.getName().equals(SpecialResultName))) {
// throw error because this case is not defined
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", oldAttribute.getName());
}
break;
default:
// don't do anything, we keep the new one
break;
}
}
else {
if(resultSet.getAttributes().get(oldAttribute.getName()) != null) {
// we have a regular attribute or the special attribute isn't part of the result set until now
if(oldRole.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", oldAttribute.getName());
}
// we have a regular attribute
switch (nameConflictHandling) {
case HANDLE_NAME_CONFLICT_ERROR:
throw new UserError(this,"attribute_subset_preprocessing.name_conflict", oldAttribute.getName());
case HANDLE_NAME_CONFLICT_KEEP_ORIGINAL:
// tests whether a attribute with special role and same name exists
AttributeRole control=resultSet.getAttributes().getRole(oldAttribute.getName());
if(control.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict",oldAttribute.getName());
}
// remove the attribute in result and copy the original to the resulSet
resultSet.getAttributes().remove(oldAttribute);
// create and add copy of unused attributes from input set
Attribute newAttribute = (Attribute)oldAttribute.clone();
resultSet.getExampleTable().addAttribute(newAttribute);
AttributeRole newRole = new AttributeRole(newAttribute);
if (oldRole.isSpecial())
newRole.setSpecial(oldRole.getSpecialName());
// add to result set
resultSet.getAttributes().add(newRole);
// copy data for the new attribute
Iterator<Example> oldIterator = inputSet.iterator();
Iterator<Example> newIterator = resultSet.iterator();
while (oldIterator.hasNext()) {
Example oldExample = oldIterator.next();
Example newExample = newIterator.next();
newExample.setValue(newAttribute, oldExample.getValue(oldAttribute));
}
break;
case HANDLE_NAME_CONFLICT_KEEP_NEW:
AttributeRole toKeep=resultSet.getAttributes().getRole(oldAttribute.getName());
if(toKeep.isSpecial()){
throw new UserError(this,"attribute_subset_preprocessing.role_name_conflict", oldAttribute.getName());
}
default:
// don't do anything, we keep the new one
break;
}
} else {
//there is no conflict of name or role
// create and add copy of unused attributes from input set
Attribute newAttribute = (Attribute)oldAttribute.clone();
resultSet.getExampleTable().addAttribute(newAttribute);
AttributeRole newRole = new AttributeRole(newAttribute);
if (oldRole.isSpecial())
newRole.setSpecial(oldRole.getSpecialName());
// add to result set
resultSet.getAttributes().add(newRole);
// copy data for the new attribute
Iterator<Example> oldIterator = inputSet.iterator();
Iterator<Example> newIterator = resultSet.iterator();
while (oldIterator.hasNext()) {
Example oldExample = oldIterator.next();
Example newExample = newIterator.next();
newExample.setValue(newAttribute, oldExample.getValue(oldAttribute));
}//end while
}//end else
}
}
}
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.addAll(attributeSelector.getParameterTypes());
ParameterType type = new ParameterTypeCategory(PARAMETER_NAME_CONFLICT_HANDLING, "Decides how to deal with duplicate attribute names", HANDLE_NAME_CONFLICT_MODES, 0);
type.setExpert(false);
types.add(type);
type = new ParameterTypeCategory(PARAMETER_ROLE_CONFLICT_HANDLING, "Decides how to deal with duplicate attribute roles", HANDLE_ROLE_CONFLICT_MODES, 0);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeBoolean(PARAMETER_KEEP_SUBSET_ONLY, "Indicates if the attributes which did not match the regular expression should be removed by this operator.", false));
types.add(new ParameterTypeBoolean(PARAMETER_DELIVER_INNER_RESULTS, "Indicates if the additional results (other than example set) of the inner operator should also be returned.", false));
return types;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0), AttributeSubsetPreprocessing.class, attributeSelector);
}
}