/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.NonSpecialAttributesExampleSet; import com.rapidminer.operator.OperatorChain; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.PortPairExtender; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.AttributeSubsetPassThroughRule; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetPassThroughRule; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.MetaDataInfo; import com.rapidminer.operator.ports.metadata.SetRelation; import com.rapidminer.operator.ports.metadata.SubprocessTransformRule; import com.rapidminer.operator.tools.AttributeSubsetSelector; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.tools.OperatorResourceConsumptionHandler; /** * <p>This operator can be used to select one attribute (or a subset) by defining a * regular expression for the attribute name and applies its inner operators to * the resulting subset. Please note that this operator will also use special * attributes which makes it necessary for all preprocessing steps which should * be performed on special attributes (and are normally not performed on special * attributes).</p> * * <p>This operator is also able to deliver the additional results of the inner * operator if desired.</p> * * <p>Afterwards, the remaining original attributes are added * to the resulting example set if the parameter "keep_subset_only" is set to * false (default).</p> * * <p>Please note that this operator is very powerful and can be used to create * new preprocessing schemes by combining it with other preprocessing operators. * However, there are two major restrictions (among some others): first, since the inner result * will be combined with the rest of the input example set, the number of * examples (data points) is not allowed to be changed inside of the subset preprocessing. * Second, attribute role changes will not be delivered to the outside since internally all special * attributes will be changed to regular for the inner operators and role changes can afterwards * not be delivered.</p> * * @author Ingo Mierswa, Shevek */ public class AttributeSubsetPreprocessing extends OperatorChain { /** The parameter name for "Indicates if the additional results (other than example set) of the inner operator should also be returned." */ public static final String PARAMETER_DELIVER_INNER_RESULTS = "deliver_inner_results"; /** The parameter name for "Indicates if the attributes which did not match the regular expression should be removed by this operator." */ public static final String PARAMETER_KEEP_SUBSET_ONLY = "keep_subset_only"; private final InputPort exampleSetInput = getInputPorts().createPort("example set", ExampleSet.class); private final OutputPort innerExampleSetSource = getSubprocess(0).getInnerSources().createPort("exampleSet"); private final InputPort innerExampleSetSink = getSubprocess(0).getInnerSinks().createPort("example set", ExampleSet.class); private final OutputPort exampleSetOutput = getOutputPorts().createPort("example set"); private final PortPairExtender innerResultPorts = new PortPairExtender("through", getSubprocess(0).getInnerSinks(), getOutputPorts()); private final AttributeSubsetSelector attributeSelector = new AttributeSubsetSelector(this, exampleSetInput); public AttributeSubsetPreprocessing(OperatorDescription description) { super(description, "Subset Process"); getTransformer().addRule(new AttributeSubsetPassThroughRule(exampleSetInput, innerExampleSetSource, this, false)); getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0))); getTransformer().addRule(new ExampleSetPassThroughRule(innerExampleSetSink, exampleSetOutput, SetRelation.UNKNOWN) { @Override public ExampleSetMetaData modifyExampleSet(ExampleSetMetaData inputMetaData) { if (getParameterAsBoolean(PARAMETER_KEEP_SUBSET_ONLY)) { return inputMetaData; } else { MetaData metaData = exampleSetInput.getMetaData(); if (metaData instanceof ExampleSetMetaData) { inputMetaData = (ExampleSetMetaData) metaData; ExampleSetMetaData subsetAmd = attributeSelector.getMetaDataSubset(inputMetaData, false); // storing unused attributes List<AttributeMetaData> unusedAttributes = new LinkedList<AttributeMetaData>(); Iterator<AttributeMetaData> iterator = inputMetaData.getAllAttributes().iterator(); while (iterator.hasNext()) { AttributeMetaData amd = iterator.next(); if (!(subsetAmd.containsAttributeName(amd.getName()) == MetaDataInfo.YES)) { unusedAttributes.add(amd); } } // retrieving result if (innerExampleSetSink.getMetaData() instanceof ExampleSetMetaData) { ExampleSetMetaData resultMetaData = (ExampleSetMetaData) innerExampleSetSink.getMetaData().clone(); // merge result with unusedAttributes: restore special types Iterator<AttributeMetaData> r = resultMetaData.getAllAttributes().iterator(); while (r.hasNext()) { AttributeMetaData newMetaData = r.next(); AttributeMetaData oldMetaData = inputMetaData.getAttributeByName(newMetaData.getName()); if (oldMetaData != null) { if (oldMetaData.isSpecial()) { String specialName = oldMetaData.getRole(); newMetaData.setRole(specialName); } } } // add unused attributes again resultMetaData.addAllAttributes(unusedAttributes); return resultMetaData; } } return inputMetaData; } } }); getTransformer().addRule(innerResultPorts.makePassThroughRule()); innerResultPorts.start(); } @Override public void doWork() throws OperatorException { ExampleSet inputSet = exampleSetInput.getData(); ExampleSet workingExampleSet = (ExampleSet)inputSet.clone(); Set<Attribute> selectedAttributes = attributeSelector.getAttributeSubset(workingExampleSet, false); List<Attribute> unusedAttributes = new LinkedList<Attribute>(); Iterator<Attribute> iterator = workingExampleSet.getAttributes().allAttributes(); while (iterator.hasNext()) { Attribute attribute = iterator.next(); if (!selectedAttributes.contains(attribute)) { unusedAttributes.add(attribute); iterator.remove(); } } // converting special to normal workingExampleSet = new NonSpecialAttributesExampleSet(workingExampleSet); // perform inner operators innerExampleSetSource.deliver(workingExampleSet); getSubprocess(0).execute(); // retrieve transformed example set ExampleSet resultSet = innerExampleSetSink.getData(); // transform special attributes back Iterator<AttributeRole> r = resultSet.getAttributes().allAttributeRoles(); while (r.hasNext()) { AttributeRole newRole = r.next(); AttributeRole oldRole = inputSet.getAttributes().getRole(newRole.getAttribute().getName()); if (oldRole != null) { if (oldRole.isSpecial()) { String specialName = oldRole.getSpecialName(); newRole.setSpecial(specialName); } } } // add old attributes if desired if (!getParameterAsBoolean(PARAMETER_KEEP_SUBSET_ONLY)) { if (resultSet.size() != inputSet.size()) { throw new UserError(this, 127, "changing the size of the example set is not allowed if the non-processed attributes should be kept."); } if (resultSet.getExampleTable().equals(inputSet.getExampleTable())) { for (Attribute attribute : unusedAttributes) { AttributeRole role = inputSet.getAttributes().getRole(attribute); resultSet.getAttributes().add(role); } } else { logWarning("Underlying example table has changed: data copy into new table is necessary in order to keep non-processed attributes."); for (Attribute oldAttribute : unusedAttributes) { AttributeRole oldRole = inputSet.getAttributes().getRole(oldAttribute); // create and add copy of attribute Attribute newAttribute = (Attribute)oldAttribute.clone(); resultSet.getExampleTable().addAttribute(newAttribute); AttributeRole newRole = new AttributeRole(newAttribute); if (oldRole.isSpecial()) newRole.setSpecial(oldRole.getSpecialName()); resultSet.getAttributes().add(newRole); // copy data for the new attribute Iterator<Example> oldIterator = inputSet.iterator(); Iterator<Example> newIterator = resultSet.iterator(); while (oldIterator.hasNext()) { Example oldExample = oldIterator.next(); Example newExample = newIterator.next(); newExample.setValue(newAttribute, oldExample.getValue(oldAttribute)); } } } } // add all other results if desired innerResultPorts.passDataThrough(); // deliver example set exampleSetOutput.deliver(resultSet); } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.addAll(attributeSelector.getParameterTypes()); types.add(new ParameterTypeBoolean(PARAMETER_KEEP_SUBSET_ONLY, "Indicates if the attributes which did not match the regular expression should be removed by this operator.", false)); types.add(new ParameterTypeBoolean(PARAMETER_DELIVER_INNER_RESULTS, "Indicates if the additional results (other than example set) of the inner operator should also be returned.", false)); return types; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0), AttributeSubsetPreprocessing.class, attributeSelector); } }