/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.Ontology; /** * This operator creates new attributes from nominal attributes where the new attributes contain only of * substrings of the original values. Please note that the counting starts with 1 and that the first * and the last character will be included in the resulting substring. For example, the value is * "RapidMiner" and the first index is set to 6 and the last index is set to 9 the result will be * "Mine". If the last index is larger than the length of the word, the resulting substrings * will end with the last character. * * @author Ingo Mierswa * @version $Id: AttributeValueSubstring.java,v 1.2 2008/07/07 07:06:40 ingomierswa Exp $ */ public class AttributeValueSubstring extends Operator { public static final String PARAMETER_ATTRIBUTES = "attributes"; public static final String PARAMETER_APPLY_TO_SPECIAL_FEATURES = "apply_to_special_features"; public static final String PARAMETER_FIRST = "first"; public static final String PARAMETER_LAST = "last"; public AttributeValueSubstring(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); String regex = getParameterAsString(PARAMETER_ATTRIBUTES); int firstIndex = getParameterAsInt(PARAMETER_FIRST); int lastIndex = getParameterAsInt(PARAMETER_LAST); Pattern pattern = null; try { pattern = Pattern.compile(regex); } catch (PatternSyntaxException e) { throw new UserError(this, 206, regex, e.getMessage()); } Iterator<Attribute> i = exampleSet.getAttributes().iterator(); if (getParameterAsBoolean(PARAMETER_APPLY_TO_SPECIAL_FEATURES)) { i = exampleSet.getAttributes().allAttributes(); } List<Attribute> matchingAttributes = new LinkedList<Attribute>(); while (i.hasNext()) { Attribute attribute = i.next(); Matcher matcher = pattern.matcher(attribute.getName()); if (matcher.matches()) { if (attribute.isNominal()) { matchingAttributes.add(attribute); } else { logWarning("Cannot create substring for non-nominal attribute '" + attribute.getName() + "', skipping..."); } } checkForStop(); } for (Attribute attribute : matchingAttributes) { Attribute newAttribute = createSubstringAttribute(exampleSet, attribute, firstIndex, lastIndex); AttributeRole role = exampleSet.getAttributes().getRole(attribute); exampleSet.getAttributes().remove(attribute); if (role.isSpecial()) { String specialName = role.getSpecialName(); exampleSet.getAttributes().setSpecialAttribute(newAttribute, specialName); } } return new IOObject[] { exampleSet }; } private Attribute createSubstringAttribute(ExampleSet exampleSet, Attribute originalAttribute, int firstIndex, int lastIndex) { Attribute newAttribute = AttributeFactory.createAttribute(originalAttribute.getName(), Ontology.NOMINAL); exampleSet.getExampleTable().addAttribute(newAttribute); exampleSet.getAttributes().addRegular(newAttribute); for (Example example : exampleSet) { String value = example.getNominalValue(originalAttribute); int actualFirst = firstIndex - 1; int actualLast = lastIndex; if (lastIndex > value.length()) { actualLast = value.length(); } if (lastIndex <= firstIndex) { example.setValue(newAttribute, Double.NaN); } else { String newValue = value.substring(actualFirst, actualLast); if (newValue.length() == 0) { example.setValue(newAttribute, Double.NaN); } else { example.setValue(newAttribute, newValue); } } } return newAttribute; } public Class<?>[] getInputClasses() { return new Class[] { ExampleSet.class }; } public Class<?>[] getOutputClasses() { return new Class[] { ExampleSet.class }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeString(PARAMETER_ATTRIBUTES, "Substring creation of values will be applied to the attributes that match the given regular expression.", false); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_APPLY_TO_SPECIAL_FEATURES, "Filter also special attributes (label, id...)", false)); type = new ParameterTypeInt(PARAMETER_FIRST, "The index of the first character of the substring which should be kept (counting starts with 1, 0: start with beginning of value).", 1, Integer.MAX_VALUE, 1); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_LAST, "The index of the last character of the substring which should be kept (counting starts with 1, 0: end with end of value).", 1, Integer.MAX_VALUE, Integer.MAX_VALUE); type.setExpert(false); types.add(type); return types; } }