/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.filter; import java.util.List; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.NominalMapping; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.preprocessing.PreprocessingModel; /** * Replaces strings by interpreting the second example set as a dictionary. The * * @author Simon Fischer */ public class Dictionary extends PreprocessingModel { private static final long serialVersionUID = 1441613108993813785L; private List<String[]> replacements; private String[] affectedAttributeNames; private boolean regexp = false; private boolean toLowerCase = false; private boolean stopAfterFirstMatch = false; public Dictionary(ExampleSet exampleSet, Set<Attribute> attributesAffected, List<String[]> replacements, boolean regexp, boolean toLowerCase, boolean stopAfterFistMatch) { super(exampleSet); this.stopAfterFirstMatch = stopAfterFistMatch; this.regexp = regexp; this.replacements = replacements; this.toLowerCase = toLowerCase; affectedAttributeNames = new String[attributesAffected.size()]; int i = 0; for (Attribute attribute: attributesAffected) { affectedAttributeNames[i] = attribute.getName(); i++; } } private void remap(Attributes attributes) { for (String attributeName: affectedAttributeNames) { Attribute attr = attributes.get(attributeName); if (attr.isNominal()) { NominalMapping mapping = attr.getMapping(); for (String string : mapping.getValues()) { int index = mapping.getIndex(string); mapping.setMapping(replace(string), index); } } } } private String replace(String string) { if (toLowerCase) { string = string.toLowerCase(); } for (String[] replacement : replacements) { if (regexp) { if (stopAfterFirstMatch) { if (string.matches(replacement[0])) { String newString = string.replaceAll(replacement[0], replacement[1]); return newString; } } else { string = string.replaceAll(replacement[0], replacement[1]); } } else { boolean foundMatch = false; StringBuilder soFar = new StringBuilder(""); String remainder = string; while (true) { int pos = remainder.indexOf(replacement[0]); if (pos == -1) { break; } foundMatch = true; soFar.append(remainder.substring(0, pos)); soFar.append(replacement[1]); remainder = remainder.substring(pos + replacement[0].length()); } soFar.append(remainder); string = soFar.toString(); if (foundMatch && stopAfterFirstMatch) { return string; } } } return string; } @Override public ExampleSet applyOnData(ExampleSet exampleSet) throws OperatorException { remap(exampleSet.getAttributes()); return exampleSet; } public Attributes getTargetAttributes(ExampleSet viewParent) { Attributes attributes = (Attributes) viewParent.getAttributes().clone(); remap(attributes); return attributes; } public double getValue(Attribute targetAttribute, double value) { return value; } @Override public String toString() { StringBuilder b = new StringBuilder(); for (String[] replacement : replacements) { b.append(replacement[0]); b.append(" -> "); b.append(replacement[1]); b.append("\n"); } return b.toString(); } }