/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.NominalMapping;
import com.rapidminer.example.table.ViewAttribute;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorProgress;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.preprocessing.PreprocessingModel;
import com.rapidminer.studio.internal.ProcessStoppedRuntimeException;
import com.rapidminer.tools.Tools;
/**
* Replaces strings by interpreting the second example set as a dictionary. The
*
* @author Simon Fischer
*/
public class Dictionary extends PreprocessingModel {
private static final long serialVersionUID = 1441613108993813785L;
private static final int OPERATOR_PROGRESS_STEPS = 100;
private List<String[]> replacements;
private String[] affectedAttributeNames;
private ExampleSet exampleSet;
private transient Map<NominalMapping, Map<Double, Double>> viewReplacements;
private boolean regexp = false;
private boolean toLowerCase = false;
private boolean stopAfterFirstMatch = false;
public Dictionary(ExampleSet exampleSet, Set<Attribute> attributesAffected, List<String[]> replacements, boolean regexp,
boolean toLowerCase, boolean stopAfterFistMatch) {
super(exampleSet);
this.exampleSet = exampleSet;
this.stopAfterFirstMatch = stopAfterFistMatch;
this.regexp = regexp;
this.replacements = replacements;
this.toLowerCase = toLowerCase;
affectedAttributeNames = new String[attributesAffected.size()];
int i = 0;
for (Attribute attribute : attributesAffected) {
affectedAttributeNames[i] = attribute.getName();
i++;
}
}
/**
* Remaps the attributes in the exampleSet or writes into the exampleSet if a remapping is not
* possible.
*/
private void remap(ExampleSet exampleSet) throws ProcessStoppedException {
Attributes attributes = exampleSet.getAttributes();
OperatorProgress progress = null;
if (getShowProgress() && getOperator() != null && getOperator().getProgress() != null) {
progress = getOperator().getProgress();
progress.setTotal(affectedAttributeNames.length);
}
int progressCounter = 0;
for (String attributeName : affectedAttributeNames) {
Attribute attr = attributes.get(attributeName);
if (attr.isNominal()) {
NominalMapping mapping = attr.getMapping();
List<String> mappingValues = mapping.getValues();
for (String string : mappingValues) {
String replacement = replace(string);
int oldRepr = mapping.getIndex(string);
// Nothing to replace
if (replacement.equals(string)) {
continue;
}
// Replacement already present in mapping -> Replace in example set
else if (mappingValues.contains(replacement)) {
for (Example example : exampleSet) {
double oldValue = example.getValue(attr);
if (Tools.isEqual(oldRepr, oldValue)) {
int newRepr = mapping.getIndex(replacement);
example.setValue(attr, newRepr);
}
}
}
// Replacement not present in mapping -> Replace in mapping
else {
mapping.setMapping(replacement, oldRepr);
}
}
}
if (progress != null && ++progressCounter % OPERATOR_PROGRESS_STEPS == 0) {
progress.setCompleted(progressCounter);
}
}
}
/**
* Gives the attributes a new mapping if a remapping is possible or stores the value change in
* {@link #viewReplacements} and changes to {@link ViewAttribute}.
*/
private void remap(Attributes attributes) throws ProcessStoppedException {
OperatorProgress progress = null;
if (getShowProgress() && getOperator() != null && getOperator().getProgress() != null) {
progress = getOperator().getProgress();
progress.setTotal(affectedAttributeNames.length);
}
int progressCounter = 0;
for (String attributeName : affectedAttributeNames) {
Attribute attr = attributes.get(attributeName);
if (attr.isNominal()) {
NominalMapping mapping = attr.getMapping();
List<String> mappingValues = mapping.getValues();
boolean cloned = false;
boolean putReplacement = false;
for (String string : mappingValues) {
String replacement = replace(string);
// Nothing to replace
if (replacement.equals(string)) {
continue;
}
if (!cloned) {
// mapping is not cloned on attribute clone so we have to clone it before we
// change anything
attr.setMapping((NominalMapping) mapping.clone());
// get mapping from attributes again because it might be cloned when setting
mapping = attr.getMapping();
cloned = true;
}
int oldRepr = mapping.getIndex(string);
// Replacement already present in mapping -> Replace value on the fly
if (mappingValues.contains(replacement)) {
if (viewReplacements == null) {
viewReplacements = new HashMap<>();
}
Map<Double, Double> viewReplacer = viewReplacements.get(mapping);
if (viewReplacer == null) {
viewReplacer = new HashMap<>();
viewReplacements.put(mapping, viewReplacer);
putReplacement = true;
}
int newRepr = mapping.getIndex(replacement);
viewReplacer.put((double) oldRepr, (double) newRepr);
}
// Replacement not present in mapping -> Replace in mapping
else {
mapping.setMapping(replacement, oldRepr);
}
}
if (putReplacement) {
// add view attribute to allow replacing on the fly via viewReplacements
attributes.replace(attr, new ViewAttribute(this, attr, attr.getName(), attr.getValueType(), mapping));
}
}
if (progress != null && ++progressCounter % OPERATOR_PROGRESS_STEPS == 0) {
progress.setCompleted(progressCounter);
}
}
}
private String replace(String string) {
if (toLowerCase) {
string = string.toLowerCase();
}
for (String[] replacement : replacements) {
if (regexp) {
if (stopAfterFirstMatch) {
if (string.matches(replacement[0])) {
String newString = string.replaceAll(replacement[0], replacement[1]);
return newString;
}
} else {
string = string.replaceAll(replacement[0], replacement[1]);
}
} else {
boolean foundMatch = false;
StringBuilder soFar = new StringBuilder("");
String remainder = string;
while (true) {
int pos = remainder.indexOf(replacement[0]);
if (pos == -1) {
break;
}
foundMatch = true;
soFar.append(remainder.substring(0, pos));
soFar.append(replacement[1]);
remainder = remainder.substring(pos + replacement[0].length());
}
soFar.append(remainder);
string = soFar.toString();
if (foundMatch && stopAfterFirstMatch) {
return string;
}
}
}
return string;
}
@Override
public ExampleSet applyOnData(ExampleSet exampleSet) throws OperatorException {
remap(exampleSet);
return exampleSet;
}
@Override
public Attributes getTargetAttributes(ExampleSet viewParent) {
Attributes attributes = (Attributes) viewParent.getAttributes().clone();
try {
remap(attributes);
} catch (ProcessStoppedException e) {
throw new ProcessStoppedRuntimeException();
}
return attributes;
}
@Override
public double getValue(Attribute targetAttribute, double value) {
if (viewReplacements != null) {
Map<Double, Double> replacers = viewReplacements.get(targetAttribute.getMapping());
if (replacers != null) {
Double replacement = replacers.get(value);
if (replacement != null) {
value = replacement;
}
}
}
return value;
}
@Override
public String toString() {
StringBuilder b = new StringBuilder();
for (String[] replacement : replacements) {
b.append(replacement[0]);
b.append(" -> ");
b.append(replacement[1]);
b.append("\n");
}
return b.toString();
}
public List<String[]> getReplacements() {
return replacements;
}
public String[] getAffectedAttributeNames() {
return affectedAttributeNames;
}
public boolean isRegexp() {
return regexp;
}
public boolean isToLowerCase() {
return toLowerCase;
}
public boolean shouldStopAfterFirstMatch() {
return stopAfterFirstMatch;
}
public ExampleSet getExampleSet() {
return exampleSet;
}
@Override
protected boolean needsRemapping() {
return false;
}
@Override
protected boolean writesIntoExistingData() {
return true;
}
}