/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.util.Arrays;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeTypeException;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.AbstractExampleSetProcessing;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorVersion;
import com.rapidminer.operator.ProcessSetupError.Severity;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MDInteger;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SimpleMetaDataError;
import com.rapidminer.operator.tools.AttributeSubsetSelector;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeExpression;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
import com.rapidminer.tools.expression.ExampleResolver;
import com.rapidminer.tools.expression.Expression;
import com.rapidminer.tools.expression.ExpressionException;
import com.rapidminer.tools.expression.ExpressionParser;
import com.rapidminer.tools.expression.ExpressionType;
import com.rapidminer.tools.expression.internal.ExpressionParserUtils;
/**
* Allows the declaration of a missing value (nominal or numeric) on a selected subset. The given
* value will be converted to Double.NaN, so subsequent operators will treat it as a missing value.
*
* @author Marco Boeck, Marius Helf
*/
public class DeclareMissingValueOperator extends AbstractExampleSetProcessing {
/** parameter to set the missing value for numeric type */
public static final String PARAMETER_MISSING_VALUE_NUMERIC = "numeric_value";
/** parameter to set the missing value for nominal type */
public static final String PARAMETER_MISSING_VALUE_NOMINAL = "nominal_value";
/** parameter to set the expression */
public static final String PARAMETER_MISSING_VALUE_EXPRESSION = "expression_value";
/** parameter to set the missing value type (numeric or nominal) */
public static final String PARAMETER_MODE = "mode";
public static final OperatorVersion VERSION_IGNORE_ATTRIBUTES_OF_WRONG_TYPE = new OperatorVersion(5, 2, 8);
/** Subset Selector for parameter use */
private AttributeSubsetSelector subsetSelector = new AttributeSubsetSelector(this, getExampleSetInputPort());
/** constant for PARAMETER_VALUE_TYPE */
private static final String NUMERIC = "numeric";
/** constant for PARAMETER_VALUE_TYPE */
private static final String NOMINAL = "nominal";
/** constant for PARAMETER_VALUE_TYPE */
private static final String EXPRESSION = "expression";
/** value types to choose from in {@link #PARAMETER_MODE} */
private static final String[] VALUE_TYPES = new String[] { NUMERIC, NOMINAL, EXPRESSION };
/**
* Incompatible version, old version writes into the exampleset, if original output port is not
* connected.
*/
private static final OperatorVersion VERSION_MAY_WRITE_INTO_DATA = new OperatorVersion(7, 1, 1);
public DeclareMissingValueOperator(OperatorDescription description) {
super(description);
}
@Override
protected MetaData modifyMetaData(ExampleSetMetaData metaData) throws UndefinedParameterError {
if (isParameterSet(PARAMETER_MISSING_VALUE_NOMINAL) || isParameterSet(PARAMETER_MISSING_VALUE_NUMERIC)) {
ExampleSetMetaData subset = subsetSelector.getMetaDataSubset(metaData, false);
if (subset != null) {
MDInteger missingValueNumber;
boolean parameterAttributeTypeExistsInSubset = false;
String mode = getParameterAsString(PARAMETER_MODE);
for (AttributeMetaData amd : subset.getAllAttributes()) {
AttributeMetaData originalAMD = metaData.getAttributeByName(amd.getName());
missingValueNumber = originalAMD.getNumberOfMissingValues();
missingValueNumber.increaseByUnknownAmount();
if (mode.equals(NUMERIC)) {
switch (amd.getValueType()) {
case Ontology.NUMERICAL:
case Ontology.INTEGER:
case Ontology.REAL:
parameterAttributeTypeExistsInSubset = true;
break;
default:
continue;
}
} else if (mode.equals(NOMINAL)) {
switch (amd.getValueType()) {
case Ontology.NOMINAL:
case Ontology.STRING:
case Ontology.BINOMINAL:
case Ontology.POLYNOMINAL:
case Ontology.FILE_PATH:
case Ontology.DATE_TIME:
parameterAttributeTypeExistsInSubset = true;
break;
default:
continue;
}
} else if (mode.equals(EXPRESSION)) {
// expression can be on all types so always true
parameterAttributeTypeExistsInSubset = true;
}
}
if (!parameterAttributeTypeExistsInSubset) {
if (subset.getAllAttributes().size() <= 0) {
getInputPort().addError(
new SimpleMetaDataError(Severity.ERROR, getInputPort(), "attribute_selection_empty"));
} else {
if (mode.equals(NUMERIC)) {
getInputPort().addError(new SimpleMetaDataError(Severity.ERROR, getInputPort(),
"exampleset.must_contain_numerical_attribute"));
}
if (mode.equals(NOMINAL)) {
getInputPort().addError(new SimpleMetaDataError(Severity.ERROR, getInputPort(),
"exampleset.must_contain_nominal_attribute"));
}
}
}
}
}
return metaData;
}
@Override
public ExampleSet apply(ExampleSet exampleSet) throws OperatorException {
ExampleSet subset = subsetSelector.getSubset(exampleSet, false);
Attributes attributes = subset.getAttributes();
String mode = getParameterAsString(PARAMETER_MODE);
// handle EXPRESSION mode
if (mode.equals(EXPRESSION)) {
ExampleResolver resolver = new ExampleResolver(exampleSet);
ExpressionParser expParser = ExpressionParserUtils.createAllModulesParser(this, resolver);
String expression = getParameterAsString(PARAMETER_MISSING_VALUE_EXPRESSION);
// error after parsing?
Expression result = null;
try {
result = expParser.parse(expression);
} catch (ExpressionException e) {
throw ExpressionParserUtils.convertToUserError(this, expression, e);
}
if (result.getExpressionType() == ExpressionType.BOOLEAN) {
int exampleCounter = 0;
for (Example example : exampleSet) {
// assign values to the variables
resolver.bind(example);
try {
if (++exampleCounter % 1000 == 0) {
checkForStop();
}
Boolean resultBoolean;
try {
resultBoolean = result.evaluateBoolean();
} catch (ExpressionException e) {
throw ExpressionParserUtils.convertToUserError(this, expression, e);
}
for (Attribute attribute : attributes) {
// change to missing on true evaluation
if (resultBoolean) {
example.setValue(attribute, Double.NaN);
}
}
} finally {
// avoid memory leak
resolver.unbind();
}
}
}
}
boolean ignoreIncompatibleAttributes = getCompatibilityLevel().isAtMost(VERSION_IGNORE_ATTRIBUTES_OF_WRONG_TYPE);
String nominalString = getParameterAsString(PARAMETER_MISSING_VALUE_NOMINAL);
double missingValueNumeric = 0;
if (mode.equals(NUMERIC)) {
missingValueNumeric = getParameterAsDouble(PARAMETER_MISSING_VALUE_NUMERIC);
}
if (nominalString == null) {
nominalString = "";
}
for (Attribute attribute : attributes) {
for (Example example : subset) {
checkForStop();
if (mode.equals(NUMERIC)) {
if (ignoreIncompatibleAttributes || attribute.isNumerical()) {
if (example.getValue(attribute) == missingValueNumeric) {
example.setValue(attribute, Double.NaN);
}
}
} else if (mode.equals(NOMINAL)) {
if (ignoreIncompatibleAttributes || attribute.isNominal()
|| attribute.getValueType() == Ontology.FILE_PATH
|| Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
try {
if (example.getNominalValue(attribute).equals(nominalString)) {
example.setValue(attribute, Double.NaN);
}
} catch (AttributeTypeException e) {
throw new UserError(this, 119, attribute.getName(), this.getName());
}
}
}
}
}
return exampleSet;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> parameters = super.getParameterTypes();
parameters.addAll(subsetSelector.getParameterTypes());
ParameterType type = new ParameterTypeCategory(PARAMETER_MODE, "Select the value type of the missing value",
VALUE_TYPES, 0);
type.setExpert(false);
parameters.add(type);
type = new ParameterTypeDouble(PARAMETER_MISSING_VALUE_NUMERIC, "This parameter defines the missing numerical value",
Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, true);
type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, VALUE_TYPES, true, 0));
type.setExpert(false);
parameters.add(type);
type = new ParameterTypeString(PARAMETER_MISSING_VALUE_NOMINAL, "This parameter defines the missing nominal value",
true, false);
type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, VALUE_TYPES, false, 1));
type.setExpert(false);
parameters.add(type);
type = new ParameterTypeExpression(PARAMETER_MISSING_VALUE_EXPRESSION,
"This parameter defines the expression which if true equals the missing value", getInputPort(), true, false);
type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, VALUE_TYPES, true, 2));
type.setExpert(false);
parameters.add(type);
return parameters;
}
@Override
public boolean writesIntoExistingData() {
if (getCompatibilityLevel().isAbove(VERSION_MAY_WRITE_INTO_DATA)) {
return true;
} else {
// old version: true only if original output port is connected
return isOriginalOutputConnected();
}
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(),
DeclareMissingValueOperator.class, null);
}
@Override
public OperatorVersion[] getIncompatibleVersionChanges() {
OperatorVersion[] incompatibleVersions = super.getIncompatibleVersionChanges();
OperatorVersion[] extendedIncompatibleVersions = Arrays.copyOf(incompatibleVersions,
incompatibleVersions.length + 2);
extendedIncompatibleVersions[incompatibleVersions.length] = VERSION_IGNORE_ATTRIBUTES_OF_WRONG_TYPE;
extendedIncompatibleVersions[incompatibleVersions.length + 1] = VERSION_MAY_WRITE_INTO_DATA;
return ExpressionParserUtils.addIncompatibleExpressionParserChange(extendedIncompatibleVersions);
}
}