/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Statistics;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorVersion;
import com.rapidminer.operator.ProcessSetupError.Severity;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.MDInteger;
import com.rapidminer.operator.ports.metadata.SimpleMetaDataError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.ParameterCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
/**
* Replaces missing values in examples. If a value is missing, it is replaced by one of the
* functions "minimum", "maximum", "average", and "none",
* which is applied to the non missing attribute values of the example set. "none" means,
* that the value is not replaced. The function can be selected using the parameter list
* <code>columns</code>. If an attribute's name appears in this list as a key, the value is used as
* the function name. If the attribute's name is not in the list, the function specified by the
* <code>default</code> parameter is used. For nominal attributes the mode is used for the average,
* i.e. the nominal value which occurs most often in the data. For nominal attributes and
* replacement type zero the first nominal value defined for this attribute is used. The
* replenishment "value" indicates that the user defined parameter should be used for the
* replacement.
*
* @author Ingo Mierswa, Simon Fischer, Marius Helf
*/
public class MissingValueReplenishment extends ValueReplenishment {
/**
* The parameter name for "This value is used for some of the replenishment types."
*/
public static final String PARAMETER_REPLENISHMENT_VALUE = "replenishment_value";
private static final int NONE = 0;
private static final int MINIMUM = 1;
private static final int MAXIMUM = 2;
private static final int AVERAGE = 3;
private static final int ZERO = 4;
private static final int VALUE = 5;
private static final String[] REPLENISHMENT_NAMES = { "none", "minimum", "maximum", "average", "zero", "value" };
public static final OperatorVersion VERSION_BEFORE_ROUND_ON_INTEGER_ATTRIBUTES = new OperatorVersion(5, 2, 0);
public MissingValueReplenishment(OperatorDescription description) {
super(description);
}
/*
* (non-Javadoc)
*
* @see com.rapidminer.operator.Operator#getIncompatibleVersionChanges()
*/
@Override
public OperatorVersion[] getIncompatibleVersionChanges() {
OperatorVersion[] oldIncompatibleVersionChanges = super.getIncompatibleVersionChanges();
OperatorVersion[] newIncompatibleVersionChanges = new OperatorVersion[oldIncompatibleVersionChanges.length + 1];
for (int i = 0; i < oldIncompatibleVersionChanges.length; ++i) {
newIncompatibleVersionChanges[i] = oldIncompatibleVersionChanges[i];
}
newIncompatibleVersionChanges[newIncompatibleVersionChanges.length - 1] = VERSION_BEFORE_ROUND_ON_INTEGER_ATTRIBUTES;
return newIncompatibleVersionChanges;
}
private static boolean doesReplenishmentSupportValueType(int replenishment, int valueType) {
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(valueType, Ontology.NOMINAL)) {
// don't support MINIMUM, MAXIMUM, ZERO for NOMINAL attributes
switch (replenishment) {
case MINIMUM:
case MAXIMUM:
case ZERO:
return false;
}
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(valueType, Ontology.DATE_TIME)) {
// don't support AVERAGE for DATE_TIME attributes
switch (replenishment) {
case AVERAGE:
return false;
}
}
return true;
}
@Override
protected void checkSelectedSubsetMetaData(ExampleSetMetaData subsetMetaData) {
super.checkSelectedSubsetMetaData(subsetMetaData);
int replenishment;
try {
replenishment = getParameterAsInt(PARAMETER_DEFAULT);
} catch (UndefinedParameterError e) {
// should never happen
return;
}
Set<AttributeMetaData> unsupportedAttributes = new HashSet<AttributeMetaData>();
for (AttributeMetaData amd : subsetMetaData.getAllAttributes()) {
if (!doesReplenishmentSupportValueType(replenishment, amd.getValueType())) {
unsupportedAttributes.add(amd);
}
}
if (!unsupportedAttributes.isEmpty()) {
StringBuilder builder = new StringBuilder();
boolean first = true;
for (AttributeMetaData amd : unsupportedAttributes) {
if (!first) {
builder.append(", ");
} else {
first = false;
}
builder.append("\"");
builder.append(amd.getName());
builder.append("\"");
}
getExampleSetInputPort().addError(
new SimpleMetaDataError(Severity.WARNING, getExampleSetInputPort(),
"missing_value_replenishment.value_type_not_supported_by_replenishment",
REPLENISHMENT_NAMES[replenishment], builder.toString()));
}
}
@Override
protected Collection<AttributeMetaData> modifyAttributeMetaData(ExampleSetMetaData emd, AttributeMetaData amd)
throws UndefinedParameterError {
if (doesReplenishmentSupportValueType(getParameterAsInt(PARAMETER_DEFAULT), amd.getValueType())) {
amd.setNumberOfMissingValues(new MDInteger(0));
}
return Collections.singletonList(amd);
}
@Override
protected int[] getFilterValueTypes() {
return new int[] { Ontology.VALUE_TYPE };
}
@Override
public String[] getFunctionNames() {
return REPLENISHMENT_NAMES;
}
@Override
public int getDefaultFunction() {
return AVERAGE;
}
@Override
public int getDefaultColumnFunction() {
return AVERAGE;
}
@Override
public double getReplacedValue() {
return Double.NaN;
}
@Override
public double getReplenishmentValue(int functionIndex, ExampleSet exampleSet, Attribute attribute) throws UserError {
if (!doesReplenishmentSupportValueType(functionIndex, attribute.getValueType())) {
logWarning("function \"" + REPLENISHMENT_NAMES[functionIndex] + "\" does not support attribute \""
+ attribute.getName() + "\" of type \""
+ Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(attribute.getValueType())
+ "\". Ignoring missing values of this attribute.");
return Double.NaN;
}
// no need to check for incompatibe valueTypes/functions, since we already did that above
switch (functionIndex) {
case NONE:
return Double.NaN;
case MINIMUM:
final double min = exampleSet.getStatistics(attribute, Statistics.MINIMUM);
return min;
case MAXIMUM:
final double max = exampleSet.getStatistics(attribute, Statistics.MAXIMUM);
return max;
case AVERAGE:
if (attribute.isNominal()) {
final double mode = exampleSet.getStatistics(attribute, Statistics.MODE);
return mode;
} else {
double average = exampleSet.getStatistics(attribute, Statistics.AVERAGE);
average = getProperlyRoundedValue(attribute, average);
return average;
}
case ZERO:
return 0.0d;
case VALUE:
String valueString = getParameterAsString(PARAMETER_REPLENISHMENT_VALUE);
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
String formatString = null;
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE)) {
formatString = "MM/dd/yyyy";
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.TIME)) {
formatString = "hh.mm a";
} else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) {
formatString = "MM/dd/yyyy hh.mm a";
}
SimpleDateFormat dateFormat = new SimpleDateFormat(formatString, Locale.US);
try {
Date date = dateFormat.parse(valueString);
return date.getTime();
} catch (ParseException e) {
throw new UserError(this, 218, PARAMETER_REPLENISHMENT_VALUE, valueString);
}
} else if (attribute.isNominal()) {
return attribute.getMapping().mapString(valueString);
} else { // any numerical type
try {
double value = Double.parseDouble(valueString);
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.INTEGER)
&& !getCompatibilityLevel().isAtMost(VERSION_BEFORE_ROUND_ON_INTEGER_ATTRIBUTES)) {
if (value != Math.round(value)) {
throw new UserError(this, 225, PARAMETER_REPLENISHMENT_VALUE, valueString);
}
}
return value;
} catch (NumberFormatException e) {
throw new UserError(this, 211, PARAMETER_REPLENISHMENT_VALUE, valueString);
}
}
default:
throw new RuntimeException("Illegal value functionIndex: " + functionIndex);
}
}
/**
* @param attribute
* @param average2
* @return
*/
private double getProperlyRoundedValue(Attribute attribute, double value) {
if (getCompatibilityLevel().isAtMost(VERSION_BEFORE_ROUND_ON_INTEGER_ATTRIBUTES)) {
return value;
} else {
if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.INTEGER)) {
return Math.round(value);
} else {
return value;
}
}
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterTypeString type = new ParameterTypeString(PARAMETER_REPLENISHMENT_VALUE,
"This value is used for some of the replenishment types.", true, false);
type.registerDependencyCondition(new ParameterCondition(this, PARAMETER_DEFAULT, true) {
@Override
public boolean isConditionFullfilled() {
// check if any of the options is set to value
try {
if (getParameterAsInt(PARAMETER_DEFAULT) == VALUE) {
return true;
}
List<String[]> pairs = getParameterList(PARAMETER_COLUMNS);
if (pairs != null) {
for (String[] pair : pairs) {
if (pair[1].equals("value") || pair[1].equals("" + VALUE)) {
return true;
}
}
}
} catch (UndefinedParameterError e) {
}
return false;
}
});
types.add(type);
return types;
}
@Override
public boolean writesIntoExistingData() {
// the model takes care of materialization
return false;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(),
MissingValueReplenishment.class, attributeSelector);
}
}