/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeWeights;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.Statistics;
import com.rapidminer.example.set.Condition;
import com.rapidminer.example.set.ConditionCreationException;
import com.rapidminer.example.set.ConditionedExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.condition.CombinedInnerOperatorCondition;
import com.rapidminer.operator.condition.InnerOperatorCondition;
import com.rapidminer.operator.condition.SpecificInnerOperatorCondition;
import com.rapidminer.operator.features.weighting.InfoGainWeighting;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.OperatorService;
import com.rapidminer.tools.RandomGenerator;
/**
* The operator MissingValueImpution imputes missing values by learning models
* for each attribute (except the label) and applying those models to the data
* set. The learner which is to be applied has to be given as inner operator.
* In order to specify a subset of the example set in which the missing values
* should be imputed (e.g. to limit the imputation to only numerical attributes) an
* arbitrary filter can be used as the first inner operator. In the case that such
* a filter is used, the learner has to be the second inner operator.
*
* Please be aware that depending on the ability of the inner operator to handle
* missing values this operator might not be able to impute all missing values in
* some cases. This behaviour leads to a warning. It might hence be useful to combine
* this operator with a subsequent MissingValueReplenishment.
*
* ATTENTION: This operator is currently under development and does not properly
* work in all cases. We do not recommend the usage of this operator in production
* systems.
*
* @author Tobias Malbrecht
* @version $Id: MissingValueImputation.java,v 1.12 2008/07/07 07:06:40 ingomierswa Exp $
*/
public class MissingValueImputation extends OperatorChain {
/** The parameter name for "Order of attributes in which missing values are estimated." */
public static final String PARAMETER_ORDER = "order";
/** The parameter name for "Sort direction which is used in order strategy." */
public static final String PARAMETER_SORT = "sort";
/** The parameter name for "Impute missing values immediately after having learned the corresponding concept and iterate." */
public static final String PARAMETER_ITERATE = "iterate";
/** The parameter name for "Apply filter to learning set in addition to determination which missing values should be substituted." */
public static final String PARAMETER_FILTER_LEARNING_SET = "filter_learning_set";
/** The parameter name for "Learn concepts to impute missing values only on the basis of complete cases (should be used in case learning approach can not handle missing values)." */
public static final String PARAMETER_LEARN_ON_COMPLETE_CASES = "learn_on_complete_cases";
/** The parameter name for "Use the given random seed instead of global random numbers (-1: use global)." */
public static final String PARAMETER_LOCAL_RANDOM_SEED = "local_random_seed";
/** Chronological imputation order. */
private static final int CHRONOLOGICAL = 0;
/** Random imputation order. */
private static final int RANDOM = 1;
/** Imputation order based on the number of missing values. */
private static final int NUMBER_OF_MISSING_VALUES = 2;
/** Imputation order based on the information gain of the attributes. */
private static final int INFORMATION_GAIN = 3;
/** Order strategies names. */
private static final String[] orderStrategies = { "chronological", "random", "number of missing values", "information gain" };
/** Ascending sort order. */
private static final int ASCENDING = 0;
/** Sort strategies names. */
private static final String[] sortStrategies = { "ascending", "descending" };
public MissingValueImputation(OperatorDescription description) {
super(description);
}
/** Returns the minimum number of inner operators. */
public int getMinNumberOfInnerOperators() {
return 1;
}
/** Returns the maximum number of inner operators. */
public int getMaxNumberOfInnerOperators() {
return 2;
}
public InnerOperatorCondition getInnerOperatorCondition() {
if (getNumberOfOperators() == 1) {
return new SpecificInnerOperatorCondition("Learner", 0, new Class[] { ExampleSet.class }, new Class[] { Model.class });
} else {
CombinedInnerOperatorCondition condition = new CombinedInnerOperatorCondition();
condition.addCondition(new SpecificInnerOperatorCondition("Filter", 0, new Class[] { ExampleSet.class }, new Class[] { ExampleSet.class }));
condition.addCondition(new SpecificInnerOperatorCondition("Learner", 1, new Class[] { ExampleSet.class }, new Class[] { Model.class }));
return condition;
}
}
public Attribute[] getOrderedAttributes(ExampleSet exampleSet, int order, boolean ascending) throws OperatorException {
Attribute[] sortedAttributes = new Attribute[exampleSet.getAttributes().size()];
AttributeWeights weights = new AttributeWeights(exampleSet);
switch (order) {
case CHRONOLOGICAL:
int index = 0;
for (Attribute attribute : exampleSet.getAttributes()) {
weights.setWeight(attribute.getName(), index);
index++;
}
break;
case RANDOM:
RandomGenerator randomGenerator = RandomGenerator.getRandomGenerator(getParameterAsInt(PARAMETER_LOCAL_RANDOM_SEED));
for (Attribute attribute : exampleSet.getAttributes()) {
weights.setWeight(attribute.getName(), randomGenerator.nextDouble());
}
break;
case NUMBER_OF_MISSING_VALUES:
exampleSet.recalculateAllAttributeStatistics();
for (Attribute attribute : exampleSet.getAttributes()) {
weights.setWeight(attribute.getName(), exampleSet.getStatistics(attribute, Statistics.UNKNOWN));
}
break;
case INFORMATION_GAIN:
if (exampleSet.getAttributes().getLabel() == null) {
throw new UserError(this, 105);
}
Operator infoGainWeightingOperator;
try {
infoGainWeightingOperator = OperatorService.createOperator(InfoGainWeighting.class);
} catch (OperatorCreationException e) {
throw new OperatorException("Cannot create info gain weighting operator which is necessary for ordering the attributes.");
}
weights = infoGainWeightingOperator.apply(new IOContainer(new IOObject[] { exampleSet })).get(AttributeWeights.class);
break;
}
String[] attributeNames = new String[weights.size()];
weights.getAttributeNames().toArray(attributeNames);
int sortingOrder = (ascending ? AttributeWeights.DECREASING : AttributeWeights.INCREASING);
weights.sortByWeight(attributeNames, sortingOrder, AttributeWeights.ABSOLUTE_WEIGHTS);
for (int i = 0; i < attributeNames.length; i++) {
sortedAttributes[i] = exampleSet.getAttributes().get(attributeNames[i]);
}
return sortedAttributes;
}
public IOObject[] apply() throws OperatorException {
boolean iterate = getParameterAsBoolean(PARAMETER_ITERATE);
int order = getParameterAsInt(PARAMETER_ORDER);
boolean ascending = (getParameterAsInt(PARAMETER_SORT) == ASCENDING);
boolean filterLearningSet = getParameterAsBoolean(PARAMETER_FILTER_LEARNING_SET);
boolean learnOnCompleteCases = getParameterAsBoolean(PARAMETER_LEARN_ON_COMPLETE_CASES);
// retrieve inner operators
Operator learner = null;
Operator filter = null;
if (getNumberOfOperators() == 1) {
learner = getOperator(0);
} else {
filter = getOperator(0);
learner = getOperator(1);
}
ExampleSet exampleSet = getInput(ExampleSet.class);
// delete original label which should not be learned from
Attribute label = exampleSet.getAttributes().getLabel();
if (label != null) {
exampleSet.getAttributes().setLabel(null);
exampleSet.getAttributes().remove(label);
}
ExampleSet imputationSet = (ExampleSet) exampleSet.clone();
// filter example set in which missing values should be substituted
if (filter != null) {
imputationSet = filter.apply(new IOContainer(new IOObject[] { (ExampleSet) exampleSet.clone() })).get(ExampleSet.class);
}
int numberOfAttributes = imputationSet.getAttributes().size();
Attribute[][] attributePairs = new Attribute[2][numberOfAttributes];
imputationSet.getAttributes().setLabel(label);
attributePairs[0] = getOrderedAttributes(imputationSet, order, ascending);
imputationSet.getAttributes().setLabel(null);
int imputationFailure = 0;
ExampleSet workingSet = null;
for (int i = 0; i < numberOfAttributes; i++) {
// use either filtered set or original (full) set
if (filterLearningSet) {
workingSet = (ExampleSet) imputationSet.clone();
} else {
workingSet = (ExampleSet) exampleSet.clone();
}
Attribute attribute = attributePairs[0][i];
workingSet.getAttributes().setLabel(attribute);
// sort out examples with missing labels
Condition condition = null;
try {
condition = ConditionedExampleSet.createCondition("no_missing_labels", workingSet, "");
} catch (ConditionCreationException e) {
throw new UserError(this, 904, "no_missing_lables", e.getMessage());
}
ExampleSet learningSet = new ConditionedExampleSet(workingSet, condition);
// if desired sort out cases with missing attribute values
if (learnOnCompleteCases) {
try {
condition = ConditionedExampleSet.createCondition("no_missing_attributes", learningSet, "");
} catch (ConditionCreationException e) {
throw new UserError(this, 904, "no_missing_attributes", e.getMessage());
}
learningSet = new ConditionedExampleSet(learningSet, condition);
}
log("Learning imputation model for attribute " + attribute.getName() + " on " + learningSet.size() + " examples.");
// learn
Model model = learner.apply(new IOContainer(new IOObject[] { learningSet })).get(Model.class);
// re-add current attribute
workingSet = model.apply(workingSet);
workingSet.getAttributes().setLabel(null);
workingSet.getAttributes().addRegular(attribute);
attributePairs[1][i] = workingSet.getAttributes().getPredictedLabel();
// if strategy is iterative immediately impute missing values
// after learning step
if (iterate) {
log("Imputating missing values in attribute " + attribute.getName() + ".");
for (Example example : workingSet) {
double value = example.getValue(attribute);
if (Double.isNaN(value)) {
example.setValue(attribute, example.getPredictedLabel());
if (Double.isNaN(example.getPredictedLabel())) {
imputationFailure++;
}
}
}
}
if (imputationFailure > 0) {
logWarning("Unable to impute " + imputationFailure + " missing values in attribute " + attribute.getName() + ".");
imputationFailure = 0;
}
workingSet.getAttributes().setPredictedLabel(null);
}
// if strategy is not iterative impute missing values not before having
// learned all concepts
if (!iterate) {
for (int i = 0; i < numberOfAttributes; i++) {
imputationFailure = 0;
Attribute attribute = attributePairs[0][i];
log("Imputating missing values in attribute " + attribute.getName() + ".");
for (Example example : workingSet) {
double value = example.getValue(attribute);
if (Double.isNaN(value)) {
example.setValue(attribute, example.getValue(attributePairs[1][i]));
if (Double.isNaN(example.getValue(attributePairs[1][i]))) {
imputationFailure++;
}
}
}
if (imputationFailure > 0) {
logWarning("Unable to impute " + imputationFailure + " missing values in attribute " + attribute.getName() + ".");
imputationFailure = 0;
}
}
}
exampleSet.getAttributes().addRegular(label);
exampleSet.getAttributes().setLabel(label);
return new IOObject[] { exampleSet };
}
public Class<?>[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
public Class<?>[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeCategory(PARAMETER_ORDER, "Order of attributes in which missing values are estimated.", orderStrategies, CHRONOLOGICAL));
types.add(new ParameterTypeCategory(PARAMETER_SORT, "Sort direction which is used in order strategy.", sortStrategies, ASCENDING));
types.add(new ParameterTypeBoolean(PARAMETER_ITERATE, "Impute missing values immediately after having learned the corresponding concept and iterate.", true));
types.add(new ParameterTypeBoolean(PARAMETER_FILTER_LEARNING_SET, "Apply filter to learning set in addition to determination which missing values should be substituted.", false));
types.add(new ParameterTypeBoolean(PARAMETER_LEARN_ON_COMPLETE_CASES, "Learn concepts to impute missing values only on the basis of complete cases (should be used in case learning approach can not handle missing values).", true));
types.add(new ParameterTypeInt(PARAMETER_LOCAL_RANDOM_SEED, "Use the given random seed instead of global random numbers (-1: use global).", -1, Integer.MAX_VALUE, -1));
return types;
}
}