/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.filter.attributes;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.set.ConditionCreationException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.MetaDataInfo;
import com.rapidminer.parameter.ParameterHandler;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeDouble;
/**
* This condition checks whether an attribute contains less than a specified fraction of missing values.
* If the maximal fraction is exceeded, the attribute is removed.
*
* This condition needs a full data scan per attribute and hence might downspeed calculations.
*
* @author Sebastian Land
*/
public class MissingValuesAttributeFilter extends AbstractAttributeFilterCondition {
public static final String PARAMETER_MAX_FRACTION_MISSING = "max_fraction_of_missings";
private double maxFraction;
private int numberOfExamples = 0;
private int numberOfMissings = 0;
private Attribute lastAttribute = null;
@Override
public MetaDataInfo isFilteredOutMetaData(AttributeMetaData attribute, ParameterHandler parameterHandler) throws ConditionCreationException {
//TODO: Implement meta data dependent handling
return MetaDataInfo.UNKNOWN;
}
@Override
public boolean isNeedingFullScan() {
return true;
}
@Override
public boolean isNeedingScan() {
return true;
}
@Override
public ScanResult beforeScanCheck(Attribute attribute) throws UserError {
return ScanResult.UNCHECKED;
}
@Override
public ScanResult check(Attribute attribute, Example example) {
if (attribute != lastAttribute) {
numberOfExamples = 0;
numberOfMissings = 0;
lastAttribute = attribute;
}
numberOfExamples++;
if (Double.isNaN(example.getValue(attribute)))
numberOfMissings++;
// returning unchecked, since counting not completed
return ScanResult.UNCHECKED;
}
@Override
public ScanResult checkAfterFullScan() {
double fraction = numberOfExamples;
if (numberOfMissings / fraction> maxFraction)
return ScanResult.REMOVE;
return ScanResult.KEEP;
}
@Override
public void init(ParameterHandler operator) throws UserError, ConditionCreationException {
maxFraction = operator.getParameterAsDouble(PARAMETER_MAX_FRACTION_MISSING);
}
@Override
public List<ParameterType> getParameterTypes(ParameterHandler operator, InputPort inPort, int...valueTypes) {
List<ParameterType> types = super.getParameterTypes(operator, inPort);
types.add(new ParameterTypeDouble(PARAMETER_MAX_FRACTION_MISSING, "If the attribute contains missing values in more than this fraction of the total number of examples, it is removed.", 0d, 1d, true));
return types;
}
}