/*
* RapidMiner
*
* Copyright (C) 2001-2007 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA.
*/
package com.rapidminer.operator.preprocessing.discretization;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.Tools;
/**
* An example filter that discretizes all numeric attributes in the dataset into
* nominal attributes. This discretization is performed by equal frequency
* binning. The number of bins is determined by a parameter, or, chooseable via
* another parameter, by the square root of the number of examples with
* non-missing values (calculated for every single attribute). Skips all special
* attributes including the label.
*
* @author Sebastian Land, Ingo Mierswa
* @version $Id: FrequencyDiscretizer.java,v 1.12 2006/04/14 11:42:27
* ingomierswa Exp $
*/
public class FrequencyDiscretizer extends Operator {
/** The parameter name for "If true, the number of bins is instead determined by the square root of the number of non-missing values." */
public static final String PARAMETER_USE_SQRT_OF_EXAMPLES = "use_sqrt_of_examples";
public static final String PARAMETER_NUMBER_OF_BINS = "number_of_bins";
public FrequencyDiscretizer(OperatorDescription description) {
super(description);
}
public Class[] getInputClasses() {
return new Class[] { ExampleSet.class };
}
public Class[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
public IOObject[] apply() throws OperatorException {
ExampleSet exampleSet = (ExampleSet)getInput(ExampleSet.class).clone();
// Get and check parametervalues
boolean useSqrt = getParameterAsBoolean(PARAMETER_USE_SQRT_OF_EXAMPLES);
int numberOfBins = 0;
if (!useSqrt) {
// if not automatic sizing of bins, use parametervalue
numberOfBins = getParameterAsInt(PARAMETER_NUMBER_OF_BINS);
if (numberOfBins >= (exampleSet.size() - 1)) {
throw new UserError(this, 116, PARAMETER_NUMBER_OF_BINS, "number of bins must be smaller than number of examples (here: " + exampleSet.size() + ")");
}
}
// over all attributes
FrequencyDiscretizerExample[] exampleAttributePairs = new FrequencyDiscretizerExample[exampleSet.size()];
for (Attribute currentAttribute : exampleSet.getAttributes()) {
if (!currentAttribute.isNominal()) {
int numberOfNotMissing = 0;
// get examples with value of current attribute and store as
// pairs and compute the number of not missing values
Iterator<Example> iterator = exampleSet.iterator();
int j = 0;
while (iterator.hasNext()) {
Example currentExample = iterator.next();
exampleAttributePairs[j] = new FrequencyDiscretizerExample(currentExample.getValue(currentAttribute), currentExample);
if (!Double.isNaN(currentExample.getValue(currentAttribute))) {
numberOfNotMissing++;
}
checkForStop();
j++;
}
// sort pairs and compute number of Bins
Arrays.sort(exampleAttributePairs);
if (useSqrt) {
numberOfBins = (int) Math.round(Math.sqrt(numberOfNotMissing));
}
// change attributetype of current attribute
currentAttribute = exampleSet.getAttributes().replace(currentAttribute, AttributeFactory.changeValueType(currentAttribute, Ontology.NOMINAL));
// set new nominal value
double examplesPerBin = exampleSet.size() / (double) numberOfBins;
double currentBinSpace = 0;
int currentBin = 0;
log(currentAttribute.getName() + ": start new range" + currentBin + " at " + Tools.formatNumber(exampleAttributePairs[0].getValue()));
for (int k = 0; k < exampleAttributePairs.length; k++) {
// change bin if full and not last
if (currentBinSpace < 1 && currentBin < numberOfBins) {
if (k > 0) {
double lastValue = exampleAttributePairs[k-1].getValue();
double thisValue = exampleAttributePairs[k].getValue();
log(currentAttribute.getName() + ": start new range" + currentBin + " at " + Tools.formatNumber((thisValue - lastValue) / 2.0d));
}
currentBin++;
currentBinSpace += examplesPerBin;
}
// set number of bin as nominal value
Example example = exampleAttributePairs[k].getExample();
example.setValue(currentAttribute, "range" + currentBin);
currentBinSpace--;
checkForStop();
}
}
}
return new IOObject[] { exampleSet };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
ParameterType type = new ParameterTypeInt(PARAMETER_NUMBER_OF_BINS, "Defines the number of bins which should be used for each attribute.", 2, Integer.MAX_VALUE, 2);
type.setExpert(false);
types.add(type);
type = new ParameterTypeBoolean(PARAMETER_USE_SQRT_OF_EXAMPLES, "If true, the number of bins is instead determined by the square root of the number of non-missing values.", false);
type.setExpert(false);
types.add(type);
return types;
}
}