/* * RapidMiner * * Copyright (C) 2001-2007 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ package com.rapidminer.operator.preprocessing.discretization; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.tools.Ontology; /** * A filter that discretizes all numeric attributes in the dataset into nominal * attributes. The discretization is performed by selecting a bin boundary * minimizing the entropy in the induced partitions. The method is then applied * recursively for both new partitions until the stopping criterion is reached. * For Detail see a)Multi-interval discretization of continued-values attributes * for classification learning(Fayyad,Irani) b)Supervised and Unsupervized * Discretization(Dougherty,Kohavi,Sahami) Skips all special attributes * including the label. * * @author Dirk Dach * @version $Id: MinimalEntropyPartitioning.java,v 1.5 2006/04/14 11:42:27 * ingomierswa Exp $ */ public class MinimalEntropyPartitioning extends Discretization { public MinimalEntropyPartitioning(OperatorDescription description) { super(description); } private Double getMinEntropySplitpoint(LinkedList<double[]> truncatedExamples, Attribute label) { HashSet<Double> candidateSplitpoints = new HashSet<Double>(); Iterator<double[]> it = truncatedExamples.iterator(); int[] totalLabelDistribution = new int[label.getMapping().size()]; // Label // distribution // for // all // examples. while (it.hasNext()) { // Get splitpoint candidates and total label // distribution. double[] attributeLabelPair = it.next(); candidateSplitpoints.add(attributeLabelPair[0]); int labelIndex = (int) attributeLabelPair[1]; totalLabelDistribution[labelIndex]++; } double[] totalFrequencies = new double[label.getMapping().size()]; for (int i = 0; i < label.getMapping().size(); i++) { totalFrequencies[i] = (double) totalLabelDistribution[i] / (double) truncatedExamples.size(); } double totalEntropy = 0.0d; for (int i = 0; i < label.getMapping().size(); i++) { totalEntropy -= totalFrequencies[i] * log2(totalFrequencies[i]); } double minClassInformationEntropy = totalEntropy; double bestSplitpoint = Double.NaN; double bestSplitpointEntropy1 = Double.POSITIVE_INFINITY; double bestSplitpointEntropy2 = Double.POSITIVE_INFINITY; int k1 = 0; // Number of different class labels in class 1. int k2 = 0; // Number of different class labels in class 2. Iterator it1 = candidateSplitpoints.iterator(); while (it1.hasNext()) { // Test every value as splitpoint double currentSplitpoint = ((Double) it1.next()).doubleValue(); // Initialize. int s1 = 0; // Instances in partition 1. int s2 = 0; // Instances in partition 2. k1 = 0; k2 = 0; int[] labelDistribution1 = new int[label.getMapping().size()]; // Label // distribution // in // class // 1. int[] labelDistribution2 = new int[label.getMapping().size()]; // Label // distribution // in // class // 2. // Determine the class of each instance and the corresponding label // distribution. Iterator it2 = truncatedExamples.iterator(); while (it2.hasNext()) { double[] attributeLabelPair = (double[]) it2.next(); double valueToCompare = attributeLabelPair[0]; int labelIndex = (int) attributeLabelPair[1]; if (valueToCompare <= currentSplitpoint) { // Partition 1 gets // all instances // with values less // or equal to the // current // splitpoint. s1++; labelDistribution1[labelIndex]++; } else { // Partition 2 gets all instances with values // greater than the current split point. s2++; labelDistribution2[labelIndex]++; } } // Calculate frequencies and number of different labels for this // splitpoint each class. double[] frequencies1 = new double[label.getMapping().size()]; double[] frequencies2 = new double[label.getMapping().size()]; for (int i = 0; i < label.getMapping().size(); i++) { frequencies1[i] = (double) labelDistribution1[i] / (double) s1; frequencies2[i] = (double) labelDistribution2[i] / (double) s2; if (labelDistribution1[i] > 0) { // Label value i exists in // class 1. k1++; } if (labelDistribution2[i] > 0) { // Label value i exists in // class 2. k2++; } } // Calculate entropies. double entropy1 = 0.0d; for (int i = 0; i < label.getMapping().size(); i++) { entropy1 -= frequencies1[i] * log2(frequencies1[i]); } double entropy2 = 0.0d; for (int i = 0; i < label.getMapping().size(); i++) { entropy2 -= frequencies2[i] * log2(frequencies2[i]); } double classInformationEntropy = ((double) s1 / (double) truncatedExamples.size()) * entropy1 + ((double) s2 / (double) truncatedExamples.size()) * entropy2; if (classInformationEntropy < minClassInformationEntropy) { minClassInformationEntropy = classInformationEntropy; bestSplitpoint = currentSplitpoint; bestSplitpointEntropy1 = entropy1; bestSplitpointEntropy2 = entropy2; } } // Calculate the termination criterion. Return null if termination // criterion is met. double gain = totalEntropy - minClassInformationEntropy; double delta = log2(Math.pow(3.0, label.getMapping().size()) - 2) - (label.getMapping().size() * totalEntropy - k1 * bestSplitpointEntropy1 - k2 * bestSplitpointEntropy2); if (gain >= log2(truncatedExamples.size() - 1) / truncatedExamples.size() + delta / truncatedExamples.size()) { return new Double(bestSplitpoint); } else { return null; } } /* * LinkedList partition consist of double arrays of size 2. array[0]=value * of the current attribute, array[1]=corresponding label value. */ private ArrayList getSplitpoints(LinkedList<double[]> startPartition, Attribute label) { LinkedList<LinkedList<double[]>> border = new LinkedList<LinkedList<double[]>>(); ArrayList<Double> result = new ArrayList<Double>(); border.addLast(startPartition); while (!border.isEmpty()) { LinkedList<double[]> currentPartition = border.removeFirst(); Double splitpoint = this.getMinEntropySplitpoint(currentPartition, label); if (splitpoint != null) { result.add(splitpoint); double splitValue = splitpoint.doubleValue(); LinkedList<double[]> newPartition1 = new LinkedList<double[]>(); LinkedList<double[]> newPartition2 = new LinkedList<double[]>(); Iterator<double[]> it = currentPartition.iterator(); while (it.hasNext()) { // Create new partitions. double[] attributeLabelPair = it.next(); if (attributeLabelPair[0] <= splitValue) { newPartition1.addLast(attributeLabelPair); } else { newPartition2.addLast(attributeLabelPair); } } border.addLast(newPartition1); border.addLast(newPartition2); } } return result; // Empty ArrayList if no Splitpoint could be found. } /** * Delivers the maximum range thresholds for all attributes, i.e. the value * getRanges()[a][b] is the b-th threshold for the a-th attribute. */ public double[][] getRanges(ExampleSet exampleSet) { double[][] ranges = new double[exampleSet.getAttributes().size()][]; Attribute label = exampleSet.getAttributes().getLabel(); int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { if (!attribute.isNominal()) { // skip nominal attributes Iterator<Example> reader = exampleSet.iterator(); LinkedList<double[]> startPartition = new LinkedList<double[]>(); while (reader.hasNext()) { // Create start partition. Example example = reader.next(); double[] attributeLabelPair = new double[2]; attributeLabelPair[0] = example.getValue(attribute); attributeLabelPair[1] = example.getValue(label); startPartition.addLast(attributeLabelPair); } ArrayList splitpointsOfAttribute = getSplitpoints(startPartition, label); Iterator it = splitpointsOfAttribute.iterator(); ranges[a] = new double[splitpointsOfAttribute.size() + 1]; for (int i = 0; it.hasNext(); i++) { ranges[a][i] = ((Double) it.next()).doubleValue(); } ranges[a][ranges[a].length - 1] = exampleSet.getStatistics(attribute, Statistics.MAXIMUM); Arrays.sort(ranges[a]); } a++; } return ranges; } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); Attribute label = exampleSet.getAttributes().getLabel(); if ((label == null) || (!label.isNominal())) throw new UserError(this, 101, getName(), (label == null ? "no label" : label.getName())); exampleSet.recalculateAllAttributeStatistics(); checkForStop(); double[][] ranges = getRanges(exampleSet); boolean[] numerical = new boolean[ranges.length]; // needed since // value type is // changed! // change attribute type int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { if (!attribute.isNominal()) { // skip nominal attributes numerical[a] = true; attribute = exampleSet.getAttributes().replace(attribute, AttributeFactory.changeValueType(attribute, Ontology.NOMINAL)); for (int b = 0; b < ranges[a].length; b++) { attribute.getMapping().mapString("range" + (b + 1)); } } else { numerical[a] = false; } a++; } // change data Iterator<Example> reader = exampleSet.iterator(); while (reader.hasNext()) { Example example = reader.next(); a = 0; for (Attribute attribute : exampleSet.getAttributes()) { if (numerical[a] && ranges[a] != null) { double value = example.getValue(attribute); for (int b = 0; b < ranges[a].length; b++) { if (value <= ranges[a][b]) { example.setValue(attribute, attribute.getMapping().mapString("range" + (b + 1))); break; } } } a++; } checkForStop(); } // remove useless attributes with no splitpoint a = 0; Iterator<Attribute> i = exampleSet.getAttributes().iterator(); while (i.hasNext()) { i.next(); if (numerical[a] && ranges[a].length == 1) { i.remove(); } a++; } return new IOObject[] { exampleSet }; } public double log2(double arg) { return Math.log(arg) / Math.log(2); } public Class[] getOutputClasses() { return new Class[] { ExampleSet.class }; } public Class[] getInputClasses() { return new Class[] { ExampleSet.class }; } }