/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.rules; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.SortedExampleSet; /** * Find the best split point for numerical attributes according to accuracy. * * @author Ingo Mierswa * @version $Id: NumericalSplitter.java,v 1.9 2008/05/09 19:23:13 ingomierswa Exp $ */ public class NumericalSplitter { private Criterion criterion; private double minValue = 0.5d; public NumericalSplitter() { this.criterion = new AccuracyCriterion(); } public NumericalSplitter(Criterion criterion) { this.criterion = criterion; } public Split getBestSplit(ExampleSet inputSet, Attribute attribute, String labelName) { SortedExampleSet exampleSet = new SortedExampleSet((ExampleSet)inputSet.clone(), attribute, SortedExampleSet.INCREASING); Attribute labelAttribute = exampleSet.getAttributes().getLabel(); int labelIndex = labelAttribute.getMapping().mapString(labelName); double oldLabel = Double.NaN; double bestSplit = Double.NaN; double lastValue = Double.NaN; double bestBenefit = Double.NEGATIVE_INFINITY; double bestTotalWeight = 0; int bestSplitType = Split.LESS_SPLIT; // initiating online counting of benefit: only 2 Datascans needed then criterion.reinitOnlineCounting(exampleSet); for (Example e : exampleSet) { double currentValue = e.getValue(attribute); double label = e.getValue(labelAttribute); if ((Double.isNaN(oldLabel)) || (oldLabel != label) && (lastValue != currentValue)) { double splitValue = (lastValue + currentValue) / 2.0d; double[] benefits; if (labelName == null) { benefits = criterion.getOnlineBenefit(e); } else { benefits = criterion.getOnlineBenefit(e, labelIndex); } // online method returns both possible relations in one array(greater / smaller) in one array if ((benefits[0] > minValue) && (benefits[0] > 0) && (benefits[1] > 0) && ((benefits[0] > bestBenefit) || ((benefits[0] == bestBenefit) && (benefits[1] > bestTotalWeight)))) { bestBenefit = benefits[0]; bestSplit = splitValue; bestTotalWeight = benefits[1]; bestSplitType = Split.LESS_SPLIT; } if ((benefits[2] > minValue) && (benefits[2] > 0) && (benefits[3] > 0) && ((benefits[2] > bestBenefit) || ((benefits[2] == bestBenefit) && (benefits[3] > bestTotalWeight)))) { bestBenefit = benefits[2]; bestSplit = splitValue; bestTotalWeight = benefits[3]; bestSplitType = Split.GREATER_SPLIT; } oldLabel = label; } lastValue = currentValue; criterion.update(e); } return new Split(bestSplit, new double[] { bestBenefit, bestTotalWeight }, bestSplitType); } }