/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.tree; import com.rapidminer.operator.learner.tree.criterions.ColumnCriterion; import com.rapidminer.operator.learner.tree.criterions.WeightDistribution; import com.rapidminer.tools.Tools; /** * Calculates the best split point for numerical attributes according to a given criterion. * * @author Ingo Mierswa, Gisa Schaefer */ public class ColumnNumericalSplitter { private ColumnCriterion criterion; private ColumnExampleTable columnTable; public ColumnNumericalSplitter(ColumnExampleTable columnTable, ColumnCriterion criterion) { this.criterion = criterion; this.columnTable = columnTable; } /** * Calculates where to best split a numerical attribute by considering all possibilities and the * associated benefits according to the given criterion. If there are missing values, they are * considered as extra class. * * @param selectedExamples * which of the starting examples are considered sorted such the associated attribute * values are in ascending order * @param attributeNumber * indicates which attribute is considered * @return the benefit of the best split */ public ParallelBenefit getBestSplitBenefit(int[] selectedExamples, int attributeNumber) { final double[] attributeColumn = columnTable.getNumericalAttributeColumn(attributeNumber); double bestSplit = Double.NaN; double lastValue = Double.NaN; double bestSplitBenefit = Double.NEGATIVE_INFINITY; int lastRow = -1; WeightDistribution distribution = null; if (this.criterion.supportsIncrementalCalculation()) { distribution = this.criterion.startIncrementalCalculation(columnTable, selectedExamples, attributeNumber); } for (int j : selectedExamples) { double currentValue = attributeColumn[j]; if (this.criterion.supportsIncrementalCalculation()) { if (lastRow > -1) { this.criterion.updateWeightDistribution(columnTable, lastRow, distribution); } lastRow = j; if (!Tools.isEqual(currentValue, lastValue)) { double benefit = this.criterion.getIncrementalBenefit(distribution); if (benefit > bestSplitBenefit) { bestSplitBenefit = benefit; bestSplit = (lastValue + currentValue) / 2.0d; } } } else { if (!Tools.isEqual(currentValue, lastValue)) { double splitValue = (lastValue + currentValue) / 2.0d; double benefit = this.criterion.getNumericalBenefit(columnTable, selectedExamples, attributeNumber, splitValue); if (benefit > bestSplitBenefit) { bestSplitBenefit = benefit; bestSplit = splitValue; } } } lastValue = currentValue; } if (Double.isNaN(bestSplit)) { return null; } else { return new ParallelBenefit(bestSplitBenefit, attributeNumber, bestSplit); } } }