/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. Sánchez (luciano@uniovi.es) J. Alcalá-Fdez (jalcala@decsai.ugr.es) S. García (sglopez@ujaen.es) A. Fernández (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Cristobal Romero (Universidad de C�rdoba) 10/10/2007 * @version 0.1 * @since JDK 1.5 *</p> */ package keel.Algorithms.Semi_Supervised_Learning.Basic.C45; import java.util.*; import keel.Algorithms.Decision_Trees.C45.Attribute; /** * Class to select a cut point in a dataset. */ public class SelectCut { /** Minimum number of objects in interval. */ private int minItemsets; /** The training data. */ private Dataset dataset; /** Creates a new cut model. * * @param nObj Minimum number of objects. * @param allData The dataset. */ public SelectCut(int nObj, Dataset allData) { minItemsets = nObj; dataset = allData; } /** Function to select the cut point. * * @param data The dataset used to compute the cut point. * * @return The cut point computed. */ public final Cut selectModel(Dataset data) { double minResult, averageInfoGain = 0, sumOfWeights; Cut[] current; Cut best = null, noCut = null; int models = 0, i; boolean multiVal = true; Classification checkClassification; Attribute attribute; try { // Check if all Dataset belong to one class or if not // enough Dataset to Split. checkClassification = new Classification(data); noCut = new Cut(checkClassification); if (checkClassification.getTotal() < 2 * minItemsets || checkClassification.getTotal() == checkClassification.perClass(checkClassification.maxClass())) { return noCut; } // Check if all attributes are nominal and have a // lot of values. if (dataset != null) { Enumeration enum2 = data.enumerateAttributes(); while (enum2.hasMoreElements()) { attribute = (Attribute) enum2.nextElement(); if ((attribute.isContinuous()) || ((double) attribute.numValues() < (0.3 * (double) dataset.numItemsets()))) { multiVal = false; break; } } } current = new Cut[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++) { // Apart from class attribute. if (i != (data).getClassIndex()) { // Get models for current attribute. current[i] = new Cut(i, minItemsets, sumOfWeights); current[i].classify(data); // Check if useful Split for current attribute // exists and check for enumerated attributes with // a lot of values. if (current[i].checkModel()) { if (dataset != null) { if ((data.getAttribute(i).isContinuous()) || (multiVal || (double) data.getAttribute(i).numValues() < (0.3 * (double) dataset.numItemsets()))) { averageInfoGain = averageInfoGain + current[i].getInfoGain(); models++; } } else { averageInfoGain = averageInfoGain + current[i].getInfoGain(); models++; } } } else { current[i] = null; } } // Check if any useful Split was found. if (models == 0) { return noCut; } averageInfoGain = averageInfoGain / (double) models; // Find "best" attribute to Split on. minResult = 0; for (i = 0; i < data.numAttributes(); i++) { if ((i != (data).getClassIndex()) && (current[i].checkModel())) { // Use 1E-3 here to get a closer approximation to the original // implementation. if ((current[i].getInfoGain() >= (averageInfoGain - 1E-3)) && current[i].getGainRatio() > minResult) { best = current[i]; minResult = current[i].getGainRatio(); } } } // Check if useful Split was found. if (minResult == 0) { return noCut; } // Add all Dataset with unknown values for the corresponding // attribute to the classification for the model, so that // the complete classification is stored with the model. best.classification().addWithUnknownValue(data, best.attributeIndex()); // Set the Split point analogue to C45 if attribute numeric. if (dataset != null) { best.setCutPoint(dataset); } return best; } catch (Exception e) { e.printStackTrace(); } return null; } }