/* * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ /* * NBTreeModelSelection.java * Copyright (C) 2004-2012 University of Waikato, Hamilton, New Zealand * */ package weka.classifiers.trees.j48; import java.util.Enumeration; import weka.core.Attribute; import weka.core.Instances; import weka.core.RevisionUtils; import weka.core.Utils; /** * Class for selecting a NB tree split. * * @author Mark Hall (mhall@cs.waikato.ac.nz) * @version $Revision: 8034 $ */ public class NBTreeModelSelection extends ModelSelection { /** for serialization */ private static final long serialVersionUID = 990097748931976704L; /** Minimum number of objects in interval. */ private int m_minNoObj; /** All the training data */ private Instances m_allData; // /** * Initializes the split selection method with the given parameters. * * @param minNoObj minimum number of instances that have to occur in at least two * subsets induced by split * @param allData FULL training dataset (necessary for * selection of split points). */ public NBTreeModelSelection(int minNoObj, Instances allData) { m_minNoObj = minNoObj; m_allData = allData; } /** * Sets reference to training data to null. */ public void cleanup() { m_allData = null; } /** * Selects NBTree-type split for the given dataset. */ public final ClassifierSplitModel selectModel(Instances data){ double globalErrors = 0; double minResult; double currentResult; NBTreeSplit [] currentModel; NBTreeSplit bestModel = null; NBTreeNoSplit noSplitModel = null; int validModels = 0; boolean multiVal = true; Distribution checkDistribution; Attribute attribute; double sumOfWeights; int i; try{ // build the global model at this node noSplitModel = new NBTreeNoSplit(); noSplitModel.buildClassifier(data); if (data.numInstances() < 5) { return noSplitModel; } // evaluate it globalErrors = noSplitModel.getErrors(); if (globalErrors == 0) { return noSplitModel; } // Check if all Instances belong to one class or if not // enough Instances to split. checkDistribution = new Distribution(data); if (Utils.sm(checkDistribution.total(), m_minNoObj) || Utils.eq(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass()))) { return noSplitModel; } // Check if all attributes are nominal and have a // lot of values. if (m_allData != null) { Enumeration enu = data.enumerateAttributes(); while (enu.hasMoreElements()) { attribute = (Attribute) enu.nextElement(); if ((attribute.isNumeric()) || (Utils.sm((double)attribute.numValues(), (0.3*(double)m_allData.numInstances())))){ multiVal = false; break; } } } currentModel = new NBTreeSplit[data.numAttributes()]; sumOfWeights = data.sumOfWeights(); // For each attribute. for (i = 0; i < data.numAttributes(); i++){ // Apart from class attribute. if (i != (data).classIndex()){ // Get models for current attribute. currentModel[i] = new NBTreeSplit(i,m_minNoObj,sumOfWeights); currentModel[i].setGlobalModel(noSplitModel); currentModel[i].buildClassifier(data); // Check if useful split for current attribute // exists and check for enumerated attributes with // a lot of values. if (currentModel[i].checkModel()){ validModels++; } } else { currentModel[i] = null; } } // Check if any useful split was found. if (validModels == 0) { return noSplitModel; } // Find "best" attribute to split on. minResult = globalErrors; for (i=0;i<data.numAttributes();i++){ if ((i != (data).classIndex()) && (currentModel[i].checkModel())) { /* System.err.println("Errors for "+data.attribute(i).name()+" "+ currentModel[i].getErrors()); */ if (currentModel[i].getErrors() < minResult) { bestModel = currentModel[i]; minResult = currentModel[i].getErrors(); } } } // System.exit(1); // Check if useful split was found. if (((globalErrors - minResult) / globalErrors) < 0.05) { return noSplitModel; } /* if (bestModel == null) { System.err.println("This shouldn't happen! glob : "+globalErrors+ " minRes : "+minResult); System.exit(1); } */ // Set the global model for the best split // bestModel.setGlobalModel(noSplitModel); return bestModel; }catch(Exception e){ e.printStackTrace(); } return null; } /** * Selects NBTree-type split for the given dataset. */ public final ClassifierSplitModel selectModel(Instances train, Instances test) { return selectModel(train); } /** * Returns the revision string. * * @return the revision */ public String getRevision() { return RevisionUtils.extract("$Revision: 8034 $"); } }