/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Genetic_Rule_Learning.PART; /** * <p> * Class to handle the classifier tree * </p> * * <p> * @author Written by Antonio Alejandro Tortosa (University of Granada) 15/10/2008 * @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008 * @version 1.1 * @since JDK1.2 * </p> */ public class Tree{ /** Total number of Nodes in the tree */ public static int NumberOfNodes; /** Number of Leafs in the tree */ public static int NumberOfLeafs; /** The selected model. */ protected SelectCut model; /** The model of the node. */ protected Cut nodeModel; /** Sons of the node. */ protected Tree [] sons; /** Is this node leaf or not. */ protected boolean isLeaf; /** Is this node empty or not. */ protected boolean isEmpty; /** Is this node unexplored or not. */ protected boolean isUnexplored; /** The dataset. */ protected MyDataset train; /** Is pruned the tree or not. */ protected boolean prune = false; /** The confidence factor for pruning. */ protected float confidence = 0.25f; /** Constructor. * */ public Tree( ) { isUnexplored=true; NumberOfNodes=0; NumberOfLeafs=0; } /** Constructor. * * @param selectNodeModel The cut model. * @param pruneTree Prune the tree or not. * @param cf Minimum confidence. */ public Tree( SelectCut selectNodeModel, boolean pruneTree, float cf ) { model = selectNodeModel; prune = pruneTree; confidence = cf; isUnexplored=true; NumberOfNodes=0; NumberOfLeafs=0; } /** Adds one new node. * * @param data The dataset. * * @throws Exception If the node cannot be built. */ public void buildNode( MyDataset data ) throws Exception { MyDataset [] localItemsets; train = data; isLeaf = false; isEmpty = false; sons = null; nodeModel = model.selectModel( data ); if ( nodeModel.numSubsets() > 1 ) { localItemsets = nodeModel.cutDataset( data ); data = null; sons = new Tree [nodeModel.numSubsets()]; //Calculates the entropy and decides the exploration order Pair[] unexplored_sons=new Pair[nodeModel.numSubsets()]; for (int i=0;i<nodeModel.numSubsets();i++){ unexplored_sons[i]=new Pair(); unexplored_sons[i].key=i; unexplored_sons[i].value=localItemsets[i].getEntropy(); } Utilities.mergeSort(unexplored_sons,nodeModel.numSubsets()); boolean noLeaf=false; //When a son is not a leaf, end. for ( int i = 0; i < sons.length && !noLeaf; i++ ) { int next=unexplored_sons[i].key; sons[next] = getNewTree( localItemsets[next] ); sons[next].isUnexplored=false; localItemsets[next] = null; if (!sons[next].isLeaf) noLeaf=true; } //If all the sons are leaf, prune if (!noLeaf){ collapse(); System.out.println("Collapsing...."); if (isLeaf) System.out.println("Yes!!"); } } else { isLeaf = true; if ( data.sumOfWeights() == 0 ) isEmpty = true; data = null; } } /** Function to build the classifier tree. * * @param data The dataset. * * @throws Exception If the tree cannot be built. */ public void buildTree( MyDataset data ) throws Exception { data = new MyDataset( data ); data.deleteWithMissing( data.getClassIndex() ); buildNode( data ); //collapse(); //if ( prune ) // prune(); } /** Function to collapse a tree to a node if training error doesn't increase. * */ public final void collapse() { double errorsOfSubtree, errorsOfTree; int i; if ( !isLeaf ) { errorsOfSubtree = getErrors(); errorsOfTree = nodeModel.classification().numIncorrect(); if ( errorsOfSubtree >= errorsOfTree-1E-3 ) { // Free adjacent trees sons = null; isLeaf = true; // Get NoCut Model for tree. nodeModel = new Cut( nodeModel.classification() ); } else for ( i = 0; i < sons.length; i++ ) if (!sons[i].isUnexplored) son( i ).collapse(); } } /** Function to prune a tree. * * @throws Exception If the prune cannot be made. */ public void prune() throws Exception { double errorsLargestBranch, errorsLeaf, errorsTree; int indexOfLargestBranch, i; Tree largestBranch; if ( !isLeaf ) { // Prune all subtrees. for ( i = 0; i < sons.length; i++ ) son( i ).prune(); // Compute error for largest branch indexOfLargestBranch = nodeModel.classification().maxValue(); errorsLargestBranch = son( indexOfLargestBranch ).getEstimatedErrorsForBranch( (MyDataset)train ); // Compute error if this Tree would be leaf errorsLeaf = getEstimatedErrorsForLeaf( nodeModel.classification() ); // Compute error for the whole subtree errorsTree = getEstimatedErrors(); // Decide if leaf is best choice. if ( errorsLeaf <= errorsTree + 0.1 && errorsLeaf <= errorsLargestBranch + 0.1 ) { // Free son Trees sons = null; isLeaf = true; // Get NoCut Model for node. nodeModel = new Cut( nodeModel.classification() ); return; } // Decide if largest branch is better choice // than whole subtree. if ( errorsLargestBranch <= errorsTree + 0.1 ) { largestBranch = son( indexOfLargestBranch ); sons = largestBranch.sons; nodeModel = largestBranch.nodeModel; isLeaf = largestBranch.isLeaf; newClassification( train ); prune(); } } } /** Function to get the classification of classes. * * @param itemset The itemset to classify. * * @return The classification of class values for the itemset. * * @throws Exception If the probabilities cannot be computed. */ public final double [] classificationForItemset( Itemset itemset ) throws Exception { double [] doubles = new double[itemset.numClasses()]; for ( int i = 0; i < doubles.length; i++ ) doubles[i] = getProbabilities( i, itemset, 1 ); return doubles; } /** Function to compute the class probabilities of a given itemset. * * @param classIndex The index of the class attribute. * @param itemset The itemset. * @param weight The weight. * * @return The probability of the class. * * @throws Exception If the probabilities cannot be computed. */ private double getProbabilities( int classIndex, Itemset itemset, double weight ) throws Exception { double [] weights; double prob = 0; int treeIndex, i,j; if ( isLeaf ) return weight * nodeModel.classProbability( classIndex, itemset, -1 ); else { treeIndex = nodeModel.whichSubset( itemset ); if ( treeIndex == -1 ) { weights = nodeModel.weights( itemset ); for ( i = 0; i < sons.length; i++ ) { if (!(sons[i].isUnexplored) && !son( i ).isEmpty ) { prob += son( i ).getProbabilities( classIndex, itemset, weights[i] * weight ); } } return prob; } else { if ((!sons[treeIndex].isUnexplored) && son( treeIndex ).isEmpty ) return weight * nodeModel.classProbability( classIndex, itemset, treeIndex ); else if (!sons[treeIndex].isUnexplored) return son( treeIndex ).getProbabilities( classIndex, itemset, weight ); else return 0; } } } /** Function to print the tree. * @return the string representation of this tree. */ public String toString() { try { StringBuffer text = new StringBuffer(); if (!isUnexplored && !isLeaf ) { NumberOfNodes++; printTree( 0, text ); } return text.toString(); } catch ( Exception e ) { return "Cannot print the tree."+e.getMessage(); } } /** Function to print the tree. * * @param depth Depth of the node in the tree. * @param text The tree. * * @throws Exception If the tree cannot be printed. */ private void printTree( int depth, StringBuffer text ) throws Exception { int i, j; String aux = ""; for ( int k = 0; k < depth; k++ ) aux += "\t"; for ( i = 0; i < sons.length; i++ ) { if (sons[i]!=null && !sons[i].isUnexplored){ text.append(aux); if (i == 0) text.append("if ( " + nodeModel.leftSide(train) + nodeModel.rightSide(i, train) + " ) then\n" + aux + "{\n"); else text.append("elseif ( " + nodeModel.leftSide(train) + nodeModel.rightSide(i, train) + " ) then\n" + aux + "{\n"); if (sons[i].isLeaf) { NumberOfLeafs++; text.append(aux + "\t" + train.getClassAttribute().name() + " = \"" + nodeModel.label(i, train) + "\"\n"); } else { NumberOfNodes++; sons[i].printTree(depth + 1, text); } text.append(aux + "}\n"); } } } /** Returns the son with the given index. * * @param index The index of the son. * @return the subtree that hangs from the son. */ private Tree son( int index ) { return (Tree)sons[index]; } /** Function to create a new tree. * * @param data The dataset. * * @return The new tree. * * @throws Exception If the new tree cannot be created. */ protected Tree getNewTree( MyDataset data ) throws Exception { Tree newNode = new Tree( model, prune, confidence ); newNode.buildNode( (MyDataset)data ); return newNode; } /** Function to compute the estimated errors. * * @return The estimated errors. */ private double getEstimatedErrors() { double errors = 0; int i; if ( isLeaf ) return getEstimatedErrorsForLeaf( nodeModel.classification() ); else { for ( i = 0; i < sons.length; i++ ) if((!sons[i].isUnexplored)) errors = errors + son( i ).getEstimatedErrors(); return errors; } } /** Function to compute the estimated errors for one branch. * * @param data The dataset over the errors has to be computed. * * @return The error computed. * * @throws Exception If the errors cannot be computed. */ private double getEstimatedErrorsForBranch( MyDataset data ) throws Exception { MyDataset [] localItemsets; double errors = 0; int i; if ( isLeaf ) return getEstimatedErrorsForLeaf( new Classification( data ) ); else { Classification savedDist = nodeModel.classification; nodeModel.resetClassification( data ); localItemsets = (MyDataset[])nodeModel.cutDataset( data ); nodeModel.classification = savedDist; for ( i = 0; i < sons.length; i++ ) if (!sons[i].isUnexplored) errors += son( i ).getEstimatedErrorsForBranch( localItemsets[i] ); return errors; } } /** Function to compute the estimated errors for leaf. * * @param theClassification The classification of the classes. * * @return The estimated errors for the leaf. */ private double getEstimatedErrorsForLeaf( Classification theClassification ) { if ( theClassification.getTotal() == 0 ) return 0; else return theClassification.numIncorrect()+ errors( theClassification.getTotal(), theClassification.numIncorrect(), confidence ); } /** Function to compute the errors on training data. * * @return The errors. */ private double getErrors() { double errors = 0; int i; if ( isLeaf ) return nodeModel.classification().numIncorrect(); else { for ( i = 0; i < sons.length; i++ ) if (!sons[i].isUnexplored) errors += son( i ).getErrors(); return errors; } } /** Function to create a new classification. * * @param data The dataset. * * @throws Exception If the classification cannot be built. */ private void newClassification( MyDataset data ) throws Exception { MyDataset [] localItemsets; nodeModel.resetClassification( data ); train = data; if ( !isLeaf ) { localItemsets = (MyDataset [])nodeModel.cutDataset( data ); for ( int i = 0; i < sons.length; i++ ) if (!sons[i].isUnexplored) son( i ).newClassification( localItemsets[i] ); } } /** Function to compute estimated extra error for given total number of itemsets and errors. * * @param N The weight of all the itemsets. * @param e The weight of the itemsets incorrectly classified. * @param CF Minimum confidence. * * @return The errors. */ private static double errors( double N, double e, float CF ) { // Some constants for the interpolation. double Val[] = {0, 0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00}; double Dev[] = {100, 6.0, 5.61, 5.2, 4.75, 4.26, 3.89, 3.72, 3.29, 3.09, 2.58, 2.33, 1.65, 1.28, 0.84, 0.25, 0.00}; double Val0, Pr, Coeff = 0; int i = 0; while ( CF > Val[i] ) i++; Coeff = Dev[i-1] + ( Dev[i] - Dev[i-1] ) * ( CF-Val[i-1] ) / ( Val[i] - Val[i-1] ); Coeff = Coeff * Coeff; if ( e == 0 ) return N * ( 1 - Math.exp( Math.log( CF ) / N ) ); else { if ( e < 0.9999 ) { Val0 = N * ( 1 - Math.exp( Math.log( CF ) / N ) ); return Val0 + e * ( errors( N, 1.0, CF ) - Val0 ); } else { if ( e + 0.5 >= N ) return 0.67 * ( N - e ); else { Pr = ( e + 0.5 + Coeff / 2 + Math.sqrt(Coeff * ( ( e + 0.5 ) * ( 1 - ( e + 0.5 ) / N ) + Coeff / 4 ) ) ) / ( N + Coeff ); return ( N * Pr - e ); } } } } /** * Returns the number of children of the root. * @return the number of children of the root. */ public int getNChildren(){return sons.length;} /** * Returns whether the node is unexplore or not. * @return whether the node is unexplore or not. */ public boolean isUnexplored() {return isUnexplored;} /** Returns the son with the given index. * * @param i The index of the son. * @return the subtree that hangs from the son. */ public Tree getChild(int i){return sons[i];} }