/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Decision_Trees.FunctionalTrees; import java.util.StringTokenizer; import java.util.ArrayList; import java.io.*; import org.core.Fichero; /** * The Functional Trees algorithm builds a decision tree model integrating in only * one model a decision tree and another classifier. The current version is called * FT-Leaves because the classifier is only present at the leaf nodes. * * The params that can be used with this algorithm are various: * - minNumInstancesToSplit, the minimum number of instances that a node should have * to be considered for split * - splitCriteria, which is the criteria used to decide which is the best split for * a node. We consider criterias such as entropy, information gain, gini index or gain * ratio * - pruneCriteria, which is the criteria used to prune the tree when it has been built. * The criteria used depends on a general error on the tree or a prune for all the leaves * - classifierOnLeaves, the classifier that is on the leaves of the tree. The current * version only supports Naive Bayes, KNN, Nearest Means, KSNN and KNN Adaptive * - K, the parameter for some of the classifiers on the leaves * * @author Written by Victoria Lopez Morales (University of Granada) 24/05/2009 * @version 0.1 * @since JDK1.5 */ public class FunctionalTrees { /** * Nodes of the tree built with the Functional Trees algorithm */ private TreeNode root; // Files /** * Array of files that include the name of the output file for train, test and other output */ private String outFile[]; /** * Name of the file that contains the test instances */ private String testFile; /** * Name of the file that contains the original train instances */ private String trainFile; /** * Name of the file that contains the reference instantes (current train instances) */ private String referenceFile; // Datasets /** * Dataset containing all the test instances */ private myDataset testDataset; /** * Dataset containing all the original train instances */ private myDataset trainDataset; /** * Dataset containing all the reference instances (current train instances) */ private myDataset referenceDataset; // Timing /** * Number used to store the time of the beginning of the algorithm */ private long initialTime; /** * Seconds used to classify all the training instances */ private double classificationTrainTime; /** * Seconds used to classify all the test instances */ private double classificationTestTime; /** * Seconds used to build the tree */ private double buildingTime; // Classified /** * Number of correctly classified train instances */ private int correctTrain; /** * Number of incorrectly classified train instances */ private int failTrain; /** * Number of correctly classified test instances */ private int correctTest; /** * Number of incorrectly classified test instances */ private int failTest; // Other parameters /** * User parameter: the minimum number of instances that a node should have to be considered for split */ int minNumInstances; /** * User parameter: the criteria used to decide which is the best split for a node. We consider * criterias such as entropy, information gain, gini index or gain ratio */ int splitCriteria; /** * User parameter: criteria used to prune the tree when it has been built. The criteria used * depends on a general error on the tree or a prune for all the leaves */ int pruneCriteria; /** * User parameter: the classifier that is on the leaves of the tree. The current version only * supports Naive Bayes, KNN, Nearest Means, KSNN and KNN Adaptive */ int leavesClassifier; /** * User parameter: the parameter for some of the classifiers on the leaves */ int K; /** * Number of nodes of the tree during the building stage */ int numnodes; /** * Creates a FunctionalTrees instance by reading the script file that contains all the * information needed for running the algorithm * * @param script The configuration script which contains the parameters of the algorithm */ public FunctionalTrees (String script) { // We start time-counting initialTime = System.currentTimeMillis(); // Read of the script file readConfiguration(script); // Names of the input and output files readParameters(script); // Parameters for the Functional Trees algorithm // Reading datasets try { trainDataset = new myDataset(trainFile, 1); testDataset = new myDataset(testFile, 3); referenceDataset = new myDataset(referenceFile, 2); } catch (Exception e) { System.err.println(e); System.exit(1); } // Start building the tree buildTree(); } // end-method /** * This method builds the tree with all the data stored in the class, in a process that first * builds the tree and then prunes it, obtaining finally a tree with classifiers at the leaves * that can be used to classify different instances */ private void buildTree () { long buildTime; // Initialize root node using training data set System.out.println("\nInitializing root node"); buildTime = System.currentTimeMillis(); root = new TreeNode (0, null, null, false, -1, null, trainDataset, leavesClassifier, K); System.out.println("Root node initialized"); numnodes = 1; // Build the tree growTree(root); // After the tree is built, we prune the tree System.out.println("\nBeginning prune..."); pruneTree(); System.out.println("Prune finished!"); // Check the time spent during the tree building buildingTime = (double)(System.currentTimeMillis()-buildTime)/1000.0; System.out.println("\nBuilding of the tree finished!!"); System.out.println(numnodes + " nodes generated"); } /** * Builds the tree from a tree node that functions as a root node, with all the data stored in * the class * * @param node Tree node that is considered as a root node from which we are generating descendant * nodes */ public void growTree (TreeNode node) { Split best_split; ArrayList <TreeNode> nodes; System.out.println("\nBeginning node processing..."); // Check if the node is partitionable if (node.isPartitionable(minNumInstances)) { // for each attribute A // Evaluate splits on attribute A best_split = node.evaluateAllSplits (splitCriteria); // Use best split to split node N into N1 and N2 nodes = node.split(best_split, numnodes); if (nodes == null) { // The split cannot be done, so this node will be a leaf node.setAsLeaf(); } else { // The split is done, and two new nodes are created numnodes += 2; // We grow the tree from those new nodes growTree (nodes.get(0)); growTree (nodes.get(1)); } } else { // The node is not partitionable, set as a leaf node node.setAsLeaf(); } } /** * Prunes the tree accordingly to the prune criteria, this means, makes some of the non-leaf nodes * as leaves and deletes its descendants */ public void pruneTree() { if (pruneCriteria == 0) { // The prune we have to do is a prune of all leaves root.pruneAllLeaves(); } else if (pruneCriteria == 1) { // The prune we have to do is for all leaves root.pruneWithError(); } } /** * This method performs the classification for all the instances: the train and the test sets */ public void execute () { System.out.println(); System.out.println("Beginning classification..."); System.out.println(); // Classify the train set print(referenceDataset, outFile[0], 0); // Classify the test set print(testDataset, outFile[1], 1); // Print other results like the performance of the algorithm and the tree printResults(trainDataset, outFile[2]); System.out.println("Classification FINISHED!!"); System.out.println(); System.out.println(getStatistical()); } // end-method /** * Reads configuration script, and extracts its contents. * * @param script Name of the configuration script */ protected void readConfiguration (String script) { String fichero, linea, token; StringTokenizer lineasFichero, tokens; byte line[]; int i, j; outFile = new String[3]; fichero = Fichero.leeFichero (script); lineasFichero = new StringTokenizer (fichero,"\n\r"); lineasFichero.nextToken(); linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); //Getting the names of training and test files //reference file will be used as comparison line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); trainFile = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); referenceFile = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); testFile = new String (line,i,j-i); //Getting the path and base name of the results files linea = lineasFichero.nextToken(); tokens = new StringTokenizer (linea, "="); tokens.nextToken(); token = tokens.nextToken(); //Getting the names of output files line = token.getBytes(); for (i=0; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[0] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[1] = new String (line,i,j-i); for (i=j+1; line[i]!='\"'; i++); i++; for (j=i; line[j]!='\"'; j++); outFile[2] = new String (line,i,j-i); } //end-method /** * Reads configuration script, to extract the parameter's values. * * @param script Name of the configuration script * */ protected void readParameters (String script) { String file; String line; String str_splitCriteria, str_pruneCriteria, str_leavesClassifier; StringTokenizer fileLines, tokens; file = Fichero.leeFichero (script); fileLines = new StringTokenizer (file,"\n\r"); // Discard in/out files definition fileLines.nextToken(); fileLines.nextToken(); fileLines.nextToken(); //fileLines.nextToken(); // Getting the number of minimum instances to perform a split line = fileLines.nextToken(); tokens = new StringTokenizer (line, "="); tokens.nextToken(); minNumInstances = Integer.parseInt(tokens.nextToken().substring(1)); System.out.println ("The minimum number of instances to let split a node is " + minNumInstances); if (minNumInstances < 1) { System.err.println("Error: The minimum number of instances in a node to be partitioned is at least 1"); System.exit(-1); } // Getting the split criteria used in the algorithm line = fileLines.nextToken(); tokens = new StringTokenizer (line, "="); tokens.nextToken(); str_splitCriteria = tokens.nextToken().trim(); if (str_splitCriteria.equals("Entropy")) { splitCriteria = 0; } else if (str_splitCriteria.equals("InformationGain")) { splitCriteria = 1; } else if (str_splitCriteria.equals("GiniIndex")) { splitCriteria = 2; } else if (str_splitCriteria.equals("GainRatio")) { splitCriteria = 3; } else { System.err.println("Error: The different ways to calculate a split in Functional Trees are \"Entropy\", \"InformationGain\", \"GiniIndex\" or \"GainRatio\""); System.exit(-1); } System.out.println("The split criteria is " + str_splitCriteria); // Getting the prune criteria used in the algorithm line = fileLines.nextToken(); tokens = new StringTokenizer (line, "="); tokens.nextToken(); str_pruneCriteria = tokens.nextToken().trim(); if (str_pruneCriteria.equals("pruneAllLeaves")) { pruneCriteria = 0; } else if (str_pruneCriteria.equals("pruneErrorLeaves")) { pruneCriteria = 1; } else { System.err.println("Error: The different ways to prune in Functional Trees are \"pruneAllLeaves\" or \"pruneErrorLeaves\""); System.exit(-1); } System.out.println("The prune criteria is " + str_pruneCriteria); // Getting the classifier used in the leaves of the tree line = fileLines.nextToken(); tokens = new StringTokenizer (line, "="); tokens.nextToken(); str_leavesClassifier = tokens.nextToken().trim(); if (str_leavesClassifier.equals("NaiveBayes")) { leavesClassifier = 0; } else if (str_leavesClassifier.equals("KNN")) { leavesClassifier = 1; } else if (str_leavesClassifier.equals("NM")) { leavesClassifier = 2; } else if (str_leavesClassifier.equals("KSNN")) { leavesClassifier = 3; } else if (str_leavesClassifier.equals("KNNAdaptive")) { leavesClassifier = 4; } else { System.err.println("Error: The different classifiers that can be used in Functional Trees are are \"NaiveBayes\", \"KNN\", \"NM\", \"KSNN\" or \"KNNAdaptive\""); System.exit(-1); } System.out.println("The classifier used at leaves is " + str_leavesClassifier); // Getting the parameters for the classifier selected if necessary if ((leavesClassifier == 1) || (leavesClassifier == 3) || (leavesClassifier == 4)) { line = fileLines.nextToken(); tokens = new StringTokenizer (line, "="); tokens.nextToken(); K = Integer.parseInt(tokens.nextToken().substring(1)); System.out.println ("The number of neighboors in the selected classifier is " + K); if (K < 1) { System.err.println("Error: The minimum number of neighboors in the selected classifier is at least 1"); System.exit(-1); } } System.out.println(); } //end-method /** * Gets the general information about the dataset in a string form * * @param dat Dataset from which we are obtaining the general information * @return a string with the general information about the dataset */ private String getHeader (myDataset dat) { String header; ArrayList <myAttribute> attributes; myAttribute output; attributes = dat.getAttributes(); output = dat.getOutputAttribute(); // Get information about the dataset and the attributes header = "@relation " + dat.getName() + "\n"; for (int i=0; i<attributes.size(); i++) { switch (attributes.get(i).getAttributeType()) { case 1: header += "@attribute " + attributes.get(i).getName() + " integer[" + (int)attributes.get(i).getMin() + "," + (int)attributes.get(i).getMax() + "]\n"; break; case 2: header += "@attribute " + attributes.get(i).getName() + " real[" + attributes.get(i).getMin() + "," + attributes.get(i).getMax() + "]\n"; break; case 3: header += "@attribute " + attributes.get(i).getName() + " {"; for (int j=0; j<attributes.get(i).getValues().size()-1; j++) { header += attributes.get(i).getValue(j) + ","; } header += attributes.get(i).getValue(attributes.get(i).getValues().size()-1) + "}\n"; break; } } // Get information about the output attribute switch (output.getAttributeType()) { case 1: header += "@attribute " + output.getName() + " integer[" + (int)output.getMin() + "," + (int)output.getMax() + "]\n"; break; case 2: header += "@attribute " + output.getName() + " real[" + output.getMin() + "," + output.getMax() + "]\n"; break; case 3: header += "@attribute " + output.getName() + " {"; for (int j=0; j<output.getValues().size()-1; j++) { header += output.getValue(j) + ","; } header += output.getValue(output.getValues().size()-1) + "}\n"; break; } return header; } /** * Classifies a given item with the information stored in the tree * * @param item Data attribute values for the item we are classifying * * @return the class asigned to the item given */ public int evaluateItem (double [] item) { return root.evaluate(item); } /** * Prints in a file the result of the classification made with the tree generated by the PUBLIC * algorithm. This can be done over the train set or the test set. * * @param data Dataset that we are classifying * @param filename Name of the file that is going to store the results * @param type 0 if we are working with a train set, 1 if we are working with a test set */ public void print (myDataset data, String filename, int type) { String text = getHeader(data); double item[]; int correct, fail; long time; text += "@data\n"; item = new double[data.getNumAtr()]; correct = 0; fail = 0; // Check the time spent time = System.currentTimeMillis(); for (int i = 0; i < data.getNumIns(); i++) { // Evaluate all the instances try { item = data.getDataItem (i); int cl = (int) evaluateItem(item); if (cl == (int) data.getOutputI(i)) { correct++; } else { fail++; } text += data.getOutputAttribute().getValue((int)data.getOutputI(i)) + " " + data.getOutputAttribute().getValue(cl)+ "\n"; } catch (Exception e) { System.err.println(e.getMessage()); } } // Print the corresponding results if (type == 0) { classificationTrainTime = (double)(System.currentTimeMillis()-time)/1000.0; correctTrain = correct; failTrain = fail; } else if (type == 1) { classificationTestTime = (double)(System.currentTimeMillis()-time)/1000.0; correctTest = correct; failTest = fail; } else { System.err.println("Wrong dataset for printing results"); System.exit(-1); } try { PrintWriter print = new PrintWriter(new FileWriter(filename)); print.print(text); print.close(); } catch (IOException e) { System.err.println("Can not open the output file " + filename + ": " + e.getMessage()); } } /** * Gets the general information about the performance of the algorithm. This information includes * the number of nodes and leafs of the tree, the performance in training and test and the time * spent in the operations. * * @return a string with all the important information about the performance of the algorithm */ private String getStatistical () { String text = ""; text = text + "@TotalNumberOfNodes " + root.getNumNodes() + "\n"; text = text + "@NumberOfLeafs " + root.getLeafs() + "\n\n"; text = text + "@NumberOfItemsetsTraining " + referenceDataset.getNumIns() + "\n"; text = text + "@NumberOfCorrectlyClassifiedTraining " + correctTrain + "\n"; text = text + "@PercentageOfCorrectlyClassifiedTraining " + ((double)correctTrain*100.0/(double)referenceDataset.getNumIns()) + "%\n"; text = text + "@NumberOfIncorrectlyClassifiedTraining " + failTrain + "\n"; text = text + "@PercentageOfIncorrectlyClassifiedTraining " + ((double)failTrain*100.0/(double)referenceDataset.getNumIns()) + "%\n\n"; text = text + "@NumberOfItemsetsTest " + testDataset.getNumIns() + "\n"; text = text + "@NumberOfCorrectlyClassifiedTest " + correctTest + "\n"; text = text + "@PercentageOfCorrectlyClassifiedTest " + ((double)correctTest*100.0/(double)testDataset.getNumIns()) + "%\n"; text = text + "@NumberOfIncorrectlyClassifiedTest " + failTest + "\n"; text = text + "@PercentageOfIncorrectlyClassifiedTest " + ((double)failTest*100.0/(double)testDataset.getNumIns()) + "%\n\n"; text = text + "@TotalElapsedTime " + (double)(System.currentTimeMillis()-initialTime)/1000.0 + "s\n"; text = text + "@BuildingElapsedTime " + buildingTime + "s\n"; text = text + "@ClassificationTrainElapsedTime " + classificationTrainTime + "s\n"; text = text + "@ClassificationTestElapsedTime " + classificationTestTime + "s\n"; return text; } /** * Prints in a file the result of the classification made with the tree generated by the PUBLIC * algorithm, this means, the tree itself and the general information about it * * @param data Dataset that we are working with * @param filename Name of the file that is going to store the results */ public void printResults (myDataset data, String filename) { String text = getHeader(data); text += "@inputs\n"; for (int i=0; i<data.getAttributes().size(); i++) { text = text + data.getAttributes().get(i).getName() + " "; } text = text + "\n@outputs " + data.getOutputAttribute().getName() + "\n@data\n\n@decisiontree\n\n" + root.printTree() + "\n"; text += getStatistical (); try { PrintWriter print = new PrintWriter(new FileWriter(filename)); print.print(text); print.close(); } catch (IOException e) { System.err.println("Can not open the output file " + filename + ": " + e.getMessage()); } } }