/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Decision_Trees.DT_GA; /** * <p>Title: Algorithm</p> * * <p>Description: It contains the implementation of the algorithm</p> * * * <p>Company: KEEL </p> * * @author Alberto Fern�ndez * @version 1.0 */ import java.io.IOException; import org.core.*; import java.util.Arrays; import keel.Algorithms.Decision_Trees.DT_GA.C45.C45; import java.util.StringTokenizer; import java.util.Vector; public class DT_GA { myDataset train, val, test; String outputTr, outputTst, ficheroBR, claseMayoritaria; int nClasses, nGenerations, popSize, instancesPerLeaf, type, S; double crossProb, mutProb; float confidence; boolean pruned; String fichTrain; BaseR baseReglasTree, baseReglasGA; Clasificador clasif; public static int GA_SMALL = 0; public static int GA_LARGE_SN = 1; private boolean somethingWrong = false; //to check if everything is correct. /** * Default constructor */ public DT_GA() { } /** * It reads the data from the input files (training, validation and test) and parse all the parameters * from the parameters array. * @param parameters parseParameters It contains the input files, output files and parameters */ public DT_GA(parseParameters parameters) { train = new myDataset(); val = new myDataset(); test = new myDataset(); fichTrain = parameters.getTrainingInputFile(); try { System.out.println("\nReading the training set: " + parameters.getTrainingInputFile()); train.readClassificationSet(parameters.getTrainingInputFile(), true); System.out.println("\nReading the validation set: " + parameters.getValidationInputFile()); val.readClassificationSet(parameters.getValidationInputFile(), false); System.out.println("\nReading the test set: " + parameters.getTestInputFile()); test.readClassificationSet(parameters.getTestInputFile(), false); } catch (IOException e) { System.err.println( "There was a problem while reading the input data-sets: " + e); somethingWrong = true; } //We may check if there are some numerical attributes, because our algorithm may not handle them: //somethingWrong = somethingWrong || train.hasRealAttributes(); somethingWrong = somethingWrong || train.hasMissingAttributes(); outputTr = parameters.getTrainingOutputFile(); outputTst = parameters.getTestOutputFile(); ficheroBR = parameters.getOutputFile(0); //Now we parse the parameters long semilla = Long.parseLong(parameters.getParameter(0)); String aux = parameters.getParameter(1); pruned = true; /*if (aux.compareToIgnoreCase("FALSE") == 0) { pruned = false; }*/ confidence = Float.parseFloat(parameters.getParameter(1)); instancesPerLeaf = Integer.parseInt(parameters.getParameter(2)); aux = parameters.getParameter(3); type = this.GA_SMALL; if (aux.compareToIgnoreCase("GA-LARGE-SN") == 0) { type = this.GA_LARGE_SN; } S = Integer.parseInt(parameters.getParameter(4)); nGenerations = Integer.parseInt(parameters.getParameter(5)); popSize = Integer.parseInt(parameters.getParameter(6)); while (popSize % 2 != 0) { popSize++; } crossProb = Double.parseDouble(parameters.getParameter(7)); mutProb = Double.parseDouble(parameters.getParameter(8)); Randomize.setSeed(semilla); } /** * It launches the algorithm */ public void execute() { if (somethingWrong) { //We do not execute the program System.err.println("An error was found, the data-set has missing values."); System.err.println("Aborting the program"); //We should not use the statement: System.exit(-1); } else { //We do here the algorithm's operations nClasses = train.getnClasses(); C45 arbol = new C45(fichTrain, pruned, confidence, instancesPerLeaf); try { arbol.generateTree(); } catch (Exception e) { System.err.println(e.getMessage()); System.exit( -1); } //System.err.println("Mira -> \n"+arbol.printString()); Fichero.escribeFichero("arbol.txt", arbol.printString()); String cadenaArbol = arbol.printString(); obtenerReglas(cadenaArbol); baseReglasTree.cubrirEjemplos(); System.out.println(baseReglasTree.printString()); baseReglasGA = baseReglasTree.genetico(type, S, nGenerations, popSize, crossProb, mutProb); claseMayoritaria = train.claseMasFrecuente(); clasif = new Clasificador(baseReglasTree,baseReglasGA,type,S,claseMayoritaria); //Finally we should fill the training and test output files double accTr = doOutput(this.val, this.outputTr); double accTst = doOutput(this.test, this.outputTst); escribeSalidas(accTr,accTst); } } /** * It generates the output file from a given dataset and stores it in a file * @param dataset myDataset input dataset * @param filename String the name of the file * @return the Accuracy of the classifier */ private double doOutput(myDataset dataset, String filename) { String output = new String(""); output = dataset.copyHeader(); //we insert the header in the output file int aciertos = 0; //We write the output for each example for (int i = 0; i < dataset.getnData(); i++) { //for classification: String claseReal = dataset.getOutputAsString(i); String prediccion = this.classificationOutput(dataset.getExample(i)); output += claseReal + " " + prediccion + "\n"; if (claseReal.equalsIgnoreCase(prediccion)) { aciertos++; } } Fichero.escribeFichero(filename, output); return (1.0 * aciertos / dataset.size()); } /** * It returns the algorithm classification output given an input example * @param example double[] The input example * @return String the output generated by the algorithm */ private String classificationOutput(double[] example) { /*StringBuffer clase = new StringBuffer(""); boolean smallDisjunct = baseReglasTree.clasifica(true, example, clase); //en clase guardo la clase y devuelvo si es SD if (smallDisjunct) { StringBuffer claseGA = new StringBuffer(""); baseReglasGA.clasifica(false, example, claseGA); if (!(claseGA.toString().equalsIgnoreCase("<unclassified>"))) { clase = claseGA; } //System.err.println("Mira -> "+claseGA); } return clase.toString();*/ return clasif.clasifica(example); } private void obtenerReglas(String cadenaArbol) { String reglas = new String(""); StringTokenizer lineas = new StringTokenizer(cadenaArbol, "\n"); //este lee lineas String linea = lineas.nextToken(); //Primera linea @TotalNumberOfNodes X linea = lineas.nextToken(); //Segunda linea @NumberOfLeafs Y //Empieza el arbol Vector variables = new Vector(); Vector valores = new Vector(); Vector operadores = new Vector(); int contador = 0; while (lineas.hasMoreTokens()) { linea = lineas.nextToken(); StringTokenizer campo = new StringTokenizer(linea, " \t"); String cosa = campo.nextToken(); //Posibilidades: "if", "elseif", "class" if (cosa.compareToIgnoreCase("if") == 0) { campo.nextToken(); //( variables.add(campo.nextToken()); //nombre de la variable (AttX, X == posicion) operadores.add(campo.nextToken()); //Una de tres: "=", "<=", ">" valores.add(campo.nextToken()); //Valor } else if (cosa.compareToIgnoreCase("elseif") == 0) { int dejar = Integer.parseInt(campo.nextToken()); for (int i = variables.size() - 1; i >= dejar; i--) { variables.remove(variables.size() - 1); operadores.remove(operadores.size() - 1); valores.remove(valores.size() - 1); } campo.nextToken(); //( variables.add(campo.nextToken()); //nombre de la variable (AttX, X == posicion) operadores.add(campo.nextToken()); //Una de tres: "=", "<=", ">" valores.add(campo.nextToken()); //Valor } else { //Clase --> genero la regla campo.nextToken(); // = contador++; //tengo una nueva regla reglas += "\nRULE-" + contador + ": IF "; int i; for (i = 0; i < variables.size() - 1; i++) { reglas += (String) variables.get(i) + " " + (String) operadores.get(i) + " " + (String) valores.get(i) + " AND "; } reglas += (String) variables.get(i) + " " + (String) operadores.get(i) + " " + (String) valores.get(i); reglas += " THEN class = " + campo.nextToken(); variables.remove(variables.size() - 1); operadores.remove(operadores.size() - 1); valores.remove(valores.size() - 1); } //System.err.println(reglas); } //Fichero.escribeFichero("reglas.txt", reglas); baseReglasTree = new BaseR(train, reglas); } public void escribeSalidas(double accTr, double accTst){ System.out.println("Number of Rules (Tree): " + baseReglasTree.size()); System.out.println(""+baseReglasTree.printString()); System.out.println("Number of Rules (GA): " + baseReglasGA.size()); System.out.println(""+baseReglasGA.printString()); System.out.println("Accuracy in training: " + accTr); System.out.println("Accuracy in test: " + accTst); System.out.println("Algorithm Finished"); Fichero.escribeFichero(ficheroBR,baseReglasTree.printString()+ "\n"); Fichero.AnadirtoFichero(ficheroBR,baseReglasGA.printString()+ "\n"); Fichero.AnadirtoFichero(ficheroBR,"Accuracy in training: " + accTr+ "\n"); Fichero.AnadirtoFichero(ficheroBR,"Accuracy in test: " + accTst+ "\n"); } }