/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Decision_Trees.C45_Binarization; /** * <p>Title: Multiclassifier</p> * <p>Description: This class implements the Main execution class for the Binarization methodology (OVO and OVO ) * <p>Company: KEEL </p> * @author Mikel Galar (University of Navarra) 21/10/2010 * @author Alberto Fernandez (University of Jaen) 15/05/2014 * @version 1.2 * @since JDK1.6 */ import java.io.IOException; import java.util.StringTokenizer; import java.util.Vector; import org.core.*; import keel.Algorithms.Decision_Trees.C45.*; public class Multiclassifier { myDataset train, val, test; String outputTr, outputTst, ficheroBR, fichTrain, claseMayoritaria[], cabecera, sOvo, binarization, method[]; int nClasses, n_classifiers, neighbours, preprocessing, distance; boolean pruned, valid[]; C45 classifiers[]; myDataset[] train_sets; OVO ovo; float threshold,confidence; int instancesPerLeaf; boolean nested,dynamic; int[] empates; parseParameters parameters; String input_validation_name,input_test_name; double[] aprioriClassDistribution; RuleBase[] treeRuleSet; private boolean somethingWrong = false; //to check if everything is correct. /** * Default constructor */ public Multiclassifier() { } /** * It reads the data from the input files (training, validation and test) and parse all the parameters * from the parameters array. * @param parameters parseParameters It contains the input files, output files and parameters */ public Multiclassifier(parseParameters parameters) { this.parameters = parameters; train = new myDataset(); val = new myDataset(); test = new myDataset(); fichTrain = parameters.getTrainingInputFile(); try { System.out.println("\nReading the training set: " + parameters.getTrainingInputFile()); train.readClassificationSet(parameters.getTrainingInputFile(), true); System.out.println("\nReading the validation set: " + parameters.getValidationInputFile()); val.readClassificationSet(parameters.getValidationInputFile(), false); System.out.println("\nReading the test set: " + parameters.getTestInputFile()); test.readClassificationSet(parameters.getTestInputFile(), false); } catch (IOException e) { System.err.println( "There was a problem while reading the input data-sets: " + e); somethingWrong = true; } //We may check if there are some numerical attributes, because our algorithm may not handle them: //somethingWrong = somethingWrong || train.hasRealAttributes(); //somethingWrong = somethingWrong || train.hasMissingAttributes(); outputTr = parameters.getTrainingOutputFile(); outputTst = parameters.getTestOutputFile(); //Now we parse the parameters ficheroBR = parameters.getOutputFile(0); //Now we parse the parameters pruned = true; confidence = Float.parseFloat(parameters.getParameter(1)); instancesPerLeaf = Integer.parseInt(parameters.getParameter(2)); nClasses = train.getnClasses(); aprioriClassDistribution = new double[nClasses]; for (int i = 0; i < nClasses; i++) { aprioriClassDistribution[i] = 1.0 * train.numberInstances(i) / train.size(); } binarization = parameters.getParameter(3); sOvo = "WEIGHTED"; sOvo = parameters.getParameter(4); if (sOvo.equals("BTS")) threshold = Float.parseFloat(parameters.getParameter(5)); else if (sOvo.equals("DynOVO")){ sOvo = "WEIGHTED"; dynamic = true; } nested = false; String prep = "NONE"; //parameters.getParameter(6); cabecera = parameters.getTestInputFile(); String[] aux = null; aux = cabecera.split("\\."); cabecera = aux[aux.length - 2]; //aux.length-1 es la extension aux = cabecera.split("/"); cabecera = aux[aux.length - 1]; } /** * It constructs a new set of OVO classifiers for NESTING aggregation * @param nested if the classifier is nested * @param padre the reference to the parent OVO */ public Multiclassifier(boolean nested, Multiclassifier padre) { train = padre.train; val = padre.val; test = padre.test; this.parameters = padre.parameters; fichTrain = padre.fichTrain; outputTr = parameters.getTrainingOutputFile(); outputTst = parameters.getTestOutputFile(); distance = padre.distance; parameters.getParameter(4); if (sOvo.equals("BTS")) threshold = Float.parseFloat(parameters.getParameter(5)); this.nested = nested; } /** * It launches the algorithm */ public void execute() { if (somethingWrong) { //We do not execute the program System.err.println("An error was found, the data-set has missing values."); System.err.println("Aborting the program"); //We should not use the statement: System.exit(-1); } else { //We do here the algorithm's operations nClasses = train.getnClasses(); input_validation_name = parameters.getValidationInputFile(); input_test_name = parameters.getTestInputFile(); ovo = new OVO(this, sOvo, dynamic); n_classifiers = nClasses * (nClasses - 1) / 2; if (binarization.equals("OVA")){ n_classifiers = nClasses; } valid = new boolean[n_classifiers]; claseMayoritaria = new String[n_classifiers]; classifiers = new C45[n_classifiers]; train_sets = new myDataset[n_classifiers]; treeRuleSet = new RuleBase[n_classifiers]; aprioriClassDistribution = new double[nClasses]; for (int i = 0; i < nClasses; i++) { aprioriClassDistribution[i] = 1.0 * train.numberInstances(i) / train.size(); } if (binarization.equals("OVO")){ for (int i = 0, x = 0; i < nClasses - 1; i++) { for (int j = i + 1; j < nClasses; j++) { if (i != j) { train_sets[x] = new myDataset(train, i, j); x++; } } } }else{ for (int i = 0; i < nClasses; i++) { train_sets[i] = new myDataset(train, i); } } int x, y; x = 0; y = 1; boolean is_ovo = binarization.equals("OVO"); for (int i = 0; i < n_classifiers; i++) { String text = new String(""); if (is_ovo){ text = train.className(x)+" vs. "+train.className(y); }else{ text = train.className(i)+" vs. REST"; } System.out.println("Classifier -> "+i+"; "+text); if (!train_sets[i].empty()) { Files.writeFile(cabecera+".tra", train_sets[i].printDataSet(!is_ovo)); valid[i] = true; C45 tree = new C45(cabecera+".tra", pruned, confidence, instancesPerLeaf,!is_ovo); try { tree.generateTree(); } catch (Exception e) { System.err.println("Error!!"); System.err.println(e.getMessage()); System.exit( -1); } String treeString = tree.printStringOVO(); obtainRules(treeString,i); treeRuleSet[i].coverExamples(); } else { valid[i] = false; } y++; if (y % nClasses == 0) { x++; y = x + 1; } } if (binarization.equals("OVO")){ ovo.classifierTrainFinished(); } //Finally we should fill the training and test output files ovo.clearTables(false); double accTr = doOutput(this.val, this.outputTr); ovo.clearTables(true); double accTst = doOutput(this.test, this.outputTst); System.out.println("Accuracy in training: " + accTr); System.out.println("Accuracy in test: " + accTst); } } /** * It executes the algorithm, but only for those instances which were ties in the * previous OVO * @param empate An array containing wether instances were ties or not */ public void execute_nesting(int[] empate) { if (somethingWrong) { //We do not execute the program System.err.println("An error was found, the data-set has missing values."); System.err.println("Aborting the program"); //We should not use the statement: System.exit(-1); } else { //We do here the algorithm's operations nClasses = train.getnClasses(); // Construct the OVO (it manages the aggregation) ovo = new OVO(this, sOvo,false); n_classifiers = nClasses * (nClasses - 1) / 2; valid = new boolean[n_classifiers]; treeRuleSet = new RuleBase[n_classifiers]; claseMayoritaria = new String[n_classifiers]; train_sets = new myDataset[n_classifiers]; /* Compute a priori class distributions */ aprioriClassDistribution = new double[nClasses]; for (int i = 0; i < nClasses; i++) { aprioriClassDistribution[i] = 1.0 * train.numberInstances(i) / train.size(); } /* Construct the data-set for each classifier only considering the ties */ for (int i = 0, x = 0; i < nClasses - 1; i++) { for (int j = i + 1; j < nClasses; j++) { if (i != j) { train_sets[x] = new myDataset(train, i, j, empate); x++; } } } /* Construct the classifiers */ int x, y; x = 0; y = 1; for (int i = 0; i < n_classifiers; i++) { if (!train_sets[i].empty()) { Fichero.escribeFichero("training.txt", train_sets[i].printDataSet(false)); valid[i] = true; System.out.println("Training classifier[" + i + "] for classes " + x + " and " + y); C45 tree = new C45("training.txt", pruned, confidence,instancesPerLeaf,false); try { tree.generateTree(); } catch (Exception e) { System.err.println("Error!!"); System.err.println(e.getMessage()); System.exit( -1); } String treeString = tree.printStringOVO(); obtainRules(treeString, i); treeRuleSet[i].coverExamples(); claseMayoritaria[i] = train_sets[i].mostFrequentClass(); } else { valid[i] = false; } y++; if (y % nClasses == 0) { x++; y = x + 1; } } ovo.classifierTrainFinished(); //Finally we should fill the training and test output files this.empates = empate; doOutput(this.val, this.outputTr); } } /** * It extracts the rule set from a given file exported by the C4.5 classifier * @param treeString the contain of the file (rule set) * @param classifier classifier id of the ensemble */ private void obtainRules(String treeString, int classifier) { String rules = new String(""); StringTokenizer lines = new StringTokenizer(treeString, "\n"); //read lines String line = lines.nextToken(); //First line @TotalNumberOfNodes X line = lines.nextToken(); //Second line @NumberOfLeafs Y //The tree starts Vector <String>variables = new Vector<String>(); Vector <String>values = new Vector<String>(); Vector <String>operators = new Vector<String>(); int contador = 0; while (lines.hasMoreTokens()) { line = lines.nextToken(); StringTokenizer field = new StringTokenizer(line, " \t"); String cosa = field.nextToken(); //Possibilities: "if", "elseif", "class" if (cosa.compareToIgnoreCase("if") == 0) { field.nextToken(); //( variables.add(field.nextToken()); //variable name (AttX, X == position) operators.add(field.nextToken()); //One of three: "=", "<=", ">" values.add(field.nextToken()); //Value } else if (cosa.compareToIgnoreCase("elseif") == 0) { int dejar = Integer.parseInt(field.nextToken()); for (int i = variables.size() - 1; i >= dejar; i--) { variables.remove(variables.size() - 1); operators.remove(operators.size() - 1); values.remove(values.size() - 1); } field.nextToken(); //( variables.add(field.nextToken()); //variable name (AttX, X == position) operators.add(field.nextToken()); //One of three: "=", "<=", ">" values.add(field.nextToken()); //Value } else { //Class --> rule generation field.nextToken(); // = contador++; //I have a new rule rules += "\nRULE-" + contador + ": IF "; int i; for (i = 0; i < variables.size() - 1; i++) { rules += (String) variables.get(i) + " " + (String) operators.get(i) + " " + (String) values.get(i) + " AND "; } rules += (String) variables.get(i) + " " + (String) operators.get(i) + " " + (String) values.get(i); rules += " THEN class = " + field.nextToken(); variables.remove(variables.size() - 1); operators.remove(operators.size() - 1); values.remove(values.size() - 1); } } treeRuleSet[classifier] = new RuleBase(train_sets[classifier], rules); } /** * It generates the output file from a given dataset and stores it in a file * @param dataset myDataset input dataset * @param filename String the name of the file * @return the Accuracy of the classifier */ private double doOutput(myDataset dataset, String filename) { String output = new String(""); /*if (!nested) dataset.normalize(); */ output = dataset.copyHeader(); //we insert the header in the output file int [] hits = new int[nClasses]; //We write the output for each example for (int i = 0; i < dataset.getnData(); i++) { int clase = dataset.getOutputAsInteger(i); String actualClass = dataset.getOutputAsString(i); String prediccion = this.classificationOutput(dataset.getExample(i)); output += actualClass + " " + prediccion + "\n"; if (actualClass.equalsIgnoreCase(prediccion)) { hits[clase]++; } } Files.writeFile(filename, output); double accAvg = 0; int numClases = 0; for (int i = 0; i < nClasses; i++){ try{ int datos = dataset.numberInstances(i); if (datos > 0){ numClases++; double acc = (1.0*hits[i])/datos; System.out.print("Cl["+i+"]: "+hits[i]+"/"+datos+"("+acc+")\t"); accAvg += acc; } }catch(Exception e){ System.err.println("NO examples for class "+i); } } System.out.println(""); accAvg /= numClases; return (100.0*accAvg); } /** * It computes the output class for a given example * @param example * @return the output class from the decision matrix (OVO) or vector (OVA) */ private String classificationOutput(double[] example) { /** Here we should include the algorithm directives to generate the classification output from the input example */ if (binarization.equals("OVO")){ return ovo.computeClassScores(example); }else{ return ovo.computeClassScoresOVA(example); } } /** * It computes the output class according to the learned system * @param x * @param y * @param example * @return */ protected int obtainClass(int x, int y, double[] example){ int i = 0; for (int i2 = 0; i2 < x; i2++) i += nClasses - (i2 + 1); i += y - x - 1; if (valid[i]) { String clase = "?"; for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?")); j++) { if (treeRuleSet[i].ruleBase.get(j).covers(example)) { clase = treeRuleSet[i].ruleBase.get(j).clase; } } return train_sets[i].numericClass(clase); } else return -1; } protected double[] obtainConfidence(int x, int y, double[] example){ int i = 0; double[] salida = new double[2]; for (int i2 = 0; i2 < x; i2++) i += nClasses - (i2 + 1); i += y - x - 1; double confidence = 0; if (valid[i]) { String clase = "?"; for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?")); j++) { if (treeRuleSet[i].ruleBase.get(j).covers(example)) { clase = treeRuleSet[i].ruleBase.get(j).clase; confidence = treeRuleSet[i].ruleBase.get(j).confidence(); } } int clase_num = train_sets[i].numericClass(clase); if (clase_num == x){ salida[0] = confidence;//(int)salida[0] == x ? salida[1] : 1 - salida[1]; }else{ salida[0] = 1-confidence; } salida[1] = 1-salida[0]; return salida; } else { salida[0] = 0.0; salida[1] = 0.0; return salida; } } protected double[][] ovo_table(double[] example) { double[][] tabla = new double[nClasses][nClasses]; int x, y; x = 0; y = 1; for (int i = 0; i < n_classifiers; i++) { if (valid[i]) { String clase = "?"; double confidence = 0; for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?")); j++) { if (treeRuleSet[i].ruleBase.get(j).covers(example)) { clase = treeRuleSet[i].ruleBase.get(j).clase; confidence = treeRuleSet[i].ruleBase.get(j).confidence(); } } int clase_num = train_sets[i].numericClass(clase); if (x == clase_num) { tabla[x][y] = confidence; tabla[y][x] = 1 - confidence; } else {//if (y == clase_num[0]){ tabla[y][x] = confidence; tabla[x][y] = 1 - confidence; } } else { tabla[x][y] = tabla[y][x] = 0; } y++; if (y % nClasses == 0) { x++; y = x + 1; } } return tabla; } protected double [] ova_table(double [] example){ double[] grado_asoc = new double[this.n_classifiers]; for (int i = 0; i < this.n_classifiers; i++) { if (valid[i]) { String clase = "?"; double confidence = 0; for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?")); j++) { if (treeRuleSet[i].ruleBase.get(j).covers(example)) { clase = treeRuleSet[i].ruleBase.get(j).clase; confidence = treeRuleSet[i].ruleBase.get(j).confidence(); } } int clase_num = train_sets[i].numericClass(clase); grado_asoc[i] = 0.0; if (clase_num == 0){ grado_asoc[i] = confidence; } } } return grado_asoc; } }