/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. Sánchez (luciano@uniovi.es)
J. Alcalá-Fdez (jalcala@decsai.ugr.es)
S. García (sglopez@ujaen.es)
A. Fernández (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.ImbalancedClassification.Ensembles;
import java.io.IOException;
import org.core.*;
import keel.Algorithms.ImbalancedClassification.Ensembles.C45.C45;
import java.util.StringTokenizer;
import java.util.Vector;
import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.CalculateAUC;
import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.PosProb;
import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.AccAUC;
import keel.Algorithms.ImbalancedClassification.Auxiliar.AUC.PredPair;
/**
* <p>Title: multi_C45</p>
* <p>Description: Main class to compute the algorithm procedure
* <p>Company: KEEL </p>
*
* @author Mikel Galar Idoate (UPNA)
* @author Modified by Alberto Fernandez (University of Jaen) 15/10/2012
* @author Modified by Sarah Vluymans (University of Ghent) 29/01/2014
* @author Modified by Alberto Fernandez (University of Jaen) 08/05/2014
* @version 1.2
* @since JDK1.6
*/
public class multi_C45 {
parseParameters parameters;
myDataset train, val, test;
public static String outputTr;
String outputTst, ruleBaseFile;
int instancesPerLeaf, n_classifiers, lambda;
float confidence;
boolean pruned, valid[];
String trainFile,cabecera;
RuleBase[] treeRuleSet; // Trees of the ensemble
myDataset actua_train_set; // train data-set for the actual ensemble
Ensemble ensemble;
String ensembleType;
String evMeas;
private boolean somethingWrong = false; //to check if everything is correct.
/**
* Default constructor
*/
public multi_C45() {
}
/**
* It reads the data from the input files (training, validation and test) and parse all the parameters
* from the parameters array.
* @param parameters parseParameters It contains the input files, output files and parameters
*/
public multi_C45(parseParameters parameters) {
this.parameters = parameters;
train = new myDataset();
val = new myDataset();
test = new myDataset();
trainFile = parameters.getTrainingInputFile();
try {
System.out.println("\nReading the training set: " +
parameters.getTrainingInputFile());
train.readClassificationSet(parameters.getTrainingInputFile(), true);
System.out.println("\nReading the validation set: " +
parameters.getValidationInputFile());
val.readClassificationSet(parameters.getValidationInputFile(), false);
System.out.println("\nReading the test set: " +
parameters.getTestInputFile());
test.readClassificationSet(parameters.getTestInputFile(), false);
}
catch (IOException e) {
System.err.println(
"There was a problem while reading the input data-sets: " +
e);
somethingWrong = true;
}
outputTr = parameters.getTrainingOutputFile();
outputTst = parameters.getTestOutputFile();
ruleBaseFile = parameters.getOutputFile(0);
//Now we parse the parameters
pruned = parameters.getParameter(1).equalsIgnoreCase("TRUE");
confidence = Float.parseFloat(parameters.getParameter(2));
instancesPerLeaf = Integer.parseInt(parameters.getParameter(3));
n_classifiers = Integer.parseInt(parameters.getParameter(4));
ensembleType = parameters.getParameter(5);
if (ensembleType.equalsIgnoreCase("ADABOOST.NC"))
lambda = Integer.parseInt(parameters.getParameter(7));
else
lambda = 1;
cabecera = parameters.getTrainingInputFile();
String[] aux = null;
aux = cabecera.split("\\.");
cabecera = aux[aux.length - 2]; //aux.length-1 es la extension
aux = cabecera.split("/");
cabecera = aux[aux.length - 1];
/* Create the ensemble! */
ensemble = new Ensemble(ensembleType, train, n_classifiers, lambda, this);
}
/**
* It launches the algorithm
*/
public void execute() {
if (somethingWrong) { //We do not execute the program
System.err.println("An error was found, the data-set has missing values.");
System.err.println("Aborting the program");
//We should not use the statement: System.exit(-1);
}
else {
n_classifiers = ensemble.nClassifier;
valid = new boolean[n_classifiers];
treeRuleSet = new RuleBase[n_classifiers];
/** While the algorithm has not end, and the number of classifier constructed is not reached...
* we construct a new classifier for the ensemble
*/
boolean fin = false;
for (int i = 0; i < n_classifiers && !fin; i++) {
// we get the actual training data-set
actua_train_set = ensemble.getDS();
/* Databoost-IM has problems generating instances in Highly imbalanced data-sets */
if (actua_train_set.getnData() > 53000)
{
System.out.println("Databoost overflow!, nData = " + actua_train_set.getnData());
fin = true;
break;
}
boolean mal = false;
if (!actua_train_set.vacio())
{
// write the data-set which will be readed by C4.5 decision tree learning algorithm
Fichero.escribeFichero(ensembleType + cabecera + ".txt", actua_train_set.printDataSet());
valid[i] = true;
System.out.println("Training classifier[" + i + "]");
// Construct the tree using the weights (they can be unirformly distributed)
C45 tree = new C45(ensembleType + cabecera + ".txt", pruned, confidence, instancesPerLeaf, ensemble.getWeights().clone());
try {
tree.generateTree();
}
catch (Exception e) {
System.err.println("Error!!");
System.err.println(e.getMessage());
System.exit( -1);
}
/* The tree is stored in a set of rules */
String cadenaTree = tree.printString();
obtainRules(cadenaTree, i);
if (treeRuleSet[i].size() == 0)
{
mal = true;
int clase = tree.getPriorProbabilities()[0] > tree.getPriorProbabilities()[1] ? 0 : 1;
// The a priori rule is introduced which predict the class with the greatest prior probability
treeRuleSet[i].ruleBase.add(new Rule(train.getOutputValue(clase), actua_train_set));
}
treeRuleSet[i].coverExamples();
treeRuleSet[i].coverExamples(ensemble.getWeights().clone()); //Step 2
}
else {
valid[i] = false;
}
// Go to the next iteration of the ensemble!
if (mal) {
if ((!ensembleType.contains("EUNDERBAGGING")) && (ensemble.weightsBackup != null)) {
ensemble.weights = ensemble.weightsBackup.clone();
}
else
fin = ensemble.nextIteration();
} else
fin = ensemble.nextIteration();
if (ensembleType.equalsIgnoreCase("EASYENSEMBLE")
|| ensembleType.equalsIgnoreCase("BALANCECASCADE"))
i = ensemble.t - 1;
}
//Finally we should fill the training and test output files
AccAUC pairTra = doOutput(this.val, this.outputTr);
AccAUC pairTst = doOutput(this.test, this.outputTst);
writeOutput(pairTra, pairTst, this.ruleBaseFile);
ensemble.writeAUCError(this.outputTst);
}
}
/**
* It generates the output file from a given dataset and stores it in a file
* @param dataset myDataset input dataset
* @param filename String the name of the file
* @return the Accuracy of the classifier
*/
private AccAUC doOutput(myDataset dataset, String filename) {
double TP = 0, FP = 0, FN = 0, TN = 0;
/*String output = new String("");
output = dataset.copyHeader(); //we insert the header in the output file
int aciertos = 0;*/
String outputTotal = dataset.copyHeader();
String claseReal = "";
String prediccion = "";
String output2 = "";
StringBuilder sb = new StringBuilder(dataset.getnData() * 5);
int aciertos = 0;
/*
* For AUC: when the weighted sum is positive, the instance is classified
* as originalDS.getOutputValue(0). When it is negative, as
* originalDS.getOutputValue(1).
* To be able to compute the AUC, we need, for each instance, the probability
* that the classifier will classify it as belonging to the positive
* (minority) class. When this class is the same as
* originalDS.getOutputValue(0), we can just the value of the weighted sum
* as 'probability'. In the other case, we will use (-1) * sum.
* Note that these 'probabilities' do not necessarily belong to [0,1].
* This is no problem, only their relative differences matter.
*/
boolean takeOpposite =
ensemble.originalDS.getOutputValue(0).equals(train.claseMasFrecuente());
PosProb[] valsForAUC = new PosProb[dataset.getnData()];
//We write the output for each example
for (int i = 0; i < dataset.getnData(); i++) {
claseReal = dataset.getOutputAsString(i);
PredPair predAndVoteValue = this.classificationOutput(dataset.getExample(i));
prediccion = predAndVoteValue.getPrediction();
output2 = claseReal.concat(" ").concat(prediccion).concat("\n");
// Calculations for accuracy
if (claseReal.equalsIgnoreCase(prediccion)) {
aciertos++;
}
if (claseReal.equalsIgnoreCase(prediccion) && claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
TN++;
else if (claseReal.equalsIgnoreCase(prediccion) && !claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
TP++;
else if (!claseReal.equalsIgnoreCase(prediccion) && claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
FP++;
else
FN++;
// Calculations for AUC
double voted = predAndVoteValue.getVotingValue();
boolean isPositive = !claseReal.equals(train.claseMasFrecuente());
double prob = voted;
if(takeOpposite){
prob *= -1.0;
}
valsForAUC[i] = new PosProb(isPositive, prob);
sb.append(output2);
}
outputTotal += sb.toString();
double TPrate = TP / (TP + FN);
double TNrate = TN / (TN + FP);
double gmean = Math.sqrt(TPrate * TNrate);
double precision = TP / (TP + FP);
double recall = TP / (TP + FN);
double fmean = 2 * recall * precision / (1 * recall + precision);
System.out.println("G-mean: " + gmean);
System.out.println("F-mean: " + fmean);
System.out.println("TPrate: " + TPrate);
System.out.println("TNrate: " + TNrate);
double FPrate = FP / (FP + TN);
System.out.println("AUC: " + (1 + TPrate - FPrate) / 2);
Files.writeFile(filename, outputTotal);
double acc = 1.0 * aciertos / dataset.size();
double auc = getAUC(valsForAUC);
return new AccAUC(acc, auc);
}
/**
* It carries out the classification of a given dataset throughout the learning stage of the ensemble
* @param dataset the instance set
* @return accuracy for the current ensemble
*/
public double classify (myDataset dataset) {
//double TP = 0, FP = 0, FN = 0, TN = 0;
//String output = new String("");
//output = dataset.copyHeader(); //we insert the header in the output file
int aciertos = 0;
//We write the output for each example
for (int i = 0; i < dataset.getnData(); i++) {
String claseReal = dataset.getOutputAsString(i);
PredPair predAndVoteValue = this.classificationOutput(dataset.getExample(i));
String prediccion = predAndVoteValue.getPrediction();
//output += claseReal + " " + prediccion + "\n";
if (claseReal.equalsIgnoreCase(prediccion)) {
aciertos++;
}
/*if (claseReal.equalsIgnoreCase(prediccion) && claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
TN++;
else if (claseReal.equalsIgnoreCase(prediccion) && !claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
TP++;
else if (!claseReal.equalsIgnoreCase(prediccion) && claseReal.equalsIgnoreCase(train.claseMasFrecuente()))
FP++;
else
FN++;*/
}
/*double TPrate = TP / (TP + FN);
double TNrate = TN / (TN + FP);
double gmean = Math.sqrt(TPrate * TNrate);
double precision = TP / (TP + FP);
double recall = TP / (TP + FN);
double fmean = 2 * recall * precision / (1 * recall + precision);
System.out.println("G-mean: " + gmean);
System.out.println("F-mean: " + fmean);
System.out.println("TPrate: " + TPrate);*/
return (1.0 * aciertos / dataset.size());
}
/**
* It returns the algorithm classification output given an input example
* @param example double[] The input example
* @return String the output generated by the algorithm
*/
private PredPair classificationOutput(double[] example) {
/**
Here we should include the algorithm directives to generate the
classification output from the input example
*/
return ensemble.computeClassScores(example);
}
/** It returns the class index of the prediction of an example in the i^{th} classifier
*
* @param i the classifier to be used
* @param example the example to be classified
* @return the predicted class index
*/
protected int obtainClass(int i, double[] example)
{
if (valid[i]) {
String clase = "?";
for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?"));
j++) {
if (treeRuleSet[i].ruleBase.get(j).covers(example)) {
clase = treeRuleSet[i].ruleBase.get(j).clase;
}
}
int clase_num = train.claseNumerica(clase);
if (clase_num == -1)
{
clase_num = train.claseNumerica(train.claseMasFrecuente());
}
return clase_num;
}
else {
System.err.println("This should not be accessed: "+i+"/"+valid[i]);
return -1;
}
}
/** It obtains the confidence on the prediction of the example in the i^{th} classifier
*
* @param i the classifier to be used
* @param example the example to be classified
* @return the confidence on the prediction
*/
protected double obtainConfidence(int i, double[] example)
{
double confianza = 0;
if (valid[i]) {
String clase = "?";
for (int j = 0; (j < treeRuleSet[i].size()) && (clase.equals("?"));
j++) {
if (treeRuleSet[i].ruleBase.get(j).covers(example)) {
clase = treeRuleSet[i].ruleBase.get(j).clase;
double nCubiertosOK = treeRuleSet[i].ruleBase.get(j).fCubiertosOK;
double nCubiertos = treeRuleSet[i].ruleBase.get(j).fCubiertos;
if (nCubiertos == 0)
confianza = 0;
else
confianza = (ensemble.nData * nCubiertosOK + 1) / (ensemble.nData * nCubiertos + 2);
}
}
int clase_num = train.claseNumerica(clase);
if (clase_num == -1)
confianza = 0.5;
return confianza;
}
else
{
return 0.5;
}
}
/**
* It extracts the rule set from a given file exported by the C4.5 classifier
* @param treeString the contain of the file (rule set)
* @param classifier classifier id of the ensemble
*/
private void obtainRules(String treeString, int classifier) {
String rules = new String("");
StringTokenizer lines = new StringTokenizer(treeString, "\n"); //read lines
String line = lines.nextToken(); //First line @TotalNumberOfNodes X
line = lines.nextToken(); //Second line @NumberOfLeafs Y
//The tree starts
Vector <String>variables = new Vector<String>();
Vector <String>values = new Vector<String>();
Vector <String>operators = new Vector<String>();
int contador = 0;
while (lines.hasMoreTokens()) {
line = lines.nextToken();
StringTokenizer field = new StringTokenizer(line, " \t");
String cosa = field.nextToken(); //Possibilities: "if", "elseif", "class"
if (cosa.compareToIgnoreCase("if") == 0) {
field.nextToken(); //(
variables.add(field.nextToken()); //variable name (AttX, X == position)
operators.add(field.nextToken()); //One of three: "=", "<=", ">"
values.add(field.nextToken()); //Value
}
else if (cosa.compareToIgnoreCase("elseif") == 0) {
int dejar = Integer.parseInt(field.nextToken());
for (int i = variables.size() - 1; i >= dejar; i--) {
variables.remove(variables.size() - 1);
operators.remove(operators.size() - 1);
values.remove(values.size() - 1);
}
field.nextToken(); //(
variables.add(field.nextToken()); //variable name (AttX, X == position)
operators.add(field.nextToken()); //One of three: "=", "<=", ">"
values.add(field.nextToken()); //Value
}
else { //Class --> rule generation
field.nextToken(); // =
contador++; //I have a new rule
rules += "\nRULE-" + contador + ": IF ";
int i;
for (i = 0; i < variables.size() - 1; i++) {
rules += (String) variables.get(i) + " " + (String) operators.get(i) +
" " + (String) values.get(i) + " AND ";
}
rules += (String) variables.get(i) + " " + (String) operators.get(i) +
" " + (String) values.get(i);
rules += " THEN class = " + field.nextToken();
variables.remove(variables.size() - 1);
operators.remove(operators.size() - 1);
values.remove(values.size() - 1);
}
}
treeRuleSet[classifier] = new RuleBase(actua_train_set, rules);
}
/**
* It writes on a file the full ensemble (C4.5 rule sets)
* @param accTr Training accuracy
* @param accTst Test accuracy
*/
public void writeOutput (AccAUC pairTra, AccAUC pairTst, String ruleBaseFile) {
Files.writeFile(ruleBaseFile,"");
for (int i = 0; i < ensemble.nClassifier; i++) {
if (valid[i]) {
Files.addToFile(ruleBaseFile, "@Classifier number " + i + ": \n");
Files.addToFile(ruleBaseFile,treeRuleSet[i].printStringF() + "\n");
}
else {
// System.out.println("Not valid!");
}
}
Files.addToFile(ruleBaseFile, "Accuracy in training: " + pairTra.getAcc() + "\n");
Files.addToFile(ruleBaseFile, "Accuracy in test: " + pairTst.getAcc() + "\n");
Files.addToFile(ruleBaseFile, "AUC in training: " + pairTra.getAUC() + "\n");
Files.addToFile(ruleBaseFile, "AUC in test: " + pairTst.getAUC() + "\n");
System.out.println("Accuracy in training: " + pairTra.getAcc());
System.out.println("Accuracy in test: " + pairTst.getAcc());
System.out.println("AUC in training: " + pairTra.getAUC());
System.out.println("AUC in test: " + pairTst.getAUC());
System.out.println("Algorithm Finished");
}
/*
* Calculates the AUC for the associated set of values
*
* @param valsForAUC Array containing the predicted classes and sum obtained by the ensemble
*
* @return The AUC value associated to the given set of values
*/
private double getAUC(PosProb[] valsForAUC){
return CalculateAUC.calculate(valsForAUC);
}
}