/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
* <p>
* @author Written by Crist�bal Romero Morales (University of Oviedo) 01/07/2008
* @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008
* @version 1.1
* @since JDK1.2
* </p>
*/
package keel.Algorithms.Rule_Learning.PART;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.StreamTokenizer;
import java.io.IOException;
/** para commons.configuration
import org.apache.commons.configuration.*;
*/
public class C45 extends Algorithm{
/**
* <p>
* Class to implement the C4.5 algorithm
* </p>
*/
/** Decision tree. */
private Tree root;
/** Is the tree pruned or not. */
private boolean prune = true;
/** Confidence level. */
private float confidence = 0.25f;
/** Minimum number of itemsets per leaf. */
private int minItemsets = 2;
/** The prior probabilities of the classes. */
private double [] priorsProbabilities;
/** Resolution of the margin histogram. */
private static int marginResolution = 500;
/** Cumulative margin classification. */
private double marginCounts [];
/** The sum of counts for priors. */
private double classPriorsSum;
/** Constructor.
*
* @param paramFile The parameters file.
*
* @throws Exception If the algorithm cannot be executed.
*/
public C45( parseParameters paramFile ) throws Exception
{
try
{
// starts the time
long startTime = System.currentTimeMillis();
/* Sets the options of the execution from text file*/
//StreamTokenizer tokenizer = new StreamTokenizer( new BufferedReader( new FileReader( paramFile ) ) );
//initTokenizer( tokenizer) ;
//setOptions( tokenizer );
//File Names
modelFileName=paramFile.getTrainingInputFile();
trainFileName=paramFile.getValidationInputFile();
testFileName=paramFile.getTestInputFile();
//Options
prune=Boolean.valueOf(paramFile.getParameter(1)).booleanValue(); //wether the tree must be pruned or not
confidence=Float.parseFloat(paramFile.getParameter(2)); //confidence level for the uniform distribution
minItemsets = Integer.parseInt(paramFile.getParameter(3)); //itemset per Leaf
if (confidence < 0 || confidence > 1) {
confidence = 0.25F;
System.err.println("Error: Confidence must be in the interval [0,1]");
System.err.println("Using default value: 0.25");
}
if (minItemsets <= 0) {
minItemsets = 2;
System.err.println("Error: itemsetPerLeaf must be greater than 0");
System.err.println("Using default value: 2");
}
/* Sets the options from XML file */
/** para commons.configuration
XMLConfiguration config = new XMLConfiguration(paramFile);
setOptions( config );
*/
/* Initializes the dataset. */
modelDataset = new MyDataset( modelFileName, true );
trainDataset = new MyDataset( trainFileName, false );
testDataset = new MyDataset( testFileName, false );
priorsProbabilities = new double [modelDataset.numClasses()];
priorsProbabilities();
marginCounts = new double [marginResolution + 1];
// generate the tree
generateTree( modelDataset );
//printTrain();
//printTest();
//printResult();
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
System.exit(-1);
}
}
public C45( MyDataset data,boolean pruned,float cf,int minItemsets ) throws Exception
{
try
{
// starts the time
long startTime = System.currentTimeMillis();
//File Names
//modelFileName=paramFile.getTrainingInputFile();
//trainFileName=paramFile.getValidationInputFile();
//testFileName=paramFile.getTestInputFile();
//Options
prune=pruned; //wether the tree must be pruned or not
confidence=cf; //confidence level for the uniform distribution
this.minItemsets = minItemsets; //itemset per Leaf
/* Initializes the dataset. */
modelDataset = data;
//trainDataset = new MyDataset( trainFileName, false );
//testDataset = new MyDataset( testFileName, false );
priorsProbabilities = new double [modelDataset.numClasses()];
priorsProbabilities();
marginCounts = new double [marginResolution + 1];
// generate the tree
generateTree( modelDataset );
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
System.exit(-1);
}
}
/** Function to read the options from the xml parameter file and assign the values to the corresponding member variables of C45 class:
* modelFileName, trainFileName, testFileName, trainOutputFileName, testOutputFileName, resultFileName, prune, confidence, minItemsets.
*
* @param config The XMLObject with the parameters.
*
* @throws Exception If there is any problem with the xml file
*/
/** para commons.configuration
protected void setOptions( XMLConfiguration config ) throws Exception
{
String algorithm = config.getString("algorithm");
if (!algorithm.equalsIgnoreCase( "C4.5" ) )
throw new Exception( "The name of the algorithm is not correct." );
modelFileName = config.getString("inputData.inputData1");
trainFileName = config.getString("inputData.inputData2");
testFileName = config.getString("inputData.inputData3");
trainOutputFileName = config.getString("outputData.outputData1");
testOutputFileName = config.getString("outputData.outputData2");
resultFileName = config.getString("outputData.outputData3");
prune = config.getBoolean("parameter.pruned");
confidence = config.getFloat("parameter.confidence");
minItemsets = config.getInt("parameter.instancesPerLeaf");
}
*/
/** Function to read the options from the execution file and assign the values to the parameters.
*
* @param options The StreamTokenizer that reads the parameters file.
*
* @throws Exception If the format of the file is not correct.
*/
protected void setOptions( StreamTokenizer options ) throws Exception
{
options.nextToken();
/* Checks that the file starts with the token algorithm */
if ( options.sval.equalsIgnoreCase( "algorithm" ) )
{
options.nextToken();
options.nextToken();
//if (!options.sval.equalsIgnoreCase( "C4.5" ) )
// throw new Exception( "The name of the algorithm is not correct." );
options.nextToken();
System.out.println(options.sval+"\n");
options.nextToken();
System.out.println(options.sval+"\n");
//options.nextToken();
//System.out.println(options.sval+"\n");
//options.nextToken();
//System.out.println(options.sval+"\n");
/* Reads the names of the input files*/
if ( options.sval.equalsIgnoreCase( "inputData" ) )
{
options.nextToken();
options.nextToken();
modelFileName = options.sval;
System.out.println("Hay inputs\n");
if ( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = options.sval;
options.nextToken();
testFileName = options.sval;
if( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = modelFileName;
options.nextToken();
}
System.out.println(trainFileName+"\n");
System.out.println(testFileName+"\n");
}
}
else
throw new Exception( "No file test provided." );
/* Reads the names of the output files*/
while ( true )
{
if( options.nextToken() == StreamTokenizer.TT_EOF )
throw new Exception( "No output file provided." );
if ( options.sval == null )
continue;
else if ( options.sval.equalsIgnoreCase( "outputData" ) )
break;
}
options.nextToken();
options.nextToken();
trainOutputFileName = options.sval;
options.nextToken();
testOutputFileName = options.sval;
options.nextToken();
resultFileName = options.sval;
System.out.println(trainOutputFileName+"\n");
System.out.println(testOutputFileName+"\n");
System.out.println(resultFileName+"\n");
if ( !getNextToken( options ) )
return;
while ( options.ttype != StreamTokenizer.TT_EOF )
{
/* Reads the prune parameter */
if ( options.sval.equalsIgnoreCase( "pruned" ) )
{
options.nextToken();
options.nextToken();
if( options.sval.equalsIgnoreCase( "TRUE" ) )
prune = true;
else
prune = false;
//prune = true;
}
/* Reads the confidence parameter */
if ( options.sval.equalsIgnoreCase( "confidence" ) )
{
if ( !prune )
throw new Exception( "Doesn't make sense to change confidence for prune tree!" );
options.nextToken();
options.nextToken();
/* Checks that the confidence threshold is between 0 and 1. */
float cf = Float.parseFloat( options.sval );
if( cf <= 1 || cf >= 0 )
confidence = Float.parseFloat( options.sval );
}
/* Reads the itemsets per leaf parameter */
if ( options.sval.equalsIgnoreCase( "itemsetsPerLeaf" ) )
{
options.nextToken();
options.nextToken();
if( Integer.parseInt( options.sval ) > 0 )
minItemsets = Integer.parseInt( options.sval );
}
getNextToken( options );
}
}
}
/** Generates the tree.
*
* @param itemsets The dataset used to build the tree.
*
* @throws Exception If the tree cannot be built.
*/
public void generateTree( MyDataset itemsets ) throws Exception
{
SelectCut selectCut;
selectCut = new SelectCut( minItemsets, itemsets );
root = new Tree( selectCut, prune, confidence );
root.buildTree( itemsets );
root.isUnexplored=false;
}
/** Function to evaluate the class which the itemset must have according to the classification of the tree.
*
* @param itemset The itemset to evaluate.
* @throws Exception If cannot compute the classification.
* @return The index of the class index predicted.
*/
public double evaluateItemset( Itemset itemset ) throws Exception
{
Itemset classMissing = (Itemset)itemset.copy();
double prediction = 0;
classMissing.setDataset( itemset.getDataset() );
classMissing.setClassMissing();
double [] classification = classificationForItemset( classMissing );
prediction = maxIndex( classification );
updateStats( classification, itemset, itemset.numClasses() );
//itemset.setPredictedValue( prediction );
return prediction;
}
/** Updates all the statistics for the current itemset.
*
* @param predictedClassification Distribution of class values predicted for the itemset.
* @param itemset The itemset.
* @param nClasses The number of classes.
*
*/
private void updateStats( double [] predictedClassification, Itemset itemset, int nClasses )
{
int actualClass = (int)itemset.getClassValue();
if ( !itemset.classIsMissing() )
{
updateMargins( predictedClassification, actualClass, nClasses );
// Determine the predicted class (doesn't detect multiple classifications)
int predictedClass = -1;
double bestProb = 0.0;
for( int i = 0; i < nClasses; i++ )
{
if ( predictedClassification[i] > bestProb )
{
predictedClass = i;
bestProb = predictedClassification[i];
}
}
// Update counts when no class was predicted
if ( predictedClass < 0 )
{
return;
}
double predictedProb = Math.max( Double.MIN_VALUE, predictedClassification[actualClass] );
double priorProb = Math.max( Double.MIN_VALUE, priorsProbabilities[actualClass] / classPriorsSum );
}
}
/** Returns class probabilities for an itemset.
*
* @param itemset The itemset.
*
* @throws Exception If cannot compute the classification.
* @return class probabilities for an itemset.
*/
public final double [] classificationForItemset( Itemset itemset ) throws Exception
{
return root.classificationForItemset( itemset );
}
/** Update the cumulative record of classification margins.
*
* @param predictedClassification Distribution of class values predicted for the itemset.
* @param actualClass The class value.
* @param nClasses Number of classes.
*/
private void updateMargins( double [] predictedClassification, int actualClass, int nClasses )
{
double probActual = predictedClassification[actualClass];
double probNext = 0;
for( int i = 0; i < nClasses; i++ )
if ( ( i != actualClass ) && ( //Comparators.isGreater( predictedClassification[i], probNext ) ) )
predictedClassification[i] > probNext ) )
probNext = predictedClassification[i];
double margin = probActual - probNext;
int bin = (int)( ( margin + 1.0 ) / 2.0 * marginResolution );
marginCounts[bin]++;
}
/** Evaluates if a string is a boolean value.
*
* @param value The string to evaluate.
*
* @return True if value is a boolean value. False otherwise.
*/
private boolean isBoolean( String value )
{
if ( value.equalsIgnoreCase( "TRUE") || value.equalsIgnoreCase( "FALSE" ) )
return true;
else
return false;
}
/** Returns index of maximum element in a given array of doubles. First maximum is returned.
*
* @param doubles The array of elements.
*
* @return index of maximum element in a given array of doubles. First maximum is returned.
*/
public static int maxIndex( double [] doubles )
{
double maximum = 0;
int maxIndex = 0;
for ( int i = 0; i < doubles.length; i++ )
{
if ( ( i == 0 ) || //
doubles[i] > maximum )
{
maxIndex = i;
maximum = doubles[i];
}
}
return maxIndex;
}
/** Sets the class prior probabilities.
*
* @throws Exception If cannot compute the probabilities.
*/
public void priorsProbabilities() throws Exception
{
for ( int i = 0; i < modelDataset.numClasses(); i++ )
priorsProbabilities[i] = 1;
classPriorsSum = modelDataset.numClasses();
for (int i = 0; i < modelDataset.numItemsets(); i++)
{
if ( !modelDataset.itemset(i).classIsMissing() )
{
try
{
priorsProbabilities[(int)modelDataset.itemset(i).getClassValue()] += modelDataset.itemset(i).getWeight();
classPriorsSum += modelDataset.itemset(i).getWeight();
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
}
}
}
}
/** Writes the tree and the results of the training and the test in the file.
*
* @throws IOException If the file cannot be written.
*/
public void printResult() throws IOException
{
long totalTime = ( System.currentTimeMillis() - startTime ) / 1000;
long seconds = totalTime % 60;
long minutes = ( ( totalTime - seconds ) % 3600 ) / 60;
String tree ="";
PrintWriter resultPrint;
tree += toString();
tree += "\n@TotalNumberOfNodes " + root.NumberOfNodes;
tree += "\n@NumberOfLeafs " + root.NumberOfLeafs;
tree += "\n\n@NumberOfItemsetsTraining " + trainDataset.numItemsets();
tree += "\n@NumberOfCorrectlyClassifiedTraining " + correct;
tree += "\n@PercentageOfCorrectlyClassifiedTraining " + (float)(correct*100.0)/(float)trainDataset.numItemsets() + "%" ;
tree += "\n@NumberOfInCorrectlyClassifiedTraining " + (trainDataset.numItemsets()-correct);
tree += "\n@PercentageOfInCorrectlyClassifiedTraining " + (float)((trainDataset.numItemsets()-correct)*100.0)/(float)trainDataset.numItemsets() + "%" ;
tree += "\n\n@NumberOfItemsetsTest " + testDataset.numItemsets();
tree += "\n@NumberOfCorrectlyClassifiedTest " + testCorrect;
tree += "\n@PercentageOfCorrectlyClassifiedTest " + (float)(testCorrect*100.0)/(float)testDataset.numItemsets() + "%" ;
tree += "\n@NumberOfInCorrectlyClassifiedTest " + (testDataset.numItemsets()-testCorrect);
tree += "\n@PercentageOfInCorrectlyClassifiedTest " + (float)((testDataset.numItemsets()-testCorrect)*100.0)/(float)testDataset.numItemsets() + "%" ;
tree += "\n\n@ElapsedTime " + ( totalTime - minutes * 60 - seconds ) / 3600 + ":" + minutes / 60 + ":" + seconds;
resultPrint = new PrintWriter( new FileWriter ( resultFileName ) );
resultPrint.print( getHeader() + "\n@decisiontree\n\n" + tree );
resultPrint.close();
}
/** Evaluates the training dataset and writes the results in the file.
*
* @exception If the file cannot be written.
*/
public void printTrain()
{
String text = getHeader();
for ( int i = 0; i < trainDataset.numItemsets(); i++)
{
try
{
Itemset itemset = trainDataset.itemset( i );
int cl = (int)evaluateItemset( itemset );
if ( cl == (int) itemset.getValue( trainDataset.getClassIndex() ) )
correct++;
text += trainDataset.getClassAttribute().value( cl ) + " " +
trainDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + "\n";
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( trainOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file: " + e.getMessage() );
}
}
/** Evaluates the test dataset and writes the results in the file.
*
* @exception If the file cannot be written.
*/
public void printTest()
{
String text = getHeader();
for ( int i = 0; i < testDataset.numItemsets(); i++)
{
try
{
int cl = (int) evaluateItemset( testDataset.itemset( i ) );
Itemset itemset = testDataset.itemset( i );
if ( cl == (int) itemset.getValue( testDataset.getClassIndex() ) )
testCorrect++;
text += testDataset.getClassAttribute().value( cl ) + " " +
testDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + "\n";
}
catch ( Exception e )
{
System.err.println( e.getMessage());
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( testOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file." );
}
}
/** Function to print the tree.
*
* @return a string representation of the C4.5 tree
*/
public String toString()
{
return root.toString();
}
/**
* Returns the C4.5 tree
* @return the C4.5 tree
*/
public Tree getTree(){return root;}
}