/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
* <p>
* @author Written by Cristobal Romero (Universidad de C�rdoba) 27/02/2007
* @author Modified by Cristobal Romero (Universidad de C�rdoba) 19/04/2007
* @version 0.1
* @since JDK 1.5
*</p>
*/
package keel.Algorithms.Decision_Trees.ID3;
import java.io.*;
import java.util.*;
import keel.Dataset.Attributes;
/**
A Java implementation of the ID3 algorithm. This is a binary version where
nodes are split by atributtes and values, instead of the generic ID3, which
only considers attributes for splitting.
@author Crist�bal Romero Morales (UCO)
@version 1.0 (30-03-06)
*/
public class ID3 extends Algorithm
{
/** Root of the decomposition tree. */
Node root = new Node();
/** Total number of Nodes in the tree */
int NumberOfNodes;
/** Number of Leafs in the tree */
int NumberOfLeafs;
/** Constructor.
*
* @param paramFile The parameters file.
*
*/
public ID3( String paramFile )
{
boolean salir = false;
try
{
// starts the time
long startTime = System.currentTimeMillis();
// Sets the options of the execution.
StreamTokenizer tokenizer = new StreamTokenizer( new BufferedReader( new FileReader( paramFile ) ) );
initTokenizer( tokenizer) ;
setOptions( tokenizer );
// Initializes the dataset.
modelDataset = new Dataset( modelFileName, true );
/*check if there are continous attributes*/
if(Attributes.hasRealAttributes() || Attributes.hasIntegerAttributes())
{
System.err.println("ID3 can only handle nominal attributes." );
//System.exit(-1);
salir = true;
}
if (!salir){
trainDataset = new Dataset( trainFileName, false );
testDataset = new Dataset( testFileName, false );
NumberOfNodes = 0;
NumberOfLeafs = 0;
// Executes the algorithm.
generateTree();
// Prints the results generates by the algorithm.
printTrain();
printTest();
printResult();
}
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
System.exit(-1);
}
}
/** Function to read the options from the execution file and assign the values to the parameters.
*
* @param options The StreamTokenizer that reads the parameters file.
*
* @throws Exception If the format of the file is not correct.
*/
protected void setOptions( StreamTokenizer options ) throws Exception
{
options.nextToken();
// Checks that the file starts with the token algorithm.
if ( options.sval.equalsIgnoreCase( "algorithm" ) )
{
options.nextToken();
options.nextToken();
//if (!options.sval.equalsIgnoreCase( "ID3" ) )
// throw new Exception( "The name of the algorithm is not correct." );
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
// Reads the names of the input files.
if ( options.sval.equalsIgnoreCase( "inputData" ) )
{
options.nextToken();
options.nextToken();
modelFileName = options.sval;
if ( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = options.sval;
options.nextToken();
testFileName = options.sval;
if( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = modelFileName;
options.nextToken();
}
}
}
else
throw new Exception( "The file must start with the word inputData." );
while ( true )
{
if( options.nextToken() == StreamTokenizer.TT_EOF )
throw new Exception( "No output file provided." );
if ( options.sval == null )
continue;
else if ( options.sval.equalsIgnoreCase( "outputData" ) )
break;
}
/* Reads the names of the output files*/
options.nextToken();
options.nextToken();
trainOutputFileName = options.sval;
options.nextToken();
testOutputFileName = options.sval;
options.nextToken();
resultFileName = options.sval;
}
else
throw new Exception( "The file must start with the word algorithm followed of the name of the algorithm." );
}
/** Run the algorithm.
*
*/
public void generateTree()
{
root.setData( getItemsets() );
decomposeNode( root );
}
/** Function to write the decision tree in the form of rules.
*
* @param node The current node.
* @param tab The indentation of the current rule.
*
* @return The tree in form of rules.
*/
public String writeTree( Node node, String tab )
{
int outputattr = modelDataset.getClassIndex();
String cadena = "";
Attribute classAtt = modelDataset.getClassAttribute();
String attName = classAtt.name();
try
{
// Print a leaf node.
if ( node.getChildren() == null )
{
String value = getCommonClass( node.getData(), outputattr );
cadena = tab + attName + " = \"" + value + "\"\n";
/* new */
NumberOfLeafs++;
return cadena;
}
// Print a rule.
cadena += tab + "if( " + modelDataset.getAttribute( node.getDecompositionAttribute() ).name() +
" == \"" + modelDataset.getAttribute( node.getDecompositionAttribute() ).
value(node.getDecompositionValue()) + "\" ) then \n";
cadena += tab + "{\n";
cadena += writeTree( node.getChildren()[0], tab + "\t" );
cadena += tab + "}\n";
cadena += tab + "else\n";
cadena += tab + "{\n";
cadena +=writeTree( node.getChildren()[1], tab + "\t" );
cadena += tab + "}\n";
/* new */
NumberOfNodes++;
return cadena;
}
catch( Exception e )
{
System.out.println( "Error writing tree" );
}
return cadena;
}
/** Function to evaluate the class which the itemset must have according to the classification of the tree.
*
* @param itemset The itemset to evaluate.
* @param node The node that is evaluated at this time.
*
* @return The index of the class index predicted.
*/
public int evaluateItemset( Itemset itemset, Node node )
{
int outputattr = modelDataset.getClassIndex();
boolean correct = false;
String aux = null;
Attribute classAtt = modelDataset.getClassAttribute();
try
{
// if the node is a final leaf
if ( node.getChildren() == null )
{
int []values = getAllValues( node.getData(), outputattr );
if ( values.length == 1 )
{
if( values[0] == itemset.getClassValue() )
{
aux = classAtt.value( values[0] );
aux = aux + " " + aux + "\n";
return values[0];
}
else
{
aux = classAtt.value( (int)itemset.getClassValue() );
aux = aux + " " + classAtt.value(values[0]) + "\n";
return values[0];
}
}
aux = classAtt.value( (int)itemset.getClassValue() );
aux = aux + " null\n";
return (int)itemset.getClassValue();
}
}
catch ( Exception e)
{
return Integer.parseInt( aux.toString() );
}
// Evaluate the children of the node.
if( itemset.getValue( node.getDecompositionAttribute() ) == node.getDecompositionValue() )
return( evaluateItemset( itemset, node.getChildren()[0] ) );
else
return( evaluateItemset( itemset, node.getChildren()[1] ) );
}
/** Function to return all the values of the specified attribute in the data set.
*
* @param data All the itemsets.
* @param attribute Number of attributes.
*
* @return All the values that can have the attributes.
*/
public int []getAllValues( Vector data, int attribute )
{
Vector values = new Vector();
int num = data.size();
for ( int i = 0; i < num; i++ )
{
Itemset current = (Itemset) data.elementAt( i );
String symbol = modelDataset.getAttribute( attribute ).value( (int)current.getValue( attribute ) );
int index = values.indexOf( symbol );
if ( index < 0 )
values.addElement( symbol );
}
int []array = new int[values.size()];
for ( int i = 0; i < array.length; i++ )
{
String symbol = (String)values.elementAt( i );
array[i] = modelDataset.getAttribute( attribute ).valueIndex( symbol );
}
values = null;
return array;
}
/** Function to return the most common class of the itemsets in data.
*
* @param data All the itemsets.
* @param attribute Index of attribute.
*
* @return The most common class.
*/
public String getCommonClass( Vector data, int attribute )
{
Vector values = new Vector();
int []counter = new int[20];
int num = data.size();
int bestIndex = 0;
for ( int i = 0; i < num; i++ )
{
Itemset current = (Itemset)data.elementAt( i );
String symbol = modelDataset.getAttribute( attribute ).value( (int)current.getValue( attribute ) );
int index = values.indexOf( symbol );
if ( index < 0 )
values.addElement( symbol );
else
counter[index]++;
}
for ( int i = 1; i < counter.length; i++ )
if ( counter[i] > counter[bestIndex] )
bestIndex = i;
return (String)values.elementAt( bestIndex );
}
/** Function to returns a subset of data.
*
* @param data The itemsets that where to extract the subset.
* @param attribute The attribute to make the division.
* @param value The value of the attribute.
*
* @return All the itemsets in data that has the value given for the attribute given.
*/
public Vector getSubset( Vector data, int attribute, int value )
{
Vector subset = new Vector();
int num = data.size();
for ( int i = 0; i < num; i++ )
{
Itemset current = (Itemset)data.elementAt( i );
if ( current.getValue( attribute ) == value )
subset.addElement( current );
}
return subset;
}
/** Function to returns a subset of data, which is the complement of the second argument.
*
* @param data The itemsets that where to extract the subset.
* @param oldset The complement set.
*
* @return All the itemsets that are contained in data but are not in oldset.
*/
public Vector getComplement( Vector data, Vector oldset )
{
Vector subset = new Vector();
int num = data.size();
for ( int i = 0; i < num; i++ )
{
Itemset current = (Itemset)data.elementAt( i );
int index = oldset.indexOf( current );
if ( index < 0 )
subset.addElement( current );
}
return subset;
}
/** Function to compute the entropy of the set of data points.
*
* @param data The set of itemsets over is wanted to compute the entropy.
*
* @return The entropy of data.
*/
public double computeEntropy( Vector data )
{
int numdata = data.size();
if ( numdata == 0 )
return 0;
int attribute = modelDataset.getClassIndex();
int numvalues = modelDataset.getClassAttribute().numValues();
double sum = 0;
for ( int i = 0; i < numvalues; i++ )
{
int count = 0;
for ( int j = 0; j < numdata; j++ )
{
Itemset current = (Itemset)data.elementAt( j );
if ( current.getValue( attribute ) == i )
count++;
}
double probability = 1. * count / numdata;
if ( count > 0 )
sum += -probability * Math.log( probability );
}
return sum;
}
/** Function to check if the specified attribute and value are already used to decompose the data.
*
* @param node The node to check at this time.
* @param attribute The attribute to check.
* @param value The value to check.
*
* @return True if the attribute and the values are already used to decompose,
* or false otherwise.
*/
public boolean alreadyUsedToDecompose( Node node, int attribute, int value )
{
if ( node.getChildren() != null )
if ( node.getDecompositionAttribute() == attribute && node.getDecompositionValue() == value )
return true;
if ( node.getParent() == null )
return false;
return alreadyUsedToDecompose( node.getParent(), attribute, value );
}
/** Function to decompose the specified node.
*
* @param node The node to decompose.
*/
public void decomposeNode( Node node )
{
double bestEntropy;
boolean selected = false;
int selectedAttribute = 0;
int selectedValue = 0;
int numdata = node.getData().size();
int numinputattributes = modelDataset.numAttributes() - 1;
node.setEntropy( computeEntropy( node.getData() ) );
double initialEntropy = bestEntropy = node.getEntropy();
if ( node.getEntropy() == 0 )
return;
// The best attribute and value are located which causes maximum decrease in entropy.
for ( int i = 0; i < numinputattributes; i++ )
{
if ( i == modelDataset.getClassIndex() )
continue;
int numvalues = modelDataset.getAttribute(i).numValues();
for ( int j = 0; j < numvalues; j++ )
{
if ( alreadyUsedToDecompose( node, i, j ) )
continue;
Vector subset = getSubset( node.getData(), i, j );
if ( subset.size() == 0 )
continue;
Vector complement = getComplement( node.getData(), subset );
double e1 = computeEntropy( subset );
double e2 = computeEntropy( complement );
double entropy = ( e1 * subset.size() + e2 * complement.size() ) / numdata;
if ( entropy < bestEntropy )
{
selected = true;
bestEntropy = entropy;
selectedAttribute = i;
selectedValue = j;
}
}
}
if ( selected == false )
{
return;
}
// Now divide the dataset into two using the selected attribute and value.
node.setDecompositionAttribute( selectedAttribute );
node.setDecompositionValue( selectedValue );
node.setChildren( new Node [2] );
node.addChildren( new Node() );
node.getChildren( 0 ).setParent( node );
node.getChildren( 0 ).setData( getSubset( node.getData(), selectedAttribute, selectedValue ) );
node.getChildren()[1] = new Node();
node.getChildren( 1 ).setParent( node );
// This loop copies all the data that are not in the first child node into the second child node.
for ( int j = 0; j < numdata; j++ )
{
Itemset current = (Itemset)node.getData().elementAt( j );
if ( node.getChildren( 0 ).getData().indexOf( current ) >= 0 )
continue;
node.getChildren( 1 ).getData().addElement( current );
}
decomposeNode( node.getChildren()[0] );
decomposeNode( node.getChildren()[1] );
// There is no more any need to keep the original vector. Release this memory.
node.setData( null );
}
/** Funtion to get all the itemsets of the dataset.
*
* @return The itemsets.
*/
private Vector getItemsets()
{
Vector itemsets = new Vector( modelDataset.numItemsets());
for ( int i = 0; i < modelDataset.numItemsets(); i++ )
itemsets.addElement( modelDataset.itemset( i ) );
return itemsets;
}
/** Writes the tree and the results of the training and the test in the file.
*
* @exception If the file cannot be written.
*/
public void printResult() throws IOException
{
long totalTime = ( System.currentTimeMillis() - startTime ) / 1000;
long seconds = totalTime % 60;
long minutes = ( ( totalTime - seconds ) % 3600 ) / 60;
String tree = "";
PrintWriter resultPrint;
tree += writeTree( root, "" );
tree += "\n@TotalNumberOfNodes " + NumberOfNodes;
tree += "\n@NumberOfLeafs " + NumberOfLeafs;
tree += "\n\n@NumberOfItemsetsTraining " + trainDataset.numItemsets();
tree += "\n@NumberOfCorrectlyClassifiedTraining " + correct;
tree += "\n@PercentageOfCorrectlyClassifiedTraining " + (float)(correct*100.0)/(float)trainDataset.numItemsets() + "%" ;
tree += "\n@NumberOfInCorrectlyClassifiedTraining " + (trainDataset.numItemsets()-correct);
tree += "\n@PercentageOfInCorrectlyClassifiedTraining " + (float)((trainDataset.numItemsets()-correct)*100.0)/(float)trainDataset.numItemsets() + "%" ;
tree += "\n\n@NumberOfItemsetsTest " + testDataset.numItemsets();
tree += "\n@NumberOfCorrectlyClassifiedTest " + testCorrect;
tree += "\n@PercentageOfCorrectlyClassifiedTest " + (float)(testCorrect*100.0)/(float)testDataset.numItemsets() + "%" ;
tree += "\n@NumberOfInCorrectlyClassifiedTest " + (testDataset.numItemsets()-testCorrect);
tree += "\n@PercentageOfInCorrectlyClassifiedTest " + (float)((testDataset.numItemsets()-testCorrect)*100.0)/(float)testDataset.numItemsets() + "%" ;
tree += "\n\n@ElapsedTime " + ( totalTime - minutes * 60 - seconds ) / 3600 + ":" + minutes / 60 + ":" + seconds;
resultPrint = new PrintWriter( new FileWriter ( resultFileName ) );
resultPrint.print( getHeader() + "\n@decisiontree\n\n" + tree );
resultPrint.close();
}
/** Evaluates the training dataset and writes the results in the file.
*
*/
public void printTrain()
{
String text = getHeader();
for ( int i = 0; i < trainDataset.numItemsets(); i++ )
{
try
{
Itemset itemset = trainDataset.itemset( i );
int cl = evaluateItemset( itemset, root );
if ( cl == (int) itemset.getValue( trainDataset.getClassIndex() ) )
correct++;
text += trainDataset.getClassAttribute().value( cl ) + " " +
trainDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + "\n";
}
catch ( Exception e )
{
System.err.println( e.getMessage() );
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( trainOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file: " + e.getMessage() );
}
}
/** Evaluates the test dataset and writes the results in the file.
*
*/
public void printTest()
{
String text = getHeader();
for ( int i = 0; i < testDataset.numItemsets(); i++)
{
try
{
int cl = (int) evaluateItemset( testDataset.itemset( i ), root );
Itemset itemset = testDataset.itemset( i );
if ( cl == (int) itemset.getValue( testDataset.getClassIndex() ) )
testCorrect++;
text += testDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + " " +
testDataset.getClassAttribute().value( cl )+ "\n";
}
catch ( Exception e )
{
System.err.println( e.getMessage());
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( testOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file." );
}
}
/** Main function.
*
* @param args The parameters file.
*/
public static void main(String[] args) {
if ( args.length != 1){
System.err.println("\nError: you have to specify the parameters file\n\tusage: java -jar ID3.jar parameterfile.txt" );
System.exit(-1);
}
else{
ID3 id3 = new ID3( args[0] );
}
}
}//id3