/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Rule_Learning.DataSqueezer;
import java.io.*;
import java.lang.reflect.Array;
import java.util.*;
import keel.Dataset.Attributes;
/**
A Java implementation of the DataSqueezer algorithm
@author Francisco Charte Ojeda
@version 1.0 (20-01-10)
*/
public class DataSqueezer extends Algorithm {
/** Inner class to represent a rule
*
* Francisco Charte - 17-ene-2010
*/
private class Rule {
int classId; // Associated class
Vector<Integer> attribute; // Attributes affected by the rule
Vector<Integer> value; // Values assigned to this attributes
int weight; // weight of this rule
/** Constructor
* Francisco Charte - 18-ene-2010
*
* @param C Class
*/
public Rule(int C) {
classId = C;
attribute = new Vector<Integer>();
value = new Vector<Integer>();
weight = 0;
}
/** Add a new condition to this rule
* Francisco Charte - 18-ene-2010
*
* @param att Attribute
* @param val Value
*/
public void addCondition(int att, int val, int w) {
attribute.add(att);
value.add(val);
// Accumulate the number of itemsets described by the rule
weight += w / modelDataset.getAttribute(att).numValues();
}
/** Check if an itemset meets the conditions of this rule
* Francisco Charte - 18-ene-2010
*
* @param i Itemset to check
* @return true if meets the conditions of this rule
*/
public boolean check(Itemset i) {
for(int index = 0; index < attribute.size(); index++) {
if(i.isMissing(attribute.get(index)) ||
(int )i.getValue(attribute.get(index)) != value.get(index))
return false;
}
return true;
}
public boolean check2(Itemset i) {
for(int index = 0; index < attribute.size(); index++) {
if(!i.isMissing(attribute.get(index)) &&
(int )i.getValue(attribute.get(index)) != value.get(index))
return false;
}
return true;
}
/** Return the string representation for this rule
* Francisco Charte - 18-ene-2010
*
* @return String representation
*/
public String generateRule() {
String rule = "IF ";
for(int index = 0; index < attribute.size(); index++) {
if(index > 0) rule += "AND ";
rule += modelDataset.getAttribute(attribute.get(index)).name();
rule += "=" + modelDataset.getAttribute(attribute.get(index)).value(value.get(index)) + " ";
}
rule += " THEN " + modelDataset.getClassAttribute().name() +
"=" + modelDataset.getClassAttribute().value(classId) + "\n";
return rule;
}
/** Returns true if the rule has not selectors
*
* @return
*/
public boolean isEmpty() {
return attribute.size() == 0;
}
/** Return the class index
*
* @return Class index
*/
public int getClassId() {
return classId;
}
/** Return the weight
*
* @return
*/
public int getWeight() {
return weight;
}
/** Returns the number of selectors in the rule
*
* @return
*/
public int numberOfSelectors() {
return attribute.size();
}
}
// Francisco Charte - 16-ene-2010
Vector<Integer> [] classList; // Bidimensional array with indexes of data rows for every class
Vector<Itemset> POS, NEG; // Vectors with positive and negative examples
Vector<Itemset> Gpos, Gneg; // Vector with positive and negative examples after reduced
/** Array containing all the rules */
Vector<Rule> allRules;
/** Temporal array to save rules of a class */
Vector<Rule> rules;
/** Number of items not classified by the algorithm */
int notClassified;
int testNotClassified;
double pruneT;
double generalizationT;
/** Constructor.
*
* @param paramFile The parameters file.
*
*/
public DataSqueezer(String paramFile) {
boolean salir = false;
try {
// starts the time
startTime = System.currentTimeMillis();
// Sets the options of the execution.
StreamTokenizer tokenizer = new StreamTokenizer( new BufferedReader( new FileReader( paramFile ) ) );
initTokenizer( tokenizer) ;
setOptions( tokenizer );
// Initializes the dataset.
modelDataset = new Dataset( modelFileName, true );
/*check if there are continous attributes*/
if(Attributes.hasRealAttributes() || Attributes.hasIntegerAttributes())
{
System.err.println("DataSqueezer can only handle nominal attributes." );
//System.exit(-1);
salir = true;
}
if (!salir){
trainDataset = new Dataset( trainFileName, false );
testDataset = new Dataset( testFileName, false );
notClassified = 0;
testNotClassified = 0;
// Executes the algorithm.
// Francisco Charte - 16-ene-2010
generateRules();
// Prints the results generates by the algorithm.
printTrain();
printTest();
printResult();
}
}
catch ( Exception e )
{
e.printStackTrace();
System.err.println( e.getMessage() );
System.exit(-1);
}
}
/** Function to read the options from the execution file and assign the values to the parameters.
*
* @param options The StreamTokenizer that reads the parameters file.
*
* @throws Exception If the format of the file is not correct.
*/
protected void setOptions( StreamTokenizer options ) throws Exception {
options.nextToken();
// Checks that the file starts with the token algorithm.
if ( options.sval.equalsIgnoreCase( "algorithm" ) )
{
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
// Reads the names of the input files.
if ( options.sval.equalsIgnoreCase( "inputData" ) )
{
options.nextToken();
options.nextToken();
modelFileName = options.sval;
if ( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = options.sval;
options.nextToken();
testFileName = options.sval;
if( options.nextToken() != StreamTokenizer.TT_EOL )
{
trainFileName = modelFileName;
options.nextToken();
}
}
}
else
throw new Exception( "The file must start with the word inputData." );
while ( true )
{
if( options.nextToken() == StreamTokenizer.TT_EOF )
throw new Exception( "No output file provided." );
if ( options.sval == null )
continue;
else if ( options.sval.equalsIgnoreCase( "outputData" ) )
break;
}
/* Reads the names of the output files*/
options.nextToken();
options.nextToken();
trainOutputFileName = options.sval;
options.nextToken();
testOutputFileName = options.sval;
options.nextToken();
resultFileName = options.sval;
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
pruneT = Double.parseDouble(options.sval);
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
options.nextToken();
generalizationT = Double.parseDouble(options.sval);
}
else
throw new Exception( "The file must start with the word algorithm followed of the name of the algorithm." );
}
/** Run the DataSqueezer algorithm
*
* Francisco Charte - 16-ene-2010
*/
public void generateRules() {
// Separate the rows of the dataset by class
generateClassLists();
// Prepare de rules array
allRules = new Vector<Rule>();
// For every class in the dataset
for(int index = 0; index < modelDataset.numClasses(); index++) {
// Obtain the POS and NEG tables assuming the index class as positive
obtainPosNegTables(index);
if (POS.size() > 0) {
// Process POS and NEG tables
dataSqueezer(index);
// Save the rules that classify the current class
saveRules();
}
}
}
/** Group the rows of the dataset by class
*
* Francisco Charte - 16-ene-2010
*/
private void generateClassLists() {
// Adjust the dimensions of the array of class list
classList = (Vector<Integer>[] ) Array.newInstance(Vector.class, modelDataset.numClasses());
// and create the empty list for every class
for(int index = 0; index < modelDataset.numClasses(); index++) {
classList[index] = new Vector<Integer>();
}
// Iterate over the training dataset rows
for(int index = 0; index < modelDataset.numItemsets(); index++) {
// For every sample
Itemset sample = modelDataset.itemset(index);
// add his index to the list of their class
classList[(int ) sample.getClassValue()].add(index);
}
}
/** Generate de POS and NEG tables for class 'index'
* Francisco Charte - 16-ene-2010
*
* @param index Index of the positive class
*/
private void obtainPosNegTables(int classIndex) {
// Recreate the tables for the new partition
POS = new Vector<Itemset>();
NEG = new Vector<Itemset>();
// The samples of index class to the POS table
for(int index = 0; index < classList[classIndex].size(); index++) {
POS.add((Itemset)modelDataset.itemset(classList[classIndex].get(index)).copy());
}
// Every other sample to NEG table
for(int classI = 0; classI < classList.length; classI++) {
// If this is the index of positive class
if(classI == classIndex)
continue; // step to the next one
// Add all the samples of this class to NEG table
for(int index = 0; index < classList[classI].size(); index++) {
NEG.add((Itemset )modelDataset.itemset(classList[classI].get(index)).copy());
}
}
}
/** Process the data stored in POS and NEG tables generating rules
*
* Francisco Charte - 17/18-ene-2010
*/
private void dataSqueezer(int C) {
int k = modelDataset.numAttributes();
// Generalize de data tables
Gpos = dataReduce(POS, k);
Gneg = dataReduce(NEG, k);
// Initialize the rule list
rules = new Vector<Rule>();
int i = 0; // Rule index
boolean notChange;
long pruneNumber = Math.round(pruneT*POS.size());
do {
// Generate de list of columns in POS
Vector<Integer> LIST = new Vector<Integer>();
for(int col = 0; col < modelDataset.numAttributes(); col++) {
if(col != modelDataset.getClassIndex())
LIST.add(col);
}
// Add new empty rule
rules.add(new Rule(C));
notChange = true; // Gpos not changed
boolean prune = false;
do {
int maxWeight = 0, maxJ = -1, maxA = -1;
int Saj;
// Within every column of Gpos that is on LIST
for(int j = 0; j < LIST.size(); j++) {
// For every non missing value from this column
for(int a = 0; a < Gpos.size(); a++) {
Saj = 0;
if(Gpos.get(a).isMissing(LIST.get(j)))
continue;
// Reference value
double value = Gpos.get(a).getValue(LIST.get(j));
// Sum every row with this value
for(int index = 0; index < Gpos.size(); index++)
if(Gpos.get(index).getValue(LIST.get(j)) == value)
Saj += (int ) Gpos.get(index).getValue(k);
// Scales by the number of valid values for this attribute
Saj *= Gpos.get(a).getAttribute(LIST.get(j)).numValues();
// Keep the indexes of max weight
if(Saj > maxWeight) {
maxA = a;
maxJ = j;
maxWeight = Saj;
}
} // for a
// All the values in Gpos for attribute j are * (missing)
if(maxA == -1) {
LIST.remove(j);
break;
}
} // for j
// Add "j = a" selector to rules[i]
if(maxA != -1 && maxJ != -1) {
if ((maxWeight / Gpos.get(maxA).getAttribute(LIST.get(maxJ)).numValues()) > pruneNumber) {
rules.get(i).addCondition(
LIST.get(maxJ),
(int )Gpos.get(maxA).getValue(LIST.get(maxJ)), maxWeight);
// Remove j from LIST
LIST.remove(maxJ);
}
else {
prune = true;
}
}
} while(rulesDescribeNeg(i,k) && !LIST.isEmpty() && !prune);
Rule l = rules.get(i); // Current rule
if(!l.isEmpty()) {
// Remove all rows described by rules[i] from Gpos
Enumeration e = Gpos.elements();
while(e.hasMoreElements()) {
Itemset r = (Itemset )e.nextElement();
if(l.check2(r)) {
Gpos.remove(r);
e = Gpos.elements(); // Forzar reexploración *** Quebradero de cabeza
notChange = false; // Gpos has changed
}
}
i++; // Index of new rule
} else {
rules.remove(i); // Remove empty rule
}
} while(!Gpos.isEmpty() && !notChange);
// rules contain the rules to classify class C
return;
} // dataSqueezer
/** Check if the rule ith describe any row from Gneg
* Francisco Charte - 18-ene-2010
*
* @param i Index of current rule in rules vector
* @return true or false
*/
private boolean rulesDescribeNeg(int i, int k) {
int cont = 0;
for(int index = 0; index < Gneg.size(); index++)
if(rules.get(i).check(Gneg.get(index))) {
cont += Gneg.get(index).getValue(k);
}
if (cont > Math.round(generalizationT*POS.size())) {
return true;
} else {
return false;
}
}
/** Apply generalization to the data tables as described by Kurgan
* Francisco Charte - 17-ene-2010
*
* @param D D=POS or D=NEG
* @param k Number of attributes
* @return Gpos or Gneg
*/
private Vector<Itemset> dataReduce(Vector<Itemset> D, int k) {
Vector<Itemset> G = new Vector<Itemset>(); // G = []
// Init parameters
int i = 0;
Itemset tmp = (Itemset )D.get(0).copy();
G.add((Itemset )D.get(0).copy());
G.get(0).setValue(k, 1);
for(int j = 1; j < D.size(); j++) {
for(int kk = 0; kk < k; kk++) {
if(kk == modelDataset.getClassIndex()) continue;
// process missing 'do not care' values
if(D.get(j).getValue(kk) != tmp.getValue(kk) || D.get(j).isMissing(kk)) {
tmp.setMissing(kk);
}
} // for kk
if(numberOfNonMissingValues(tmp, k) >= 2) {
for(int index = 0; index < k; index++)
G.get(i).setValue(index, tmp.getValue(index));
G.get(i).setValue(k, G.get(i).getValue(k) + 1);
} else {
i++;
tmp = (Itemset )D.get(j).copy();
G.add((Itemset )D.get(j).copy());
G.get(i).setValue(k, 1);
}
} // for j
return G;
}
/** Accumulate the rules that classify every class
* Francisco Charte - 17-ene-2010
*
*/
private void saveRules() {
for(int index = 0; index < rules.size(); index++)
allRules.add(rules.get(index));
}
/** Return the number of non missing values in a Itemset
* Francisco Charte - 17-ene-2010
*
* @param t Itemset
* @param k Number of attributes
* @return Number of non missing values
*/
private int numberOfNonMissingValues(Itemset t, int k) {
int count = 0;
for(int index = 0; index < k; index++) {
if(index == modelDataset.getClassIndex()) continue;
if(!t.isMissing(index))
count++;
}
return count;
}
/** Function to write the list of rules.
* Francisco Charte - 19-ene-2010
*
* @return String with the list of rules
*/
public String writeRules() {
String ruleList = "";
for(int index = 0; index < allRules.size(); index++)
ruleList += allRules.get(index).generateRule();
return ruleList;
}
/** Function to evaluate the class which the itemset must have according to the classification of the rules.
* Francisco Charte - 19-ene-2010
*
* @param i The itemset to evaluate.
*
* @return The index of the class index predicted or -1 if it's not described by any rule.
*/
public int evaluateItemset(Itemset i) {
int classId = -1, weight = 0;
for(int index = 0; index < allRules.size(); index++)
if(allRules.get(index).check(i) && allRules.get(index).getWeight() > weight) {
classId = allRules.get(index).getClassId();
weight = allRules.get(index).getWeight();
}
return classId;
}
/** Writes the rules and the results of the training and the test in the file.
* Francisco Charte - 19-ene-2010
*
* @exception If the file cannot be written.
*/
public void printResult() throws IOException
{
long totalTime = ( System.currentTimeMillis() - startTime ) / 1000;
long seconds = totalTime % 60;
long minutes = ( ( totalTime - seconds ) % 3600 ) / 60;
String result = "";
PrintWriter resultPrint;
double numberOfRules = allRules.size(), numberOfSelectors = 0;
for(int index = 0; index < allRules.size(); index++)
numberOfSelectors += allRules.get(index).numberOfSelectors();
result += writeRules();
result += "\n\n@NumberOfRules " + numberOfRules;
result += "\n@TotalNumberOfSelectors " + numberOfSelectors;
result += "\n@MeanNumberOfSelectorsPerRule " + numberOfSelectors / numberOfRules;
result += "\n\n@NumberOfItemsetsTraining " + trainDataset.numItemsets();
result += "\n@NumberOfCorrectlyClassifiedTraining " + correct;
result += "\n@PercentageOfCorrectlyClassifiedTraining " + (float)(correct*100.0)/(float)trainDataset.numItemsets() + "%" ;
result += "\n@NumberOfItemsNotClassifiedTraining " + notClassified;
result += "\n@PercentageOfItemsNotClassifiedTraining " + (float)(notClassified*100.0)/(float)trainDataset.numItemsets() + "%" ;
result += "\n@NumberOfInCorrectlyClassifiedTraining " + (trainDataset.numItemsets()-correct-notClassified);
result += "\n@PercentageOfInCorrectlyClassifiedTraining " + (float)((trainDataset.numItemsets()-correct-notClassified)*100.0)/(float)trainDataset.numItemsets() + "%" ;
result += "\n\n@NumberOfItemsetsTest " + testDataset.numItemsets();
result += "\n@NumberOfCorrectlyClassifiedTest " + testCorrect;
result += "\n@PercentageOfCorrectlyClassifiedTest " + (float)(testCorrect*100.0)/(float)testDataset.numItemsets() + "%" ;
result += "\n@ItemsNotClassifiedTest " + testNotClassified;
result += "\n@PercentageOfItemsNotClassifiedTest " + (float)(testNotClassified*100.0)/(float)testDataset.numItemsets() + "%" ;
result += "\n@NumberOfInCorrectlyClassifiedTest " + (testDataset.numItemsets()-testCorrect-testNotClassified);
result += "\n@PercentageOfInCorrectlyClassifiedTest " + (float)((testDataset.numItemsets()-testCorrect-testNotClassified)*100.0)/(float)testDataset.numItemsets() + "%" ;
result += "\n\n@ElapsedTime " + ( totalTime - minutes * 60 - seconds ) / 3600 + ":" + minutes / 60 + ":" + seconds;
result += "\n\n" +
numberOfRules + "," +
numberOfSelectors + "," +
numberOfSelectors / numberOfRules + "," +
trainDataset.numItemsets() + "," +
correct + "," +
(float)(correct*100.0)/(float)trainDataset.numItemsets() + "," +
notClassified + "," +
(float)(notClassified*100.0)/(float)trainDataset.numItemsets() + "," +
(trainDataset.numItemsets()-correct-notClassified) + "," +
(float)((trainDataset.numItemsets()-correct-notClassified)*100.0)/(float)trainDataset.numItemsets() + "," +
testDataset.numItemsets() + "," +
testCorrect + "," +
(float)(testCorrect*100.0)/(float)testDataset.numItemsets() + "," +
testNotClassified + "," +
(float)(testNotClassified*100.0)/(float)testDataset.numItemsets() + "," +
(testDataset.numItemsets()-testCorrect-testNotClassified) + "," +
(float)((testDataset.numItemsets()-testCorrect-testNotClassified)*100.0)/(float)testDataset.numItemsets() + "\n";
resultPrint = new PrintWriter( new FileWriter ( resultFileName ) );
resultPrint.print( getHeader() + "\n@rule list\n\n" + result );
resultPrint.close();
}
/** Evaluates the training dataset and writes the results in the file.
* Francisco Charte - 19-ene-2010
*
*/
public void printTrain()
{
String text = getHeader();
for ( int i = 0; i < trainDataset.numItemsets(); i++ )
{
try
{
Itemset itemset = trainDataset.itemset( i );
int cl = evaluateItemset(itemset);
if(cl == -1)
notClassified++;
if ( cl == (int) itemset.getValue( trainDataset.getClassIndex() ) )
correct++;
text +=
trainDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + " " +
(cl == -1 ? "not classified" : trainDataset.getClassAttribute().value( cl )) + "\n";
}
catch ( Exception e )
{
e.printStackTrace();
System.err.println( e.getMessage() );
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( trainOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file: " + e.getMessage() );
}
}
/** Evaluates the test dataset and writes the results in the file.
* Francisco Charte - 19-ene-2010
*
*/
public void printTest()
{
String text = getHeader();
for ( int i = 0; i < testDataset.numItemsets(); i++)
{
try
{
int cl = (int) evaluateItemset( testDataset.itemset( i ));
Itemset itemset = testDataset.itemset( i );
if(cl == -1)
testNotClassified++;
if ( cl == (int) itemset.getValue( testDataset.getClassIndex() ) )
testCorrect++;
text += testDataset.getClassAttribute().value( ( (int) itemset.getClassValue()) ) + " " +
(cl == -1 ? "not classified" : testDataset.getClassAttribute().value( cl ))+ "\n";
}
catch ( Exception e )
{
e.printStackTrace();
System.err.println( e.getMessage());
}
}
try
{
PrintWriter print = new PrintWriter( new FileWriter ( testOutputFileName ) );
print.print( text );
print.close();
}
catch ( IOException e )
{
System.err.println( "Can not open the training output file." );
}
}
/** Main function.
* Francisco Charte - 16-ene-2010
*
* @param args The parameters file.
*/
public static void main(String[] args) {
if ( args.length != 1){
System.err.println("\nError: you have to specify the parameters file\n\tusage: java -jar DataSqueezer.jar parameterfile.txt" );
System.exit(-1);
}
else{
new DataSqueezer( args[0] );
}
}
}