/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Rule_Learning.CN2;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import org.core.Files;
/**
* <p>Title: Main class of the algorithm</p>
* <p>Description: It contains the esential methods for the CN2 algorithm</p>
* <p>Created: November 26th 2004</p>
* <p>Copyright: Copyright (c) 2004</p>
* <p>Company: KEEL</p>
* @author Alberto Fern�ndez (University of Granada) 26/11/2004
* @since JDK1.5
* @version 1.6
*/
public class CN2 {
//names of the I/O files
private String outputFile;
private String outputFileTr;
private String outputFileTst;
private int instancesClassTrain[];
private int instancesClassEval[];
private int instancesClassTest[];
private int nClasses; // Maximum number of classes
private Complex selectors; // It stores all possible selectors
private int starSize; //Maximum size of the Star (beam search)
private ruleSet finalRuleSet; //Final RuleSet
private evaluateRuleQuality evReg; // To evaluate the performance of the rules
//Data-sets
private myDataset trainData;
private myDataset evalData;
private myDataset testData;
private int trainClasses[];
private int testClasses[];
private long tiempo;
private String miOutput; //To store the data-set header
private double covered;
private int nAttributes;
private double div;
private double threshold = 0.0;
private int accuracy = 0;
private String[] attributesName;
private String[] className;
private boolean anyContinous = false;
private boolean problem = false;
/**
* It checks if some of the preconditions are not satisfied: There are any continuous value or
* there was a problem while reading the data files
* @return boolean true if the algorithm can run normally, false in other case
*/
public boolean everythingOK() {
return ((!anyContinous) && (!problem));
}
/**
* Default builder
*/
public CN2() {
}
/**
* CN2 class builder</br>
* It does a local copy of the filenames for their posterior use.<br/>
* Then, it obtains all data from file and stores it in a format recognizable for the program.<br/>
* Finally, it creates all possible selectors for the dataset and stores them.
* @param ftrain Name of the input training file
* @param feval Name of the input validation file
* @param ftest Name of the input test file
* @param foutputTr Name of the output training file
* @param foutputTst Name of the output test file
* @param fsal Name of the output information file
* @param starSize It is the maximum size for the star in the search process (beam search)
* @param _covered The percentage of maximum examples to cover
* @param _accuracy It refers wether the complete selectors will be employed (disjunctions).
*/
public CN2(String ftrain, String feval, String ftest,
String foutputTr,
String foutputTst, String fsal, int starSize,
double _covered, int _accuracy) {
int i;
//System.out.println("Executing CN2");
outputFile = fsal;
covered = _covered;
accuracy = _accuracy;
outputFileTr = foutputTr;
outputFileTst = foutputTst;
Dataset train = new Dataset(); //ficheroTrain);
Dataset eval = new Dataset();
Dataset test = new Dataset(); //ficheroTest); }
try {
//System.out.println("\nLeyendo train: " + ftrain);
train.readSet(ftrain, true);
if (train.hayAtributosContinuos()) {
System.err.println("CN2 may not work properly with real attributes.\nPlease discretize the data-set");
//System.exit(-1);
anyContinous = true;
}
//System.out.println("\nLeyendo eval: " + feval);
eval.readSet(feval, false);
if (eval.hayAtributosContinuos()) {
System.err.println("CN2 may not work properly with real attributes.\nPlease discretize the data-set");
//System.exit(-1);
anyContinous = true;
}
//System.out.println("\nLeyendo test: " + ftest);
test.readSet(ftest, false);
} catch (IOException e) {
System.err.println(
"There was a problem while trying to read the dataset files:");
System.err.println("-> " + e);
problem = true;
}
if (this.everythingOK()) {
miOutput = new String("");
miOutput = test.copiaCabeceraTest();
System.out.println("\nGenerating datasets...");
trainData = new myDataset();
evalData = new myDataset();
testData = new myDataset();
train.calculaMasComunes();
eval.calculaMasComunes();
test.calculaMasComunes();
trainData = creaConjunto(train);
evalData = creaConjunto(eval);
testData = creaConjunto(test);
trainClasses = train.getC();
nClasses = train.getnClasses();
int aux = test.getnClasses();
if (aux > nClasses) {
nClasses = aux;
}
nAttributes = train.getnentradas();
instancesClassTrain = new int[nClasses];
for (int j = 0; j < nClasses; j++) {
instancesClassTrain[j] = 0;
for (i = 0; i < trainData.size(); i++) {
if (j == trainClasses[i]) {
instancesClassTrain[j]++;
}
}
}
int[] clasesEval;
clasesEval = eval.getC();
instancesClassEval = new int[nClasses];
for (int j = 0; j < nClasses; j++) {
instancesClassEval[j] = 0;
for (i = 0; i < evalData.size(); i++) {
if (j == clasesEval[i]) {
instancesClassEval[j]++;
}
}
}
testClasses = test.getC();
instancesClassTest = new int[nClasses];
for (int j = 0; j < nClasses; j++) {
instancesClassTest[j] = 0;
for (i = 0; i < testData.size(); i++) {
//if (valorClasesTst[j] == testClasses[i]) {
if (j == testClasses[i]) {
instancesClassTest[j]++;
}
}
}
this.starSize = starSize;
tiempo = System.currentTimeMillis();
div = (double) 1.0 / trainData.size();
attributesName = train.dameNombres();
className = train.dameClases();
String[] classNameAux = test.dameClases();
if (className.length < classNameAux.length) {
//nClasses = classNameAux.length;
className = new String[nClasses];
for (i = 0; i < nClasses; i++) {
className[i] = classNameAux[i];
}
}
}
}
/**
* It creates a dataset (attributes/class) according to those obtained from a data-file
* @param myData It must be a dataset read from file
* @return The new dataset created, that is, a linked-list of objects "Instances"
*/
private myDataset creaConjunto(Dataset myData) {
myDataset datos = new myDataset();
int tam = myData.getnentradas();
double[] vars = new double[tam];
double[][] X;
int[] C;
int clase = 0;
X = myData.getX();
C = myData.getC();
for (int i = 0; i < myData.getndatos(); i++) {
boolean salir = false;
for (int j = 0; (j < tam) && (!salir); j++) {
if (myData.isMissing(i, j)) {
salir = true;
} else {
vars[j] = X[i][j];
}
}
if (!salir) {
clase = C[i]; //Integer.parseInt(mis_datos.getDatosIndex(i, tam));
Instance m = new Instance(vars, clase, tam);
m.setPosFile(i);
datos.addData(m);
}
}
return datos;
}
/**
* We execute here the CN2 algorithm and we create the necessary output data
*/
public void execute() {
makeSelectors(); //I create all the possible selectors for the rules
unorderedCN2(trainData); //, valorClasesT);
tiempo = System.currentTimeMillis() - tiempo;
/** I delete possible repeated rules **/
for (int i = 0; (i < finalRuleSet.size() - 1); i++) {
boolean compara = false;
for (int j = i + 1; (j < finalRuleSet.size()) && (!compara); j++) {
compara = finalRuleSet.getRule(i).same(finalRuleSet.
getRule(j));
}
if (compara) { //there is
finalRuleSet.deleteRule(i); //I remove it
i--;
}
}
evReg = new evaluateRuleQuality(finalRuleSet, evalData, testData,
instancesClassEval, instancesClassTest,
className); //We evaluate the quality of the rules ...
generateOutput(); //We write the output files
}
/**
* It builds the total set of selectors to obtain all possible rules
*/
private void makeSelectors() {
int totalAtributos = trainData.getData(0).getNattributes();
int examples = trainData.size();
double[][] lista = new double[examples + 1][totalAtributos];
for (int i = 0; i < totalAtributos; i++) {
lista[0][i] = trainData.getData(0).getMuest()[i];
lista[1][i] = Double.POSITIVE_INFINITY; //index
}
for (int i = 0; i < totalAtributos; i++) {
for (int j = 1; j < examples; j++) {
double valor = trainData.getData(j).getMuest()[i];
int k = 0;
while (!(Double.isInfinite(lista[k][i]))) {
if (lista[k][i] == valor) {
break;
}
k++;
}
if (Double.isInfinite(lista[k][i])) {
lista[k][i] = valor;
lista[k + 1][i] = Double.POSITIVE_INFINITY;
}
}
}
selectors = new Complex(nClasses);
for (int i = 0; i < totalAtributos; i++) {
for (int h = 0; h < examples; h++) {
if (Double.isInfinite(lista[h][i])) {
break;
}
for (int j = 0; j < 4; j++) { //For the 3 possible values in the comparison <>,<=,>
Selector s = new Selector(i, j, lista[h][i]);
selectors.addSelector(s);
}
}
}
//Operator = (disjunts values)
if (accuracy == 1) {
for (int i = 0; i < totalAtributos; i++) {
int total;
for (total = 0; !(Double.isInfinite(lista[total][i])); total++) {
;
}
ArrayList list = new ArrayList();
ArrayList listaAux = new ArrayList();
for (int j = 0; j < total - 1; j++) {
for (int k = j + 1; k < total; k++) {
double[] valores = new double[2];
valores[0] = lista[j][i];
valores[1] = lista[k][i];
listaAux.add(valores);
Selector s = new Selector(i, 0, valores);
selectors.addSelector(s);
}
}
for (int l = 3; l < total - 2; l++) {
double[] auxi = new double[l - 1];
double[] auxi2 = new double[l - 1];
list.addAll(listaAux);
listaAux.clear();
while (!list.isEmpty()) {
boolean salir = false;
auxi = (double[]) list.remove(0);
for (int j = 0; (j < list.size()) && (!salir); j++) {
auxi2 = (double[]) list.get(j);
for (int k = 0; (k < auxi.length - 1) && (!salir);
k++) {
salir = !(auxi[k] == auxi2[k]);
}
if (!salir) {
double[] valores = new double[l];
for (int k = 0; k < l - 1; k++) {
valores[k] = auxi[k];
}
valores[l - 1] = auxi2[l - 2];
listaAux.add(valores);
Selector s = new Selector(i, 0, valores);
selectors.addSelector(s);
}
}
}
}
}
}
}
/**
* It generates unordered rules for each class of the training set
* @param trainData myDataset Training set
*/
private void unorderedCN2(myDataset trainData) {
finalRuleSet = new ruleSet();
finalRuleSet.addClassNames(className);
finalRuleSet.addClassName(attributesName[nAttributes]);
System.out.println("\n Extracting rules for each class:");
for (int i = 0; i < nClasses; i++) { //for each class in training
CN2forOneClass(trainData.copyDataSet(), i); //valorClasesT[i]);
//finalRuleSet.addReglas(ruleSetAux);
}
}
/**
* It obtains the rules for a single class
* @param train myDataset training data-set
* @param clase int Class for which we want to generate the rules
*/
private void CN2forOneClass(myDataset train, int clase) {
boolean continuar = false;
int quedan = instancesClassTrain[clase];
System.out.println("\n Searching for the best complex for class " +
clase + " [" + quedan + " examples remaining]");
continuar = quedan > 0;
while (continuar) {
continuar = false;
Complex bestComplex = findBestComplex(train, clase);
if (bestComplex != null) {
bestComplex.addAttributeNames(attributesName);
System.out.println("\n\nComplex found:");
bestComplex.print();
finalRuleSet.addRule(bestComplex);
for (int i = 0; i < train.size(); i++) {
Instance m = train.getData(i);
if ((bestComplex.covered(m)) &&
(bestComplex.getClas() == m.getClas())) { //It covers the example and it is a true positive
train.deleteData(i);
i--;
quedan--;
}
}
continuar = true;
}
double porc = 1.0 - ((double) quedan / instancesClassTrain[clase]);
if ((porc >= covered) || (bestComplex == null)) {
continuar = false;
}
System.out.println("\nPercentage of covered examples -> " +
porc * 100 + "% <" + quedan + "> remaining");
}
//return reglas;
}
/**
* It discovers the best complex for the given instances
* @param train myDataset training set
* @param clas int
* @return Complex
*/
private Complex findBestComplex(myDataset train, int clas) {
Complex bestComplex = new Complex(nClasses);
ruleSet star = new ruleSet();
//star.adjuntaclassName(className);
//star.adjuntaNombreClase(attributesName[nAttributes]);
boolean continuar = true;
//I create the initial star
for (int i = 0; i < selectors.size(); i++) {
Complex aux = new Complex(nClasses);
aux.setClass(clas);
aux.addSelector(selectors.getSelector(i));
evaluateComplex(aux, train);
star.addRule(aux);
}
//Order
Collections.sort(star.getruleSet());
//Check statistical significance of the best complex (optional)
//....
//Obtain the best complex
star.deleteSubsumed(starSize);
for (int j = star.size() - 1; star.size() > starSize;
star.deleteRule(j), j--) {
; //Beam search
}
for (int k = star.size() - 1; k >= 0; k--) {
if (star.getRule(k).getClassDistribution(clas) == 0) {
star.deleteRule(k);
}
}
if (star.size() == 0) {
continuar = false;
bestComplex = null;
} else {
bestComplex = star.getRule(0);
}
//star.print();
int tam = 1;
while (continuar) {
//for (int tam = 1; tam < nAttributes; tam++) {
//a) Specialize every complex in STAR
ruleSet newStar = new ruleSet();
for (int i = 0; i < selectors.size(); i++) {
Selector s = selectors.getSelector(i);
for (int j = 0; j < star.size(); j++) {
Complex aux2 = star.getRule(j);
Complex aux = new Complex(nClasses);
boolean sigue = true;
for (int h = 0; (h < aux2.size()) && (sigue); h++) {
Selector s2 = aux2.getSelector(h);
aux.addSelector(s2);
if (s2.compareTo(s) < 2) { //same attribute
sigue = false; //I do not add it
}
}
if (sigue) { //Selector is not repeated in the complex of "star"
aux.addSelector(s);
aux.setClass(clas);
evaluateComplex(aux, train);
newStar.addRule(aux);
}
}
}
Collections.sort(newStar.getruleSet());
//esSignificativa(bestComplex);
removeInvalid(newStar); //we remove repeated and subsumed complexes
for (int k = newStar.size() - 1; k >= 0; k--) {
if (newStar.getRule(k).getClassDistribution(clas) == 0) {
newStar.deleteRule(k);
}
}
if (newStar.size() > 0) {
if (bestComplex.compareTo(newStar.getRule(0)) == 1) { //es peor
bestComplex = newStar.getRule(0);
}
//for (; newStar.size() > starSize; newStar.deleteRule(0)) {
for (int j = newStar.size() - 1;
newStar.size() > starSize;
newStar.deleteRule(j), j--) {
; //Beam search
}
star.deleteAll();
star.addRules(newStar);
tam++;
continuar = (tam < nAttributes);
} else {
continuar = false;
}
System.out.print("New star created of size " + tam + ", ");
if (((tam + 1) % 6) == 0) {
System.out.println("");
}
} //while(continuar);
/*if (bestComplex.getClassDistribution(clase) == 0) {
bestComplex = null;
}*/
return bestComplex;
}
/**
* It removes those complexes which are subsumed by others in newStar
* @param newStar ruleSet The new set of complexes we are building
*/
private void removeInvalid(ruleSet newStar) {
//Primero quitamos los nulos: se repiten atributos!
//newStar.eliminaNulos(); //Esta hecho conforme se construye [tienen distinto atributo]
//Eliminamos los Complexs que esten repetidos dentro de newStar!
//newStar.eliminaRepetidos(starSize); //Elimino hasta quedarme con "tamEstrella"
newStar.deleteSubsumed(starSize); //We delete rules that are semantically the same (At = 1, At <> 0, At = [0,1])
}
/**
* Test of Statistical Signficance . Complex c is significant if its value is higher than a given threshold
* <br/>The computation is carried out as 2*SUM[fi�log(fi/ei)] where:
* <br/>fi is the distribution of examples covered by c
* <br/>ei is the distribution of examples randomly covered -> #examples of class i / #examples
* @param c Complex The complex to analyse
* @return boolean True if it is significant (higher than the threshold) false in other case
*/
private boolean esSignificativa(Complex c) {
double significancia = 0;
double pCond = 0;
for (int j = 0; j < nClasses; j++) {
pCond += c.getClassDistribution(j);
}
pCond *= 1.0 / trainData.size();
for (int j = 0; j < nClasses; j++) {
double logaritmo = (double) c.getClassDistribution(j) /
(this.instancesClassTrain[j] * pCond);
if (logaritmo != 0) {
logaritmo = Math.log(logaritmo);
logaritmo *= (double) c.getClassDistribution(j);
significancia += logaritmo;
}
}
significancia *= 2.0;
//System.out.println("threshold -> " + significancia);
return (significancia >= threshold);
}
/**
* Evaluation of the complexes over the examples set, in order to see which ones are covered in each class
* @param c Complex to evaluate
* @param e Data-set
*/
private void evaluateComplex(Complex c, myDataset e) {
c.deleteDistribution();
for (int i = 0; i < e.size(); i++) {
int cl = e.getData(i).getClas();
if (c.covered(e.getData(i))) {
c.addClassDistribution(cl);
}
}
c.computeLaplacian();
}
/**
* It computes the statistical data and creates the output files
*/
private void generateOutput() {
Files f = new Files();
String cad = "";
//System.out.println("\n Estas son las reglas encontradas:");
//finalRuleSet.print();
cad = finalRuleSet.printString();
cad += "\n\n" + evReg.printString() + "\n\n Time (seconds); " +
(tiempo / 1000);
f.writeFile(outputFile, cad);
f.writeFile(outputFileTr,
miOutput + evReg.salida(trainData));
f.writeFile(outputFileTst,
miOutput + evReg.salida(testData));
}
}