/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Genetic_Rule_Learning.GIL; /** * <p>Title: Algorithm</p> * * <p>Description: It contains the implementation of the algorithm</p> * * * <p>Company: KEEL </p> * * @author Alberto Fern�ndez * @version 1.0 */ import java.io.IOException; import java.util.Arrays; import org.core.*; public class GIL { myDataset train, val, test; String outputTr, outputTst, outputRule; int nClasses; //Parameters long seed; //We may declare here the algorithm's parameters int popSize; double w1; double w2; double w3; double p1a; double p1b; double p2; double p3; double p4; double p5; double p6; double p7a; double p7b; double p7c; double p8; double p9; double p10; double p11; double p12; double p13; double p14; int numGenerations; double pConditionLevel; double lowerThreshold; double upperThreshold; private boolean somethingWrong = false; //to check if everything is correct. /** * Default constructor */ public GIL () { } /** * It reads the data from the input files (training, validation and test) and parse all the parameters * from the parameters array. * @param parameters parseParameters It contains the input files, output files and parameters */ public GIL(parseParameters parameters) { train = new myDataset(); val = new myDataset(); test = new myDataset(); try { System.out.println("\nReading the training set: " + parameters.getTrainingInputFile()); train.readClassificationSet(parameters.getTrainingInputFile(), true); System.out.println("\nReading the validation set: " + parameters.getValidationInputFile()); val.readClassificationSet(parameters.getValidationInputFile(), false); System.out.println("\nReading the test set: " + parameters.getTestInputFile()); test.readClassificationSet(parameters.getTestInputFile(), false); } catch (IOException e) { System.err.println( "There was a problem while reading the input data-sets: " + e); somethingWrong = true; } //We may check if there are some numerical attributes, because our algorithm may not handle them: somethingWrong = somethingWrong || train.hasRealAttributes(); somethingWrong = somethingWrong || train.hasMissingAttributes(); outputTr = parameters.getTrainingOutputFile(); outputTst = parameters.getTestOutputFile(); outputRule = parameters.getOutputFile(0); //Now we parse the parameters, for example: seed = Long.parseLong(parameters.getParameter(0)); popSize = Integer.parseInt(parameters.getParameter(1)); w1 = Double.parseDouble(parameters.getParameter(2)); w2 = Double.parseDouble(parameters.getParameter(3)); w3 = Double.parseDouble(parameters.getParameter(4)); p1a = Double.parseDouble(parameters.getParameter(5)); p1b = Double.parseDouble(parameters.getParameter(6)); p2 = Double.parseDouble(parameters.getParameter(7)); p3 = Double.parseDouble(parameters.getParameter(8)); p4 = Double.parseDouble(parameters.getParameter(9)); p5 = Double.parseDouble(parameters.getParameter(10)); p6 = Double.parseDouble(parameters.getParameter(11)); p7a = Double.parseDouble(parameters.getParameter(12)); p7b = Double.parseDouble(parameters.getParameter(13)); p7c = Double.parseDouble(parameters.getParameter(14)); p8 = Double.parseDouble(parameters.getParameter(15)); p9 = Double.parseDouble(parameters.getParameter(16)); p10 = Double.parseDouble(parameters.getParameter(17)); p11 = Double.parseDouble(parameters.getParameter(18)); p12 = Double.parseDouble(parameters.getParameter(19)); p13 = Double.parseDouble(parameters.getParameter(20)); p14 = Double.parseDouble(parameters.getParameter(21)); numGenerations = Integer.parseInt(parameters.getParameter(22)); pConditionLevel = Double.parseDouble(parameters.getParameter(23)); lowerThreshold = Double.parseDouble(parameters.getParameter(24)); upperThreshold = Double.parseDouble(parameters.getParameter(25)); //... } /** * It launches the algorithm */ public void execute() { int i, j, k, l; int nperClass[]; boolean flag[]; int pos, min, max; int classAct; RuleSet population[]; double costs[]; double minCost, maxCost; double f=0.0; double prob[]; double aux; double NUmax = 1.5; //used for lineal ranking double NUmin = 0.5; //used for lineal ranking RuleSet newPopulation[]; double pos1, pos2; int sel1, sel2; double comp, cons; boolean act1, act2; int contAct; RuleSet solution[]; int classSolution[]; int maxClass; if (somethingWrong) { //We do not execute the program System.err.println("An error was found, either the data-set have numerical values or missing values."); System.err.println("Aborting the program"); //We should not use the statement: System.exit(-1); } else { Randomize.setSeed (seed); nClasses = train.getnClasses(); nperClass = new int[nClasses]; flag = new boolean[nClasses]; Arrays.fill(flag, true); for (i=0; i<train.getnData(); i++) { nperClass[train.getOutputAsInteger(i)]++; } solution = new RuleSet[nClasses-1]; classSolution = new int[nClasses-1]; //Search the class with higher number of instances for (j=0; j<nClasses && !flag[j]; j++); pos = j; max = nperClass[j]; for (j=pos+1; j <nClasses; j++) { if (flag[j] && nperClass[j] >= max) { pos = j; max = nperClass[j]; } } maxClass = pos; //For each concept to be learned for (i=0; i<nClasses-1; i++) { //Search the class not chosen yet with lower number of instances for (j=0; j<nClasses && !flag[j]; j++); pos = j; min = nperClass[j]; for (j=pos+1; j <nClasses; j++) { if (flag[j] && nperClass[j] < min) { pos = j; min = nperClass[j]; } } flag[pos] = false; classAct = pos; classSolution[i]=classAct; System.out.println("Learning concept '" + train.getOutputValue(classAct) + "'"); if (nperClass[classAct] == 0) { //there is no examples of this class System.out.println("There is no example of this concept."); solution[i] = new RuleSet(); } else { //Initialize population population = new RuleSet[popSize]; for (j=0; j<population.length; j++) { population[j] = new RuleSet(train,classAct); } //Compute cost for all chromosomes costs = new double[popSize]; for (j=0; j<population.length; j++) { costs[j] = population[j].computeCost(); } //Obtain max and min costs in the population minCost = maxCost = costs[0]; for (j=1; j<costs.length; j++) { if (costs[j] < minCost) minCost = costs[j]; else if (costs[j] > maxCost) maxCost = costs[j]; } //Evaluate the population for (j=0; j<population.length; j++) { population[j].computeFitness(train, classAct, minCost, maxCost, f, w1, w2, w3); } //Construct the Baker selection roulette prob = new double[popSize]; for (j=0; j<popSize; j++) { aux = (double)( NUmax-NUmin)*((double)j/(popSize-1)); prob[j]=(double)(1.0/(popSize)) * (NUmax-aux); } for (j=1; j<popSize; j++) prob[j] = prob[j] + prob[j-1]; f = 0.0; //Evolutionary cycle for (j=0; j<numGenerations; j++) { System.out.println("Start Generation: " + j); /*Sort the population by fitness*/ Arrays.sort(population); System.out.println("Fitness of the best chromosome: " + population[0].fitness); System.out.println("Completeness of the best chromosome: " + population[0].completeness); System.out.println("Consistency of the best chromosome: " + population[0].consistency); System.out.println("Cost of the best chromosome: " + population[0].cost); System.out.println("Aplication Probabilities: " + p1a + " - " + p2 + " - "+ p3 + " - "+ p4 + " - "+ p5 + " - "+ p6 + " - "+ p7a + " - "+ p8 + " - "+ p9 + " - "+ p10 + " - "+ p11 + " - "+ p12 + " - "+ p13 + " - "+ p14 + " - "); newPopulation = new RuleSet[popSize]; /*Baker's selection*/ act1 = act2 = false; contAct = 0; for (k=0; k<((popSize)/2); k++) { pos1 = Randomize.Rand(); pos2 = Randomize.Rand(); for (l=0; l<popSize && prob[l]<pos1; l++); sel1 = l; for (l=0; l<popSize && prob[l]<pos2; l++); sel2 = l; newPopulation[k*2] = new RuleSet(population[sel1]); newPopulation[k*2+1] = new RuleSet(population[sel2]); /*Application of the operators*/ if (Randomize.Rand() < p1a) { newPopulation[k*2].rulesExchange(newPopulation[k*2+1], p1b); act1 = act2 = true; } comp = newPopulation[k*2].computeCompleteness(train, classAct); cons = newPopulation[k*2].computeConsistency(train, classAct); if (Randomize.Rand() < (p2*(1.5-comp)*(0.5+cons))) { newPopulation[k*2].rulesCopy(newPopulation[k*2+1]); act1 = true; } if (Randomize.Rand() < (p3*(1.5-comp)*(0.5+cons))) { newPopulation[k*2].newEvent(train, classAct); act1 = true; } if (Randomize.Rand() < (p4*(1.5-comp)*(0.5+cons))) { newPopulation[k*2].rulesGeneralization(train); act1 = true; } if (Randomize.Rand() < (p5*(0.5+comp)*(1.5-cons))) { newPopulation[k*2].rulesDrop(); act1 = true; } if (Randomize.Rand() < (p6*(0.5+comp)*(1.5-cons))) { newPopulation[k*2].rulesSpecialization(train); act1 = true; } act1 |= newPopulation[k*2].applyOperators(p7a, p7b, p7c, p8, p9, p10, p11, p12, p13, p14, pConditionLevel, train, classAct); comp = newPopulation[k*2+1].computeCompleteness(train, classAct); cons = newPopulation[k*2+1].computeConsistency(train, classAct); if (Randomize.Rand() < (p2*(1.5-comp)*(0.5+cons))) { newPopulation[k*2+1].rulesCopy(newPopulation[k*2+1]); act2 = true; } if (Randomize.Rand() < (p3*(1.5-comp)*(0.5+cons))) { newPopulation[k*2+1].newEvent(train, classAct); act2 = true; } if (Randomize.Rand() < (p4*(1.5-comp)*(0.5+cons))) { newPopulation[k*2+1].rulesGeneralization(train); act2 = true; } if (Randomize.Rand() < (p5*(0.5+comp)*(1.5-cons))) { newPopulation[k*2+1].rulesDrop(); act2 = true; } if (Randomize.Rand() < (p6*(0.5+comp)*(1.5-cons))) { newPopulation[k*2+1].rulesSpecialization(train); act2 = true; } act2 |= newPopulation[k*2+1].applyOperators(p7a, p7b, p7c, p8, p9, p10, p11, p12, p13, p14, pConditionLevel, train, classAct); if (act1) contAct++; if (act2) contAct++; } //Compute cost for all chromosomes for (k=0; k<newPopulation.length; k++) { costs[k] = newPopulation[k].computeCost(); } //Obtain max and min costs in the population minCost = maxCost = costs[0]; for (k=1; k<costs.length; k++) { if (costs[k] < minCost) minCost = costs[k]; else if (costs[k] > maxCost) maxCost = costs[k]; } //Evaluate the population for (k=0; k<newPopulation.length; k++) { newPopulation[k].computeFitness(train, classAct, minCost, maxCost, f, w1, w2, w3); } if (((double)contAct/(double)popSize) >= upperThreshold) { p1a *= 0.99; p2 *= 0.99; p3 *= 0.99; p4 *= 0.99; p5 *= 0.99; p6 *= 0.99; p7a *= 0.99; p8 *= 0.99; p9 *= 0.99; p10 *= 0.99; p11 *= 0.99; p12 *= 0.99; p13 *= 0.99; p14 *= 0.99; } else if (((double)contAct/(double)popSize) <= lowerThreshold) { p1a *= 1.01; p2 *= 1.01; p3 *= 1.01; p4 *= 1.01; p5 *= 1.01; p6 *= 1.01; p7a *= 1.01; p8 *= 1.01; p9 *= 1.01; p10 *= 1.01; p11 *= 1.01; p12 *= 1.01; p13 *= 1.01; p14 *= 1.01; } f += (double)1 / (double)numGenerations; population = newPopulation; } /*Sort the population by fitness*/ Arrays.sort(population); solution[i] = new RuleSet(population[0]); } } //Finally we should fill the training and test output files doOutput(this.val, this.outputTr, solution, classSolution, maxClass); doOutput(this.test, this.outputTst, solution, classSolution, maxClass); /*Print the rule obtained*/ for (i=0; i<nClasses-1; i++) { Fichero.AnadirtoFichero(outputRule, "Concept '" + train.getOutputValue(classSolution[i]) + "': \n"); Fichero.AnadirtoFichero(outputRule, solution[i].toString(train)); } Fichero.AnadirtoFichero(outputRule, "Concept' " + train.getOutputValue(maxClass) + "': Default.\n"); System.out.println("Algorithm Finished"); } } /** * It generates the output file from a given dataset and stores it in a file * @param dataset myDataset input dataset * @param filename String the name of the file * @param solution rule set for inducing the classification */ private void doOutput(myDataset dataset, String filename, RuleSet solution[], int [] sol, int max) { String output = new String(""); output = dataset.copyHeader(); //we insert the header in the output file //We write the output for each example for (int i = 0; i < dataset.getnData(); i++) { //for classification: output += dataset.getOutputAsString(i) + " " + this.classificationOutput(i, solution, dataset, sol, max) + "\n"; } Fichero.escribeFichero(filename, output); } private String classificationOutput(int ej, RuleSet solution[], myDataset dataset, int [] sol, int max) { int i, j; boolean example[]; boolean rule[]; for (i=0; i<nClasses-1; i++) { for (j=0; j<solution[i].getRuleSet().size(); j++) { example = Rule.toBitString(dataset, ej); rule = solution[i].getRule(j).toBitString(); if (Rule.match(rule, example)) { return dataset.getOutputValue(sol[i]); } } } return dataset.getOutputValue(max); } }