/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Genetic_Rule_Learning.COGIN; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import org.core.Files; import org.core.Randomize; import keel.Dataset.*; /** * <p> * This class implements the COGIN algorithm from: * </p> * <p> * David Perry Greene and Stephen F. Smith. Competition-Based Induction of Decision Models from Examples. Machine Learning, 13: 229-257, 1993. * </p> * <p> * @author Written by Juli�n Luengo Mart�n 08/02/2007 * @version 0.2 * @since JDK 1.5 * </p> */ public class Cogin { ArrayList<Chromosome> poblation; ArrayList<Chromosome> offspring; ArrayList<Chromosome> bestPob; // Chromosome coverInstance[]; long seed; InstanceSet IS; InstanceSet ISval; InstanceSet IStest; String input_train_name = new String(); String input_validation_name; String input_test_name = new String(); String output_train_name = new String(); String output_test_name = new String(); String method_output; int numClasses; double datasetEntropy = -1; double crossoverRate = 0.9; boolean useNegationBit = true; int missclassificationErrorLevel = 2; int generationLimit = 1000; /** * <p> * Default constructor * </p> */ public Cogin(){ poblation = offspring = null; } /** * <p> * Builds up the COGIN with the provided parameters in KEEL format * </p> * @param paramfile The path to the configuration file with all the parameters in KEEL format */ public Cogin(String paramfile){ config_read(paramfile); Randomize.setSeed(seed); IS = new InstanceSet(); ISval = new InstanceSet(); IStest = new InstanceSet(); try { IS.readSet(input_train_name, true); ISval.readSet(input_validation_name, false); IStest.readSet(input_test_name, false); } catch (Exception e) { System.out.println("Dataset exception = " + e); e.printStackTrace(); System.exit(-1); } // coverInstance = new Chromosome[IS.getNumInstances()]; Attribute a =Attributes.getOutputAttribute(0); if(a.getType() == Attribute.NOMINAL) numClasses = a.getNumNominalValues(); else numClasses =(int)( a.getMaxAttribute() - a.getMinAttribute()); poblation = new ArrayList<Chromosome>(); } /** * Read the pattern file, and parse data into strings * @param fileParam the file with the parameters */ private void config_read(String fileParam) { File inputFile = new File(fileParam); if (inputFile == null || !inputFile.exists()) { System.out.println("parameter " + fileParam + " file doesn't exists!"); System.exit(-1); } // begin the configuration read from file try { FileReader file_reader = new FileReader(inputFile); BufferedReader buf_reader = new BufferedReader(file_reader); // FileWriter file_write = new FileWriter(outputFile); String line; do { line = buf_reader.readLine(); } while (line.length() == 0); // avoid empty lines for processing // -> // produce exec failure String out[] = line.split("algorithm = "); // alg_name = new String(out[1]); //catch the algorithm name // input & output filenames do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("inputData = "); out = out[1].split("\\s\""); input_train_name = new String(out[0].substring(1,out[0].length() - 1)); input_validation_name = new String(out[1].substring(0,out[2].length() - 1)); input_test_name = new String(out[2].substring(0,out[2].length() - 1)); if (input_validation_name.charAt(input_validation_name.length() - 1) == '"') input_validation_name = input_validation_name.substring(0, input_validation_name .length() - 1); if (input_test_name.charAt(input_test_name.length() - 1) == '"') input_test_name = input_test_name.substring(0, input_test_name .length() - 1); do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("outputData = "); out = out[1].split("\\s\""); output_train_name = new String(out[0].substring(1, out[0].length() - 1)); output_test_name = new String(out[1].substring(0, out[1].length() - 1)); method_output = new String(out[2].substring(0,out[2].length() - 1)); method_output = method_output.trim(); if (method_output.charAt(method_output.length() - 1) == '"') method_output = method_output.substring(0, method_output.length() - 1); if (output_test_name.charAt(output_test_name.length() - 1) == '"') output_test_name = output_test_name.substring(0, output_test_name.length() - 1); // parameters do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("seed = "); seed = (new Long(out[1])).longValue(); do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("missclassificationErrorLevel = "); missclassificationErrorLevel = (new Integer(out[1])).intValue(); // parse the string into do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("generationLimit = "); generationLimit = (new Integer(out[1])).intValue(); // parse the string into do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("CrossoverRate = "); crossoverRate = (new Double(out[1])).doubleValue(); // parse the string into do { line = buf_reader.readLine(); } while (line.length() == 0); out = line.split("useNegationBit = "); useNegationBit = (out[1].compareTo("Yes")==0); // parse the string into file_reader.close(); } catch (IOException e) { System.out.println("IO exception = " + e); e.printStackTrace(); System.exit(-1); } } protected void initialize(){ int coveredExamples = 0; double rnd; int previousCovered; Chromosome rule = new Chromosome(Attributes.getInputNumAttributes()); Gene gen; Instance inst; ArrayList<Instance> uncovered = new ArrayList<Instance>(); boolean allCovered = false; for(int i=0;i<IS.getNumInstances();i++){ inst = IS.getInstance(i); uncovered.add(i, inst); } System.out.println("Covered examples at initialization:"); //first, we try to generate a completely random rule while(coveredExamples<IS.getNumInstances() || poblation.size() < 10){ previousCovered = coveredExamples; for(int i=0;i<rule.getNumGenes();i++){ gen = new Gene(Attributes.getInputAttribute(i)); for(int j=0;j<gen.getNumBits();j++){ rnd = Randomize.Rand(); if(rnd<0.5) gen.setBit(j, '#'); else if(rnd<0.75) gen.setBit(j, '0'); else gen.setBit(j, '1'); } if(Randomize.Rand()<0.1 && useNegationBit) gen.setNegation(1); rule.setGene(i, gen); } rule.setClass(Randomize.Randint(0, numClasses)); //now, we search the uncovered examples that this new rule covers for(int i=0;i<uncovered.size();i++){ inst = uncovered.get(i); if(rule.covers(inst)){ coveredExamples++; uncovered.remove(i); i--; } } if(coveredExamples > previousCovered || allCovered){ poblation.add(rule); System.out.print(coveredExamples+"/"+uncovered.size()+" -> "); } else{ inst = uncovered.get(0); rule.makeCover(inst); for(int i=0;i<uncovered.size();i++){ inst = uncovered.get(i); if(rule.covers(inst)){ coveredExamples++; uncovered.remove(i); i--; } } if(coveredExamples > previousCovered || allCovered){ poblation.add(rule); System.out.print(coveredExamples+"/"+uncovered.size()+" -> "); } } rule = new Chromosome(Attributes.getInputNumAttributes()); if(coveredExamples == IS.getNumInstances()) allCovered = true; } System.out.println("end"); } protected void randomSelection(){ int p1,p2,pos,tmp; int baraje[] = new int[poblation.size()]; int baraje2[] = new int[poblation.size()]; for (int i = 0; i < poblation.size(); i++){ baraje[i] = i; baraje2[i] = i; } for (int i = 0; i < poblation.size(); i++) { pos = Randomize.Randint(i, poblation.size()); tmp = baraje[i]; baraje[i] = baraje[pos]; baraje[pos] = tmp; do{ pos = Randomize.Randint(i, poblation.size()); if(pos>0 && baraje2[pos] == baraje[i] && baraje2[pos-1] != baraje[i]) pos--; if(pos==0 && baraje2[pos] == baraje[i] && baraje2[pos+1] != baraje[i]) pos++; }while(baraje2[pos] == baraje[i]); tmp = baraje2[i]; baraje2[i] = baraje2[pos]; baraje2[pos] = tmp; } offspring = new ArrayList<Chromosome>(); for(int i=0;i<(int)(poblation.size()*crossoverRate);i+=2){ p1 = baraje[i]; p2 = baraje2[i]; while(p1==p2){ pos = Randomize.Randint(i, poblation.size()); tmp = baraje2[i]; baraje2[i] = baraje2[pos]; baraje2[pos] = tmp; p2 = baraje2[i]; } offspring.add(new Chromosome(poblation.get(p1))); offspring.add(new Chromosome(poblation.get(p2))); } } protected void onePointCrossover(){ Chromosome parent1,parent2; Instance inst; int max1,max2; int voted1[] = new int[numClasses]; int voted2[] = new int[numClasses]; for(int i=0;i<offspring.size();i+=2){ parent1 = offspring.get(i); parent2 = offspring.get(i+1); parent1.swapOnePoint(parent2); //now apply the ex-post assignment for(int j=0;j<numClasses;j++){ voted1[j] = 0; voted2[j] = 0; } for(int j=0;j<IS.getNumInstances();j++){ inst = IS.getInstance(j); if(parent1.covers(inst)) voted1[(int)inst.getAllOutputValues()[0]]++; if(parent2.covers(inst)) voted2[(int)inst.getAllOutputValues()[0]]++; } max1 = max2 = 0; for(int j=1;j<numClasses;j++){ if(voted1[j] > voted1[max1]) max1 = j; if(voted1[j] == voted1[max1] && Randomize.Rand()<0.5) max1 = j; if(voted2[j] > voted2[max2]) max2 = j; if(voted2[j] == voted2[max2] && Randomize.Rand()<0.5) max2 = j; } parent1.setClass(max1); parent1.setCoveredInstances(max1); parent2.setClass(max2); parent2.setCoveredInstances(max2); } } protected void evaluate(){ Chromosome rule; int missclassified,matched,output; int matchedClasses[]; int unmatchedClasses[]; double info,lex,Hm,Hunm; Instance inst; boolean match; //first, compute the dataset's entropy if needed -it won't change...- if(datasetEntropy==-1){ int nInstancesPerClass[] = new int[numClasses]; for(int i=0;i<IS.getNumInstances();i++){ nInstancesPerClass[(int)IS.getInstance(i).getAllOutputValues()[0]]++; } datasetEntropy = 0; for(int i=0;i<numClasses;i++){ if(nInstancesPerClass[i]!=0) datasetEntropy -= ((double)nInstancesPerClass[i]/IS.getNumInstances())*Math.log((double)nInstancesPerClass[i]/IS.getNumInstances()); } } //compute the fitness for the new candidate rules for(int i=0;i<offspring.size();i++){ rule = offspring.get(i); missclassified = 0; matched = 0; matchedClasses = new int[numClasses]; unmatchedClasses = new int[numClasses]; for(int j=0;j<IS.getNumInstances();j++){ inst = IS.getInstance(j); output = (int)inst.getAllOutputValues()[0]; match = rule.covers(inst); if(match){ matched++; matchedClasses[output]++; if(output != rule.classify(inst)) missclassified++; } else{ unmatchedClasses[output]++; } } //the Info(R) part... Hm = Hunm =0; for(int j=0;j<numClasses;j++){ if(matchedClasses[j]!=0) Hm += -(((double)matchedClasses[j]/IS.getNumInstances())*Math.log((double)matchedClasses[j]/IS.getNumInstances())); } info = Hm * matched; for(int j=0;j<numClasses;j++){ if(unmatchedClasses[j]!=0) Hunm += -(((double)unmatchedClasses[j]/IS.getNumInstances())*Math.log((double)unmatchedClasses[j]/IS.getNumInstances())); } info += Hunm * (IS.getNumInstances()-matched); info = info / IS.getNumInstances(); info = datasetEntropy - info; //the Lex(R) part lex =(double) IS.getNumInstances()/missclassificationErrorLevel; lex = lex - (double)missclassified/missclassificationErrorLevel; //set the fitness rule.setFitness(info+lex); } } protected void competitiveReplacement(){ ArrayList<Chromosome> coverage = new ArrayList<Chromosome>(); ArrayList<Instance> filter = new ArrayList<Instance>(); Instance inst; Chromosome rule; boolean itCovers[],match; coverage.addAll(poblation); coverage.addAll(offspring); Collections.sort(coverage,Collections.reverseOrder()); itCovers = new boolean[coverage.size()]; for(int j=0;j<coverage.size();j++){ itCovers[j] = false; } for(int i=0;i<IS.getNumInstances();i++){ filter.add(IS.getInstance(i)); } for(int i=0;i<IS.getNumInstances();i++){ inst = IS.getInstance(i); match = false; for(int j=0;j<coverage.size() && !match;j++){ rule = coverage.get(j); match = rule.covers(inst); if(match){ filter.remove(inst); itCovers[j] = true; } } } for(int j=0,i=0;j<coverage.size();j++,i++){ if(!itCovers[i]){ coverage.remove(j); j--; } } poblation = coverage; Collections.sort(poblation,Collections.reverseOrder()); } /** * <p> * Process the training and test files provided in the parameters file to the constructor. * </p> */ public void run(){ int gen = 0; Attribute a; double bestCR,CR; String instanciasIN[] = new String[IS.getNumInstances()]; String instanciasOUT[] = new String[IS.getNumInstances()]; for(int i=0;i<Attributes.getInputNumAttributes();i++){ a = Attributes.getInputAttribute(i); if(a.getType() == Attribute.REAL){ System.err.println("COGIN works with discrete values. Please discretize first."); System.exit(-1); } } initialize(); offspring = poblation; //trick to allow the evaluate() function compute the fitness //of the actual poblation instead of the nonexistent offspring evaluate(); offspring = null; bestCR = classify(IS, instanciasIN, instanciasOUT); while(gen < generationLimit){ randomSelection(); onePointCrossover(); evaluate(); competitiveReplacement(); CR = classify(IS, instanciasIN, instanciasOUT); if(bestPob == null || bestCR < CR || (bestCR == CR && poblation.size()<bestPob.size())){ bestCR = CR; bestPob = new ArrayList<Chromosome>(); bestPob.addAll(poblation); System.out.print("\ngeneration "+gen+ " BestCR: "+bestCR); if((gen+1)%10==0) System.out.println(); } gen++; if(gen%10==0) System.out.print("."); } System.out.println("\nbestCR: "+bestCR); instanciasIN = new String[ISval.getNumInstances()]; instanciasOUT = new String[ISval.getNumInstances()]; classify(ISval, instanciasIN, instanciasOUT); writeOutput(output_train_name, instanciasIN, instanciasOUT, Attributes.getInputAttributes(), Attributes.getOutputAttributes()[0], Attributes.getInputNumAttributes(), Attributes.getRelationName()); instanciasIN = new String[IStest.getNumInstances()]; instanciasOUT = new String[IStest.getNumInstances()]; poblation = bestPob; System.out.println("Test CR: "+classify(IStest,instanciasIN,instanciasOUT)); writeOutput(output_test_name, instanciasIN, instanciasOUT, Attributes.getInputAttributes(), Attributes.getOutputAttributes()[0], Attributes.getInputNumAttributes(), Attributes.getRelationName()); // write the obtained rules to disk printRules(); } /** * Writes the output in KEEL format * @param fileName output file * @param instancesIN output from instances of the input data set * @param instancesOUT class of classified instances * @param inputs the input attributes * @param output the output attribute * @param nInputs number of input attributes * @param relation data set name */ public static void writeOutput(String fileName, String instancesIN[], String instancesOUT[], Attribute inputs[], Attribute output, int nInputs, String relation) { String cadena = ""; int i, j, k; int aux; /* Printing input attributes */ cadena += "@relation " + relation + "\n"; for (i = 0; i < nInputs; i++) { cadena += "@attribute " + inputs[i].getName() + " "; if (inputs[i].getType() == Attribute.NOMINAL) { cadena += "{"; for (j = 0; j < inputs[i].getNominalValuesList().size(); j++) { cadena += (String) inputs[i].getNominalValuesList() .elementAt(j); if (j < inputs[i].getNominalValuesList().size() - 1) { cadena += ", "; } } cadena += "}\n"; } else { if (inputs[i].getType() == Attribute.INTEGER) { cadena += "integer"; cadena += " [" + String.valueOf((int) inputs[i] .getMinAttribute()) + ", " + String.valueOf((int) inputs[i] .getMaxAttribute()) + "]\n"; } else { cadena += "real"; cadena += " [" + String.valueOf(inputs[i].getMinAttribute()) + ", " + String.valueOf(inputs[i].getMaxAttribute()) + "]\n"; } } } /* Printing output attribute */ cadena += "@attribute " + output.getName() + " "; if (output.getType() == Attribute.NOMINAL) { cadena += "{"; for (j = 0; j < output.getNominalValuesList().size(); j++) { cadena += (String) output.getNominalValuesList().elementAt(j); if (j < output.getNominalValuesList().size() - 1) { cadena += ", "; } } cadena += "}\n"; } else { cadena += "integer [" + String.valueOf((int) output.getMinAttribute()) + ", " + String.valueOf((int) output.getMaxAttribute()) + "]\n"; } /* Printing the data */ cadena += "@data\n"; Files.writeFile(fileName, cadena); cadena = ""; for (i = 0; i < instancesIN.length; i++) { cadena += instancesIN[i] + " " + instancesOUT[i]; cadena += "\n"; } Files.addToFile(fileName, cadena); } protected double classify(InstanceSet ISet, String instanciasIN[], String instanciasOUT[]){ Instance inst; Chromosome rule; double input[]; int output,predict; int tp; boolean match; //ordered list -IF_THEN_ELSE scheme- tp = 0; Attribute a = Attributes.getOutputAttribute(0); int tipo = a.getType(); for(int i=0;i<ISet.getNumInstances();i++){ inst = ISet.getInstance(i); input = inst.getAllInputValues(); output = (int)inst.getAllOutputValues()[0]; if(tipo!=Attribute.NOMINAL){ instanciasIN[i] = String.valueOf(output); } else{ instanciasIN[i] = new String(inst.getOutputNominalValues(0)); } match = false; for(int j=0;j<poblation.size() && !match;j++){ rule = poblation.get(j); match = rule.covers(inst); if(match){ predict = rule.classify(inst); if(tipo!=Attribute.NOMINAL){ instanciasOUT[i] = String.valueOf(predict); }else{ instanciasOUT[i] = new String(a.getNominalValue(predict)); } if(predict==output) tp++; } } if(instanciasOUT[i]==null) instanciasOUT[i] = "?"; } return (double)tp/ISet.getNumInstances(); } protected void printRules(){ String cad = new String(); ArrayList<Integer> conds; Chromosome rule; int value; Attribute a; for(int i=0;i<poblation.size();i++){ cad = cad + "IF "; rule = poblation.get(i); for(int j=0;j<rule.getNumGenes();j++){ cad += "( "; conds = rule.getGene(j).bin2nominal(); Collections.sort(conds); a = Attributes.getAttribute(j); cad += a.getName()+ " is "; for(int k=0;k<conds.size();k++){ value = conds.get(k).intValue(); if(a.getType() == Attribute.NOMINAL && value<a.getNumNominalValues()){ if(k!=0) cad += "OR "; cad += a.getNominalValue(value)+ " "; } else if (value <= a.getMaxAttribute() && value>=a.getMinAttribute() ){ if(k!=0) cad += "OR "; cad += value + " "; } } cad += ")"; if(j<rule.getNumGenes()-1) cad += " AND "; } if(Attributes.getOutputAttribute(0).getType() == Attribute.NOMINAL) cad += " THEN " + Attributes.getOutputAttribute(0).getNominalValue(rule.getClas()); else cad += " THEN " + rule.getClas(); cad += "\n"; if(i<poblation.size()-1) cad += "ELSE "; } Files.writeFile(method_output, cad); } }