/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Alejandro Tortosa (University of Granada) 15/10/2008 * @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008 * @version 1.4 * @since JDK1.2 * </p> */ package keel.Algorithms.Rule_Learning.Ripper; import keel.Dataset.*; import java.io.IOException; import org.core.*; public class Ripper { /** * Implementation of the classification algorithm Ripper, according to the paper [Cohen95] * and the Weka's implementation. */ public static int W=1; //'Worth' metric public static int A=2; //'Accuracy'metric MyDataset train, val, test; //the datasets for training, validation and test String outputTr, outputTst, outputRules; //the names for the output files Randomize rand; //random numbers generator double pct; //ratio of growing/pruning instances int K; //number of optimization private boolean somethingWrong = false; //to check if everything is correct. /** * It reads the data from the input files (training, validation and test) and parse all the parameters * from the parameters array. * @param parameters parseParameters It contains the input files, output files and parameters */ public Ripper(parseParameters parameters) { train = new MyDataset(); val = new MyDataset(); test = new MyDataset(); try { System.out.println("\nReading the training set: " + parameters.getTrainingInputFile()); train.readClassificationSet(parameters.getTrainingInputFile(), true); System.out.println("\nReading the validation set: " + parameters.getValidationInputFile()); val.readClassificationSet(parameters.getValidationInputFile(), false); System.out.println("\nReading the test set: " + parameters.getTestInputFile()); test.readClassificationSet(parameters.getTestInputFile(), false); } catch (IOException e) { System.err.println("There was a problem while reading the input data-sets: " + e); somethingWrong = true; } //We may check if there are some numerical attributes, because our algorithm may not handle them: //somethingWrong = somethingWrong || train.hasNumericalAttributes(); //somethingWrong = somethingWrong || train.hasMissingAttributes(); outputTr = parameters.getTrainingOutputFile(); outputTst = parameters.getTestOutputFile(); if ( parameters.getNOutputFiles()==0){ System.err.println("No se ha especificado archivo para las reglas."); System.err.println("Usando nombre por defecto: rules-out.txt"); outputRules="rules-out.txt"; } else outputRules = parameters.getOutputFile(0); long seed = Long.parseLong(parameters.getParameter(0)); pct = Double.parseDouble(parameters.getParameter(1)); K = Integer.parseInt(parameters.getParameter(2)); rand=new Randomize(); rand.setSeed(seed); } /** * It launches the algorithm. */ public void execute() { if (somethingWrong) { //We do not execute the program System.err.println("An error was found, the data-set have numerical values."); System.err.println("Aborting the program"); //We should not use the statement: System.exit(-1); } else { //We do here the algorithm's operations Ruleset[] rulesets=this.ripperMulticlass(train); //Classificates the datasets' entries, according the generated rulesets String[] classification_train=train.classify(rulesets,rulesets.length); String[] classification_val=val.classify(rulesets,rulesets.length); String[] classification_test=test.classify(rulesets,rulesets.length); //Finally we should fill the training and test output files doOutput(this.val, this.outputTr, classification_val); doOutput(this.test, this.outputTst, classification_test); doRulesOutput2(this.outputRules,rulesets); System.out.println("Algorithm Finished"); } } /** * It generates the output file from a given dataset and stores it in a file. * @param dataset myDataset input dataset * @param filename String the name of the file * @param classification String[] gererated classification of the dataset */ private void doOutput(MyDataset dataset, String filename,String[] classification) { String output = new String(""); output = dataset.copyHeader(); //we insert the header in the output file //We write the output for each example for (int i = 0; i < dataset.getnData(); i++) { output += dataset.getOutputAsString(i) + " " +classification[i] + "\n"; } Fichero.escribeFichero(filename, output); } /** * It generates the output rules file from a given ruleset and stores it in a file * @param filename String the name of the file * @param rulesets Rulesets[] the rulesets (one for each class) */ private void doRulesOutput(String filename,Ruleset[] rulesets) { String output = new String(""); for (int i=0;i<rulesets.length-1;i++){ output+="if("; for(int j=0;j<rulesets[i].size();j++){ Rule current=rulesets[i].getRule(j); output+="("; for (int k=0;k<current.size();k++){ output+=current.getSimpleRule(k); if (k!=current.size()-1) output+=" && "; } output+=")"; if (j!=rulesets[i].size()-1) output+=" || "; } output+=")\n\t"; output+="output="+rulesets[i].getType()+"\nelse "; } output+="\n\toutput="+rulesets[rulesets.length-1].getType(); Fichero.escribeFichero(filename, output); } /** * It generates the output rules file from a given ruleset and stores it in a file * @param filename String the name of the file * @param rulesets Rulesets[] the rulesets (one for each class) */ private void doRulesOutput2(String filename,Ruleset[] rulesets) { String output = new String(""); int rules = 0; for (int i = 0; i < rulesets.length;i++){ rules += rulesets[i].size(); } output += "@Number of Rules: "+rules+"\n"; for (int i=0;i<rulesets.length-1;i++){ Mask class_filter=new Mask(train.size()); train.filterByClass(class_filter,rulesets[i].getType()); for(int j=0;j<rulesets[i].size();j++){ output+="if("; Rule current=rulesets[i].getRule(j); for (int k=0;k<current.size();k++){ output+=current.getSimpleRule(k); if (k!=current.size()-1) output+=" && "; } int covered=current.apply(train); int accuracy=current.apply(train,class_filter); output+=") ("+accuracy+"/"+covered+")\n\t"; output+="output="+rulesets[i].getType()+"\nelse "; } } output+="\n\toutput="+rulesets[rulesets.length-1].getType(); Fichero.escribeFichero(filename, output); } /** * It grows a rule maximizing the following heuristic: * h= p*(log(p/t)-log(P/T)) * p/t: number of positive/total instances covered by the current rule * P/T: number of positive/total instances * @param data MyDataset the dataset * @param positives Mask active positive entries * @param negatives Mask active negative entries * @return the grown rule */ public Rule grow(MyDataset data,Mask positives,Mask negatives){ return grow(new Rule(),data,positives,negatives); } /** * It expands a rule, greedily adding simple rules, maximizing the following heuristic: * h= p*(log(p/t)-log(P/T)) * p/t: number of positive/total instances covered by the current rule * P/T: number of positive/total instances * @param rule Rule the base rule * @param data MyDataset the dataset * @param grow_pos Mask active positive entries * @param grow_neg Mask active negative entries * @return the grown rule */ public Rule grow(Rule rule,MyDataset data,Mask grow_pos,Mask grow_neg){ double best_v=0,best_h=-Double.MAX_VALUE; if (grow_pos.getnActive()<0) return new Rule(); Mask positives = grow_pos.copy(); Mask negatives = grow_neg.copy(); int[] attributes=new int[data.getnInputs()]; int nattributes=attributes.length; for (int i=0;i<attributes.length;i++){ attributes[i]=i; } if (rule.size()>0){ //Elimination of the attributes already used by the rule int[] aux = new int[data.getnInputs()]; for (int i = 0; i < rule.size(); i++) { attributes[rule.getSimpleRule(i).getAttribute()] = -1; } int j = 0; for (int i = 0; i < nattributes; i++) { if (attributes[i] != -1) { aux[j] = attributes[i]; j++; } } attributes = aux; nattributes = j; data.filter(positives,rule); data.filter(negatives,rule); } while (negatives.getnActive()>0 && nattributes>0 && positives.getnActive()>0){ int A=-1,P=-1; //A->best attribute, P-> relative position inside Attributes double V=0,best_global=-Double.MAX_VALUE; int Op=-1; double C=Utilities.log2(positives.getnActive()/((double) (positives.getnActive()+negatives.getnActive()))); for (int i=0;i<nattributes;i++){ int ai=attributes[i]; Score score=new Score(); positives.resetIndex(); while (positives.next()){ if (!data.isMissing(positives,ai)){ double[] exemple=data.getExample(positives); int pos = score.findKey(exemple[ai]); if (pos!=-1) score.addPositive(pos); else score.addKey(exemple[ai],Score.POSITIVE); } } negatives.resetIndex(); while (negatives.next()){ if (!data.isMissing(negatives,ai)){ double[] exemple=data.getExample(negatives); int pos = score.findKey(exemple[ai]); if (pos!=-1) score.addNegative(pos); else score.addKey(exemple[ai],Score.NEGATIVE); } } //First, to find the best value for the current attribute best_v=0; best_h=-Double.MAX_VALUE; int best_operator=-1; if(Attributes.getInputAttribute(ai).getType()==Attribute.NOMINAL){ for (int j = 0; j < score.size(); j++) { double h = score.getPositive(j) * (Utilities.log2(score.getPositive(j) / ( (double) score.getTotal(j))) -C); if (h > best_h) { best_h = h; best_v = score.getKey(j); best_operator=Rule.EQUAL; } } } else{ score.sort(); int total_pos=positives.getnActive(),total_neg=negatives.getnActive(); //Evaluating the first element as cutting point with operator <= int count_pos=0; int count_neg=0; if (score.size()==1 && score.getPositive(0)!=0){ best_h = count_pos * (Utilities.log2(score.getPositive(0)/( (double) score.getNegative(0) + score.getPositive(0))) - C); best_v = score.getKey(0); best_operator=Rule.EQUAL; } else if (score.size()==1){ best_h = -Double.MAX_VALUE; best_v = score.getKey(0); best_operator=Rule.EQUAL; } else best_h = -Double.MAX_VALUE; for (int j = 0; j < score.size()-1; j++) { //Evaluating the j-th element as cutting point with > count_pos+=score.getPositive(j); count_neg+=score.getNegative(j); double h_lower; if (count_pos!=0) h_lower = count_pos * (Utilities.log2(count_pos / ( (double) count_neg+count_pos)) -C); else h_lower = -Double.MAX_VALUE; //Evaluating the (j-1)th element as cutting point with > int count_pos_g=total_pos-count_pos; int count_neg_g=total_neg-count_neg; double h_greater; if (count_pos_g!=0) h_greater = count_pos_g * (Utilities.log2(count_pos_g / ( (double) count_neg_g+count_pos_g)) -C); else h_greater=-Double.MAX_VALUE; //Comparing with the best so far if (h_lower>h_greater && h_lower > best_h) { best_h = h_lower; best_v = score.getKey(j); best_operator=Rule.LOWER; } else if (h_greater>best_h){ best_h = h_greater; best_v = score.getKey(j); best_operator=Rule.GREATER; } } } //Later, test if it is the best couple so far if (best_h>best_global){ P=i; A=ai; V=best_v; Op=best_operator; best_global=best_h; } } //2.Add to the rule the couple (A,V) //Julian - If no attribute could be found, do not add the couple //I really don't know if this assumption it is correct, but it allows the program //to finish, so... if(A!=-1){ rule.grow(A,V,Op); data.filter(positives,A,V,Op); data.filter(negatives,A,V,Op); attributes[P]=attributes[nattributes-1]; } nattributes--; } return rule; } /** * It prunes a rule, according with one of two heuristics: * W= (p+1)/(t+2) * A= (p+n')/T * p/t: number of positive/total instances covered by the current rule * n': number of negative instances not covered by the current rule (true negatives) * T: number of total instances * @param rule Rule the rule to prune * @param data MyDataset the dataset * @param positives Mask active positive entries * @param negatives Mask active negative entries * @param metric int heuristic's selector (A or W) * @return the pruned rule. */ public Rule prune(Rule rule,MyDataset data,Mask positives,Mask negatives,int metric){ double p,t,T,n,n_prime; double h,next_h=0.0; p=rule.apply(data,positives); T=positives.getnActive()+negatives.getnActive(); n=rule.apply(data,negatives); n_prime=negatives.getnActive()-n; if (metric==A){ next_h=(p+n_prime)/T; } if (metric==W){ t=p+n; next_h=(p+1)/(t+2); } do{ h=next_h; p=rule.apply(data,positives,rule.size()-1); T=positives.getnActive()+negatives.getnActive(); n=rule.apply(data,negatives,rule.size()-1); n_prime=negatives.getnActive()-n; if (metric==A){ next_h=(p+n_prime)/T; } if (metric==W){ t=p+n; next_h=(p+1)/(t+2); } if (h<next_h && rule.size()>1){ rule.prune(rule.size()-1); } }while(h<next_h && rule.size()>0 && rule.size()>1); return rule; } /** * It implements the algorithm Ripperk itself: * 1. In each iteration, it takes the class with less instances in the dataset and * it splits this into positive (those of the taken class) and negative (the rest) instances. * 2. Then it invokes Ripper2 to generates a Ruleset for the taken class. * 3. Finally, it removes the instances covered by the ruleset and it carries on whit the next iteration. * @param data MyDataset the dataset * @return a vector with a Ruleset for each class. */ public Ruleset[] ripperMulticlass(MyDataset data){ Ruleset[] rules=new Ruleset[data.getnClasses()]; Pair[] ordered_classes=new Pair[data.getnClasses()]; for (int i=0;i<data.getnClasses();i++){ ordered_classes[i]=new Pair(); ordered_classes[i].key=i; ordered_classes[i].value=data.numberInstances(i); } Utilities.mergeSort(ordered_classes,data.getnClasses()); Mask positives,negatives; Mask base=new Mask(data.size()); for (int i=0;i<data.getnClasses()-1;i++){ String target_class=Attributes.getOutputAttribute(0).getNominalValue(ordered_classes[i].key); positives=base.copy(); data.filterByClass(positives,target_class); negatives=base.and(positives.complement()); rules[i]=ripperK(data,positives,negatives); rules[i].setType(target_class); base=negatives.copy(); } rules[rules.length-1]=new Ruleset(); rules[rules.length-1].addRule(new Rule()); rules[rules.length-1].setType(Attributes.getOutputAttribute(0).getNominalValue(ordered_classes[data.getnClasses()-1].key)); return rules; } /** * It implements the Ripper2's Build Phase: * Iteratively, it grows and prunes rules until the descrition length (DL) of the ruleset * and examples is 64 bits greater than the smallest DL met so far, or there are * no positive examples, or the error rate >= 50%. * The prune metric used here is W. * @param rules Ruleset the rules generated so far * @param data MyDataset the dataset * @param pos Mask active positive entries of data * @param neg Mask active negative entries of data * @return rules with the new grown & pruned rules */ public Ruleset IREPstar(Ruleset rules, MyDataset data, Mask pos, Mask neg){ double error_ratio,smallest_mdl,new_mdl; smallest_mdl=Double.MAX_VALUE-64.0; new_mdl=Double.MAX_VALUE; Mask positives=pos.copy(),negatives=neg.copy(); do{ //Splitting of the two dataset into two prune dataset and two grow dataset Mask[] gp_pos=positives.split(pct,rand); Mask grow_pos=gp_pos[0], prune_pos=gp_pos[1]; Mask[] gp_neg=negatives.split(pct,rand); Mask grow_neg=gp_neg[0], prune_neg=gp_neg[1]; //Grow & Prune Rule new_rule=grow(data,grow_pos,grow_neg); System.out.println("Regla criada\n"+new_rule); prune(new_rule,data,prune_pos,prune_neg,Ripper.W); System.out.println("Regla podada\n"+new_rule); //Estimation of the error ratio rules.addRule(new_rule); //double errors=new_rule.apply(data,prune_neg); //error_ratio=errors/prune_neg.getnActive(); new_mdl=rules.getMDL(data,positives,negatives); if (new_mdl<=smallest_mdl+64){ System.out.println("Regla a�adida\n"+new_rule); data.substract(positives,new_rule); data.substract(negatives,new_rule); if (new_mdl<smallest_mdl) smallest_mdl = new_mdl; } else rules.removeRule(rules.size()-1); }while(positives.getnActive()>0 && new_mdl<=smallest_mdl+64); return rules; } /** * It implements the Ripper2's Optimization Phase: * After generating the initial ruleset {Ri}, * generate and prune two variants of each rule Ri from randomized data * using the grow and prune method. But one variant is generated from an empty rule * while the other is generated by greedily adding antecedents to the original rule. * Moreover, the pruning metric used here is A. Then the smallest possible DL for * each variant and the original rule is computed. The variant with the minimal DL * is selected as the final representative of Ri in the ruleset. [WEKA] * @param rules Ruleset the rules from the build phase * @param data MyDataset the dataset * @param positives Mask active positive entries * @param negatives Mask active negative entries * @return the optimized rules */ public Ruleset optimize(Ruleset rules,MyDataset data,Mask positives,Mask negatives){ for (int i=0;i<rules.size();i++){ //Splitting of the two dataset into two prune dataset and two grow dataset Mask[] gp_pos=positives.split(pct,rand); Mask grow_pos=gp_pos[0], prune_pos=gp_pos[1]; Mask[] gp_neg=negatives.split(pct,rand); Mask grow_neg=gp_neg[0], prune_neg=gp_neg[1]; //Removing from the pruning set of all instances that are covered by the other rules data.substract(prune_pos,rules,i); data.substract(prune_neg,rules,i); //Creation of the competing rules Rule revision=grow(data,grow_pos,grow_neg); //from scratch Rule replacement=rules.getRule(i).getCopy(); grow(replacement,data,grow_pos,grow_neg); //from the current rule prune(revision,data,prune_pos,prune_neg,Ripper.A); prune(replacement,data,prune_pos,prune_neg,Ripper.A); //Select the representative Rule current=rules.getRule(i); double current_mdl=rules.getMDL(data,positives,negatives); rules.removeRule(i); rules.insertRule(revision,i); double revision_mdl=rules.getMDL(data,positives,negatives); rules.removeRule(i); rules.insertRule(replacement,i); double replacement_mdl=rules.getMDL(data,positives,negatives); rules.removeRule(i); if (current_mdl<=revision_mdl && current_mdl<=replacement_mdl) rules.insertRule(current,i); else if(revision_mdl<=replacement_mdl) rules.insertRule(revision,i); else rules.insertRule(replacement,i); } return rules; } /** * It implements the algorithm Ripper2: * 1. Build Phase * 2. Optimization Phase * 3. MOP UP: If there are not covered entries, repeat the Build Phase for these entries. * 4. CLEAN UP: Remove those rules that increase the description's length (DL) * @param data MyDataset the dataset * @param positives Mask active positive entries * @param negatives Mask active negative entries * @return the generated ruleset */ public Ruleset ripperK(MyDataset data,Mask positives,Mask negatives){ Ruleset rules=new Ruleset(); /**********************Growing & Prunning***************************************/ IREPstar(rules,data,positives,negatives); for (int i=0;i<K;i++){ /**********************Optimization********************************************/ optimize(rules, data, positives, negatives); /*************************************MOP UP******************************/ Mask p = positives.copy(); data.substract(p, rules); if (p.getnActive() > 0) { IREPstar(rules, data, p, negatives); } } /*************************************CLEAN UP******************************/ rules.removeDuplicates(); rules.pulish(data,positives,negatives); return rules; } public MyDataset getData(){return train;} }