/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * JRip.java * Copyright (C) 2001 Xin Xu, Eibe Frank */ package weka.classifiers.rules; import java.util.Enumeration; import java.util.Random; import java.util.Vector; import weka.core.FastVector; import weka.core.Instances; import weka.core.Instance; import weka.core.Attribute; import weka.core.AttributeStats; import weka.core.Utils; import weka.core.OptionHandler; import weka.core.Option; import weka.core.Copyable; import weka.core.WeightedInstancesHandler; import weka.core.AdditionalMeasureProducer; import weka.core.UnsupportedAttributeTypeException; import weka.core.UnsupportedClassTypeException; import weka.filters.supervised.attribute.ClassOrder; import weka.filters.Filter; import weka.classifiers.DistributionClassifier; import weka.classifiers.Evaluation; /** * This class implements a propositional rule learner, Repeated Incremental * Pruning to Produce Error Reduction (RIPPER), which is proposed by William * W. Cohen as an optimzed version of IREP. <p> * * The algorithm is briefly described as follows: <p> * Initialize RS = {}, and for each class from the less prevalent one to * the more frequent one, DO: <p> * * 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL) * of the ruleset and examples is 64 bits greater than the smallest DL * met so far, or there are no positive examples, or the error rate >= 50%. * <p> * 1.1. Grow phase:<br> * Grow one rule by greedily adding antecedents (or conditions) to * the rule until the rule is perfect (i.e. 100% accurate). The * procedure tries every possible value of each attribute and selects * the condition with highest information gain: p(log(p/t)-log(P/T)). * <p> * 1.2. Prune phase:<br> * Incrementally prune each rule and allow the pruning of any * final sequences of the antecedents;<br> * The pruning metric is (p-n)/(p+n) -- but it's actually * 2p/(p+n) -1, so in this implementation we simply use p/(p+n) * (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<p> * * 2. Optimization stage: after generating the initial ruleset {Ri}, * generate and prune two variants of each rule Ri from randomized data * using procedure 1.1 and 1.2. But one variant is generated from an * empty rule while the other is generated by greedily adding antecedents * to the original rule. Moreover, the pruning metric used here is * (TP+TN)/(P+N).<br> * Then the smallest possible DL for each variant and the original rule * is computed. The variant with the minimal DL is selected as the final * representative of Ri in the ruleset. <br> * After all the rules in {Ri} have been examined and if there are still * residual positives, more rules are generated based on the residual * positives using Building Stage again. <p> * * 3. Delete the rules from the ruleset that would increase the DL of the * whole ruleset if it were in it. and add resultant ruleset to RS. <p> * * ENDDO<p> * * Note that there seem to be 2 bugs in the ripper program that would * affect the ruleset size and accuracy slightly. This implementation avoids * these bugs and thus is a little bit different from Cohen's original * implementation. Even after fixing the bugs, since the order of classes with * the same frequency is not defined in ripper, there still seems to be * some trivial difference between this implementation and the original ripper, * especially for audiology data in UCI repository, where there are lots of * classes of few instances.<p> * * If wrapped by other classes, typical usage of this class is:<br> * * <code>JRip rip = new JRip(); * Instances data = ... // Data from somewhere * double[] orderedClasses = ... // Get the ordered class counts for the data * double expFPRate = ... // Calculate the expected FP/(FP+FN) rate * double classIndex = ... // The class index for which ruleset is built * // DL of default rule, no theory DL, only data DL * double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(), * 0.0, orderedClasses[(int)classIndex]); * * rip.rulesetForOneClass(expFPRate, data, classIndex, defDL); * RuleStats rulesetStats = rip.getRuleStats(0); * * // Can get heaps of information from RuleStats, e.g. combined DL, * // simpleStats, etc. * double comDL = rulesetStats.combinedDL(expFPRate, classIndex); * int whichRule = ... // Want simple stats of which rule? * double[] simpleStats = rulesetStats.getSimpleStats(whichRule); * ... * </code> * * Details please see "Fast Effective Rule Induction", William W. Cohen, * 'Machine Learning: Proceedings of the Twelfth International Conference' * (ML95). <p> * * PS. We have compared this implementation with the original ripper * implementation in aspects of accuracy, ruleset size and running time * on both artificial data "ab+bcd+defg" and UCI datasets. In all these * aspects it seems to be quite comparable to the original ripper * implementation. However, we didn't consider memory consumption * optimization in this implementation.<p> * * @author Xin Xu (xx5@cs.waikato.ac.nz) * @author Eibe Frank (eibe@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */ public class JRip extends DistributionClassifier implements OptionHandler, AdditionalMeasureProducer, WeightedInstancesHandler{ /** The limit of description length surplus in ruleset generation */ private static double MAX_DL_SURPLUS = 64.0; /** The class attribute of the data*/ private Attribute m_Class; /** The ruleset */ private FastVector m_Ruleset; /** The predicted class distribution */ private FastVector m_Distributions; /** Runs of optimizations */ private int m_Optimizations = 2; /** Random object used in this class */ private Random m_Random = null; /** # of all the possible conditions in a rule */ private double m_Total = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The number of folds to split data into Grow and Prune for IREP */ private int m_Folds = 3; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** Whether in a debug mode */ private boolean m_Debug = false; /** Whether check the error rate >= 0.5 in stopping criteria */ private boolean m_CheckErr = true; /** Whether use pruning, i.e. the data is clean or not */ private boolean m_UsePruning = true; /** The filter used to randomize the class order */ private Filter m_Filter = null; /** The RuleStats for the ruleset of each class value */ private FastVector m_RulesetStats; /** * Returns an enumeration describing the available options * Valid options are: <p> * * -F number <br> * The number of folds for reduced error pruning. One fold is * used as the pruning set. (Default: 3) <p> * * -N number <br> * The minimal weights of instances within a split. * (Default: 2) <p> * * -O number <br> * Set the number of runs of optimizations. (Default: 2)<p> * * -D <br> * Whether turn on the debug mode * * -S number <br> * The seed of randomization used in Ripper.(Default: 1)<p> * * -E <br> * Whether NOT check the error rate >= 0.5 in stopping criteria. * (default: check)<p> * * -P <br> * Whether NOT use pruning. (default: use pruning)<p> * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(3); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","F", 1, "-F <number of folds>")); newVector.addElement(new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)","N", 1, "-N <min. weights>")); newVector.addElement(new Option("\tSet the number of runs of\n"+ "\toptimizations. (Default: 2)", "O", 1,"-O <number of runs>")); newVector.addElement(new Option("\tSet whether turn on the\n"+ "\tdebug mode (Default: false)", "D", 0,"-D")); newVector.addElement(new Option("\tThe seed of randomization\n"+ "\t(Default: 1)", "S", 1,"-S <seed>")); newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n" +"\tin stopping criteria " +"\t(default: check)", "E", 0, "-E")); newVector.addElement(new Option("Whether NOT use pruning\n" +"\t(default: use pruning)", "P", 0, "-P")); return newVector.elements(); } /** * Parses a given list of options. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('F', options); if (numFoldsString.length() != 0) m_Folds = Integer.parseInt(numFoldsString); else m_Folds = 3; String minNoString = Utils.getOption('N', options); if (minNoString.length() != 0) m_MinNo = Double.parseDouble(minNoString); else m_MinNo = 2.0; String seedString = Utils.getOption('S', options); if (seedString.length() != 0) m_Seed = Long.parseLong(seedString); else m_Seed = 1; String runString = Utils.getOption('O', options); if (runString.length() != 0) m_Optimizations = Integer.parseInt(runString); else m_Optimizations = 2; m_Debug = Utils.getFlag('D', options); m_CheckErr = !Utils.getFlag('E', options); m_UsePruning = !Utils.getFlag('P', options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [11]; int current = 0; options[current++] = "-F"; options[current++] = "" + m_Folds; options[current++] = "-N"; options[current++] = "" + m_MinNo; options[current++] = "-O"; options[current++] = "" + m_Optimizations; options[current++] = "-S"; options[current++] = "" + m_Seed; if(m_Debug) options[current++] = "-D"; if(!m_CheckErr) options[current++] = "-E"; if(!m_UsePruning) options[current++] = "-P"; while(current < options.length) options[current++] = ""; return options; } /** * Returns an enumeration of the additional measure names * @return an enumeration of the measure names */ public Enumeration enumerateMeasures() { Vector newVector = new Vector(1); newVector.addElement("measureNumRules"); return newVector.elements(); } /** * Returns the value of the named measure * @param measureName the name of the measure to query for its value * @return the value of the named measure * @exception IllegalArgumentException if the named measure is not supported */ public double getMeasure(String additionalMeasureName) { if (additionalMeasureName.compareTo("measureNumRules") == 0) return m_Ruleset.size(); else throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)"); } public void setFolds(int fold){ m_Folds = fold; } public int getFolds(){ return m_Folds; } public void setMinNo(double m){ m_MinNo = m; } public double getMinNo(){ return m_MinNo; } public void setSeed(long s){ m_Seed = s; } public long getSeed(){ return m_Seed; } public void setOptimizations(int run){ m_Optimizations = run; } public int getOptimizations(){ return m_Optimizations; } public void setDebug(boolean d){m_Debug = d;} public boolean getDebug(){ return m_Debug; } public void setCheckErrorRate(boolean d){ m_CheckErr = d;} public boolean getCheckErrorRate(){ return m_CheckErr; } public void setUsePruning(boolean d){ m_UsePruning = d;} public boolean getUsePruning(){ return m_UsePruning; } /** * Get the ruleset generated by Ripper * * @return the ruleset */ public FastVector getRuleset(){ return m_Ruleset; } /** * Get the statistics of the ruleset in the given position * * @param pos the position of the stats, assuming correct */ public RuleStats getRuleStats(int pos) { return (RuleStats)m_RulesetStats.elementAt(pos); } /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely NumericAntd * and NominalAntd in which the attributes are numeric and nominal respectively. */ private abstract class Antd implements WeightedInstancesHandler, Copyable{ /* The attribute of the antecedent */ protected Attribute att; /* The attribute value of the antecedent. For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */ protected double value; /* The maximum infoGain achieved by this antecedent test * in the growing data */ protected double maxInfoGain; /* The accurate rate of this antecedent test on the growing data */ protected double accuRate; /* The coverage of this antecedent in the growing data */ protected double cover; /* The accurate data for this antecedent in the growing data */ protected double accu; /* Constructor*/ public Antd(Attribute a){ att=a; value=Double.NaN; maxInfoGain = 0; accuRate = Double.NaN; cover = Double.NaN; accu = Double.NaN; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defAcRt, double cla); public abstract boolean covers(Instance inst); public abstract String toString(); /** Implements Copyable */ public abstract Object copy(); /* Get functions of this antecedent */ public Attribute getAttr(){ return att; } public double getAttrValue(){ return value; } public double getMaxInfoGain(){ return maxInfoGain; } public double getAccuRate(){ return accuRate; } public double getAccu(){ return accu; } public double getCover(){ return cover; } } /** * The antecedent with numeric attribute */ private class NumericAntd extends Antd{ /* The split point for this numeric antecedent */ private double splitPoint; /* Constructor*/ public NumericAntd(Attribute a){ super(a); splitPoint = Double.NaN; } /* Get split point of this numeric antecedent */ public double getSplitPoint(){ return splitPoint; } /** Implements Copyable */ public Object copy(){ NumericAntd na = new NumericAntd(getAttr()); na.value = this.value; na.splitPoint = this.splitPoint; return na; } /** * Implements the splitData function. * This procedure is to split the data into two bags according * to the information gain of the numeric attribute value * The maximum infoGain is also calculated. * * @param insts the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances insts, double defAcRt, double cl){ Instances data = insts; int total=data.numInstances();// Total number of instances without // missing value for att int split=1; // Current split position int prev=0; // Previous split position int finalSplit=split; // Final split position maxInfoGain = 0; value = 0; double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0; data.sort(att); // Find the las instance without missing value for(int x=0; x<data.numInstances(); x++){ Instance inst = data.instance(x); if(inst.isMissing(att)){ total = x; break; } sndCover += inst.weight(); if(Utils.eq(inst.classValue(), cl)) sndAccu += inst.weight(); } if(total == 0) return null; // Data all missing for the attribute splitPoint = data.instance(total-1).value(att); for(; split <= total; split++){ if((split == total) || (data.instance(split).value(att) > // Can't split within data.instance(prev).value(att))){ // same value for(int y=prev; y<split; y++){ Instance inst = data.instance(y); fstCover += inst.weight(); if(Utils.eq(data.instance(y).classValue(), cl)){ fstAccu += inst.weight(); // First bag positive# ++ } } double fstAccuRate = (fstAccu+1.0)/(fstCover+1.0), sndAccuRate = (sndAccu+1.0)/(sndCover+1.0); /* Which bag has higher information gain? */ boolean isFirst; double fstInfoGain, sndInfoGain; double accRate, infoGain, coverage, accurate; fstInfoGain = //Utils.eq(defAcRt, 1.0) ? //fstAccu/(double)numConds : fstAccu*(Utils.log2(fstAccuRate)-Utils.log2(defAcRt)); sndInfoGain = //Utils.eq(defAcRt, 1.0) ? //sndAccu/(double)numConds : sndAccu*(Utils.log2(sndAccuRate)-Utils.log2(defAcRt)); if(fstInfoGain > sndInfoGain){ isFirst = true; infoGain = fstInfoGain; accRate = fstAccuRate; accurate = fstAccu; coverage = fstCover; } else{ isFirst = false; infoGain = sndInfoGain; accRate = sndAccuRate; accurate = sndAccu; coverage = sndCover; } /* Check whether so far the max infoGain */ if(infoGain > maxInfoGain){ splitPoint = data.instance(prev).value(att); value = (isFirst) ? 0 : 1; accuRate = accRate; accu = accurate; cover = coverage; maxInfoGain = infoGain; finalSplit = (isFirst) ? split : prev; } for(int y=prev; y<split; y++){ Instance inst = data.instance(y); sndCover -= inst.weight(); if(Utils.eq(data.instance(y).classValue(), cl)){ sndAccu -= inst.weight(); // Second bag positive# -- } } prev=split; } } /* Split the data */ Instances[] splitData = new Instances[2]; splitData[0] = new Instances(data, 0, finalSplit); splitData[1] = new Instances(data, finalSplit, total-finalSplit); return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered * by this antecedent */ public boolean covers(Instance inst){ boolean isCover=true; if(!inst.isMissing(att)){ if((int)value == 0){ // First bag if(inst.value(att) > splitPoint) isCover=false; } else if(inst.value(att) < splitPoint) // Second bag isCover=false; } else isCover = false; return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = ((int)value == 0) ? " <= " : " >= "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } } /** * The antecedent with nominal attribute */ private class NominalAntd extends Antd{ /* The parameters of infoGain calculated for each attribute value * in the growing data */ private double[] accurate; private double[] coverage; /* Constructor*/ public NominalAntd(Attribute a){ super(a); int bag = att.numValues(); accurate = new double[bag]; coverage = new double[bag]; } /** Implements Copyable */ public Object copy(){ Antd antec = new NominalAntd(getAttr()); antec.value = this.value; return antec; } /** * Implements the splitData function. * This procedure is to split the data into bags according * to the nominal attribute value * The infoGain for each bag is also calculated. * * @param data the data to be split * @param defAcRt the default accuracy rate for data * @param cl the class label to be predicted * @return the array of data after split */ public Instances[] splitData(Instances data, double defAcRt, double cl){ int bag = att.numValues(); Instances[] splitData = new Instances[bag]; for(int x=0; x<bag; x++){ splitData[x] = new Instances(data, data.numInstances()); accurate[x] = 0; coverage[x] = 0; } for(int x=0; x<data.numInstances(); x++){ Instance inst=data.instance(x); if(!inst.isMissing(att)){ int v = (int)inst.value(att); splitData[v].add(inst); coverage[v] += inst.weight(); if((int)inst.classValue() == (int)cl) accurate[v] += inst.weight(); } } for(int x=0; x<bag; x++){ double t = coverage[x]+1.0; double p = accurate[x] + 1.0; double infoGain = //Utils.eq(defAcRt, 1.0) ? //accurate[x]/(double)numConds : accurate[x]*(Utils.log2(p/t)-Utils.log2(defAcRt)); if(infoGain > maxInfoGain){ maxInfoGain = infoGain; cover = coverage[x]; accu = accurate[x]; accuRate = p/t; value = (double)x; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is * covered by this antecedent */ public boolean covers(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if((int)inst.value(att) == (int)value) isCover=true; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { return (att.name() + " = " +att.value((int)value)); } } /** * This class implements a single rule that predicts specified class. * * A rule consists of antecedents "AND"ed together and the consequent * (class value) for the classification. * In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to * select an antecedent and Reduced Error Prunning (REP) with the metric * of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule. */ protected class RipperRule extends Rule{ /** The internal representation of the class label to be predicted*/ private double m_Consequent = -1; /** The vector of antecedents of this rule*/ protected FastVector m_Antds = null; public void setConsequent(double cl){ m_Consequent = cl; } public double getConsequent(){ return m_Consequent; } /** Constructor */ public RipperRule(){ m_Antds = new FastVector(); } /** * Get a shallow copy of this rule * * @return the copy */ public Object copy(){ RipperRule copy = new RipperRule(); copy.setConsequent(getConsequent()); copy.m_Antds = (FastVector)this.m_Antds.copyElements(); return copy; } /** * Whether the instance covered by this rule * * @param inst the instance in question * @return the boolean value indicating whether the instance * is covered by this rule */ public boolean covers(Instance datum){ boolean isCover=true; for(int i=0; i<m_Antds.size(); i++){ Antd antd = (Antd)m_Antds.elementAt(i); if(!antd.covers(datum)){ isCover = false; break; } } return isCover; } /** * Whether this rule has antecedents, i.e. whether it is a default rule * * @return the boolean value indicating whether the rule has antecedents */ public boolean hasAntds(){ if (m_Antds == null) return false; else return (m_Antds.size() > 0); } /** * the number of antecedents of the rule * * @return the size of this rule */ public double size(){ return (double)m_Antds.size(); } /** * Private function to compute default number of accurate instances * in the specified data for the consequent of the rule * * @param data the data in question * @return the default accuracy number */ private double computeDefAccu(Instances data){ double defAccu=0; for(int i=0; i<data.numInstances(); i++){ Instance inst = data.instance(i); if((int)inst.classValue() == (int)m_Consequent) defAccu += inst.weight(); } return defAccu; } /** * Build one rule using the growing data * * @param data the growing data used to build the rule * @exception if the consequent is not set yet */ public void grow(Instances data) throws Exception { if(m_Consequent == -1) throw new Exception(" Consequent not set yet."); Instances growData = data; double sumOfWeights = growData.sumOfWeights(); if(!Utils.gr(sumOfWeights, 0.0)) return; /* Compute the default accurate rate of the growing data */ double defAccu = computeDefAccu(growData); double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0); /* Keep the record of which attributes have already been used*/ boolean[] used=new boolean [growData.numAttributes()]; for (int k=0; k<used.length; k++) used[k]=false; int numUnused=used.length; // If there are already antecedents existing for(int j=0; j < m_Antds.size(); j++){ Antd antdj = (Antd)m_Antds.elementAt(j); if(!antdj.getAttr().isNumeric()){ used[antdj.getAttr().index()]=true; numUnused--; } } double maxInfoGain; while (Utils.gr(growData.numInstances(), 0.0) && (numUnused > 0) && Utils.sm(defAcRt, 1.0) ){ // We require that infoGain be positive /*if(numAntds == originalSize) maxInfoGain = 0.0; // At least one condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ? defAccu/(double)numAntds : 0.0; */ maxInfoGain = 0.0; /* Build a list of antecedents */ Antd oneAntd=null; Instances coverData = null; Enumeration enumAttr=growData.enumerateAttributes(); int index=-1; /* Build one condition based on all attributes not used yet*/ while (enumAttr.hasMoreElements()){ Attribute att= (Attribute)(enumAttr.nextElement()); index++; if(m_Debug) System.err.println("\nOne condition: size = " + growData.sumOfWeights()); Antd antd =null; if(att.isNumeric()) antd = new NumericAntd(att); else antd = new NominalAntd(att); if(!used[index]){ /* Compute the best information gain for each attribute, it's stored in the antecedent formed by this attribute. This procedure returns the data covered by the antecedent*/ Instances coveredData = computeInfoGain(growData, defAcRt, antd); if(coveredData != null){ double infoGain = antd.getMaxInfoGain(); if(m_Debug) System.err.println("Test of \'"+antd.toString()+ "\': infoGain = "+ infoGain + " | Accuracy = " + antd.getAccuRate()+ "="+antd.getAccu() +"/"+antd.getCover()+ " def. accuracy: "+defAcRt); if(infoGain > maxInfoGain){ oneAntd=antd; coverData = coveredData; maxInfoGain = infoGain; } } } } if(oneAntd == null) break; // Cannot find antds if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage //Numeric attributes can be used more than once if(!oneAntd.getAttr().isNumeric()){ used[oneAntd.getAttr().index()]=true; numUnused--; } m_Antds.addElement(oneAntd); growData = coverData;// Grow data size is shrinking defAcRt = oneAntd.getAccuRate(); } } /** * Compute the best information gain for the specified antecedent * * @param instances the data based on which the infoGain is computed * @param defAcRt the default accuracy rate of data * @param antd the specific antecedent * @param numConds the number of antecedents in the rule so far * @return the data covered by the antecedent */ private Instances computeInfoGain(Instances instances, double defAcRt, Antd antd){ Instances data = instances; /* Split the data into bags. The information gain of each bag is also calculated in this procedure */ Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent); /* Get the bag of data to be used for next antecedents */ if(splitData != null) return splitData[(int)antd.getAttrValue()]; else return null; } /** * Prune all the possible final sequences of the rule using the * pruning data. The measure used to prune the rule is based on * flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of * the whole pruning data instead of the data covered */ public void prune(Instances pruneData, boolean useWhole){ Instances data = pruneData; double total = data.sumOfWeights(); if(!Utils.gr(total, 0.0)) return; /* The default accurate # and rate on pruning data */ double defAccu=computeDefAccu(data); if(m_Debug) System.err.println("Pruning with " + defAccu + " positive data out of " + total + " instances"); int size=m_Antds.size(); if(size == 0) return; // Default rule before pruning double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for(int w=0; w<size; w++){ worthRt[w]=coverage[w]=worthValue[w]=0.0; } /* Calculate accuracy parameters for all the antecedents in this rule */ double tn = 0.0; // True negative if useWhole for(int x=0; x<size; x++){ Antd antd=(Antd)m_Antds.elementAt(x); Attribute attr= antd.getAttr(); Instances newData = data; data = new Instances(newData, 0); // Make data empty for(int y=0; y<newData.numInstances(); y++){ Instance ins=newData.instance(y); if(antd.covers(ins)){ // Covered by this antecedent coverage[x] += ins.weight(); data.add(ins); // Add to data for further pruning if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction worthValue[x] += ins.weight(); } else if(useWhole){ // Not covered if((int)ins.classValue() != (int)m_Consequent) tn += ins.weight(); } } if(useWhole){ worthValue[x] += tn; worthRt[x] = worthValue[x] / total; } else // Note if coverage is 0, accuracy is 0.5 worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0); } double maxValue = (defAccu+1.0)/(total+2.0); int maxIndex = -1; for(int i=0; i<worthValue.length; i++){ if(m_Debug){ double denom = useWhole ? total : coverage[i]; System.err.println(i+"(useAccuray? "+!useWhole+"): " + worthRt[i] + "="+worthValue[i]+ "/"+denom); } if(worthRt[i] > maxValue){ // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } /* Prune the antecedents according to the accuracy parameters */ for(int z=size-1;z>maxIndex;z--) m_Antds.removeElementAt(z); } /** * Prints this rule * * @param classAttr the class attribute in the data * @return a textual description of this rule */ public String toString(Attribute classAttr) { StringBuffer text = new StringBuffer(); if(m_Antds.size() > 0){ for(int j=0; j< (m_Antds.size()-1); j++) text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and "); text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")"); } text.append(" => " + classAttr.name() + "=" + classAttr.value((int)m_Consequent)); return text.toString(); } } /** * Builds Ripper in the order of class frequencies. For each class * it's built in two stages: building and optimization * * @param instances the training data * @exception Exception if classifier can't be built successfully */ public void buildClassifier(Instances instances) throws Exception { if(instances.numInstances() == 0) throw new Exception(" No instances with a class value!"); if (instances.checkForStringAttributes()) throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!"); if (!instances.classAttribute().isNominal()) throw new UnsupportedClassTypeException(" Only nominal class, please."); m_Random = new Random(m_Seed); m_Total = RuleStats.numAllConditions(instances); if(m_Debug) System.err.println("Number of all possible conditions = "+m_Total); Instances data = null; m_Filter = new ClassOrder(); // Sth. to make the class order different each time in cross-validations Instance inst = instances.instance((int)(m_Random.nextDouble()*(double)instances.numInstances())); ((ClassOrder)m_Filter).setSeed((long)inst.toString().hashCode()); ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND); m_Filter.setInputFormat(instances); data = Filter.useFilter(instances, m_Filter); if(data == null) throw new Exception(" Unable to randomize the class orders."); data.deleteWithMissingClass(); if(data.numInstances() == 0) throw new Exception(" No instances with a class value!"); if(data.numInstances() < m_Folds) throw new Exception(" Not enough data for REP."); m_Class = data.classAttribute(); m_Ruleset = new FastVector(); m_RulesetStats = new FastVector(); m_Distributions = new FastVector(); // Sort by classes frequency double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts(); if(m_Debug){ System.err.println("Sorted classes:"); for(int x=0; x < m_Class.numValues(); x++) System.err.println(x+": "+m_Class.value(x) + " has " + orderedClasses[x] + " instances."); } // Iterate from less prevalent class to more frequent one oneClass: for(int y=0; y < data.numClasses()-1; y++){ // For each class double classIndex = (double)y; if(m_Debug){ int ci = (int)classIndex; System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): " + orderedClasses[y] + "instances\n"+ "=====================================\n"); } if(Utils.eq(orderedClasses[y],0.0)) // No data for this class continue oneClass; // The expected FP/err is the proportion of the class double all = 0; for(int i=y; i<orderedClasses.length; i++) all += orderedClasses[i]; double expFPRate = orderedClasses[y] / all; double classYWeights = 0, totalWeights = 0; for(int j=0; j < data.numInstances(); j++){ Instance datum = data.instance(j); totalWeights += datum.weight(); if((int)datum.classValue() == y){ classYWeights += datum.weight(); } } // DL of default rule, no theory DL, only data DL double defDL; if(classYWeights > 0) defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0, classYWeights); else continue oneClass; // Subsumed by previous rules if(Double.isNaN(defDL) || Double.isInfinite(defDL)) throw new Exception("Should never happen: "+ "defDL NaN or infinite!"); if(m_Debug) System.err.println("The default DL = "+defDL); data = rulesetForOneClass(expFPRate, data, classIndex, defDL); } // Set the default rule RipperRule defRule = new RipperRule(); defRule.setConsequent((double)(data.numClasses()-1)); m_Ruleset.addElement(defRule); RuleStats defRuleStat = new RuleStats(); defRuleStat.setData(data); defRuleStat.setNumAllConds(m_Total); defRuleStat.addAndUpdate(defRule); m_RulesetStats.addElement(defRuleStat); for(int z=0; z < m_RulesetStats.size(); z++){ RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z); for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){ double[] classDist = oneClass.getDistributions(xyz); Utils.normalize(classDist); if(classDist != null) m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist)); } } } /** * Classify the test instance with the rule learner and provide * the class distributions * * @param datum the instance to be classified * @return the distribution */ public double[] distributionForInstance(Instance datum){ try{ for(int i=0; i < m_Ruleset.size(); i++){ RipperRule rule = (RipperRule)m_Ruleset.elementAt(i); if(rule.covers(datum)) return (double[])m_Distributions.elementAt(i); } }catch(Exception e){ System.err.println(e.getMessage()); e.printStackTrace(); } System.err.println("Should never happen!"); return new double[datum.classAttribute().numValues()]; } /** Build a ruleset for the given class according to the given data * * @param expFPRate the expected FP/(FP+FN) used in DL calculation * @param data the given data * @param classIndex the given class index * @param defDL the default DL in the data * @exception if the ruleset can be built properly */ protected Instances rulesetForOneClass(double expFPRate, Instances data, double classIndex, double defDL) throws Exception { Instances newData = data, growData, pruneData; boolean stop = false; FastVector ruleset = new FastVector(); double dl = defDL, minDL = defDL; RuleStats rstats = null; double[] rst; // Check whether data have positive examples boolean defHasPositive = true; // No longer used boolean hasPositive = defHasPositive; /********************** Building stage ***********************/ if(m_Debug) System.err.println("\n*** Building stage ***"); while((!stop) && hasPositive){ // Generate new rules until // stopping criteria met RipperRule oneRule; if(m_UsePruning){ /* Split data into Grow and Prune*/ // We should have stratified the data, but ripper seems // to have a bug that makes it not to do so. In order // to simulate it more precisely, we do the same thing. //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nGrowing a rule ..."); oneRule.grow(growData); // Build the rule if(m_Debug) System.err.println("One rule found before pruning:"+ oneRule.toString(m_Class)); if(m_Debug) System.err.println("\nPruning the rule ..."); oneRule.prune(pruneData, false); // Prune the rule if(m_Debug) System.err.println("One rule found after pruning:"+ oneRule.toString(m_Class)); } else{ oneRule = new RipperRule(); oneRule.setConsequent(classIndex); // Must set first if(m_Debug) System.err.println("\nNo pruning: growing a rule ..."); oneRule.grow(newData); // Build the rule if(m_Debug) System.err.println("No pruning: one rule found:\n"+ oneRule.toString(m_Class)); } // Compute the DL of this ruleset if(rstats == null){ // First rule rstats = new RuleStats(); rstats.setNumAllConds(m_Total); rstats.setData(newData); } rstats.addAndUpdate(oneRule); int last = rstats.getRuleset().size()-1; // Index of last rule dl += rstats.relativeDL(last, expFPRate, m_CheckErr); if(Double.isNaN(dl) || Double.isInfinite(dl)) throw new Exception("Should never happen: dl in "+ "building stage NaN or infinite!"); if(m_Debug) System.err.println("Before optimization("+last+ "): the dl = "+dl+" | best: "+minDL); if(dl < minDL) minDL = dl; // The best dl so far rst = rstats.getSimpleStats(last); if(m_Debug) System.err.println("The rule covers: "+rst[0]+ " | pos = " + rst[2] + " | neg = " + rst[4]+ "\nThe rule doesn't cover: "+rst[1]+ " | pos = " + rst[5]); stop = checkStop(rst, minDL, dl); if(!stop){ ruleset.addElement(oneRule); // Accepted newData = rstats.getFiltered(last)[1];// Data not covered hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining? if(m_Debug) System.err.println("One rule added: has positive? " +hasPositive); } else{ if(m_Debug) System.err.println("Quit rule"); rstats.removeLast(); // Remove last to be re-used } }// while !stop /******************** Optimization stage *******************/ RuleStats finalRulesetStat = null; if(m_UsePruning){ for(int z=0; z < m_Optimizations; z++){ if(m_Debug) System.err.println("\n*** Optimization: run #" +z+" ***"); newData = data; finalRulesetStat = new RuleStats(); finalRulesetStat.setData(newData); finalRulesetStat.setNumAllConds(m_Total); int position=0; stop = false; boolean isResidual = false; hasPositive = defHasPositive; dl = minDL = defDL; oneRule: while(!stop && hasPositive){ isResidual = (position>=ruleset.size()); // Cover residual positive examples // Re-do shuffling and stratification //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1); RipperRule finalRule; if(m_Debug) System.err.println("\nRule #"+position + "| isResidual?" + isResidual+ "| data size: "+newData.sumOfWeights()); if(isResidual){ RipperRule newRule = new RipperRule(); newRule.setConsequent(classIndex); if(m_Debug) System.err.println("\nGrowing and pruning"+ " a new rule ..."); newRule.grow(growData); newRule.prune(pruneData, false); finalRule = newRule; if(m_Debug) System.err.println("\nNew rule found: "+ newRule.toString(m_Class)); } else{ RipperRule oldRule = (RipperRule)ruleset.elementAt(position); boolean covers = false; // Test coverage of the next old rule for(int i=0; i<newData.numInstances(); i++) if(oldRule.covers(newData.instance(i))){ covers = true; break; } if(!covers){// Null coverage, no variants can be generated finalRulesetStat.addAndUpdate(oldRule); position++; continue oneRule; } // 2 variants if(m_Debug) System.err.println("\nGrowing and pruning"+ " Replace ..."); RipperRule replace = new RipperRule(); replace.setConsequent(classIndex); replace.grow(growData); // Remove the pruning data covered by the following // rules, then simply compute the error rate of the // current rule to prune it. According to Ripper, // it's equivalent to computing the error of the // whole ruleset -- is it true? pruneData = RuleStats.rmCoveredBySuccessives(pruneData,ruleset, position); replace.prune(pruneData, true); if(m_Debug) System.err.println("\nGrowing and pruning"+ " Revision ..."); RipperRule revision = (RipperRule)oldRule.copy(); // For revision, first rm the data covered by the old rule Instances newGrowData = new Instances(growData, 0); for(int b=0; b<growData.numInstances(); b++){ Instance inst = growData.instance(b); if(revision.covers(inst)) newGrowData.add(inst); } revision.grow(newGrowData); revision.prune(pruneData, true); double[][] prevRuleStats = new double[position][6]; for(int c=0; c < position; c++) prevRuleStats[c] = finalRulesetStat.getSimpleStats(c); // Now compare the relative DL of variants FastVector tempRules = (FastVector)ruleset.copyElements(); tempRules.setElementAt(replace, position); RuleStats repStat = new RuleStats(data, tempRules); repStat.setNumAllConds(m_Total); repStat.countData(position, newData, prevRuleStats); //repStat.countData(); rst = repStat.getSimpleStats(position); if(m_Debug) System.err.println("Replace rule covers: "+rst[0]+ " | pos = " + rst[2] + " | neg = " + rst[4]+ "\nThe rule doesn't cover: "+rst[1]+ " | pos = " + rst[5]); double repDL = repStat.relativeDL(position, expFPRate, m_CheckErr); if(m_Debug) System.err.println("\nReplace: "+ replace.toString(m_Class) +" |dl = "+repDL); if(Double.isNaN(repDL) || Double.isInfinite(repDL)) throw new Exception("Should never happen: repDL"+ "in optmz. stage NaN or "+ "infinite!"); tempRules.setElementAt(revision, position); RuleStats revStat = new RuleStats(data, tempRules); revStat.setNumAllConds(m_Total); revStat.countData(position, newData, prevRuleStats); //revStat.countData(); double revDL = revStat.relativeDL(position, expFPRate, m_CheckErr); if(m_Debug) System.err.println("Revision: " + revision.toString(m_Class) +" |dl = "+revDL); if(Double.isNaN(revDL) || Double.isInfinite(revDL)) throw new Exception("Should never happen: revDL"+ "in optmz. stage NaN or "+ "infinite!"); rstats = new RuleStats(data, ruleset); rstats.setNumAllConds(m_Total); rstats.countData(position, newData, prevRuleStats); //rstats.countData(); double oldDL = rstats.relativeDL(position, expFPRate, m_CheckErr); if(Double.isNaN(oldDL) || Double.isInfinite(oldDL)) throw new Exception("Should never happen: oldDL"+ "in optmz. stage NaN or "+ "infinite!"); if(m_Debug) System.err.println("Old rule: "+ oldRule.toString(m_Class) +" |dl = "+oldDL); if(m_Debug) System.err.println("\nrepDL: "+repDL+ "\nrevDL: "+revDL+ "\noldDL: "+oldDL); if((oldDL <= revDL) && (oldDL <= repDL)) finalRule = oldRule; // Old the best else if(revDL <= repDL) finalRule = revision; // Revision the best else finalRule = replace; // Replace the best } finalRulesetStat.addAndUpdate(finalRule); rst = finalRulesetStat.getSimpleStats(position); if(isResidual){ dl += finalRulesetStat.relativeDL(position, expFPRate, m_CheckErr); if(m_Debug) System.err.println("After optimization: the dl" +"="+dl+" | best: "+minDL); if(dl < minDL) minDL = dl; // The best dl so far stop = checkStop(rst, minDL, dl); if(!stop) ruleset.addElement(finalRule); // Accepted else{ finalRulesetStat.removeLast(); // Remove last to be re-used position--; } } else ruleset.setElementAt(finalRule, position); // Accepted if(m_Debug){ System.err.println("The rule covers: "+rst[0]+ " | pos = " + rst[2] + " | neg = " + rst[4]+ "\nThe rule doesn't cover: "+rst[1]+ " | pos = " + rst[5]); System.err.println("\nRuleset so far: "); for(int x=0; x<ruleset.size(); x++) System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class)); System.err.println(); } //Data not covered if(finalRulesetStat.getRulesetSize() > 0)// If any rules newData = finalRulesetStat.getFiltered(position)[1]; hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining? position++; } // while !stop && hasPositive if(ruleset.size() > (position+1)){ // Hasn't gone through yet for(int k=position+1; k<ruleset.size(); k++) finalRulesetStat.addAndUpdate((Rule)ruleset.elementAt(k)); } if(m_Debug) System.err.println("\nDeleting rules to decrease"+ " DL of the whole ruleset ..."); finalRulesetStat.reduceDL(expFPRate, m_CheckErr); if(m_Debug){ int del = ruleset.size() - finalRulesetStat.getRulesetSize(); System.err.println(del+" rules are deleted"+ " after DL reduction procedure"); } ruleset = finalRulesetStat.getRuleset(); rstats = finalRulesetStat; } // For each run of optimization } // if pruning is used // Concatenate the ruleset for this class to the whole ruleset if(m_Debug){ System.err.println("\nFinal ruleset: "); for(int x=0; x<ruleset.size(); x++) System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class)); System.err.println(); } m_Ruleset.appendElements(ruleset); m_RulesetStats.addElement(rstats); if(ruleset.size() > 0)// If any rules for this class return rstats.getFiltered(ruleset.size()-1)[1]; // Data not else // covered return data; } /** * Check whether the stopping criterion meets * * @param rst the statistic of the ruleset * @param minDL the min description length so far * @param dl the current description length of the ruleset * @return true if stop criterion meets, false otherwise */ private boolean checkStop(double[] rst, double minDL, double dl){ if(dl > minDL+MAX_DL_SURPLUS){ if(m_Debug) System.err.println("DL too large: "+dl+" | "+minDL); return true; } else if(!Utils.gr(rst[2], 0.0)){// Covered positives if(m_Debug) System.err.println("Too few positives."); return true; } else if((rst[4]/rst[0]) >= 0.5){// Err rate if(m_CheckErr){ if(m_Debug) System.err.println("Error too large: "+ rst[4] + "/" + rst[0]); return true; } else return false; } else{// Not stops if(m_Debug) System.err.println("Continue."); return false; } } /** * Prints the all the rules of the rule learner. * * @return a textual description of the classifier */ public String toString() { if (m_Ruleset == null) return "JRIP: No model built yet."; StringBuffer sb = new StringBuffer("JRIP rules:\n"+ "===========\n\n"); for(int j=0; j<m_RulesetStats.size(); j++){ RuleStats rs = (RuleStats)m_RulesetStats.elementAt(j); FastVector rules = rs.getRuleset(); for(int k=0; k<rules.size(); k++){ double[] simStats = rs.getSimpleStats(k); sb.append(((RipperRule)rules.elementAt(k)).toString(m_Class) + " ("+simStats[0]+"/"+simStats[4]+")\n"); } } if(m_Debug){ System.err.println("Inside m_Ruleset"); for(int i=0; i<m_Ruleset.size(); i++) System.err.println(((RipperRule)m_Ruleset.elementAt(i)).toString(m_Class)); } sb.append("\nNumber of Rules : " + m_Ruleset.size() + "\n"); return sb.toString(); } /** * Main method. * * @param args the options for the classifier */ public static void main(String[] args) { try { System.out.println(Evaluation.evaluateModel(new JRip(), args)); } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); } } }