/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* JRip.java
* Copyright (C) 2001 Xin Xu, Eibe Frank
*/
package weka.classifiers.rules;
import java.util.Enumeration;
import java.util.Random;
import java.util.Vector;
import weka.core.FastVector;
import weka.core.Instances;
import weka.core.Instance;
import weka.core.Attribute;
import weka.core.AttributeStats;
import weka.core.Utils;
import weka.core.OptionHandler;
import weka.core.Option;
import weka.core.Copyable;
import weka.core.WeightedInstancesHandler;
import weka.core.AdditionalMeasureProducer;
import weka.core.UnsupportedAttributeTypeException;
import weka.core.UnsupportedClassTypeException;
import weka.filters.supervised.attribute.ClassOrder;
import weka.filters.Filter;
import weka.classifiers.DistributionClassifier;
import weka.classifiers.Evaluation;
/**
* This class implements a propositional rule learner, Repeated Incremental
* Pruning to Produce Error Reduction (RIPPER), which is proposed by William
* W. Cohen as an optimzed version of IREP. <p>
*
* The algorithm is briefly described as follows: <p>
* Initialize RS = {}, and for each class from the less prevalent one to
* the more frequent one, DO: <p>
*
* 1. Building stage: repeat 1.1 and 1.2 until the descrition length (DL)
* of the ruleset and examples is 64 bits greater than the smallest DL
* met so far, or there are no positive examples, or the error rate >= 50%.
* <p>
* 1.1. Grow phase:<br>
* Grow one rule by greedily adding antecedents (or conditions) to
* the rule until the rule is perfect (i.e. 100% accurate). The
* procedure tries every possible value of each attribute and selects
* the condition with highest information gain: p(log(p/t)-log(P/T)).
* <p>
* 1.2. Prune phase:<br>
* Incrementally prune each rule and allow the pruning of any
* final sequences of the antecedents;<br>
* The pruning metric is (p-n)/(p+n) -- but it's actually
* 2p/(p+n) -1, so in this implementation we simply use p/(p+n)
* (actually (p+1)/(p+n+2), thus if p+n is 0, it's 0.5).<p>
*
* 2. Optimization stage: after generating the initial ruleset {Ri},
* generate and prune two variants of each rule Ri from randomized data
* using procedure 1.1 and 1.2. But one variant is generated from an
* empty rule while the other is generated by greedily adding antecedents
* to the original rule. Moreover, the pruning metric used here is
* (TP+TN)/(P+N).<br>
* Then the smallest possible DL for each variant and the original rule
* is computed. The variant with the minimal DL is selected as the final
* representative of Ri in the ruleset. <br>
* After all the rules in {Ri} have been examined and if there are still
* residual positives, more rules are generated based on the residual
* positives using Building Stage again. <p>
*
* 3. Delete the rules from the ruleset that would increase the DL of the
* whole ruleset if it were in it. and add resultant ruleset to RS. <p>
*
* ENDDO<p>
*
* Note that there seem to be 2 bugs in the ripper program that would
* affect the ruleset size and accuracy slightly. This implementation avoids
* these bugs and thus is a little bit different from Cohen's original
* implementation. Even after fixing the bugs, since the order of classes with
* the same frequency is not defined in ripper, there still seems to be
* some trivial difference between this implementation and the original ripper,
* especially for audiology data in UCI repository, where there are lots of
* classes of few instances.<p>
*
* If wrapped by other classes, typical usage of this class is:<br>
*
* <code>JRip rip = new JRip();
* Instances data = ... // Data from somewhere
* double[] orderedClasses = ... // Get the ordered class counts for the data
* double expFPRate = ... // Calculate the expected FP/(FP+FN) rate
* double classIndex = ... // The class index for which ruleset is built
* // DL of default rule, no theory DL, only data DL
* double defDL = RuleStats.dataDL(expFPRate, 0.0, data.sumOfWeights(),
* 0.0, orderedClasses[(int)classIndex]);
*
* rip.rulesetForOneClass(expFPRate, data, classIndex, defDL);
* RuleStats rulesetStats = rip.getRuleStats(0);
*
* // Can get heaps of information from RuleStats, e.g. combined DL,
* // simpleStats, etc.
* double comDL = rulesetStats.combinedDL(expFPRate, classIndex);
* int whichRule = ... // Want simple stats of which rule?
* double[] simpleStats = rulesetStats.getSimpleStats(whichRule);
* ...
* </code>
*
* Details please see "Fast Effective Rule Induction", William W. Cohen,
* 'Machine Learning: Proceedings of the Twelfth International Conference'
* (ML95). <p>
*
* PS. We have compared this implementation with the original ripper
* implementation in aspects of accuracy, ruleset size and running time
* on both artificial data "ab+bcd+defg" and UCI datasets. In all these
* aspects it seems to be quite comparable to the original ripper
* implementation. However, we didn't consider memory consumption
* optimization in this implementation.<p>
*
* @author Xin Xu (xx5@cs.waikato.ac.nz)
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @version $Revision: 1.1.1.1 $
*/
public class JRip extends DistributionClassifier
implements OptionHandler,
AdditionalMeasureProducer,
WeightedInstancesHandler{
/** The limit of description length surplus in ruleset generation */
private static double MAX_DL_SURPLUS = 64.0;
/** The class attribute of the data*/
private Attribute m_Class;
/** The ruleset */
private FastVector m_Ruleset;
/** The predicted class distribution */
private FastVector m_Distributions;
/** Runs of optimizations */
private int m_Optimizations = 2;
/** Random object used in this class */
private Random m_Random = null;
/** # of all the possible conditions in a rule */
private double m_Total = 0;
/** The seed to perform randomization */
private long m_Seed = 1;
/** The number of folds to split data into Grow and Prune for IREP */
private int m_Folds = 3;
/** The minimal number of instance weights within a split*/
private double m_MinNo = 2.0;
/** Whether in a debug mode */
private boolean m_Debug = false;
/** Whether check the error rate >= 0.5 in stopping criteria */
private boolean m_CheckErr = true;
/** Whether use pruning, i.e. the data is clean or not */
private boolean m_UsePruning = true;
/** The filter used to randomize the class order */
private Filter m_Filter = null;
/** The RuleStats for the ruleset of each class value */
private FastVector m_RulesetStats;
/**
* Returns an enumeration describing the available options
* Valid options are: <p>
*
* -F number <br>
* The number of folds for reduced error pruning. One fold is
* used as the pruning set. (Default: 3) <p>
*
* -N number <br>
* The minimal weights of instances within a split.
* (Default: 2) <p>
*
* -O number <br>
* Set the number of runs of optimizations. (Default: 2)<p>
*
* -D <br>
* Whether turn on the debug mode
*
* -S number <br>
* The seed of randomization used in Ripper.(Default: 1)<p>
*
* -E <br>
* Whether NOT check the error rate >= 0.5 in stopping criteria.
* (default: check)<p>
*
* -P <br>
* Whether NOT use pruning. (default: use pruning)<p>
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(3);
newVector.addElement(new Option("\tSet number of folds for REP\n" +
"\tOne fold is used as pruning set.\n" +
"\t(default 3)","F", 1, "-F <number of folds>"));
newVector.addElement(new Option("\tSet the minimal weights of instances\n" +
"\twithin a split.\n" +
"\t(default 2.0)","N", 1, "-N <min. weights>"));
newVector.addElement(new Option("\tSet the number of runs of\n"+
"\toptimizations. (Default: 2)", "O",
1,"-O <number of runs>"));
newVector.addElement(new Option("\tSet whether turn on the\n"+
"\tdebug mode (Default: false)", "D",
0,"-D"));
newVector.addElement(new Option("\tThe seed of randomization\n"+
"\t(Default: 1)", "S",
1,"-S <seed>"));
newVector.addElement(new Option("Whether NOT check the error rate>=0.5\n"
+"\tin stopping criteria "
+"\t(default: check)", "E",
0, "-E"));
newVector.addElement(new Option("Whether NOT use pruning\n"
+"\t(default: use pruning)", "P",
0, "-P"));
return newVector.elements();
}
/**
* Parses a given list of options.
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
String numFoldsString = Utils.getOption('F', options);
if (numFoldsString.length() != 0)
m_Folds = Integer.parseInt(numFoldsString);
else
m_Folds = 3;
String minNoString = Utils.getOption('N', options);
if (minNoString.length() != 0)
m_MinNo = Double.parseDouble(minNoString);
else
m_MinNo = 2.0;
String seedString = Utils.getOption('S', options);
if (seedString.length() != 0)
m_Seed = Long.parseLong(seedString);
else
m_Seed = 1;
String runString = Utils.getOption('O', options);
if (runString.length() != 0)
m_Optimizations = Integer.parseInt(runString);
else
m_Optimizations = 2;
m_Debug = Utils.getFlag('D', options);
m_CheckErr = !Utils.getFlag('E', options);
m_UsePruning = !Utils.getFlag('P', options);
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] options = new String [11];
int current = 0;
options[current++] = "-F"; options[current++] = "" + m_Folds;
options[current++] = "-N"; options[current++] = "" + m_MinNo;
options[current++] = "-O"; options[current++] = "" + m_Optimizations;
options[current++] = "-S"; options[current++] = "" + m_Seed;
if(m_Debug)
options[current++] = "-D";
if(!m_CheckErr)
options[current++] = "-E";
if(!m_UsePruning)
options[current++] = "-P";
while(current < options.length)
options[current++] = "";
return options;
}
/**
* Returns an enumeration of the additional measure names
* @return an enumeration of the measure names
*/
public Enumeration enumerateMeasures() {
Vector newVector = new Vector(1);
newVector.addElement("measureNumRules");
return newVector.elements();
}
/**
* Returns the value of the named measure
* @param measureName the name of the measure to query for its value
* @return the value of the named measure
* @exception IllegalArgumentException if the named measure is not supported
*/
public double getMeasure(String additionalMeasureName) {
if (additionalMeasureName.compareTo("measureNumRules") == 0)
return m_Ruleset.size();
else
throw new IllegalArgumentException(additionalMeasureName+" not supported (RIPPER)");
}
public void setFolds(int fold){ m_Folds = fold; }
public int getFolds(){ return m_Folds; }
public void setMinNo(double m){ m_MinNo = m; }
public double getMinNo(){ return m_MinNo; }
public void setSeed(long s){ m_Seed = s; }
public long getSeed(){ return m_Seed; }
public void setOptimizations(int run){ m_Optimizations = run; }
public int getOptimizations(){ return m_Optimizations; }
public void setDebug(boolean d){m_Debug = d;}
public boolean getDebug(){ return m_Debug; }
public void setCheckErrorRate(boolean d){ m_CheckErr = d;}
public boolean getCheckErrorRate(){ return m_CheckErr; }
public void setUsePruning(boolean d){ m_UsePruning = d;}
public boolean getUsePruning(){ return m_UsePruning; }
/**
* Get the ruleset generated by Ripper
*
* @return the ruleset
*/
public FastVector getRuleset(){ return m_Ruleset; }
/**
* Get the statistics of the ruleset in the given position
*
* @param pos the position of the stats, assuming correct
*/
public RuleStats getRuleStats(int pos) {
return (RuleStats)m_RulesetStats.elementAt(pos);
}
/**
* The single antecedent in the rule, which is composed of an attribute and
* the corresponding value. There are two inherited classes, namely NumericAntd
* and NominalAntd in which the attributes are numeric and nominal respectively.
*/
private abstract class Antd
implements WeightedInstancesHandler, Copyable{
/* The attribute of the antecedent */
protected Attribute att;
/* The attribute value of the antecedent.
For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */
protected double value;
/* The maximum infoGain achieved by this antecedent test
* in the growing data */
protected double maxInfoGain;
/* The accurate rate of this antecedent test on the growing data */
protected double accuRate;
/* The coverage of this antecedent in the growing data */
protected double cover;
/* The accurate data for this antecedent in the growing data */
protected double accu;
/* Constructor*/
public Antd(Attribute a){
att=a;
value=Double.NaN;
maxInfoGain = 0;
accuRate = Double.NaN;
cover = Double.NaN;
accu = Double.NaN;
}
/* The abstract members for inheritance */
public abstract Instances[] splitData(Instances data, double defAcRt,
double cla);
public abstract boolean covers(Instance inst);
public abstract String toString();
/** Implements Copyable */
public abstract Object copy();
/* Get functions of this antecedent */
public Attribute getAttr(){ return att; }
public double getAttrValue(){ return value; }
public double getMaxInfoGain(){ return maxInfoGain; }
public double getAccuRate(){ return accuRate; }
public double getAccu(){ return accu; }
public double getCover(){ return cover; }
}
/**
* The antecedent with numeric attribute
*/
private class NumericAntd extends Antd{
/* The split point for this numeric antecedent */
private double splitPoint;
/* Constructor*/
public NumericAntd(Attribute a){
super(a);
splitPoint = Double.NaN;
}
/* Get split point of this numeric antecedent */
public double getSplitPoint(){ return splitPoint; }
/** Implements Copyable */
public Object copy(){
NumericAntd na = new NumericAntd(getAttr());
na.value = this.value;
na.splitPoint = this.splitPoint;
return na;
}
/**
* Implements the splitData function.
* This procedure is to split the data into two bags according
* to the information gain of the numeric attribute value
* The maximum infoGain is also calculated.
*
* @param insts the data to be split
* @param defAcRt the default accuracy rate for data
* @param cl the class label to be predicted
* @return the array of data after split
*/
public Instances[] splitData(Instances insts, double defAcRt,
double cl){
Instances data = insts;
int total=data.numInstances();// Total number of instances without
// missing value for att
int split=1; // Current split position
int prev=0; // Previous split position
int finalSplit=split; // Final split position
maxInfoGain = 0;
value = 0;
double fstCover=0, sndCover=0, fstAccu=0, sndAccu=0;
data.sort(att);
// Find the las instance without missing value
for(int x=0; x<data.numInstances(); x++){
Instance inst = data.instance(x);
if(inst.isMissing(att)){
total = x;
break;
}
sndCover += inst.weight();
if(Utils.eq(inst.classValue(), cl))
sndAccu += inst.weight();
}
if(total == 0) return null; // Data all missing for the attribute
splitPoint = data.instance(total-1).value(att);
for(; split <= total; split++){
if((split == total) ||
(data.instance(split).value(att) > // Can't split within
data.instance(prev).value(att))){ // same value
for(int y=prev; y<split; y++){
Instance inst = data.instance(y);
fstCover += inst.weight();
if(Utils.eq(data.instance(y).classValue(), cl)){
fstAccu += inst.weight(); // First bag positive# ++
}
}
double fstAccuRate = (fstAccu+1.0)/(fstCover+1.0),
sndAccuRate = (sndAccu+1.0)/(sndCover+1.0);
/* Which bag has higher information gain? */
boolean isFirst;
double fstInfoGain, sndInfoGain;
double accRate, infoGain, coverage, accurate;
fstInfoGain =
//Utils.eq(defAcRt, 1.0) ?
//fstAccu/(double)numConds :
fstAccu*(Utils.log2(fstAccuRate)-Utils.log2(defAcRt));
sndInfoGain =
//Utils.eq(defAcRt, 1.0) ?
//sndAccu/(double)numConds :
sndAccu*(Utils.log2(sndAccuRate)-Utils.log2(defAcRt));
if(fstInfoGain > sndInfoGain){
isFirst = true;
infoGain = fstInfoGain;
accRate = fstAccuRate;
accurate = fstAccu;
coverage = fstCover;
}
else{
isFirst = false;
infoGain = sndInfoGain;
accRate = sndAccuRate;
accurate = sndAccu;
coverage = sndCover;
}
/* Check whether so far the max infoGain */
if(infoGain > maxInfoGain){
splitPoint = data.instance(prev).value(att);
value = (isFirst) ? 0 : 1;
accuRate = accRate;
accu = accurate;
cover = coverage;
maxInfoGain = infoGain;
finalSplit = (isFirst) ? split : prev;
}
for(int y=prev; y<split; y++){
Instance inst = data.instance(y);
sndCover -= inst.weight();
if(Utils.eq(data.instance(y).classValue(), cl)){
sndAccu -= inst.weight(); // Second bag positive# --
}
}
prev=split;
}
}
/* Split the data */
Instances[] splitData = new Instances[2];
splitData[0] = new Instances(data, 0, finalSplit);
splitData[1] = new Instances(data, finalSplit, total-finalSplit);
return splitData;
}
/**
* Whether the instance is covered by this antecedent
*
* @param inst the instance in question
* @return the boolean value indicating whether the instance is covered
* by this antecedent
*/
public boolean covers(Instance inst){
boolean isCover=true;
if(!inst.isMissing(att)){
if((int)value == 0){ // First bag
if(inst.value(att) > splitPoint)
isCover=false;
}
else if(inst.value(att) < splitPoint) // Second bag
isCover=false;
}
else
isCover = false;
return isCover;
}
/**
* Prints this antecedent
*
* @return a textual description of this antecedent
*/
public String toString() {
String symbol = ((int)value == 0) ? " <= " : " >= ";
return (att.name() + symbol + Utils.doubleToString(splitPoint, 6));
}
}
/**
* The antecedent with nominal attribute
*/
private class NominalAntd extends Antd{
/* The parameters of infoGain calculated for each attribute value
* in the growing data */
private double[] accurate;
private double[] coverage;
/* Constructor*/
public NominalAntd(Attribute a){
super(a);
int bag = att.numValues();
accurate = new double[bag];
coverage = new double[bag];
}
/** Implements Copyable */
public Object copy(){
Antd antec = new NominalAntd(getAttr());
antec.value = this.value;
return antec;
}
/**
* Implements the splitData function.
* This procedure is to split the data into bags according
* to the nominal attribute value
* The infoGain for each bag is also calculated.
*
* @param data the data to be split
* @param defAcRt the default accuracy rate for data
* @param cl the class label to be predicted
* @return the array of data after split
*/
public Instances[] splitData(Instances data, double defAcRt,
double cl){
int bag = att.numValues();
Instances[] splitData = new Instances[bag];
for(int x=0; x<bag; x++){
splitData[x] = new Instances(data, data.numInstances());
accurate[x] = 0;
coverage[x] = 0;
}
for(int x=0; x<data.numInstances(); x++){
Instance inst=data.instance(x);
if(!inst.isMissing(att)){
int v = (int)inst.value(att);
splitData[v].add(inst);
coverage[v] += inst.weight();
if((int)inst.classValue() == (int)cl)
accurate[v] += inst.weight();
}
}
for(int x=0; x<bag; x++){
double t = coverage[x]+1.0;
double p = accurate[x] + 1.0;
double infoGain =
//Utils.eq(defAcRt, 1.0) ?
//accurate[x]/(double)numConds :
accurate[x]*(Utils.log2(p/t)-Utils.log2(defAcRt));
if(infoGain > maxInfoGain){
maxInfoGain = infoGain;
cover = coverage[x];
accu = accurate[x];
accuRate = p/t;
value = (double)x;
}
}
return splitData;
}
/**
* Whether the instance is covered by this antecedent
*
* @param inst the instance in question
* @return the boolean value indicating whether the instance is
* covered by this antecedent
*/
public boolean covers(Instance inst){
boolean isCover=false;
if(!inst.isMissing(att)){
if((int)inst.value(att) == (int)value)
isCover=true;
}
return isCover;
}
/**
* Prints this antecedent
*
* @return a textual description of this antecedent
*/
public String toString() {
return (att.name() + " = " +att.value((int)value));
}
}
/**
* This class implements a single rule that predicts specified class.
*
* A rule consists of antecedents "AND"ed together and the consequent
* (class value) for the classification.
* In this class, the Information Gain (p*[log(p/t) - log(P/T)]) is used to
* select an antecedent and Reduced Error Prunning (REP) with the metric
* of accuracy rate p/(p+n) or (TP+TN)/(P+N) is used to prune the rule.
*/
protected class RipperRule extends Rule{
/** The internal representation of the class label to be predicted*/
private double m_Consequent = -1;
/** The vector of antecedents of this rule*/
protected FastVector m_Antds = null;
public void setConsequent(double cl){ m_Consequent = cl; }
public double getConsequent(){ return m_Consequent; }
/** Constructor */
public RipperRule(){
m_Antds = new FastVector();
}
/**
* Get a shallow copy of this rule
*
* @return the copy
*/
public Object copy(){
RipperRule copy = new RipperRule();
copy.setConsequent(getConsequent());
copy.m_Antds = (FastVector)this.m_Antds.copyElements();
return copy;
}
/**
* Whether the instance covered by this rule
*
* @param inst the instance in question
* @return the boolean value indicating whether the instance
* is covered by this rule
*/
public boolean covers(Instance datum){
boolean isCover=true;
for(int i=0; i<m_Antds.size(); i++){
Antd antd = (Antd)m_Antds.elementAt(i);
if(!antd.covers(datum)){
isCover = false;
break;
}
}
return isCover;
}
/**
* Whether this rule has antecedents, i.e. whether it is a default rule
*
* @return the boolean value indicating whether the rule has antecedents
*/
public boolean hasAntds(){
if (m_Antds == null)
return false;
else
return (m_Antds.size() > 0);
}
/**
* the number of antecedents of the rule
*
* @return the size of this rule
*/
public double size(){ return (double)m_Antds.size(); }
/**
* Private function to compute default number of accurate instances
* in the specified data for the consequent of the rule
*
* @param data the data in question
* @return the default accuracy number
*/
private double computeDefAccu(Instances data){
double defAccu=0;
for(int i=0; i<data.numInstances(); i++){
Instance inst = data.instance(i);
if((int)inst.classValue() == (int)m_Consequent)
defAccu += inst.weight();
}
return defAccu;
}
/**
* Build one rule using the growing data
*
* @param data the growing data used to build the rule
* @exception if the consequent is not set yet
*/
public void grow(Instances data) throws Exception {
if(m_Consequent == -1)
throw new Exception(" Consequent not set yet.");
Instances growData = data;
double sumOfWeights = growData.sumOfWeights();
if(!Utils.gr(sumOfWeights, 0.0))
return;
/* Compute the default accurate rate of the growing data */
double defAccu = computeDefAccu(growData);
double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0);
/* Keep the record of which attributes have already been used*/
boolean[] used=new boolean [growData.numAttributes()];
for (int k=0; k<used.length; k++)
used[k]=false;
int numUnused=used.length;
// If there are already antecedents existing
for(int j=0; j < m_Antds.size(); j++){
Antd antdj = (Antd)m_Antds.elementAt(j);
if(!antdj.getAttr().isNumeric()){
used[antdj.getAttr().index()]=true;
numUnused--;
}
}
double maxInfoGain;
while (Utils.gr(growData.numInstances(), 0.0) &&
(numUnused > 0)
&& Utils.sm(defAcRt, 1.0)
){
// We require that infoGain be positive
/*if(numAntds == originalSize)
maxInfoGain = 0.0; // At least one condition allowed
else
maxInfoGain = Utils.eq(defAcRt, 1.0) ?
defAccu/(double)numAntds : 0.0; */
maxInfoGain = 0.0;
/* Build a list of antecedents */
Antd oneAntd=null;
Instances coverData = null;
Enumeration enumAttr=growData.enumerateAttributes();
int index=-1;
/* Build one condition based on all attributes not used yet*/
while (enumAttr.hasMoreElements()){
Attribute att= (Attribute)(enumAttr.nextElement());
index++;
if(m_Debug)
System.err.println("\nOne condition: size = "
+ growData.sumOfWeights());
Antd antd =null;
if(att.isNumeric())
antd = new NumericAntd(att);
else
antd = new NominalAntd(att);
if(!used[index]){
/* Compute the best information gain for each attribute,
it's stored in the antecedent formed by this attribute.
This procedure returns the data covered by the antecedent*/
Instances coveredData = computeInfoGain(growData, defAcRt,
antd);
if(coveredData != null){
double infoGain = antd.getMaxInfoGain();
if(m_Debug)
System.err.println("Test of \'"+antd.toString()+
"\': infoGain = "+
infoGain + " | Accuracy = " +
antd.getAccuRate()+
"="+antd.getAccu()
+"/"+antd.getCover()+
" def. accuracy: "+defAcRt);
if(infoGain > maxInfoGain){
oneAntd=antd;
coverData = coveredData;
maxInfoGain = infoGain;
}
}
}
}
if(oneAntd == null) break; // Cannot find antds
if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage
//Numeric attributes can be used more than once
if(!oneAntd.getAttr().isNumeric()){
used[oneAntd.getAttr().index()]=true;
numUnused--;
}
m_Antds.addElement(oneAntd);
growData = coverData;// Grow data size is shrinking
defAcRt = oneAntd.getAccuRate();
}
}
/**
* Compute the best information gain for the specified antecedent
*
* @param instances the data based on which the infoGain is computed
* @param defAcRt the default accuracy rate of data
* @param antd the specific antecedent
* @param numConds the number of antecedents in the rule so far
* @return the data covered by the antecedent
*/
private Instances computeInfoGain(Instances instances, double defAcRt,
Antd antd){
Instances data = instances;
/* Split the data into bags.
The information gain of each bag is also calculated in this procedure */
Instances[] splitData = antd.splitData(data, defAcRt,
m_Consequent);
/* Get the bag of data to be used for next antecedents */
if(splitData != null)
return splitData[(int)antd.getAttrValue()];
else return null;
}
/**
* Prune all the possible final sequences of the rule using the
* pruning data. The measure used to prune the rule is based on
* flag given.
*
* @param pruneData the pruning data used to prune the rule
* @param useWhole flag to indicate whether use the error rate of
* the whole pruning data instead of the data covered
*/
public void prune(Instances pruneData, boolean useWhole){
Instances data = pruneData;
double total = data.sumOfWeights();
if(!Utils.gr(total, 0.0))
return;
/* The default accurate # and rate on pruning data */
double defAccu=computeDefAccu(data);
if(m_Debug)
System.err.println("Pruning with " + defAccu +
" positive data out of " + total +
" instances");
int size=m_Antds.size();
if(size == 0) return; // Default rule before pruning
double[] worthRt = new double[size];
double[] coverage = new double[size];
double[] worthValue = new double[size];
for(int w=0; w<size; w++){
worthRt[w]=coverage[w]=worthValue[w]=0.0;
}
/* Calculate accuracy parameters for all the antecedents in this rule */
double tn = 0.0; // True negative if useWhole
for(int x=0; x<size; x++){
Antd antd=(Antd)m_Antds.elementAt(x);
Attribute attr= antd.getAttr();
Instances newData = data;
data = new Instances(newData, 0); // Make data empty
for(int y=0; y<newData.numInstances(); y++){
Instance ins=newData.instance(y);
if(antd.covers(ins)){ // Covered by this antecedent
coverage[x] += ins.weight();
data.add(ins); // Add to data for further pruning
if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction
worthValue[x] += ins.weight();
}
else if(useWhole){ // Not covered
if((int)ins.classValue() != (int)m_Consequent)
tn += ins.weight();
}
}
if(useWhole){
worthValue[x] += tn;
worthRt[x] = worthValue[x] / total;
}
else // Note if coverage is 0, accuracy is 0.5
worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0);
}
double maxValue = (defAccu+1.0)/(total+2.0);
int maxIndex = -1;
for(int i=0; i<worthValue.length; i++){
if(m_Debug){
double denom = useWhole ? total : coverage[i];
System.err.println(i+"(useAccuray? "+!useWhole+"): "
+ worthRt[i] +
"="+worthValue[i]+
"/"+denom);
}
if(worthRt[i] > maxValue){ // Prefer to the
maxValue = worthRt[i]; // shorter rule
maxIndex = i;
}
}
/* Prune the antecedents according to the accuracy parameters */
for(int z=size-1;z>maxIndex;z--)
m_Antds.removeElementAt(z);
}
/**
* Prints this rule
*
* @param classAttr the class attribute in the data
* @return a textual description of this rule
*/
public String toString(Attribute classAttr) {
StringBuffer text = new StringBuffer();
if(m_Antds.size() > 0){
for(int j=0; j< (m_Antds.size()-1); j++)
text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
}
text.append(" => " + classAttr.name() +
"=" + classAttr.value((int)m_Consequent));
return text.toString();
}
}
/**
* Builds Ripper in the order of class frequencies. For each class
* it's built in two stages: building and optimization
*
* @param instances the training data
* @exception Exception if classifier can't be built successfully
*/
public void buildClassifier(Instances instances) throws Exception {
if(instances.numInstances() == 0)
throw new Exception(" No instances with a class value!");
if (instances.checkForStringAttributes())
throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!");
if (!instances.classAttribute().isNominal())
throw new UnsupportedClassTypeException(" Only nominal class, please.");
m_Random = new Random(m_Seed);
m_Total = RuleStats.numAllConditions(instances);
if(m_Debug)
System.err.println("Number of all possible conditions = "+m_Total);
Instances data = null;
m_Filter = new ClassOrder();
// Sth. to make the class order different each time in cross-validations
Instance inst =
instances.instance((int)(m_Random.nextDouble()*(double)instances.numInstances()));
((ClassOrder)m_Filter).setSeed((long)inst.toString().hashCode());
((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
m_Filter.setInputFormat(instances);
data = Filter.useFilter(instances, m_Filter);
if(data == null)
throw new Exception(" Unable to randomize the class orders.");
data.deleteWithMissingClass();
if(data.numInstances() == 0)
throw new Exception(" No instances with a class value!");
if(data.numInstances() < m_Folds)
throw new Exception(" Not enough data for REP.");
m_Class = data.classAttribute();
m_Ruleset = new FastVector();
m_RulesetStats = new FastVector();
m_Distributions = new FastVector();
// Sort by classes frequency
double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts();
if(m_Debug){
System.err.println("Sorted classes:");
for(int x=0; x < m_Class.numValues(); x++)
System.err.println(x+": "+m_Class.value(x) + " has " +
orderedClasses[x] + " instances.");
}
// Iterate from less prevalent class to more frequent one
oneClass:
for(int y=0; y < data.numClasses()-1; y++){ // For each class
double classIndex = (double)y;
if(m_Debug){
int ci = (int)classIndex;
System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): "
+ orderedClasses[y] + "instances\n"+
"=====================================\n");
}
if(Utils.eq(orderedClasses[y],0.0)) // No data for this class
continue oneClass;
// The expected FP/err is the proportion of the class
double all = 0;
for(int i=y; i<orderedClasses.length; i++)
all += orderedClasses[i];
double expFPRate = orderedClasses[y] / all;
double classYWeights = 0, totalWeights = 0;
for(int j=0; j < data.numInstances(); j++){
Instance datum = data.instance(j);
totalWeights += datum.weight();
if((int)datum.classValue() == y){
classYWeights += datum.weight();
}
}
// DL of default rule, no theory DL, only data DL
double defDL;
if(classYWeights > 0)
defDL = RuleStats.dataDL(expFPRate,
0.0,
totalWeights,
0.0,
classYWeights);
else
continue oneClass; // Subsumed by previous rules
if(Double.isNaN(defDL) || Double.isInfinite(defDL))
throw new Exception("Should never happen: "+
"defDL NaN or infinite!");
if(m_Debug)
System.err.println("The default DL = "+defDL);
data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
}
// Set the default rule
RipperRule defRule = new RipperRule();
defRule.setConsequent((double)(data.numClasses()-1));
m_Ruleset.addElement(defRule);
RuleStats defRuleStat = new RuleStats();
defRuleStat.setData(data);
defRuleStat.setNumAllConds(m_Total);
defRuleStat.addAndUpdate(defRule);
m_RulesetStats.addElement(defRuleStat);
for(int z=0; z < m_RulesetStats.size(); z++){
RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z);
for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){
double[] classDist = oneClass.getDistributions(xyz);
Utils.normalize(classDist);
if(classDist != null)
m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist));
}
}
}
/**
* Classify the test instance with the rule learner and provide
* the class distributions
*
* @param datum the instance to be classified
* @return the distribution
*/
public double[] distributionForInstance(Instance datum){
try{
for(int i=0; i < m_Ruleset.size(); i++){
RipperRule rule = (RipperRule)m_Ruleset.elementAt(i);
if(rule.covers(datum))
return (double[])m_Distributions.elementAt(i);
}
}catch(Exception e){
System.err.println(e.getMessage());
e.printStackTrace();
}
System.err.println("Should never happen!");
return new double[datum.classAttribute().numValues()];
}
/** Build a ruleset for the given class according to the given data
*
* @param expFPRate the expected FP/(FP+FN) used in DL calculation
* @param data the given data
* @param classIndex the given class index
* @param defDL the default DL in the data
* @exception if the ruleset can be built properly
*/
protected Instances rulesetForOneClass(double expFPRate,
Instances data,
double classIndex,
double defDL)
throws Exception {
Instances newData = data, growData, pruneData;
boolean stop = false;
FastVector ruleset = new FastVector();
double dl = defDL, minDL = defDL;
RuleStats rstats = null;
double[] rst;
// Check whether data have positive examples
boolean defHasPositive = true; // No longer used
boolean hasPositive = defHasPositive;
/********************** Building stage ***********************/
if(m_Debug)
System.err.println("\n*** Building stage ***");
while((!stop) && hasPositive){ // Generate new rules until
// stopping criteria met
RipperRule oneRule;
if(m_UsePruning){
/* Split data into Grow and Prune*/
// We should have stratified the data, but ripper seems
// to have a bug that makes it not to do so. In order
// to simulate it more precisely, we do the same thing.
//newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData=part[0];
pruneData=part[1];
//growData=newData.trainCV(m_Folds, m_Folds-1);
//pruneData=newData.testCV(m_Folds, m_Folds-1);
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if(m_Debug)
System.err.println("\nGrowing a rule ...");
oneRule.grow(growData); // Build the rule
if(m_Debug)
System.err.println("One rule found before pruning:"+
oneRule.toString(m_Class));
if(m_Debug)
System.err.println("\nPruning the rule ...");
oneRule.prune(pruneData, false); // Prune the rule
if(m_Debug)
System.err.println("One rule found after pruning:"+
oneRule.toString(m_Class));
}
else{
oneRule = new RipperRule();
oneRule.setConsequent(classIndex); // Must set first
if(m_Debug)
System.err.println("\nNo pruning: growing a rule ...");
oneRule.grow(newData); // Build the rule
if(m_Debug)
System.err.println("No pruning: one rule found:\n"+
oneRule.toString(m_Class));
}
// Compute the DL of this ruleset
if(rstats == null){ // First rule
rstats = new RuleStats();
rstats.setNumAllConds(m_Total);
rstats.setData(newData);
}
rstats.addAndUpdate(oneRule);
int last = rstats.getRuleset().size()-1; // Index of last rule
dl += rstats.relativeDL(last, expFPRate, m_CheckErr);
if(Double.isNaN(dl) || Double.isInfinite(dl))
throw new Exception("Should never happen: dl in "+
"building stage NaN or infinite!");
if(m_Debug)
System.err.println("Before optimization("+last+
"): the dl = "+dl+" | best: "+minDL);
if(dl < minDL)
minDL = dl; // The best dl so far
rst = rstats.getSimpleStats(last);
if(m_Debug)
System.err.println("The rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
stop = checkStop(rst, minDL, dl);
if(!stop){
ruleset.addElement(oneRule); // Accepted
newData = rstats.getFiltered(last)[1];// Data not covered
hasPositive = Utils.gr(rst[5], 0.0); // Positives remaining?
if(m_Debug)
System.err.println("One rule added: has positive? "
+hasPositive);
}
else{
if(m_Debug)
System.err.println("Quit rule");
rstats.removeLast(); // Remove last to be re-used
}
}// while !stop
/******************** Optimization stage *******************/
RuleStats finalRulesetStat = null;
if(m_UsePruning){
for(int z=0; z < m_Optimizations; z++){
if(m_Debug)
System.err.println("\n*** Optimization: run #"
+z+" ***");
newData = data;
finalRulesetStat = new RuleStats();
finalRulesetStat.setData(newData);
finalRulesetStat.setNumAllConds(m_Total);
int position=0;
stop = false;
boolean isResidual = false;
hasPositive = defHasPositive;
dl = minDL = defDL;
oneRule:
while(!stop && hasPositive){
isResidual = (position>=ruleset.size()); // Cover residual positive examples
// Re-do shuffling and stratification
//newData.randomize(m_Random);
newData = RuleStats.stratify(newData, m_Folds, m_Random);
Instances[] part = RuleStats.partition(newData, m_Folds);
growData=part[0];
pruneData=part[1];
//growData=newData.trainCV(m_Folds, m_Folds-1);
//pruneData=newData.testCV(m_Folds, m_Folds-1);
RipperRule finalRule;
if(m_Debug)
System.err.println("\nRule #"+position +
"| isResidual?" + isResidual+
"| data size: "+newData.sumOfWeights());
if(isResidual){
RipperRule newRule = new RipperRule();
newRule.setConsequent(classIndex);
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" a new rule ...");
newRule.grow(growData);
newRule.prune(pruneData, false);
finalRule = newRule;
if(m_Debug)
System.err.println("\nNew rule found: "+
newRule.toString(m_Class));
}
else{
RipperRule oldRule = (RipperRule)ruleset.elementAt(position);
boolean covers = false;
// Test coverage of the next old rule
for(int i=0; i<newData.numInstances(); i++)
if(oldRule.covers(newData.instance(i))){
covers = true;
break;
}
if(!covers){// Null coverage, no variants can be generated
finalRulesetStat.addAndUpdate(oldRule);
position++;
continue oneRule;
}
// 2 variants
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" Replace ...");
RipperRule replace = new RipperRule();
replace.setConsequent(classIndex);
replace.grow(growData);
// Remove the pruning data covered by the following
// rules, then simply compute the error rate of the
// current rule to prune it. According to Ripper,
// it's equivalent to computing the error of the
// whole ruleset -- is it true?
pruneData = RuleStats.rmCoveredBySuccessives(pruneData,ruleset, position);
replace.prune(pruneData, true);
if(m_Debug)
System.err.println("\nGrowing and pruning"+
" Revision ...");
RipperRule revision = (RipperRule)oldRule.copy();
// For revision, first rm the data covered by the old rule
Instances newGrowData = new Instances(growData, 0);
for(int b=0; b<growData.numInstances(); b++){
Instance inst = growData.instance(b);
if(revision.covers(inst))
newGrowData.add(inst);
}
revision.grow(newGrowData);
revision.prune(pruneData, true);
double[][] prevRuleStats = new double[position][6];
for(int c=0; c < position; c++)
prevRuleStats[c] = finalRulesetStat.getSimpleStats(c);
// Now compare the relative DL of variants
FastVector tempRules = (FastVector)ruleset.copyElements();
tempRules.setElementAt(replace, position);
RuleStats repStat = new RuleStats(data, tempRules);
repStat.setNumAllConds(m_Total);
repStat.countData(position, newData, prevRuleStats);
//repStat.countData();
rst = repStat.getSimpleStats(position);
if(m_Debug)
System.err.println("Replace rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
double repDL = repStat.relativeDL(position, expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("\nReplace: "+
replace.toString(m_Class)
+" |dl = "+repDL);
if(Double.isNaN(repDL) || Double.isInfinite(repDL))
throw new Exception("Should never happen: repDL"+
"in optmz. stage NaN or "+
"infinite!");
tempRules.setElementAt(revision, position);
RuleStats revStat = new RuleStats(data, tempRules);
revStat.setNumAllConds(m_Total);
revStat.countData(position, newData, prevRuleStats);
//revStat.countData();
double revDL = revStat.relativeDL(position, expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("Revision: "
+ revision.toString(m_Class)
+" |dl = "+revDL);
if(Double.isNaN(revDL) || Double.isInfinite(revDL))
throw new Exception("Should never happen: revDL"+
"in optmz. stage NaN or "+
"infinite!");
rstats = new RuleStats(data, ruleset);
rstats.setNumAllConds(m_Total);
rstats.countData(position, newData, prevRuleStats);
//rstats.countData();
double oldDL = rstats.relativeDL(position, expFPRate,
m_CheckErr);
if(Double.isNaN(oldDL) || Double.isInfinite(oldDL))
throw new Exception("Should never happen: oldDL"+
"in optmz. stage NaN or "+
"infinite!");
if(m_Debug)
System.err.println("Old rule: "+
oldRule.toString(m_Class)
+" |dl = "+oldDL);
if(m_Debug)
System.err.println("\nrepDL: "+repDL+
"\nrevDL: "+revDL+
"\noldDL: "+oldDL);
if((oldDL <= revDL) && (oldDL <= repDL))
finalRule = oldRule; // Old the best
else if(revDL <= repDL)
finalRule = revision; // Revision the best
else
finalRule = replace; // Replace the best
}
finalRulesetStat.addAndUpdate(finalRule);
rst = finalRulesetStat.getSimpleStats(position);
if(isResidual){
dl += finalRulesetStat.relativeDL(position,
expFPRate,
m_CheckErr);
if(m_Debug)
System.err.println("After optimization: the dl"
+"="+dl+" | best: "+minDL);
if(dl < minDL)
minDL = dl; // The best dl so far
stop = checkStop(rst, minDL, dl);
if(!stop)
ruleset.addElement(finalRule); // Accepted
else{
finalRulesetStat.removeLast(); // Remove last to be re-used
position--;
}
}
else
ruleset.setElementAt(finalRule, position); // Accepted
if(m_Debug){
System.err.println("The rule covers: "+rst[0]+
" | pos = " + rst[2] +
" | neg = " + rst[4]+
"\nThe rule doesn't cover: "+rst[1]+
" | pos = " + rst[5]);
System.err.println("\nRuleset so far: ");
for(int x=0; x<ruleset.size(); x++)
System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
System.err.println();
}
//Data not covered
if(finalRulesetStat.getRulesetSize() > 0)// If any rules
newData = finalRulesetStat.getFiltered(position)[1];
hasPositive = Utils.gr(rst[5], 0.0); //Positives remaining?
position++;
} // while !stop && hasPositive
if(ruleset.size() > (position+1)){ // Hasn't gone through yet
for(int k=position+1; k<ruleset.size(); k++)
finalRulesetStat.addAndUpdate((Rule)ruleset.elementAt(k));
}
if(m_Debug)
System.err.println("\nDeleting rules to decrease"+
" DL of the whole ruleset ...");
finalRulesetStat.reduceDL(expFPRate, m_CheckErr);
if(m_Debug){
int del = ruleset.size() -
finalRulesetStat.getRulesetSize();
System.err.println(del+" rules are deleted"+
" after DL reduction procedure");
}
ruleset = finalRulesetStat.getRuleset();
rstats = finalRulesetStat;
} // For each run of optimization
} // if pruning is used
// Concatenate the ruleset for this class to the whole ruleset
if(m_Debug){
System.err.println("\nFinal ruleset: ");
for(int x=0; x<ruleset.size(); x++)
System.err.println(x+": "+((RipperRule)ruleset.elementAt(x)).toString(m_Class));
System.err.println();
}
m_Ruleset.appendElements(ruleset);
m_RulesetStats.addElement(rstats);
if(ruleset.size() > 0)// If any rules for this class
return rstats.getFiltered(ruleset.size()-1)[1]; // Data not
else // covered
return data;
}
/**
* Check whether the stopping criterion meets
*
* @param rst the statistic of the ruleset
* @param minDL the min description length so far
* @param dl the current description length of the ruleset
* @return true if stop criterion meets, false otherwise
*/
private boolean checkStop(double[] rst, double minDL, double dl){
if(dl > minDL+MAX_DL_SURPLUS){
if(m_Debug)
System.err.println("DL too large: "+dl+" | "+minDL);
return true;
}
else if(!Utils.gr(rst[2], 0.0)){// Covered positives
if(m_Debug)
System.err.println("Too few positives.");
return true;
}
else if((rst[4]/rst[0]) >= 0.5){// Err rate
if(m_CheckErr){
if(m_Debug)
System.err.println("Error too large: "+
rst[4] + "/" + rst[0]);
return true;
}
else
return false;
}
else{// Not stops
if(m_Debug)
System.err.println("Continue.");
return false;
}
}
/**
* Prints the all the rules of the rule learner.
*
* @return a textual description of the classifier
*/
public String toString() {
if (m_Ruleset == null)
return "JRIP: No model built yet.";
StringBuffer sb = new StringBuffer("JRIP rules:\n"+
"===========\n\n");
for(int j=0; j<m_RulesetStats.size(); j++){
RuleStats rs = (RuleStats)m_RulesetStats.elementAt(j);
FastVector rules = rs.getRuleset();
for(int k=0; k<rules.size(); k++){
double[] simStats = rs.getSimpleStats(k);
sb.append(((RipperRule)rules.elementAt(k)).toString(m_Class)
+ " ("+simStats[0]+"/"+simStats[4]+")\n");
}
}
if(m_Debug){
System.err.println("Inside m_Ruleset");
for(int i=0; i<m_Ruleset.size(); i++)
System.err.println(((RipperRule)m_Ruleset.elementAt(i)).toString(m_Class));
}
sb.append("\nNumber of Rules : "
+ m_Ruleset.size() + "\n");
return sb.toString();
}
/**
* Main method.
*
* @param args the options for the classifier
*/
public static void main(String[] args) {
try {
System.out.println(Evaluation.evaluateModel(new JRip(), args));
} catch (Exception e) {
e.printStackTrace();
System.err.println(e.getMessage());
}
}
}