/* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* * ConjunctiveRule.java * Copyright (C) 2001 Xin Xu * */ package weka.classifiers.rules; import java.io.*; import java.util.*; import weka.core.*; import weka.classifiers.*; /** * This class implements a single conjunctive rule learner that can predict * for numeric and nominal class labels.<p> * * A rule consists of antecedents "AND"ed together and the consequent (class value) * for the classification/regression. In this case, the consequent is the * distribution of the available classes (or numeric value) in the dataset. * If the test instance is not covered by this rule, then it's predicted * using the default class distributions/value of the data not covered by the * rule in the training data. <br> * This learner selects an antecedent by computing the Information Gain of each * antecendent and prunes the generated rule using Reduced Error Prunning (REP). <p> * * For classification, the Information of one antecedent is the weighted average of * the entropies of both the data covered and not covered by the rule. <br> * * For regression, the Information is the weighted average of the mean-squared errors * of both the data covered and not covered by the rule. <p> * * In pruning, weighted average of accuracy rate of the pruning data is used * for classification while the weighted average of the mean-squared errors * of the pruning data is used for regression. <p> * * @author: Xin XU (xx5@cs.waikato.ac.nz) * @version $Revision: 1.1.1.1 $ */ public class ConjunctiveRule extends DistributionClassifier implements OptionHandler, WeightedInstancesHandler{ /** The number of folds to split data into Grow and Prune for REP*/ private int m_Folds = 3; /** The class attribute of the data*/ private Attribute m_ClassAttribute; /** The vector of antecedents of this rule*/ protected FastVector m_Antds = null; /** The default rule distribution of the data not covered*/ protected double[] m_DefDstr = null; /** The consequent of this rule */ protected double[] m_Cnsqt = null; /** Number of classes in the training data */ private int m_NumClasses = 0; /** The seed to perform randomization */ private long m_Seed = 1; /** The Random object used for randomization */ private Random m_Random = null; /** Whether randomize the data */ private boolean m_IsRandomized = true; /** The predicted classes recorded for each antecedent in the growing data */ private FastVector m_Targets; /** Whether to use exlusive expressions for nominal attributes */ private boolean m_IsExclude = false; /** The minimal number of instance weights within a split*/ private double m_MinNo = 2.0; /** The number of antecedents in pre-pruning */ private int m_NumAntds = -1; /** * The single antecedent in the rule, which is composed of an attribute and * the corresponding value. There are two inherited classes, namely NumericAntd * and NominalAntd in which the attributes are numeric and nominal respectively. */ private abstract class Antd{ /** The attribute of the antecedent */ protected Attribute att; /** The attribute value of the antecedent. For numeric attribute, value is either 0(1st bag) or 1(2nd bag) */ protected double value; /** The maximum infoGain achieved by this antecedent test */ protected double maxInfoGain; /** The information of this antecedent test on the growing data */ protected double inform; /** The parameter related to the meanSquaredError of the data not covered by the previous antecedents when the class is numeric */ protected double uncoverWtSq, uncoverWtVl, uncoverSum; /** The parameters related to the data not covered by the previous antecedents when the class is nominal */ protected double[] uncover; /** Constructor for nominal class */ public Antd(Attribute a, double[] unc){ att=a; value=Double.NaN; maxInfoGain = 0; inform = Double.NaN; uncover = unc; } /* Constructor for numeric class */ public Antd(Attribute a, double uncoveredWtSq, double uncoveredWtVl, double uncoveredWts){ att=a; value=Double.NaN; maxInfoGain = 0; inform = Double.NaN; uncoverWtSq = uncoveredWtSq; uncoverWtVl = uncoveredWtVl; uncoverSum = uncoveredWts; } /* The abstract members for inheritance */ public abstract Instances[] splitData(Instances data, double defInfo); public abstract boolean isCover(Instance inst); public abstract String toString(); /* Get functions of this antecedent */ public Attribute getAttr(){ return att; } public double getAttrValue(){ return value; } public double getMaxInfoGain(){ return maxInfoGain; } public double getInfo(){ return inform;} /** * Function used to calculate the weighted mean squared error, * i.e., sum[x-avg(x)]^2 based on the given elements of the formula: * meanSquaredError = sum(Wi*Xi^2) - (sum(WiXi))^2/sum(Wi) * * @param weightedSq sum(Wi*Xi^2) * @param weightedValue sum(WiXi) * @param sum sum of weights * @return the weighted mean-squared error */ protected double wtMeanSqErr(double weightedSq, double weightedValue, double sum){ if(Utils.smOrEq(sum, 1.0E-6)) return 0; return (weightedSq - (weightedValue * weightedValue) / sum); } /** * Function used to calculate the entropy of given vector of values * entropy = (1/sum)*{-sigma[i=1..P](Xi*log2(Xi)) + sum*log2(sum)} * where P is the length of the vector * * @param value the given vector of values * @param sum the sum of the given values. It's provided just for efficiency. * @return the entropy */ protected double entropy(double[] value, double sum){ if(Utils.smOrEq(sum, 1.0E-6)) return 0; double entropy = 0; for(int i=0; i < value.length; i++){ if(!Utils.eq(value[i],0)) entropy -= value[i] * Utils.log2(value[i]); } entropy += sum * Utils.log2(sum); entropy /= sum; return entropy; } } /** * The antecedent with numeric attribute */ private class NumericAntd extends Antd{ /* The split point for this numeric antecedent */ private double splitPoint; /* Constructor for nominal class */ public NumericAntd(Attribute a, double[] unc){ super(a, unc); splitPoint = Double.NaN; } /* Constructor for numeric class */ public NumericAntd(Attribute a, double sq, double vl, double wts){ super(a, sq, vl, wts); splitPoint = Double.NaN; } /* Get split point of this numeric antecedent */ public double getSplitPoint(){ return splitPoint; } /** * Implements the splitData function. * This procedure is to split the data into two bags according * to the information gain of the numeric attribute value * the data with missing values are stored in the last split. * The maximum infoGain is also calculated. * * @param insts the data to be split * @param defInfo the default information for data * @return the array of data after split */ public Instances[] splitData(Instances insts, double defInfo){ Instances data = new Instances(insts); data.sort(att); int total=data.numInstances();// Total number of instances without // missing value for att maxInfoGain = 0; value = 0; // Compute minimum number of Instances required in each split double minSplit; if(m_ClassAttribute.isNominal()){ minSplit = 0.1 * (data.sumOfWeights()) / ((double)m_ClassAttribute.numValues()); if (Utils.smOrEq(minSplit,m_MinNo)) minSplit = m_MinNo; else if (Utils.gr(minSplit,25)) minSplit = 25; } else minSplit = m_MinNo; double[] fst=null, snd=null, missing=null; if(m_ClassAttribute.isNominal()){ fst = new double[m_NumClasses]; snd = new double[m_NumClasses]; missing = new double[m_NumClasses]; for(int v=0; v < m_NumClasses; v++) fst[v]=snd[v]=missing[v]=0.0; } double fstCover=0, sndCover=0, fstWtSq=0, sndWtSq=0, fstWtVl=0, sndWtVl=0; int split=1; // Current split position int prev=0; // Previous split position int finalSplit=split; // Final split position for(int x=0; x<data.numInstances(); x++){ Instance inst = data.instance(x); if(inst.isMissing(att)){ total = x; break; } sndCover += inst.weight(); if(m_ClassAttribute.isNominal()) // Nominal class snd[(int)inst.classValue()] += inst.weight(); else{ // Numeric class sndWtSq += inst.weight() * inst.classValue() * inst.classValue(); sndWtVl += inst.weight() * inst.classValue(); } } // Enough Instances with known values? if (Utils.sm(sndCover,(2*minSplit))) return null; double msingWtSq=0, msingWtVl=0; Instances missingData = new Instances(data, 0); for(int y=total; y < data.numInstances(); y++){ Instance inst = data.instance(y); missingData.add(inst); if(m_ClassAttribute.isNominal()) missing[(int)inst.classValue()] += inst.weight(); else{ msingWtSq += inst.weight() * inst.classValue() * inst.classValue(); msingWtVl += inst.weight() * inst.classValue(); } } if(total == 0) return null; // Data all missing for the attribute splitPoint = data.instance(total-1).value(att); for(; split < total; split++){ if(!Utils.eq(data.instance(split).value(att), // Can't split data.instance(prev).value(att))){// within same value // Move the split point for(int y=prev; y<split; y++){ Instance inst = data.instance(y); fstCover += inst.weight(); sndCover -= inst.weight(); if(m_ClassAttribute.isNominal()){ // Nominal class fst[(int)inst.classValue()] += inst.weight(); snd[(int)inst.classValue()] -= inst.weight(); } else{ // Numeric class fstWtSq += inst.weight() * inst.classValue() * inst.classValue(); fstWtVl += inst.weight() * inst.classValue(); sndWtSq -= inst.weight() * inst.classValue() * inst.classValue(); sndWtVl -= inst.weight() * inst.classValue(); } } if(Utils.sm(fstCover, minSplit) || Utils.sm(sndCover, minSplit)){ prev=split; // Cannot split because either continue; // split has not enough data } double fstEntp = 0, sndEntp = 0; if(m_ClassAttribute.isNominal()){ fstEntp = entropy(fst, fstCover); sndEntp = entropy(snd, sndCover); } else{ fstEntp = wtMeanSqErr(fstWtSq, fstWtVl, fstCover)/fstCover; sndEntp = wtMeanSqErr(sndWtSq, sndWtVl, sndCover)/sndCover; } /* Which bag has higher information gain? */ boolean isFirst; double fstInfoGain, sndInfoGain; double info, infoGain, fstInfo, sndInfo; if(m_ClassAttribute.isNominal()){ double sum = data.sumOfWeights(); double otherCover, whole = sum + Utils.sum(uncover), otherEntropy; double[] other = null; // InfoGain of first bag other = new double[m_NumClasses]; for(int z=0; z < m_NumClasses; z++) other[z] = uncover[z] + snd[z] + missing[z]; otherCover = whole - fstCover; otherEntropy = entropy(other, otherCover); // Weighted average fstInfo = (fstEntp*fstCover + otherEntropy*otherCover)/whole; fstInfoGain = defInfo - fstInfo; // InfoGain of second bag other = new double[m_NumClasses]; for(int z=0; z < m_NumClasses; z++) other[z] = uncover[z] + fst[z] + missing[z]; otherCover = whole - sndCover; otherEntropy = entropy(other, otherCover); // Weighted average sndInfo = (sndEntp*sndCover + otherEntropy*otherCover)/whole; sndInfoGain = defInfo - sndInfo; } else{ double sum = data.sumOfWeights(); double otherWtSq = (sndWtSq + msingWtSq + uncoverWtSq), otherWtVl = (sndWtVl + msingWtVl + uncoverWtVl), otherCover = (sum - fstCover + uncoverSum); fstInfo = Utils.eq(fstCover, 0) ? 0 : (fstEntp * fstCover); fstInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover); fstInfoGain = defInfo - fstInfo; otherWtSq = (fstWtSq + msingWtSq + uncoverWtSq); otherWtVl = (fstWtVl + msingWtVl + uncoverWtVl); otherCover = sum - sndCover + uncoverSum; sndInfo = Utils.eq(sndCover, 0) ? 0 : (sndEntp * sndCover); sndInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover); sndInfoGain = defInfo - sndInfo; } if(Utils.gr(fstInfoGain,sndInfoGain) || (Utils.eq(fstInfoGain,sndInfoGain)&&(Utils.sm(fstEntp,sndEntp)))){ isFirst = true; infoGain = fstInfoGain; info = fstInfo; } else{ isFirst = false; infoGain = sndInfoGain; info = sndInfo; } boolean isUpdate = Utils.gr(infoGain, maxInfoGain); /* Check whether so far the max infoGain */ if(isUpdate){ splitPoint = ((data.instance(split).value(att)) + (data.instance(prev).value(att)))/2.0; value = ((isFirst) ? 0 : 1); inform = info; maxInfoGain = infoGain; finalSplit = split; } prev=split; } } /* Split the data */ Instances[] splitData = new Instances[3]; splitData[0] = new Instances(data, 0, finalSplit); splitData[1] = new Instances(data, finalSplit, total-finalSplit); splitData[2] = new Instances(missingData); return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered * by this antecedent */ public boolean isCover(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if(Utils.eq(value, 0)){ if(Utils.smOrEq(inst.value(att), splitPoint)) isCover=true; } else if(Utils.gr(inst.value(att), splitPoint)) isCover=true; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = Utils.eq(value, 0.0) ? " <= " : " > "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } } /** * The antecedent with nominal attribute */ class NominalAntd extends Antd{ /* The parameters of infoGain calculated for each attribute value */ private double[][] stats; private double[] coverage; private boolean isIn; /* Constructor for nominal class */ public NominalAntd(Attribute a, double[] unc){ super(a, unc); int bag = att.numValues(); stats = new double[bag][m_NumClasses]; coverage = new double[bag]; isIn = true; } /* Constructor for numeric class */ public NominalAntd(Attribute a, double sq, double vl, double wts){ super(a, sq, vl, wts); int bag = att.numValues(); stats = null; coverage = new double[bag]; isIn = true; } /** * Implements the splitData function. * This procedure is to split the data into bags according * to the nominal attribute value * the data with missing values are stored in the last bag. * The infoGain for each bag is also calculated. * * @param data the data to be split * @param defInfo the default information for data * @return the array of data after split */ public Instances[] splitData(Instances data, double defInfo){ int bag = att.numValues(); Instances[] splitData = new Instances[bag+1]; double[] wSq = new double[bag]; double[] wVl = new double[bag]; double totalWS=0, totalWV=0, msingWS=0, msingWV=0, sum=data.sumOfWeights(); double[] all = new double[m_NumClasses]; double[] missing = new double[m_NumClasses]; for(int w=0; w < m_NumClasses; w++) all[w] = missing[w] = 0; for(int x=0; x<bag; x++){ coverage[x] = wSq[x] = wVl[x] = 0; if(stats != null) for(int y=0; y < m_NumClasses; y++) stats[x][y] = 0; splitData[x] = new Instances(data, data.numInstances()); } splitData[bag] = new Instances(data, data.numInstances()); // Record the statistics of data for(int x=0; x<data.numInstances(); x++){ Instance inst=data.instance(x); if(!inst.isMissing(att)){ int v = (int)inst.value(att); splitData[v].add(inst); coverage[v] += inst.weight(); if(m_ClassAttribute.isNominal()){ // Nominal class stats[v][(int)inst.classValue()] += inst.weight(); all[(int)inst.classValue()] += inst.weight(); } else{ // Numeric class wSq[v] += inst.weight() * inst.classValue() * inst.classValue(); wVl[v] += inst.weight() * inst.classValue(); totalWS += inst.weight() * inst.classValue() * inst.classValue(); totalWV += inst.weight() * inst.classValue(); } } else{ splitData[bag].add(inst); if(m_ClassAttribute.isNominal()){ // Nominal class all[(int)inst.classValue()] += inst.weight(); missing[(int)inst.classValue()] += inst.weight(); } else{ // Numeric class totalWS += inst.weight() * inst.classValue() * inst.classValue(); totalWV += inst.weight() * inst.classValue(); msingWS += inst.weight() * inst.classValue() * inst.classValue(); msingWV += inst.weight() * inst.classValue(); } } } // The total weights of the whole grow data double whole; if(m_ClassAttribute.isNominal()) whole = sum + Utils.sum(uncover); else whole = sum + uncoverSum; // Find the split double minEntrp=Double.MAX_VALUE; maxInfoGain = 0; // Check if >=2 splits have more than the minimal data int count=0; for(int x=0; x<bag; x++) if(Utils.grOrEq(coverage[x], m_MinNo)) ++count; if(count < 2){ // Don't split maxInfoGain = 0; inform = defInfo; value = Double.NaN; return null; } for(int x=0; x<bag; x++){ double t = coverage[x], entrp, infoGain; if(Utils.sm(t, m_MinNo)) continue; if(m_ClassAttribute.isNominal()){ // Nominal class double[] other = new double[m_NumClasses]; for(int y=0; y < m_NumClasses; y++) other[y] = all[y] - stats[x][y] + uncover[y]; double otherCover = whole - t; // Entropies of data covered and uncovered entrp = entropy(stats[x], t); double uncEntp = entropy(other, otherCover); // Weighted average infoGain = defInfo - (entrp*t + uncEntp*otherCover)/whole; } else{ // Numeric class double weight = (whole - t); entrp = wtMeanSqErr(wSq[x], wVl[x], t)/t; infoGain = defInfo - (entrp * t) - wtMeanSqErr((totalWS-wSq[x]+uncoverWtSq), (totalWV-wVl[x]+uncoverWtVl), weight); } // Test the exclusive expression boolean isWithin =true; if(m_IsExclude){ double infoGain2, entrp2; if(m_ClassAttribute.isNominal()){ // Nominal class double[] other2 = new double[m_NumClasses]; double[] notIn = new double[m_NumClasses]; for(int y=0; y < m_NumClasses; y++){ other2[y] = stats[x][y] + missing[y] + uncover[y]; notIn[y] = all[y] - stats[x][y] - missing[y]; } double msSum = Utils.sum(missing); double otherCover2 = t + msSum + Utils.sum(uncover); entrp2 = entropy(notIn, (sum-t-msSum)); double uncEntp2 = entropy(other2, otherCover2); infoGain2 = defInfo - (entrp2*(sum-t-msSum) + uncEntp2*otherCover2)/whole; } else{ // Numeric class double msWts = splitData[bag].sumOfWeights(); double weight2 = t + uncoverSum + msWts; entrp2 = wtMeanSqErr((totalWS-wSq[x]-msingWS), (totalWV-wVl[x]-msingWV),(sum-t-msWts)) /(sum-t-msWts); infoGain2 = defInfo - entrp2 * (sum-t-msWts) - wtMeanSqErr((wSq[x]+uncoverWtSq+msingWS), (wVl[x]+uncoverWtVl+msingWV), weight2); } // Use the exclusive expression? if (Utils.gr(infoGain2, infoGain) || (Utils.eq(infoGain2, infoGain) && Utils.sm(entrp2, entrp))){ infoGain = infoGain2; entrp = entrp2; isWithin =false; } } // Test this split if (Utils.gr(infoGain, maxInfoGain) || (Utils.eq(infoGain, maxInfoGain) && Utils.sm(entrp, minEntrp))){ value = (double)x; maxInfoGain = infoGain; inform = maxInfoGain - defInfo; minEntrp = entrp; isIn = isWithin; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered * by this antecedent */ public boolean isCover(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if(isIn){ if(Utils.eq(inst.value(att), value)) isCover=true; } else if(!Utils.eq(inst.value(att), value)) isCover=true; } return isCover; } /** * Whether the expression is "att = value" or att != value" * for this nominal attribute. True if in the former expression, * otherwise the latter * * @return the boolean value */ public boolean isIn(){ return isIn; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = isIn ? " = " : " != "; return (att.name() + symbol + att.value((int)value)); } } /** * Returns an enumeration describing the available options * Valid options are: <p> * * -N number <br> * Set number of folds for REP. One fold is * used as the pruning set. (Default: 3) <p> * * -R <br> * Set if NOT randomize the data before split to growing and * pruning data. If NOT set, the seed of randomization is * specified by the -S option. (Default: randomize) <p> * * -S <br> * Seed of randomization. (Default: 1)<p> * * -E <br> * Set whether consider the exclusive expressions for nominal * attribute split. (Default: false) <p> * * -M number <br> * Set the minimal weights of instances within a split. * (Default: 2) <p> * * -P number <br> * Set the number of antecedents allowed in the rule if pre-pruning * is used. If this value is other than -1, then pre-pruning will be * used, otherwise the rule uses REP. (Default: -1) <p> * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(6); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","N", 1, "-N <number of folds>")); newVector.addElement(new Option("\tSet if NOT uses randomization\n" + "\t(default:use randomization)","R", 0, "-R")); newVector.addElement(new Option("\tSet whether consider the exclusive\n" + "\texpressions for nominal attributes\n"+ "\t(default false)","E", 0, "-E")); newVector.addElement(new Option("\tSet the minimal weights of instances\n" + "\twithin a split.\n" + "\t(default 2.0)","M", 1, "-M <min. weights>")); newVector.addElement(new Option("\tSet number of antecedents for pre-pruning\n" + "\tif -1, then REP is used\n" + "\t(default -1)","P", 1, "-P <number of antecedents>")); newVector.addElement(new Option("\tSet the seed of randomization\n" + "\t(default 1)","S", 1, "-S <seed>")); return newVector.elements(); } /** * Parses a given list of options. * * @param options the list of options as an array of strings * @exception Exception if an option is not supported */ public void setOptions(String[] options) throws Exception { String numFoldsString = Utils.getOption('N', options); if (numFoldsString.length() != 0) m_Folds = Integer.parseInt(numFoldsString); else m_Folds = 3; String minNoString = Utils.getOption('M', options); if (minNoString.length() != 0) m_MinNo = Double.parseDouble(minNoString); else m_MinNo = 2.0; String seedString = Utils.getOption('S', options); if (seedString.length() != 0) m_Seed = Integer.parseInt(seedString); else m_Seed = 1; String numAntdsString = Utils.getOption('P', options); if (numAntdsString.length() != 0) m_NumAntds = Integer.parseInt(numAntdsString); else m_NumAntds = -1; m_IsRandomized = (!Utils.getFlag('R', options)); m_IsExclude = Utils.getFlag('E', options); } /** * Gets the current settings of the Classifier. * * @return an array of strings suitable for passing to setOptions */ public String [] getOptions() { String [] options = new String [10]; int current = 0; options[current++] = "-N"; options[current++] = "" + m_Folds; options[current++] = "-M"; options[current++] = "" + m_MinNo; options[current++] = "-P"; options[current++] = "" + m_NumAntds; options[current++] = "-S"; options[current++] = "" + m_Seed; if(!m_IsRandomized) options[current++] = "-R"; if(m_IsExclude) options[current++] = "-E"; while (current < options.length) options[current++] = ""; return options; } /** The access functions for parameters */ public void setFolds(int folds){ m_Folds = folds; } public int getFolds(){ return m_Folds; } public void setSeed(long s){ m_Seed = s; } public long getSeed(){ return m_Seed; } public boolean getRandomized(){ return m_IsRandomized;} public void setRandomized(boolean r){ m_IsRandomized = r;} public boolean getExclusive(){ return m_IsExclude;} public void setExclusive(boolean e){ m_IsExclude = e;} public void setMinNo(double m){ m_MinNo = m; } public double getMinNo(){ return m_MinNo; } public void setNumAntds(int n){ m_NumAntds = n; } public int getNumAntds(){ return m_NumAntds; } /** * Builds a single rule learner with REP dealing with nominal classes or * numeric classes. * For nominal classes, this rule learner predicts a distribution on * the classes. * For numeric classes, this learner predicts a single value. * * @param instances the training data * @exception Exception if classifier can't be built successfully */ public void buildClassifier(Instances instances) throws Exception { if (instances.checkForStringAttributes()) throw new UnsupportedAttributeTypeException("Cannot handle string attributes!"); Instances data = new Instances(instances); if(data.numInstances() == 0) throw new Exception("No training data!"); data.deleteWithMissingClass(); if(data.numInstances() == 0) throw new Exception("Not training data without missing class values."); if(data.numInstances() < m_Folds) throw new Exception("Not enough data for REP."); m_ClassAttribute = data.classAttribute(); if(m_ClassAttribute.isNominal()) m_NumClasses = m_ClassAttribute.numValues(); else m_NumClasses = 1; m_Antds = new FastVector(); m_DefDstr = new double[m_NumClasses]; m_Cnsqt = new double[m_NumClasses]; m_Targets = new FastVector(); m_Random = new Random(m_Seed); if(m_IsRandomized){ // Randomize the data data.randomize(m_Random); } if(m_NumAntds != -1){ grow(data); } else{ // Split data into Grow and Prune data.stratify(m_Folds); Instances growData=data.trainCV(m_Folds, m_Folds-1); Instances pruneData=data.testCV(m_Folds, m_Folds-1); grow(growData); // Build this rule prune(pruneData); // Prune this rule } if(m_ClassAttribute.isNominal()){ Utils.normalize(m_Cnsqt); if(Utils.gr(Utils.sum(m_DefDstr), 0)) Utils.normalize(m_DefDstr); } } /** * Computes class distribution for the given instance. * * @param instance the instance for which distribution is to be computed * @return the class distribution for the given instance */ public double[] distributionForInstance(Instance instance) throws Exception { if(instance == null) throw new Exception("Testing instance is NULL!"); if (isCover(instance)) return m_Cnsqt; else return m_DefDstr; } /** * Whether the instance covered by this rule * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered by this rule */ public boolean isCover(Instance datum){ boolean isCover=true; for(int i=0; i<m_Antds.size(); i++){ Antd antd = (Antd)m_Antds.elementAt(i); if(!antd.isCover(datum)){ isCover = false; break; } } return isCover; } /** * Whether this rule has antecedents, i.e. whether it is a default rule * * @return the boolean value indicating whether the rule has antecedents */ public boolean hasAntds(){ if (m_Antds == null) return false; else return (m_Antds.size() > 0); } /** * Build one rule using the growing data * * @param data the growing data used to build the rule */ private void grow(Instances data){ Instances growData = new Instances(data); double defInfo; double whole = data.sumOfWeights(); if(m_NumAntds != 0){ /* Class distribution for data both covered and not covered by one antecedent */ double[][] classDstr = new double[2][m_NumClasses]; /* Compute the default information of the growing data */ for(int j=0; j < m_NumClasses; j++){ classDstr[0][j] = 0; classDstr[1][j] = 0; } if(m_ClassAttribute.isNominal()){ for(int i=0; i < growData.numInstances(); i++){ Instance datum = growData.instance(i); classDstr[0][(int)datum.classValue()] += datum.weight(); } defInfo = ContingencyTables.entropy(classDstr[0]); } else{ for(int i=0; i < growData.numInstances(); i++){ Instance datum = growData.instance(i); classDstr[0][0] += datum.weight() * datum.classValue(); } // No need to be divided by the denomitor because // it's always the same double defMean = (classDstr[0][0] / whole); defInfo = meanSquaredError(growData, defMean) * growData.sumOfWeights(); } // Store the default class distribution double[][] tmp = new double[2][m_NumClasses]; for(int y=0; y < m_NumClasses; y++){ if(m_ClassAttribute.isNominal()){ tmp[0][y] = classDstr[0][y]; tmp[1][y] = classDstr[1][y]; } else{ tmp[0][y] = classDstr[0][y]/whole; tmp[1][y] = classDstr[1][y]; } } m_Targets.addElement(tmp); /* Keep the record of which attributes have already been used*/ boolean[] used=new boolean[growData.numAttributes()]; for (int k=0; k<used.length; k++) used[k]=false; int numUnused=used.length; double maxInfoGain, uncoveredWtSq=0, uncoveredWtVl=0, uncoveredWts=0; boolean isContinue = true; // The stopping criterion of this rule while (isContinue){ maxInfoGain = 0; // We require that infoGain be positive /* Build a list of antecedents */ Antd oneAntd=null; Instances coverData = null, uncoverData = null; Enumeration enumAttr=growData.enumerateAttributes(); int index=-1; /* Build one condition based on all attributes not used yet*/ while (enumAttr.hasMoreElements()){ Attribute att= (Attribute)(enumAttr.nextElement()); index++; Antd antd =null; if(m_ClassAttribute.isNominal()){ if(att.isNumeric()) antd = new NumericAntd(att, classDstr[1]); else antd = new NominalAntd(att, classDstr[1]); } else if(att.isNumeric()) antd = new NumericAntd(att, uncoveredWtSq, uncoveredWtVl, uncoveredWts); else antd = new NominalAntd(att, uncoveredWtSq, uncoveredWtVl, uncoveredWts); if(!used[index]){ /* Compute the best information gain for each attribute, it's stored in the antecedent formed by this attribute. This procedure returns the data covered by the antecedent*/ Instances[] coveredData = computeInfoGain(growData, defInfo, antd); if(coveredData != null){ double infoGain = antd.getMaxInfoGain(); boolean isUpdate = Utils.gr(infoGain, maxInfoGain); if(isUpdate){ oneAntd=antd; coverData = coveredData[0]; uncoverData = coveredData[1]; maxInfoGain = infoGain; } } } } if(oneAntd == null) break; //Numeric attributes can be used more than once if(!oneAntd.getAttr().isNumeric()){ used[oneAntd.getAttr().index()]=true; numUnused--; } m_Antds.addElement(oneAntd); growData = coverData;// Grow data size is shrinking for(int x=0; x < uncoverData.numInstances(); x++){ Instance datum = uncoverData.instance(x); if(m_ClassAttribute.isNumeric()){ uncoveredWtSq += datum.weight() * datum.classValue() * datum.classValue(); uncoveredWtVl += datum.weight() * datum.classValue(); uncoveredWts += datum.weight(); classDstr[0][0] -= datum.weight() * datum.classValue(); classDstr[1][0] += datum.weight() * datum.classValue(); } else{ classDstr[0][(int)datum.classValue()] -= datum.weight(); classDstr[1][(int)datum.classValue()] += datum.weight(); } } // Store class distribution of growing data tmp = new double[2][m_NumClasses]; for(int y=0; y < m_NumClasses; y++){ if(m_ClassAttribute.isNominal()){ tmp[0][y] = classDstr[0][y]; tmp[1][y] = classDstr[1][y]; } else{ tmp[0][y] = classDstr[0][y]/(whole-uncoveredWts); tmp[1][y] = classDstr[1][y]/uncoveredWts; } } m_Targets.addElement(tmp); defInfo = oneAntd.getInfo(); int numAntdsThreshold = (m_NumAntds == -1) ? Integer.MAX_VALUE : m_NumAntds; if(Utils.eq(growData.sumOfWeights(), 0.0) || (numUnused == 0) || (m_Antds.size() >= numAntdsThreshold)) isContinue = false; } } m_Cnsqt = ((double[][])(m_Targets.lastElement()))[0]; m_DefDstr = ((double[][])(m_Targets.lastElement()))[1]; } /** * Compute the best information gain for the specified antecedent * * @param data the data based on which the infoGain is computed * @param defInfo the default information of data * @param antd the specific antecedent * @return the data covered and not covered by the antecedent */ private Instances[] computeInfoGain(Instances instances, double defInfo, Antd antd){ Instances data = new Instances(instances); /* Split the data into bags. The information gain of each bag is also calculated in this procedure */ Instances[] splitData = antd.splitData(data, defInfo); Instances[] coveredData = new Instances[2]; /* Get the bag of data to be used for next antecedents */ Instances tmp1 = new Instances(data, 0); Instances tmp2 = new Instances(data, 0); if(splitData == null) return null; for(int x=0; x < (splitData.length-1); x++){ if(x == ((int)antd.getAttrValue())) tmp1 = splitData[x]; else{ for(int y=0; y < splitData[x].numInstances(); y++) tmp2.add(splitData[x].instance(y)); } } if(antd.getAttr().isNominal()){ // Nominal attributes if(((NominalAntd)antd).isIn()){ // Inclusive expression coveredData[0] = new Instances(tmp1); coveredData[1] = new Instances(tmp2); } else{ // Exclusive expression coveredData[0] = new Instances(tmp2); coveredData[1] = new Instances(tmp1); } } else{ // Numeric attributes coveredData[0] = new Instances(tmp1); coveredData[1] = new Instances(tmp2); } /* Add data with missing value */ for(int z=0; z<splitData[splitData.length-1].numInstances(); z++) coveredData[1].add(splitData[splitData.length-1].instance(z)); return coveredData; } /** * Prune the rule using the pruning data. * The weighted average of accuracy rate/mean-squared error is * used to prune the rule. * * @param pruneData the pruning data used to prune the rule */ private void prune(Instances pruneData){ Instances data=new Instances(pruneData); Instances otherData = new Instances(data, 0); double total = data.sumOfWeights(); /* The default accurate# and the the accuracy rate on pruning data */ double defAccu; if(m_ClassAttribute.isNumeric()) defAccu = meanSquaredError(pruneData, ((double[][])m_Targets.firstElement())[0][0]); else{ int predict = Utils.maxIndex(((double[][])m_Targets.firstElement())[0]); defAccu = computeAccu(pruneData, predict)/total; } int size=m_Antds.size(); if(size == 0){ m_Cnsqt = ((double[][])m_Targets.lastElement())[0]; m_DefDstr = ((double[][])m_Targets.lastElement())[1]; return; // Default rule before pruning } double[] worthValue = new double[size]; /* Calculate accuracy parameters for all the antecedents in this rule */ for(int x=0; x<size; x++){ Antd antd=(Antd)m_Antds.elementAt(x); Attribute attr= antd.getAttr(); Instances newData = new Instances(data); if(Utils.eq(newData.sumOfWeights(),0.0)) break; data = new Instances(newData, newData.numInstances()); // Make data empty for(int y=0; y<newData.numInstances(); y++){ Instance ins=newData.instance(y); if(antd.isCover(ins)) // Covered by this antecedent data.add(ins); // Add to data for further else otherData.add(ins); // Not covered by this antecedent } double covered, other; double[][] classes = (double[][])m_Targets.elementAt(x+1); // m_Targets has one more element if(m_ClassAttribute.isNominal()){ int coverClass = Utils.maxIndex(classes[0]), otherClass = Utils.maxIndex(classes[1]); covered = computeAccu(data, coverClass); other = computeAccu(otherData, otherClass); } else{ double coverClass = classes[0][0], otherClass = classes[1][0]; covered = (data.sumOfWeights())*meanSquaredError(data, coverClass); other = (otherData.sumOfWeights())*meanSquaredError(otherData, otherClass); } worthValue[x] = (covered + other)/total; } /* Prune the antecedents according to the accuracy parameters */ for(int z=(size-1); z > 0; z--){ // Treatment to avoid precision problems double valueDelta; if(m_ClassAttribute.isNominal()){ if(Utils.sm(worthValue[z], 1.0)) valueDelta = (worthValue[z] - worthValue[z-1]) / worthValue[z]; else valueDelta = worthValue[z] - worthValue[z-1]; } else{ if(Utils.sm(worthValue[z], 1.0)) valueDelta = (worthValue[z-1] - worthValue[z]) / worthValue[z]; else valueDelta = (worthValue[z-1] - worthValue[z]); } if(Utils.smOrEq(valueDelta, 0.0)){ m_Antds.removeElementAt(z); m_Targets.removeElementAt(z+1); } else break; } // Check whether this rule is a default rule if(m_Antds.size() == 1){ double valueDelta; if(m_ClassAttribute.isNominal()){ if(Utils.sm(worthValue[0], 1.0)) valueDelta = (worthValue[0] - defAccu) / worthValue[0]; else valueDelta = (worthValue[0] - defAccu); } else{ if(Utils.sm(worthValue[0], 1.0)) valueDelta = (defAccu - worthValue[0]) / worthValue[0]; else valueDelta = (defAccu - worthValue[0]); } if(Utils.smOrEq(valueDelta, 0.0)){ m_Antds.removeAllElements(); m_Targets.removeElementAt(1); } } m_Cnsqt = ((double[][])(m_Targets.lastElement()))[0]; m_DefDstr = ((double[][])(m_Targets.lastElement()))[1]; } /** * Private function to compute number of accurate instances * based on the specified predicted class * * @param data the data in question * @param clas the predicted class * @return the default accuracy number */ private double computeAccu(Instances data, int clas){ double accu = 0; for(int i=0; i<data.numInstances(); i++){ Instance inst = data.instance(i); if((int)inst.classValue() == clas) accu += inst.weight(); } return accu; } /** * Private function to compute the squared error of * the specified data and the specified mean * * @param data the data in question * @param mean the specified mean * @return the default mean-squared error */ private double meanSquaredError(Instances data, double mean){ if(Utils.eq(data.sumOfWeights(),0.0)) return 0; double mSqErr=0, sum = data.sumOfWeights(); for(int i=0; i < data.numInstances(); i++){ Instance datum = data.instance(i); mSqErr += datum.weight()* (datum.classValue() - mean)* (datum.classValue() - mean); } return (mSqErr / sum); } /** * Prints this rule with the specified class label * * @param att the string standing for attribute in the consequent of this rule * @param cl the string standing for value in the consequent of this rule * @return a textual description of this rule with the specified class label */ public String toString(String att, String cl) { StringBuffer text = new StringBuffer(); if(m_Antds.size() > 0){ for(int j=0; j< (m_Antds.size()-1); j++) text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and "); text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")"); } text.append(" => " + att + " = " + cl); return text.toString(); } /** * Prints this rule * * @return a textual description of this rule */ public String toString() { String title = "\n\nSingle conjunctive rule learner:\n"+ "--------------------------------\n", body = null; StringBuffer text = new StringBuffer(); if(m_ClassAttribute != null){ if(m_ClassAttribute.isNominal()){ body = toString(m_ClassAttribute.name(), m_ClassAttribute.value(Utils.maxIndex(m_Cnsqt))); text.append("\n\nClass distributions:\nCovered by the rule:\n"); for(int k=0; k < m_Cnsqt.length; k++) text.append(m_ClassAttribute.value(k)+ "\t"); text.append('\n'); for(int l=0; l < m_Cnsqt.length; l++) text.append(Utils.doubleToString(m_Cnsqt[l], 6)+"\t"); text.append("\n\nNot covered by the rule:\n"); for(int k=0; k < m_DefDstr.length; k++) text.append(m_ClassAttribute.value(k)+ "\t"); text.append('\n'); for(int l=0; l < m_DefDstr.length; l++) text.append(Utils.doubleToString(m_DefDstr[l], 6)+"\t"); } else body = toString(m_ClassAttribute.name(), Utils.doubleToString(m_Cnsqt[0], 6)); } return (title + body + text.toString()); } /** * Main method. * * @param args the options for the classifier */ public static void main(String[] args) { try { System.out.println(Evaluation.evaluateModel(new ConjunctiveRule(), args)); } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); } } }