/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Alberto Fern�ndez (University of Granada) 01/07/2008 * @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008 * @version 1.1 * @since JDK1.2 * </p> */ package keel.Algorithms.Rule_Learning.Ripper; import java.util.Vector; public class Ruleset { /** * Representation of a disjuction of rules with a common consecuent. * It may be represented as: <b>if (rule1 || rule2) then output=consecuent<\b> */ private Vector rules; //set of rules private String type; //class (consecuent) /** * * Constructs an empty ruleset. */ public Ruleset() { rules=new Vector(); } /** * Adds a new rule to the ruleset. * @param r Rule the new rule */ public void addRule(Rule r){ rules.add(r); } /** * It returns the number of true positives,true negatives,false positives and false negatives of the whole ruleset in a given dataset. * This methods takes into account the right part (consecuent) of the rules * @param data MyDataset the dataset * @return number of true positives, false positives, true negatives and false negatives of the whole ruleset in the following order: {tp,tn,fp,fn} */ public Stats apply(MyDataset data){ //int tp,tn,fp,fn; Stats stats=new Stats(); //It splits the positive and negative instances according to the consecuent Mask positives=new Mask(data.size()); data.filterByClass(positives,type); Mask negatives=positives.complement(); int npositives=positives.getnActive(); int nnegatives=negatives.getnActive(); for (int i=0;i<rules.size();i++){ //it extracts the instances covered by the i-th rule of the ruleset data.substract(positives,(Rule) rules.elementAt(i)); data.substract(negatives,(Rule) rules.elementAt(i)); } stats.fn=positives.getnActive(); //what remains are false positives stats.tp=npositives-stats.fn; //true positives stats.tn=negatives.getnActive(); //true negatives stats.fp=nnegatives-stats.tn; //false negatives return stats; } /** * It returns the number of true positives,true negatives,false positives and false negatives of the whole ruleset in a given dataset. * (This methods doesn't take into account the right part (consecuent) of the rules). * @param data MyDataset the dataset * @param positives active positive instances of data * @param negatives active negative instances of data * @return number of true positives, false positives, true negatives and false negatives of the whole ruleset in the following order: {tp,tn,fp,fn} */ public Stats apply(MyDataset data,Mask positives,Mask negatives){ Stats stats=new Stats(); int npositives=positives.getnActive(); int nnegatives=negatives.getnActive(); Mask p=positives.copy(); Mask n=negatives.copy(); for (int i=0;i<rules.size();i++){ //it extracts the instances covered by the i-th rule of the ruleset data.substract(p,(Rule) rules.elementAt(i)); data.substract(n,(Rule) rules.elementAt(i)); } stats.fn=p.getnActive(); //what remains are false positives stats.tp=npositives-stats.fn; //true positives stats.tn=n.getnActive(); //true negatives stats.fp=nnegatives-stats.tn; //false negatives return stats; } /** * Returns the exception cost for the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @param positives Mask active positive entries of data * @param negatives Mask active negative entries of data * @return the MDL of data given this ruleset. */ public double getExceptionCost(MyDataset data,Mask positives,Mask negatives){ if (rules.size()==0) return 0.0;// Double.MAX_VALUE; Stats quartet=apply(data,positives,negatives); double tp=quartet.tp,tn=quartet.tn,fp=quartet.fp,fn=quartet.fn; double tp_prob,tn_prob,fp_prob,fn_prob; double U=tn+fn,C=tp+fp; //uncovered & covered cases double D=U+C,e=fn+fp; double mdl=0.0;//Double.MAX_VALUE; double uncoverBits,coverBits; if (C > U){ /*mdl = Math.log(D + 1) + fp * ( -Math.log(e / (2 * C))) + (C - fp) * ( -Math.log(1 - (e / (2 * C)))) + fn * ( -Math.log(fn / U)) + (U - fn) * ( -Math.log(1 - fn / U)); */ coverBits=e/(2*C); uncoverBits=(U>0.0)?fn/U:0.0; } else{ /* mdl = Math.log(D + 1) + fn * ( -Math.log(e / (2 * U))) + (U - fn) * ( -Math.log(1 - e / (2 * U))) + fp * ( -Math.log(fp / C)) + (C - fp) * ( -Math.log(1 - (fp / C))); */ coverBits=(C>0.0)?fp/C:0.0; uncoverBits=e/(2*U); } tp_prob=(coverBits==0)?0.0:tp*(-Utilities.log2(1-coverBits)); fp_prob=(coverBits==0)?0.0:fp*(-Utilities.log2(coverBits)); tn_prob=(uncoverBits==0)?0.0:tn*(-Utilities.log2(1-uncoverBits)); fn_prob=(uncoverBits==0)?0.0:fn*(-Utilities.log2(uncoverBits)); mdl = Utilities.log2(D + 1) + tp_prob + tn_prob + fp_prob + fn_prob; return mdl; } /** * Returns the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @param positives Mask active positive entries of data * @param negatives Mask active negative entries of data * @return the MDL of data given this ruleset. */ public double getMDL(MyDataset data,Mask positives,Mask negatives){ return getTheoryCost(data)+getExceptionCost(data,positives,negatives); } /** * Returns the exception cost for the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @return the MDL of data given this ruleset. */ public double getExceptionCost(MyDataset data){ Mask positives=new Mask(data.size()); data.filterByClass(positives,this.type); Mask negatives=positives.complement(); return getExceptionCost(data,positives,negatives); } /** * Returns the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @return the MDL of data given this ruleset. */ public double getMDL(MyDataset data){ return getTheoryCost(data)+getExceptionCost(data); } /** * Returns the exception cost for the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @param positives Mask active positive entries of data * @param negatives Mask active negative entries of data * @param rulesetMask the combine mask of all rules in the ruleset. * @return the MDL of data given this ruleset. */ public double getExceptionCost(MyDataset data,Mask positives,Mask negatives,IncrementalMask rulesetMask){ int tp=rulesetMask.and(positives).getnActive(); //true positives int fp=rulesetMask.and(negatives).getnActive(); //false positives int fn=positives.getnActive()-tp; //false negatives int tn=negatives.getnActive()-fp; //true negatives double mdl_ruleset=Rule.getExceptionCost(data,tp,tn,fp,fn); return mdl_ruleset; } /** * Returns the Minimum Data Length of a dataset given a theory (this ruleset). See [Quinlan95] * @param data MyDataset the datasets * @param positives Mask active positive entries of data * @param negatives Mask active negative entries of data * @param rulesetMask the combine mask of all rules in the ruleset. * @return the MDL of data given this ruleset. */ public double getMDL(MyDataset data,Mask positives,Mask negatives,IncrementalMask rulesetMask){ return getTheoryCost(data)+getExceptionCost(data,positives,negatives,rulesetMask); } /** * The description length of the theory for the ruleset. * Computed as the addition of the theory cost for each rule:<br> * 0.5* [||k||+ S(t, k, k/t)]<br> * where k is the number of antecedents of the rule; t is the total * possible antecedents that could appear in a rule; ||K|| is the * universal prior for k , log2*(k) and S(t,k,p) = -k*log2(p)-(n-k)log2(1-p) * is the subset encoding length.<p> * @param data MyDataset the dataset * @return the description length of the theory for the ruleset */ public double getTheoryCost(MyDataset data){ double total=0.0; for (int i=0;i<size();i++) total+=getRule(i).theoryDL(data); return total; } /** * Returns the rule in the i-th position of the ruleset. * @param pos int position of the rule in the ruleset * @return the rule in the pos-th position of the ruleset. */ public Rule getRule(int pos){ return (Rule) rules.elementAt(pos); } /** * Returns the common output (consecuent) of the rules in the ruleset. * @return the common output (consecuent) of the rules in the ruleset. */ public String getType(){ return type; } /** * Returns the combine mask of all the rules in the set. * @param data the dataset * @return the combine mask of all the rules in the set. */ public IncrementalMask getRulesetMask(MyDataset data){ IncrementalMask rulesetMask=new IncrementalMask(data.size()); for(int i=0;i<rules.size();i++){ Mask ruleMask=new Mask(data.size()); data.filter(ruleMask,getRule(i)); rulesetMask.plus(ruleMask); } return rulesetMask; } /** * Inserts a new rule in a given position of the ruleset. * @param r Rule the new rule * @param pos int the position where r must be inserted */ public void insertRule(Rule r,int pos){ rules.insertElementAt(r,pos); } /** * Deletes a given rule of the ruleset. * @param pos int position of the rule in the ruleset. */ public void removeRule(int pos){ rules.remove(pos); } /** * Sets the common output (consecuent) of the rules in the ruleset. * @param type String the common output (consecuent) of the rules in the ruleset. */ public void setType(String type){ this.type=type; } /** * Removes the duplicated rules */ public void removeDuplicates(){ for (int i=0;i<rules.size();i++){ Rule current = (Rule) rules.elementAt(i); if (current.size()!=0){ for (int j = i + 1; j < rules.size(); j++) { if (current.isEqual( (Rule) rules.elementAt(j))) { rules.remove(j); j--; } } } else{ rules.remove(i); i--; } } } /** * Remove the rules that increase the DL value of the set. * @param data the dataset * @param positives the positives exemples * @param negatives the negatives exemples */ public void pulish(MyDataset data,Mask positives,Mask negatives){ IncrementalMask rulesetMask=new IncrementalMask(data.size()); Mask[] ruleMask=new Mask[rules.size()]; for(int i=0;i<rules.size();i++){ ruleMask[i]=new Mask(data.size()); data.filter(ruleMask[i],getRule(i)); rulesetMask.plus(ruleMask[i]); } double thCost=getTheoryCost(data); //theory cost int tp=rulesetMask.and(positives).getnActive(); //true positives int fp=rulesetMask.and(negatives).getnActive(); //false positives int fn=positives.getnActive()-tp; //false negatives int tn=negatives.getnActive()-fp; //true negatives double mdl_ruleset=thCost+Rule.getExceptionCost(data,tp,tn,fp,fn); for(int i=0;i<rules.size();i++){ rulesetMask.minus(ruleMask[i]); thCost-=getRule(i).theoryDL(data); tp=rulesetMask.and(positives).getnActive(); //true positives fp=rulesetMask.and(negatives).getnActive(); //false positives fn=positives.getnActive()-tp; //false negatives tn=negatives.getnActive()-fp; //true negatives double mdl_whithout_i=thCost+Rule.getExceptionCost(data,tp,tn,fp,fn); if (mdl_whithout_i<mdl_ruleset){ rules.remove(i); i--; mdl_ruleset=mdl_whithout_i; } else{ rulesetMask.plus(ruleMask[i]); thCost+=getRule(i).theoryDL(data); } } } /** * Returns the size (number of rules) of the ruleset. * @return the size (number of rules) of the ruleset. */ public int size(){return rules.size();} /** * Returns a string representation of this Ruleset, containing the String representation of each Rule. * @return a string representation of this Ruleset, containing the String representation of each Rule. */ public String toString(){ String output=""; for (int i=0;i<rules.size();i++) output+=((Rule) rules.elementAt(i)).toString()+" -> "+type+"\n"; return output; } }