FAT.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    FAT.java
 *    Copyright (C) 1999 Eibe Frank
 *    Modified by Prem Melville
 *
 */

package weka.classifiers.trees.j48;

import java.util.*;
import weka.core.*;
import weka.classifiers.*;

/**
 * Feature Aquisition Trees based on C4.5
 *
 * Class for generating an unpruned or a pruned C4.5 decision tree.
 * For more information, see<p>
 *
 * Ross Quinlan (1993). <i>C4.5: Programs for Machine Learning</i>, 
 * Morgan Kaufmann Publishers, San Mateo, CA. </p>
 *
 * Valid options are: <p>
 *
 * -U <br>
 * Use unpruned tree.<p>
 *
 * -C confidence <br>
 * Set confidence threshold for pruning. (Default: 0.25) <p>
 *
 * -M number <br>
 * Set minimum number of instances per leaf. (Default: 2) <p>
 *
 * -R <br>
 * Use reduced error pruning. No subtree raising is performed. <p>
 *
 * -N number <br>
 * Set number of folds for reduced error pruning. One fold is
 * used as the pruning set. (Default: 3) <p>
 *
 * -B <br>
 * Use binary splits for nominal attributes. <p>
 *
 * -S <br>
 * Don't perform subtree raising. <p>
 *
 * -L <br>
 * Do not clean up after the tree has been built. <p>
 *
 * -A <br>
 * If set, Laplace smoothing is used for predicted probabilites. <p>
 *
 *
 * @author Prem Melville (melville@cs.utexas.edu)
 * @version $Revision: 1.7 $
 */
public class FAT extends EnsembleClassifier implements OptionHandler, 
  Drawable, Matchable, Sourcable, WeightedInstancesHandler, Summarizable,
  AdditionalMeasureProducer, ActiveFeatureAcquirer {

    //=============== BEGIN EDIT melville ===============
    
    /** To use weighted sampling instead of deterministic selection of instance */ 
    protected boolean m_UseSampling = false;
    
    /** Random number generator - for sampling */
    protected Random m_Random = new Random(0);
    
    protected boolean m_Debug = true;

    /** Enumeration of selective sampling schemes */
    static final int LABEL_MARGIN = 0,
	RANDOM = 1,
	ABS_LABEL_MARGIN = 2,
	UNLABELED_MARGIN = 3,
	ENTROPY = 4,
	MAX_CERTAINTY = 5,
	HYBRID = 6,
	RANDOM_HYBRID = 7;
    
    /* LABEL_MARGIN = INCORRECT_CERTAINTY
       UNLABELED_MARGIN = UNCERTAINITY
       HYBRID = ASYMMETRIC CERTAINTY
       RANDOM_HYBRID = ERROR SAMPLING
    */
    
    /** The selective sampling scheme to use. */
    protected int m_SelectionScheme = LABEL_MARGIN;
    //=============== END EDIT melville ===============	
    
  // To maintain the same version number after adding m_ClassAttribute
  static final long serialVersionUID = -217733168393644444L;

  /** The decision tree */
  private ClassifierTree m_root;
  
  /** Unpruned tree? */
  private boolean m_unpruned = false;

  /** Confidence level */
  private float m_CF = 0.25f;

  /** Minimum number of instances */
  private int m_minNumObj = 2;

  /** Determines whether probabilities are smoothed using
      Laplace correction when predictions are generated */
  private boolean m_useLaplace = false;

  /** Use reduced error pruning? */
  private boolean m_reducedErrorPruning = false;

  /** Number of folds for reduced error pruning. */
  private int m_numFolds = 3;

  /** Binary splits on nominal attributes? */
  private boolean m_binarySplits = false;

  /** Subtree raising to be performed? */
  private boolean m_subtreeRaising = true;

  /** Cleanup after the tree has been built. */
  boolean m_noCleanup = false;
  
    
    /** 
     * Given a set of unlabeled examples, select a specified number of examples to be labeled.
     * @param activePool pool of unlabeled examples
     * @param num number of examples to selcted for labeling
     * @exception Exception if selective sampling fails
     */
    public int [] selectInstancesForFeatures(Instances activePool,int num) throws Exception{
	//Make a list of pairs of indices and the corresponding measure of informativenes of examples
	//Sort this in the order of informativeness and return the list of num indices
	int poolSize = activePool.numInstances();
	Pair []pairs = new Pair[poolSize];
	for(int i=0; i<poolSize; i++){
	    pairs[i] = new Pair(i,calculateScore(activePool.instance(i)));
	}
	
	//sort in AScending order
	Arrays.sort(pairs, new Comparator() {
                public int compare(Object o1, Object o2) {
		    double diff = ((Pair)o2).second - ((Pair)o1).second; 
		    return(diff < 0 ? 1 : diff > 0 ? -1 : 0);
		}
            });

	int []selected = new int[num];
	if(m_UseSampling){
	    if(pairs[0].first < 0)
		throw new Exception("Sampling with negative scores has not been defined.");
	    
	    //If sampling is being used then convert scores to
	    //probabilities (inversely proportional)
	    Vector v = new Vector(poolSize);
	    double sum = 0;
	    for(int i=0; i<poolSize; i++){
		if(pairs[i].second == 0){
		    pairs[i].second = Double.MAX_VALUE/poolSize;
		    //Account for probability values of 0 - to avoid divide-by-zero errors
		    //Divide by probs.length to make sure normalizing works properly
		}else{
		    pairs[i].second = 1.0 / pairs[i].second;
		}
		//Convert pairs from array to vector
		v.add(pairs[i]);
	    }

	    /* For j=1 to n
	     *     Create a cdf
	     *     Randomly pick instance based on cdf
	     *     Note index and remove element 
	     */
	    for(int j=0; j<num; j++){
		sum = 0;
		for(int i=0; i<v.size(); i++)
		    sum += ((Pair) v.get(i)).second;
		
		//normalize
		if (Double.isNaN(sum)) {
		    throw new IllegalArgumentException("Can't normalize array. Sum is NaN.");
		}
		if (sum == 0) {
		    throw new IllegalArgumentException("Can't normalize array. Sum is zero.");
		}
		for(int i=0; i<v.size(); i++)
		    ((Pair) v.get(i)).second /= sum;
		
		//create a cdf
		double []cdf = new double[v.size()];
		cdf[0] = ((Pair) v.get(0)).second;
		for(int i=1; i<v.size(); i++)
		    cdf[i] = ((Pair) v.get(i)).second + cdf[i-1];
		
		double rnd = m_Random.nextDouble();
		int index = 0;
		while(index < cdf.length && rnd > cdf[index]){
		    index++;
		}
		selected[j] = (int) (((Pair) v.get(index)).first);
		v.remove(index);
	    }
	    assert v.size()+num==poolSize : v.size()+" + "+num+" != "+poolSize+"\n";
	    if(m_Debug) {
		System.out.println("Sampled list:");
		for(int j=0; j<num; j++){
		    System.out.println("\t"+selected[j]);
		}
	    }
	    
	}else{//take the first num
	    if(m_Debug) System.out.println("Sorted list:");
	    for(int j=0; j<num; j++){
		if(m_Debug) System.out.println("\t"+pairs[j].second+"\t"+pairs[j].first);
		selected[j] = (int) pairs[j].first;
	    }
	}
	return selected;
    }

    /**
     * Calculate the feature acquisition score for 
     * given examples depending on the chosen selection scheme.
     * @param instance local instances from the current pool
     * @return score
     * @exception Exception if score could not be calculated properly */
    protected double calculateScore(Instance instance) throws Exception{
	double score;
	
	switch(m_SelectionScheme){
	case RANDOM_HYBRID:
	    score = calculateRandomHybridScore(instance);
	    break;
	case HYBRID:
	    score = calculateHybridScore(instance);
	    break;
	case ENTROPY:
	    score = -1.0*calculateEntropy(instance);
	    break;
	case UNLABELED_MARGIN:
	    score = calculateMargin(instance);
	    break;
	case MAX_CERTAINTY:
	    score = -1.0*calculateMargin(instance);
	    break;
	case ABS_LABEL_MARGIN:
	    score = Math.abs(calculateLabeledInstanceMargin(instance));
	    break;
	case LABEL_MARGIN:
	    score = calculateLabeledInstanceMargin(instance);
	    break;
	case RANDOM:
	    score = 0;//all instances are equal
	    break;
	default:
	    score = 0;//all instances are equal
	    break; 
	}
	return score;
    }   
 

  /**
   * Generates the classifier.
   *
   * @exception Exception if classifier can't be built successfully
   */
  public void buildClassifier(Instances instances) 
       throws Exception {

    ModelSelection modSelection;	 

    if (m_binarySplits)
      modSelection = new BinC45ModelSelection(m_minNumObj, instances);
    else
      modSelection = new C45ModelSelection(m_minNumObj, instances);
    if (!m_reducedErrorPruning)
      m_root = new C45PruneableClassifierTree(modSelection, !m_unpruned, m_CF,
					    m_subtreeRaising, !m_noCleanup);
    else
      m_root = new PruneableClassifierTree(modSelection, !m_unpruned, m_numFolds,
					   !m_noCleanup);
    m_root.buildClassifier(instances);
    if (m_binarySplits) {
      ((BinC45ModelSelection)modSelection).cleanup();
    } else {
      ((C45ModelSelection)modSelection).cleanup();
    }
  }

    //=============== BEGIN EDIT melville ===============
    /** Returns class predictions of each ensemble member */
    public double []getEnsemblePredictions(Instance instance) throws Exception{
	double preds[] = new double[1];
	preds[0] = classifyInstance(instance);
	
	return preds;
    }

    /** 
     * Returns vote weights of ensemble members.
     *
     * @return vote weights of ensemble members
     */
    public double []getEnsembleWts(){
	double []ensembleWts = {1.0};
	return ensembleWts;
    }
    
    /** Returns size of ensemble */
    public double getEnsembleSize(){
	return 1.0;
    }
    //=============== END EDIT melville ===============


  /**
   * Classifies an instance.
   *
   * @exception Exception if instance can't be classified successfully
   */
  public double classifyInstance(Instance instance) throws Exception {

    return m_root.classifyInstance(instance);
  }

  /** 
   * Returns class probabilities for an instance.
   *
   * @exception Exception if distribution can't be computed successfully
   */
  public final double [] distributionForInstance(Instance instance) 
       throws Exception {

    return m_root.distributionForInstance(instance, m_useLaplace);
  }

    /**
     *  Returns the type of graph this classifier
     *  represents.
     *  @return Drawable.TREE
     */   
    //  public int graphType() {
    //  	return Drawable.TREE;
    //    }
    
  /**
   * Returns graph describing the tree.
   *
   * @exception Exception if graph can't be computed
   */
  public String graph() throws Exception {

    return m_root.graph();
  }

  /**
   * Returns tree in prefix order.
   *
   * @exception Exception if something goes wrong
   */
  public String prefix() throws Exception {
    
    return m_root.prefix();
  }


  /**
   * Returns tree as an if-then statement.
   *
   * @return the tree as a Java if-then type statement
   * @exception Exception if something goes wrong
   */
  public String toSource(String className) throws Exception {

    StringBuffer [] source = m_root.toSource(className);
    return 
    "class " + className + " {\n\n"
    +"  public static double classify(Object [] i)\n"
    +"    throws Exception {\n\n"
    +"    double p = Double.NaN;\n"
    + source[0]  // Assignment code
    +"    return p;\n"
    +"  }\n"
    + source[1]  // Support code
    +"}\n";
  }

  /**
   * Returns an enumeration describing the available options.
   *
   * Valid options are: <p>
   *
   * -U <br>
   * Use unpruned tree.<p>
   *
   * -C confidence <br>
   * Set confidence threshold for pruning. (Default: 0.25) <p>
   *
   * -M number <br>
   * Set minimum number of instances per leaf. (Default: 2) <p>
   *
   * -R <br>
   * Use reduced error pruning. No subtree raising is performed. <p>
   *
   * -N number <br>
   * Set number of folds for reduced error pruning. One fold is
   * used as the pruning set. (Default: 3) <p>
   *
   * -B <br>
   * Use binary splits for nominal attributes. <p>
   *
   * -S <br>
   * Don't perform subtree raising. <p>
   *
   * -L <br>
   * Do not clean up after the tree has been built.
   *
   * -A <br>
   * If set, Laplace smoothing is used for predicted probabilites. <p>
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {

    Vector newVector = new Vector(9);

    newVector.
	addElement(new Option("\tUse unpruned tree.",
			      "U", 0, "-U"));
    newVector.
	addElement(new Option("\tSet confidence threshold for pruning.\n" +
			      "\t(default 0.25)",
			      "C", 1, "-C <pruning confidence>"));
    newVector.
	addElement(new Option("\tSet minimum number of instances per leaf.\n" +
			      "\t(default 2)",
			      "M", 1, "-M <minimum number of instances>"));
    newVector.
	addElement(new Option("\tUse reduced error pruning.",
			      "R", 0, "-R"));
    newVector.
	addElement(new Option("\tSet number of folds for reduced error\n" +
			      "\tpruning. One fold is used as pruning set.\n" +
			      "\t(default 3)",
			      "N", 1, "-N <number of folds>"));
    newVector.
	addElement(new Option("\tUse binary splits only.",
			      "B", 0, "-B"));
    newVector.
        addElement(new Option("\tDon't perform subtree raising.",
			      "S", 0, "-S"));
    newVector.
        addElement(new Option("\tDo not clean up after the tree has been built.",
			      "L", 0, "-L"));
    newVector.
        addElement(new Option("\tLaplace smoothing for predicted probabilities.",
			      "A", 0, "-A"));
    newVector.
	addElement(new Option("\tUse sampling for selection of instances.",
			      "P", 0, "-P"));
    return newVector.elements();
  }

  /**
   * Parses a given list of options.
   *
   * @param options the list of options as an array of strings
   * @exception Exception if an option is not supported
   */
  public void setOptions(String[] options) throws Exception {
    
    // Other options
    String minNumString = Utils.getOption('M', options);
    if (minNumString.length() != 0) {
      m_minNumObj = Integer.parseInt(minNumString);
    } else {
      m_minNumObj = 2;
    }
    m_binarySplits = Utils.getFlag('B', options);
    m_useLaplace = Utils.getFlag('A', options);

    // Pruning options
    m_unpruned = Utils.getFlag('U', options);
    m_subtreeRaising = !Utils.getFlag('S', options);
    m_noCleanup = Utils.getFlag('L', options);
    if ((m_unpruned) && (!m_subtreeRaising)) {
      throw new Exception("Subtree raising doesn't need to be unset for unpruned tree!");
    }
    m_reducedErrorPruning = Utils.getFlag('R', options);
    if ((m_unpruned) && (m_reducedErrorPruning)) {
      throw new Exception("Unpruned tree and reduced error pruning can't be selected " +
			  "simultaneously!");
    }
    String confidenceString = Utils.getOption('C', options);
    if (confidenceString.length() != 0) {
      if (m_reducedErrorPruning) {
	throw new Exception("Setting the confidence doesn't make sense " +
			    "for reduced error pruning.");
      } else if (m_unpruned) {
	throw new Exception("Doesn't make sense to change confidence for unpruned "
			    +"tree!");
      } else {
	m_CF = (new Float(confidenceString)).floatValue();
	if ((m_CF <= 0) || (m_CF >= 1)) {
	  throw new Exception("Confidence has to be greater than zero and smaller " +
			      "than one!");
	}
      }
    } else {
      m_CF = 0.25f;
    }
    String numFoldsString = Utils.getOption('N', options);
    if (numFoldsString.length() != 0) {
      if (!m_reducedErrorPruning) {
	throw new Exception("Setting the number of folds" +
			    " doesn't make sense if" +
			    " reduced error pruning is not selected.");
      } else {
	m_numFolds = Integer.parseInt(numFoldsString);
      }
    } else {
      m_numFolds = 3;
    }
    
    //=============== BEGIN EDIT melville ===============
    String selectionScheme = Utils.getOption('E', options);
    if (selectionScheme.length() != 0) {
	setSelectionScheme(Integer.parseInt(selectionScheme));
    } else {
	setSelectionScheme(0);
    }
    
    m_UseSampling = Utils.getFlag('P', options);
    //=============== END EDIT melville ===============
  }
    
    //=============== BEGIN EDIT melville ===============
        
    /**
     * Get the value of useSampling.
     * @return value of useSampling.
     */
    public boolean getUseSampling() {
	return m_UseSampling;
    }
    
    /**
     * Set the value of useSampling.
     * @param v  Value to assign to useSampling.
     */
    public void setUseSampling(boolean  v) {
	m_UseSampling = v;
    }
    
    /**
     * Get the value of m_SelectionScheme.
     * @return value of m_SelectionScheme.
     */
    public int getSelectionScheme() {
	return m_SelectionScheme;
    }
    
    /**
     * Set the value of m_SelectionScheme.
     * @param v  Value to assign to m_SelectionScheme.
     */
    public void setSelectionScheme(int  v) {
	m_SelectionScheme = v;
    }
    //=============== END EDIT melville ===============
    
  /**
   * Gets the current settings of the Classifier.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String [] getOptions() {
      //=============== BEGIN EDIT melville ===============
    String [] options = new String [12];
    //=============== END EDIT melville ===============
    int current = 0;

    if (m_noCleanup) {
      options[current++] = "-L";
    }
    if (m_unpruned) {
      options[current++] = "-U";
    } else {
      if (!m_subtreeRaising) {
	options[current++] = "-S";
      }
      if (m_reducedErrorPruning) {
	options[current++] = "-R";
	options[current++] = "-N"; options[current++] = "" + m_numFolds;
      } else {
	options[current++] = "-C"; options[current++] = "" + m_CF;
      }
    }
    if (m_binarySplits) {
      options[current++] = "-B";
    }
    options[current++] = "-M"; options[current++] = "" + m_minNumObj;
    if (m_useLaplace) {
      options[current++] = "-A";
    }
    
    //=============== BEGIN EDIT melville ===============
    options[current++] = "-E"; options[current++] = "" + getSelectionScheme();
    if (m_UseSampling) {
	options[current++] = "-P";
    }
    //=============== END EDIT melville ===============
    
    while (current < options.length) {
      options[current++] = "";
    }
    return options;
  }
  
  /**
   * Get the value of useLaplace.
   *
   * @return Value of useLaplace.
   */
  public boolean getUseLaplace() {
    
    return m_useLaplace;
  }
  
  /**
   * Set the value of useLaplace.
   *
   * @param newuseLaplace Value to assign to useLaplace.
   */
  public void setUseLaplace(boolean newuseLaplace) {
    
    m_useLaplace = newuseLaplace;
  }
  
  /**
   * Returns a description of the classifier.
   */
  public String toString() {

    if (m_root == null) {
      return "No classifier built";
    }
    if (m_unpruned)
      return "J48 unpruned tree\n------------------\n" + m_root.toString();
    else
      return "J48 pruned tree\n------------------\n" + m_root.toString();
  }

  /**
   * Returns a superconcise version of the model
   */
  public String toSummaryString() {

    return "Number of leaves: " + m_root.numLeaves() + "\n"
         + "Size of the tree: " + m_root.numNodes() + "\n";
  }

  /**
   * Returns the size of the tree
   * @return the size of the tree
   */
  public double measureTreeSize() {
    return m_root.numNodes();
  }

  /**
   * Returns the number of leaves
   * @return the number of leaves
   */
  public double measureNumLeaves() {
    return m_root.numLeaves();
  }

  /**
   * Returns the number of rules (same as number of leaves)
   * @return the number of rules
   */
  public double measureNumRules() {
    return m_root.numLeaves();
  }
  
  /**
   * Returns an enumeration of the additional measure names
   * @return an enumeration of the measure names
   */
  public Enumeration enumerateMeasures() {
    Vector newVector = new Vector(3);
    newVector.addElement("measureTreeSize");
    newVector.addElement("measureNumLeaves");
    newVector.addElement("measureNumRules");
    return newVector.elements();
  }

  /**
   * Returns the value of the named measure
   * @param measureName the name of the measure to query for its value
   * @return the value of the named measure
   * @exception IllegalArgumentException if the named measure is not supported
   */
  public double getMeasure(String additionalMeasureName) {
    if (additionalMeasureName.compareTo("measureNumRules") == 0) {
      return measureNumRules();
    } else if (additionalMeasureName.compareTo("measureTreeSize") == 0) {
      return measureTreeSize();
    } else if (additionalMeasureName.compareTo("measureNumLeaves") == 0) {
      return measureNumLeaves();
    } else {
      throw new IllegalArgumentException(additionalMeasureName 
			  + " not supported (j48)");
    }
  }

  /**
   * Get the value of unpruned.
   *
   * @return Value of unpruned.
   */
  public boolean getUnpruned() {
    
    return m_unpruned;
  }
  
  /**
   * Set the value of unpruned. Turns reduced-error pruning
   * off if set.
   * @param v  Value to assign to unpruned.
   */
  public void setUnpruned(boolean v) {

    if (v) {
      m_reducedErrorPruning = false;
    }
    m_unpruned = v;
  }
  
  /**
   * Get the value of CF.
   *
   * @return Value of CF.
   */
  public float getConfidenceFactor() {
    
    return m_CF;
  }
  
  /**
   * Set the value of CF.
   *
   * @param v  Value to assign to CF.
   */
  public void setConfidenceFactor(float v) {
    
    m_CF = v;
  }
  
  /**
   * Get the value of minNumObj.
   *
   * @return Value of minNumObj.
   */
  public int getMinNumObj() {
    
    return m_minNumObj;
  }
  
  /**
   * Set the value of minNumObj.
   *
   * @param v  Value to assign to minNumObj.
   */
  public void setMinNumObj(int v) {
    
    m_minNumObj = v;
  }
  
  /**
   * Get the value of reducedErrorPruning. 
   *
   * @return Value of reducedErrorPruning.
   */
  public boolean getReducedErrorPruning() {
    
    return m_reducedErrorPruning;
  }
  
  /**
   * Set the value of reducedErrorPruning. Turns
   * unpruned trees off if set.
   *
   * @param v  Value to assign to reducedErrorPruning.
   */
  public void setReducedErrorPruning(boolean v) {
    
    if (v) {
      m_unpruned = false;
    }
    m_reducedErrorPruning = v;
  }
  
  /**
   * Get the value of numFolds.
   *
   * @return Value of numFolds.
   */
  public int getNumFolds() {
    
    return m_numFolds;
  }
  
  /**
   * Set the value of numFolds.
   *
   * @param v  Value to assign to numFolds.
   */
  public void setNumFolds(int v) {
    
    m_numFolds = v;
  }
  
  /**
   * Get the value of binarySplits.
   *
   * @return Value of binarySplits.
   */
  public boolean getBinarySplits() {
    
    return m_binarySplits;
  }
  
  /**
   * Set the value of binarySplits.
   *
   * @param v  Value to assign to binarySplits.
   */
  public void setBinarySplits(boolean v) {
    
    m_binarySplits = v;
  }
  
  /**
   * Get the value of subtreeRaising.
   *
   * @return Value of subtreeRaising.
   */
  public boolean getSubtreeRaising() {
    
    return m_subtreeRaising;
  }
  
  /**
   * Set the value of subtreeRaising.
   *
   * @param v  Value to assign to subtreeRaising.
   */
  public void setSubtreeRaising(boolean v) {
    
    m_subtreeRaising = v;
  }
  
  /**
   * Check whether instance data is to be saved.
   *
   * @return true if instance data is saved
   */
  public boolean getSaveInstanceData() {
    
    return m_noCleanup;
  }
  
  /**
   * Set whether instance data is to be saved.
   * @param v true if instance data is to be saved
   */
  public void setSaveInstanceData(boolean v) {
    
    m_noCleanup = v;
  }
 
  /**
   * Main method for testing this class
   *
   * @param String options 
   */
  public static void main(String [] argv){

    try {
      System.out.println(Evaluation.evaluateModel(new FAT(), argv));
    } catch (Exception e) {
      System.err.println(e.getMessage());
    }
  }
}