SemiSupDecorate.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    SemiSupDecorate.java
 *    WARNING: UNDER DEVELOPMENT
 *    Copyright (C) 2004 Prem Melville
 *    
 */

package weka.classifiers.meta;
import weka.classifiers.*;
import java.util.*;
import weka.core.*;
import weka.experiment.*;

/**
 * SemiSupDecorate is an attempt to exploit unlabeled data to improve Decorate.
 *
 * DECORATE is a meta-learner for building diverse ensembles of
 * classifiers by using specially constructed artificial training
 * examples. Comprehensive experiments have demonstrated that this
 * technique is consistently more accurate than the base classifier, 
 * Bagging and Random Forests. Decorate also obtains higher accuracy than
 * Boosting on small training sets, and achieves comparable performance
 * on larger training sets.  For more
 * details see: <p>
 *
 * Prem Melville and Raymond J. Mooney. <i>Constructing diverse
 * classifier ensembles using artificial training examples.</i>
 * Proceedings of the Seventeeth International Joint Conference on
 * Artificial Intelligence 2003.<p>
 *
 * Prem Melville and Raymond J. Mooney. <i>Creating diversity in ensembles using artificial data.</i>
 * Journal of Information Fusion.<BR><BR>
 *
 * Valid options are:<p>
 *
 * -D <br>
 * Turn on debugging output.<p>
 *
 * -W classname <br>
 * Specify the full class name of a weak classifier as the basis for 
 * SemiSupDecorate (default weka.classifiers.trees.j48.J48()).<p>
 *
 * -I num <br>
 * Specify the desired size of the committee (default 15). <p>
 *
 * -M iterations <br>
 * Set the maximum number of SemiSupDecorate iterations (default 50). <p>
 *
 * -S seed <br>
 * Seed for random number generator. (default 0).<p>
 *
 * -R factor <br>
 * Factor that determines number of artificial examples to generate. <p>
 *
 * Options after -- are passed to the designated classifier.<p>
 *
 * @author Prem Melville (melville@cs.utexas.edu)
 * @version $Revision: 1.4 $ */
public class SemiSupDecorate extends EnsembleClassifier implements OptionHandler,SemiSupClassifier{

    /** Weight of unlabeled examples versus labeled examples */
    protected double m_Lambda = 1.0;
    
    /** Set of unlabeled examples */
    protected Instances m_Unlabeled;

    /** Set to true to use artificial data */
    protected boolean m_UseArtificial = true;
    
    /** Set to true to use unlabeled data */
    protected boolean m_UseUnlabeled = true; 
    //When set to false this should be equivalent to Decorate
    
    /** Types of unlabeled usage */
    static final int ALL = 0,
	IGNORE_LOW = 1,
	IGNORE_HIGH = 2,
	FLIP_LOW = 3;
    
    /** Type of usage of unlabeled examples */
    protected int m_UnlabeledMethod = ALL;
    
    /** Confidence threshold for labeling unlabeled examples */
    protected double m_Threshold = 0.9; 
    
    /** Set to true to get debugging output. */
    protected boolean m_Debug = false;

    /** The model base classifier to use. */
    protected Classifier m_Classifier = new weka.classifiers.trees.j48.J48();
      
    /** Vector of classifiers that make up the committee/ensemble. */
    protected Vector m_Committee = null;
    
    /** The desired ensemble size. */
    protected int m_DesiredSize = 15;

    /** The maximum number of SemiSupDecorate iterations to run. */
    protected int m_NumIterations = 50;
    
    /** The seed for random number generation. */
    protected int m_Seed = 0;
    
    /** Amount of artificial/random instances to use - specified as a
        fraction of the training data size. */
    protected double m_ArtSize = 1.0 ;

    /** The random number generator. */
    protected Random m_Random = new Random(0);
    
    /** Attribute statistics - used for generating artificial examples. */
    protected Vector m_AttributeStats = null;


    public void setLambda (double v) {
	m_Lambda = v;
    }

    public double getLambda () {
	return m_Lambda;
    }

    public String lambdaTipText() {
	return "set weight of unlabeled examples vs. labeled";
    }
    /**
     * Get the value of UseArtificial.
     * @return value of UseArtificial.
     */
    public boolean getUseArtificial() {
	return m_UseArtificial;
    }
    
    /**
     * Set the value of UseArtificial.
     * @param v  Value to assign to UseArtificial.
     */
    public void setUseArtificial(boolean  v) {
	m_UseArtificial = v;
    }
    
    /**
     * Get the value of Threshold.
     * @return value of Threshold.
     */
    public double getThreshold() {
	return m_Threshold;
    }
    
    /**
     * Set the value of Threshold.
     * @param v  Value to assign to Threshold.
     */
    public void setThreshold(double  v) {
	m_Threshold = v;
    }
    
    /**
     * Get the value of UnlabeledMethod.
     * @return value of UnlabeledMethod.
     */
    public int getUnlabeledMethod() {
	return m_UnlabeledMethod;
    }
    
    /**
     * Set the value of UnlabeledMethod.
     * @param v  Value to assign to UnlabeledMethod.
     */
    public void setUnlabeledMethod(int  v) {
	m_UnlabeledMethod = v;
    }
    
    /**
     * Get the value of UseUnlabeled.
     * @return value of UseUnlabeled.
     */
    public boolean getUseUnlabeled() {
	return m_UseUnlabeled;
    }
    
    /**
     * Set the value of UseUnlabeled.
     * @param v  Value to assign to UseUnlabeled.
     */
    public void setUseUnlabeled(boolean  v) {
	m_UseUnlabeled = v;
    }
    
    /**
     * Returns an enumeration describing the available options
     *
     * @return an enumeration of all the available options
     */
    public Enumeration listOptions() {
	Vector newVector = new Vector(8);
	newVector.addElement(new Option(
	      "\tTurn on debugging output.",
	      "D", 0, "-D"));
	newVector.addElement(new Option(
              "\tDesired size of ensemble.\n" 
              + "\t(default 15)",
              "I", 1, "-I"));
	newVector.addElement(new Option(
	      "\tMaximum number of SemiSupDecorate iterations.\n"
	      + "\t(default 50)",
	      "M", 1, "-M"));
	newVector.addElement(new Option(
	      "\tFull name of base classifier.\n"
	      + "\t(default weka.classifiers.trees.j48.J48)",
	      "W", 1, "-W"));
	newVector.addElement(new Option(
              "\tSeed for random number generator.\n"
	      +"\tIf set to -1, use a random seed.\n"
              + "\t(default 0)",
              "S", 1, "-S"));
	newVector.addElement(new Option(
				    "\tFactor that determines number of artificial examples to generate.\n"
				    +"\tSpecified proportional to training set size.\n" 
				    + "\t(default 1.0)",
				    "R", 1, "-R"));
    
	if ((m_Classifier != null) &&
	    (m_Classifier instanceof OptionHandler)) {
	    newVector.addElement(new Option(
					    "",
					    "", 0, "\nOptions specific to classifier "
					    + m_Classifier.getClass().getName() + ":"));
	    Enumeration enum = ((OptionHandler)m_Classifier).listOptions();
	    while (enum.hasMoreElements()) {
		newVector.addElement(enum.nextElement());
	    }
	}
	
	return newVector.elements();
    }

    
    /**
     * Parses a given list of options. Valid options are:<p>
     *   
     * -D <br>
     * Turn on debugging output.<p>
     *    
     * -W classname <br>
     * Specify the full class name of a weak classifier as the basis for 
     * SemiSupDecorate (required).<p>
     *
     * -I num <br>
     * Specify the desired size of the committee (default 15). <p>
     *
     * -M iterations <br>
     * Set the maximum number of SemiSupDecorate iterations (default 50). <p>
     *
     * -S seed <br>
     * Seed for random number generator. (default 0).<p>
     *
     * -R factor <br>
     * Factor that determines number of artificial examples to generate. <p>
     *
     * Options after -- are passed to the designated classifier.<p>
     *
     * @param options the list of options as an array of strings
     * @exception Exception if an option is not supported
     */
    public void setOptions(String[] options) throws Exception {
	setUseUnlabeled(Utils.getFlag('U', options));
	setUseArtificial(Utils.getFlag('A', options));
	setDebug(Utils.getFlag('D', options));

	String unlabeledMethod = Utils.getOption('Z', options);
	if (unlabeledMethod.length() != 0) {
	    setUnlabeledMethod(Integer.parseInt(unlabeledMethod));
	} else {
	    setUnlabeledMethod(0);
	}
	
	String threshold = Utils.getOption('T', options);
	if (threshold.length() != 0) {
	    setThreshold(Double.parseDouble(threshold));
	} else {
	    setThreshold(0.9);
	}
	
	String desiredSize = Utils.getOption('I', options);
	if (desiredSize.length() != 0) {
	    setDesiredSize(Integer.parseInt(desiredSize));
	} else {
	    setDesiredSize(15);
	}

	String maxIterations = Utils.getOption('M', options);
	if (maxIterations.length() != 0) {
	    setNumIterations(Integer.parseInt(maxIterations));
	} else {
	    setNumIterations(50);
	}
	
	String seed = Utils.getOption('S', options);
	if (seed.length() != 0) {
	    setSeed(Integer.parseInt(seed));
	} else {
	    setSeed(0);
	}
	
	String artSize = Utils.getOption('R', options);
	if (artSize.length() != 0) {
	    setArtificialSize(Double.parseDouble(artSize));
	} else {
	    setArtificialSize(1.0);
	}

	String lambda = Utils.getOption('L', options);
	if (lambda.length() != 0) {
	    setLambda(Double.parseDouble(lambda));
	}
	
	String classifierName = Utils.getOption('W', options);
	if (classifierName.length() == 0) {
	    throw new Exception("A classifier must be specified with"
				+ " the -W option.");
	}
	setClassifier(Classifier.forName(classifierName,
					 Utils.partitionOptions(options)));
    }
    
    /**
     * Gets the current settings of the Classifier.
     *
     * @return an array of strings suitable for passing to setOptions
     */
    public String [] getOptions() {
	String [] classifierOptions = new String [0];
	if ((m_Classifier != null) && 
	    (m_Classifier instanceof OptionHandler)) {
	    classifierOptions = ((OptionHandler)m_Classifier).getOptions();
	}
	String [] options = new String [classifierOptions.length + 19];
	int current = 0;
	if (getDebug()) {
	    options[current++] = "-D";
	}
	if (getUseUnlabeled()) {
	    options[current++] = "-U";
	}
	if (getUseArtificial()) {
	    options[current++] = "-A";
	}
	options[current++] = "-T"; options[current++] = "" + getThreshold();
	options[current++] = "-Z"; options[current++] = "" + getUnlabeledMethod();
	options[current++] = "-S"; options[current++] = "" + getSeed();
	options[current++] = "-I"; options[current++] = "" + getDesiredSize();
	options[current++] = "-M"; options[current++] = "" + getNumIterations();
	options[current++] = "-R"; options[current++] = "" + getArtificialSize();
	options[current++] = "-L"; options[current++] = "" + getLambda();

	if (getClassifier() != null) {
	    options[current++] = "-W";
	    options[current++] = getClassifier().getClass().getName();
	}
	options[current++] = "--";
	System.arraycopy(classifierOptions, 0, options, current, 
			 classifierOptions.length);
	current += classifierOptions.length;
	while (current < options.length) {
	    options[current++] = "";
	}
	return options;
    }
    

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String desiredSizeTipText() {
      return "the desired number of member classifiers in the SemiSupDecorate ensemble. SemiSupDecorate may terminate "
	+"before this size is reached (depending on the value of numIterations). "
	+"Larger ensemble sizes usually lead to more accurate models, but increases "
	+"training time and model complexity.";
  }
    
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String numIterationsTipText() {
    return "the maximum number of SemiSupDecorate iterations to run. Each iteration generates a classifier, "
	+"but does not necessarily add it to the ensemble. SemiSupDecorate stops when the desired ensemble "
	+"size is reached. This parameter should be greater than "
	+"equal to the desiredSize. If the desiredSize is not being reached it may help to "
	+"increase this value.";
  }
    
  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String artificialSizeTipText() {
    return "determines the number of artificial examples to use during training. Specified as "
	+"a proportion of the training data. Higher values can increase ensemble diversity.";
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
   */
  public String seedTipText() {
    return "seed for random number generator used for creating artificial data."
	+" Set to -1 to use a random seed.";
  }

  /**
   * Returns the tip text for this property
   * @return tip text for this property suitable for
   * displaying in the explorer/experimenter gui
b   */
  public String classifierTipText() {
    return "the desired base learner for the ensemble.";
  }

  /**
   * Returns a string describing classifier
   * @return a description suitable for
   * displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
      return "DECORATE is a meta-learner for building diverse ensembles of "
	  +"classifiers by using specially constructed artificial training "
	  +"examples. Comprehensive experiments have demonstrated that this "
	  +"technique is consistently more accurate than the base classifier, Bagging and Random Forests."
	  +"SemiSupDecorate also obtains higher accuracy than Boosting on small training sets, and achieves "
	  +"comparable performance on larger training sets. "
	  +"For more details see: P. Melville & R. J. Mooney. Constructing diverse classifier ensembles "
	  +"using artificial training examples (IJCAI 2003).\n"
	  +"P. Melville & R. J. Mooney. Creating diversity in ensembles using artificial data (submitted).";
  }


    /**
     * Set debugging mode
     *
     * @param debug true if debug output should be printed
     */
    public void setDebug(boolean debug) {
	m_Debug = debug;
    }
    
    /**
     * Get whether debugging is turned on
     *
     * @return true if debugging output is on
     */
    public boolean getDebug() {
	return m_Debug;
    }
    
    /**
     * Set the base classifier for SemiSupDecorate.
     *
     * @param newClassifier the Classifier to use.
     */
    public void setClassifier(Classifier newClassifier) {
	m_Classifier = newClassifier;
    }

    /**
     * Get the classifier used as the base classifier
     *
     * @return the classifier used as the classifier
     */
    public Classifier getClassifier() {
	return m_Classifier;
    }

    /**
     * Factor that determines number of artificial examples to generate.
     *
     * @return factor that determines number of artificial examples to generate
     */
    public double getArtificialSize() {
	return m_ArtSize;
    }
  
    /**
     * Sets factor that determines number of artificial examples to generate.
     *
     * @param newwArtSize factor that determines number of artificial examples to generate
     */
    public void setArtificialSize(double newArtSize) {
	m_ArtSize = newArtSize;
    }
    
    /**
     * Gets the desired size of the committee.
     *
     * @return the desired size of the committee
     */
    public int getDesiredSize() {
	return m_DesiredSize;
    }
    
    /**
     * Sets the desired size of the committee.
     *
     * @param newDesiredSize the desired size of the committee
     */
    public void setDesiredSize(int newDesiredSize) {
	m_DesiredSize = newDesiredSize;
    }
    
    /**
     * Sets the max number of SemiSupDecorate iterations to run.
     *
     * @param numIterations  max number of SemiSupDecorate iterations to run
     */
    public void setNumIterations(int numIterations) {
	m_NumIterations = numIterations;
    }

    /**
     * Gets the max number of SemiSupDecorate iterations to run.
     *
     * @return the  max number of SemiSupDecorate iterations to run
     */
    public int getNumIterations() {
        return m_NumIterations;
    }
    
    /**
     * Set the seed for random number generator.
     *
     * @param seed the random number seed 
     */
    public void setSeed(int seed) {
	m_Seed = seed;
    }
    
    /**
     * Gets the seed for the random number generator.
     *
     * @return the seed for the random number generator
     */
    public int getSeed() {
        return m_Seed;
    }

    /** 
     * Provide unlabeled data to the classifier.
     * @unlabeled the unlabeled Instances
     */
    public void setUnlabeled(Instances unlabeled){
	m_Unlabeled = new Instances(unlabeled);//make local copy of unlabeled data
	//Reweight unlabeled instances if necessary
	if (m_Lambda != 1.0) 
	    weightInstances(m_Unlabeled, m_Lambda);
    }

    /** Weighted all given instances with given weight */
    protected void weightInstances (Instances insts, double weight) {
	Enumeration enumInsts = insts.enumerateInstances();
	while (enumInsts.hasMoreElements()) {
	    Instance instance = (Instance) enumInsts.nextElement();
	    instance.setWeight(weight);
	}
    }

    
    /**
     * Build SemiSupDecorate classifier
     *
     * @param data the training data to be used for generating the classifier
     * @exception Exception if the classifier could not be built successfully
     */
    public void buildClassifier(Instances data) throws Exception {
	if(m_Classifier == null) {
	    throw new Exception("A base classifier has not been specified!");
	}
	if(data.checkForStringAttributes()) {
	    throw new UnsupportedAttributeTypeException("Cannot handle string attributes!");
	}
	if(data.classAttribute().isNumeric()) {
	    throw new UnsupportedClassTypeException("SemiSupDecorate can't handle a numeric class!");
	}
	if(m_NumIterations < m_DesiredSize)
	    throw new Exception("Max number of iterations must be >= desired ensemble size!");
	
	//initialize random number generator
	if(m_Seed==-1) m_Random = new Random();
	else m_Random = new Random(m_Seed);
	
	//initialize ensemble wts to be equal 
	m_EnsembleWts = new double [m_DesiredSize];
	for(int j=0; j<m_DesiredSize; j++)
	    m_EnsembleWts[j] = 1.0;
	initMeasures();
	
	int numUnlabeledUsed = 0;
	int i = 1;//current committee size
	int numTrials = 1;//number of SemiSupDecorate iterations 
	Instances divData = new Instances(data);//local copy of data - diversity data
	divData.deleteWithMissingClass();
	Instances artData = null;//artificial data

	//compute number of artficial instances to add at each iteration
	int artSize = (int) (Math.abs(m_ArtSize)*divData.numInstances());
	if(artSize==0) artSize=1;//atleast add one random example
	computeStats(data);//Compute training data stats for creating artificial examples
	
	//initialize new committee
	m_Committee = new Vector();
	Classifier newClassifier = m_Classifier;
	newClassifier.buildClassifier(divData);
	m_Committee.add(newClassifier);
	double eComm = computeError(divData);//compute ensemble error
	if(m_Debug) System.out.println("Initialize:\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm);
	
	//repeat till desired committee size is reached OR the max number of iterations is exceeded 
	while(i<m_DesiredSize && numTrials<m_NumIterations){
	    
	    if(m_UseArtificial){
		//Generate artificial training examples
		artData = generateArtificialData(artSize, data);
		//Label artificial examples
		labelData(artData);
		//Add new artificial data
		addInstances(divData, artData);
	    }
	    
	    //Add unlabeled data
	    if(m_UseUnlabeled) numUnlabeledUsed = addUnlabeled(divData);
	    
	    //Build new classifier
	    Classifier tmp[] = Classifier.makeCopies(m_Classifier,1);
	    newClassifier = tmp[0]; 
	    newClassifier.buildClassifier(divData);
	    
	    //Remove unlabeled data
	    if(m_UseUnlabeled) removeInstances(divData, numUnlabeledUsed);
	    
	    //Remove all the artificial data
	    if(m_UseArtificial) removeInstances(divData, artSize);
	    
	    assert (divData.numInstances()==data.numInstances()) : "Diversity data error!";

	    //Test if the new classifier should be added to the ensemble
	    m_Committee.add(newClassifier);//add new classifier to current committee
	    double currError = computeError(divData);
	    if(currError <= eComm){//adding the new member did not increase the error
		i++;
		eComm = currError;
		if(m_Debug) System.out.println("Iteration: "+(1+numTrials)+"\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm);
	    }else{//reject the current classifier because it increased the ensemble error 
		m_Committee.removeElementAt(m_Committee.size()-1);//pop the last member
	    }
	    numTrials++;
	}
    }

    /** 
     * Add unlabeled data to training set. 
     * @param divData diversity data to add to.
     * @return number of unlabeled examples used. 
     */
    protected int addUnlabeled(Instances divData) throws Exception{
	//set of unlabeled examples eventually used (some may be ignored). 
	Instances unlabeledUsed = new Instances(divData);
	switch (m_UnlabeledMethod){
	case ALL:
	    unlabeledUsed = labelAll(m_Unlabeled);
	    break;
	case IGNORE_LOW:
	    unlabeledUsed = labelIgnoreLow(m_Unlabeled, unlabeledUsed);
	    break;
	case IGNORE_HIGH:
	    unlabeledUsed = labelIgnoreHigh(m_Unlabeled, unlabeledUsed);
	    break;
	case FLIP_LOW:
	    unlabeledUsed = labelFlipLow(m_Unlabeled);
	    break;
	default:
	    System.err.println("Unrecognized unlabeled method selected!");
	}
	System.out.println("Unlabeled size = "+m_Unlabeled.numInstances()+" Used size = "+unlabeledUsed.numInstances());
	addInstances(divData, unlabeledUsed);
	return unlabeledUsed.numInstances();
    }
    
    /** Label examples as predicted by the current ensemble. */
    protected Instances labelAll(Instances instances) throws Exception {
	Instance curr;
	for(int i=0; i<instances.numInstances(); i++){
	    curr = instances.instance(i);
	    curr.setClassValue(classifyInstance(curr));
	}
	return m_Unlabeled;
    }

    /** 
     * Label high-confidence examples with the ensemble's prediction.
     * Ignore low-confidence examples.
     */
    protected Instances labelIgnoreLow(Instances instances, Instances used) throws Exception {
	Instance curr;
	double []probs;
	int label; 
	double highestProb;
	
	for(int i=0; i<instances.numInstances(); i++){
	    curr = instances.instance(i);
	    //compute the class membership probs predicted by the current ensemble 
	    probs = distributionForInstance(curr);
	    label = (int) classifyInstance(curr);
	    highestProb = probs[label];
	    
	    if(highestProb >= m_Threshold){
		curr.setClassValue(label);
		used.add(curr);
	    }	
	}
	return used;
    }
    
    /** 
     * Label low-confidence examples with inverse of ensemble's prediction.
     * Ignore high-confidence examples.
     */
    protected Instances labelIgnoreHigh(Instances instances, Instances used) throws Exception {
	Instance curr;
	double []probs;
	int label; 
	double highestProb;
	
	for(int i=0; i<instances.numInstances(); i++){
	    curr = instances.instance(i);
	    //compute the class membership probs predicted by the current ensemble 
	    probs = distributionForInstance(curr);
	    label = (int) classifyInstance(curr);
	    highestProb = probs[label];
	    
	    if(highestProb < m_Threshold){
		curr.setClassValue(inverseLabel(probs));
		used.add(curr);
	    }	
	}
	return used;
    }   
    
    /** 
     * Label low-confidence examples with inverse of ensemble's prediction.
     * Use ensemble's prediction for high-confidence examples.
     */
    protected Instances labelFlipLow(Instances instances) throws Exception {
	Instance curr;
	double []probs;
	int label; 
	double highestProb;
	
	int a=0, b=0;
	for(int i=0; i<instances.numInstances(); i++){
	    curr = instances.instance(i);
	    //compute the class membership probs predicted by the current ensemble 
	    probs = distributionForInstance(curr);
	    label = (int) classifyInstance(curr);
	    highestProb = probs[label];
	    
	    if(highestProb >= m_Threshold){
		curr.setClassValue(label);
		a++;
	    }else{
		curr.setClassValue(inverseLabel(probs));
		b++;
	    }
	}
	System.out.println("As is: "+a+"\tFlipped: "+b);
	return m_Unlabeled;
    }
    
    
    //Helper method to print arrays
    protected void printArray(double []array){
	for(int i=0; i<array.length; i++)
	    System.out.print(array[i]+" ");
	System.out.println();
    }
    
    /** Returns class predictions of each ensemble member */
    public double []getEnsemblePredictions(Instance instance) throws Exception{
	double preds[] = new double [m_Committee.size()];
	for(int i=0; i<m_Committee.size(); i++)
	    preds[i] = ((Classifier) m_Committee.get(i)).classifyInstance(instance);
	
	return preds;
    }
    
    /** 
     * Returns vote weights of ensemble members.
     *
     * @return vote weights of ensemble members
     */
    public double []getEnsembleWts(){
	return m_EnsembleWts;
    }
    
    /** Returns size of ensemble */
    public double getEnsembleSize(){
	return m_Committee.size();
    }
    

    
    /** 
     * Compute and store statistics required for generating artificial data.
     *
     * @param data training instances
     * @exception Exception if statistics could not be calculated successfully
     */
    protected void computeStats(Instances data) throws Exception{
	int numAttributes = data.numAttributes();
	m_AttributeStats = new Vector(numAttributes);//use to map attributes to their stats
	
	for(int j=0; j<numAttributes; j++){
	    if(data.attribute(j).isNominal()){
		//Compute the probability of occurence of each distinct value 
		int []nomCounts = (data.attributeStats(j)).nominalCounts;
		double []counts = new double[nomCounts.length];
		if(counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!"); 
		//Perform Laplace smoothing
		for(int i=0; i<counts.length; i++)
		    counts[i] = nomCounts[i] + 1;
		Utils.normalize(counts);
		double []stats = new double[counts.length - 1];
		stats[0] = counts[0];
		//Calculate cumulative probabilities
		for(int i=1; i<stats.length; i++)
		    stats[i] = stats[i-1] + counts[i];
		m_AttributeStats.add(j,stats);
	    }else if(data.attribute(j).isNumeric()){
		//Get mean and standard deviation from the training data
		double []stats = new double[2];
		stats[0] = data.meanOrMode(j);
		stats[1] = Math.sqrt(data.variance(j));
		m_AttributeStats.add(j,stats);
	    }else System.err.println("SemiSupDecorate can only handle numeric and nominal values.");
	}
    }

    /**
     * Generate artificial training examples.
     * @param artSize size of examples set to create
     * @param data training data
     * @return the set of unlabeled artificial examples
     */
    protected Instances generateArtificialData(int artSize, Instances data){
	int numAttributes = data.numAttributes();
	Instances artData = new Instances(data, artSize);
	double []att; 
	Instance artInstance;
	
	for(int i=0; i<artSize; i++){
	    att = new double[numAttributes];
	    for(int j=0; j<numAttributes; j++){
		if(data.attribute(j).isNominal()){
		    //Select nominal value based on the frequency of occurence in the training data  
		    double []stats = (double [])m_AttributeStats.get(j);
		    att[j] =  (double) selectIndexProbabilistically(stats);
		}
		else if(data.attribute(j).isNumeric()){
		    //Generate numeric value from the Guassian distribution 
		    //defined by the mean and std dev of the attribute
		    double []stats = (double [])m_AttributeStats.get(j);
		    att[j] = (m_Random.nextGaussian()*stats[1])+stats[0];
		}else System.err.println("SemiSupDecorate can only handle numeric and nominal values.");
	    }
	    artInstance = new Instance(1.0, att);
	    artData.add(artInstance);
	}
	return artData;
    }
    
    
    /** 
     * Labels the artificially generated data.
     *
     * @param artData the artificially generated instances
     * @exception Exception if instances cannot be labeled successfully 
     */
    protected void labelData(Instances artData) throws Exception {
	Instance curr;
	double []probs;
	
	for(int i=0; i<artData.numInstances(); i++){
	    curr = artData.instance(i);
	    //compute the class membership probs predicted by the current ensemble 
	    probs = distributionForInstance(curr);
	    //select class label inversely proportional to the ensemble predictions
	    curr.setClassValue(inverseLabel(probs));
	}	
    }
    

    /** 
     * Select class label such that the probability of selection is
     * inversely proportional to the ensemble's predictions.
     *
     * @param probs class membership probabilities of instance
     * @return index of class label selected
     * @exception Exception if instances cannot be labeled successfully 
     */
    protected int inverseLabel(double []probs) throws Exception{
	double []invProbs = new double[probs.length];
	//Produce probability distribution inversely proportional to the given
	for(int i=0; i<probs.length; i++){
	    if(probs[i]==0){
		invProbs[i] = Double.MAX_VALUE/probs.length; 
		//Account for probability values of 0 - to avoid divide-by-zero errors
		//Divide by probs.length to make sure normalizing works properly
	    }else{
		invProbs[i] = 1.0 / probs[i];
	    }
	}
	Utils.normalize(invProbs);
	double []cdf = new double[invProbs.length];
	//Compute cumulative probabilities 
	cdf[0] = invProbs[0];
	for(int i=1; i<invProbs.length; i++){
	    cdf[i] = invProbs[i]+cdf[i-1];
	}
	
	if(Double.isNaN(cdf[invProbs.length-1]))
	    System.err.println("Cumulative class membership probability is NaN!"); 
	return selectIndexProbabilistically(cdf);
    }
    
    /** 
     * Given cumulative probabilities select a nominal attribute value index 
     *
     * @param cdf array of cumulative probabilities
     * @return index of attribute selected based on the probability distribution 
     */
    protected int selectIndexProbabilistically(double []cdf){
	double rnd = m_Random.nextDouble();
	int index = 0;
	while(index < cdf.length && rnd > cdf[index]){
	    index++;
	}
	return index;
    } 
    
    /**
     * Removes a specified number of instances from the given set of instances.
     *
     * @param data given instances
     * @param numRemove number of instances to delete from the given instances
     */
    protected void removeInstances(Instances data, int numRemove){
	int num = data.numInstances();
	for(int i=num - 1; i>num - 1 - numRemove;i--){
	    data.delete(i);
	}
    }
    
    /**
     * Add new instances to the given set of instances.
     *
     * @param data given instances
     * @param newData set of instances to add to given instances
     */
    protected void addInstances(Instances data, Instances newData){
	for(int i=0; i<newData.numInstances(); i++)
	    data.add(newData.instance(i));
    }
    
    /** 
     * Computes the error in classification on the given data.
     *
     * @param data the instances to be classified
     * @return classification error
     * @exception Exception if error can not be computed successfully
     */
    protected double computeError(Instances data) throws Exception {
	double error = 0.0;
	int numInstances = data.numInstances();
	Instance curr;
	
	for(int i=0; i<numInstances; i++){
	    curr = data.instance(i);
	    //Check if the instance has been misclassified
	    if(curr.classValue() != ((int) classifyInstance(curr))) error++;
	}
	return (error/numInstances);
    }
    
  /**
   * Calculates the class membership probabilities for the given test instance.
   *
   * @param instance the instance to be classified
   * @return predicted class probability distribution
   * @exception Exception if distribution can't be computed successfully
   */
  public double[] distributionForInstance(Instance instance) throws Exception {
      if (instance.classAttribute().isNumeric()) {
	  throw new UnsupportedClassTypeException("SemiSupDecorate can't handle a numeric class!");
      }
      double [] sums = new double [instance.numClasses()], newProbs; 
      Classifier curr;
      
      for (int i = 0; i < m_Committee.size(); i++) {
	  curr = (Classifier) m_Committee.get(i);
	  if (curr instanceof DistributionClassifier) {
	      newProbs = ((DistributionClassifier)curr).distributionForInstance(instance);
	      for (int j = 0; j < newProbs.length; j++)
		  sums[j] += newProbs[j];
	  } else {
	      sums[(int)curr.classifyInstance(instance)]++;
	  }
      }
      if (Utils.eq(Utils.sum(sums), 0)) {
	  return sums;
      } else {
	  Utils.normalize(sums);
	  return sums;
      }
  }
    
    /**
     * Returns description of the SemiSupDecorate classifier.
     *
     * @return description of the SemiSupDecorate classifier as a string
     */
    public String toString() {
	
	if (m_Committee == null) {
	    return "SemiSupDecorate: No model built yet.";
	}
	StringBuffer text = new StringBuffer();
	text.append("SemiSupDecorate base classifiers: \n\n");
	for (int i = 0; i < m_Committee.size(); i++)
	    text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n");
	text.append("Number of classifier in the ensemble: "+m_Committee.size()+"\n");
	return text.toString();
    }
    
    /**
     * Main method for testing this class.
     *
     * @param argv the options
     */
    public static void main(String [] argv) {
	
	try {
	    System.out.println(Evaluation.evaluateModel(new SemiSupDecorate(), argv));
	} catch (Exception e) {
	    System.err.println(e.getMessage());
	}
    }
}