/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* SemiSupDecorate.java
* WARNING: UNDER DEVELOPMENT
* Copyright (C) 2004 Prem Melville
*
*/
package weka.classifiers.meta;
import weka.classifiers.*;
import java.util.*;
import weka.core.*;
import weka.experiment.*;
/**
* SemiSupDecorate is an attempt to exploit unlabeled data to improve Decorate.
*
* DECORATE is a meta-learner for building diverse ensembles of
* classifiers by using specially constructed artificial training
* examples. Comprehensive experiments have demonstrated that this
* technique is consistently more accurate than the base classifier,
* Bagging and Random Forests. Decorate also obtains higher accuracy than
* Boosting on small training sets, and achieves comparable performance
* on larger training sets. For more
* details see: <p>
*
* Prem Melville and Raymond J. Mooney. <i>Constructing diverse
* classifier ensembles using artificial training examples.</i>
* Proceedings of the Seventeeth International Joint Conference on
* Artificial Intelligence 2003.<p>
*
* Prem Melville and Raymond J. Mooney. <i>Creating diversity in ensembles using artificial data.</i>
* Journal of Information Fusion.<BR><BR>
*
* Valid options are:<p>
*
* -D <br>
* Turn on debugging output.<p>
*
* -W classname <br>
* Specify the full class name of a weak classifier as the basis for
* SemiSupDecorate (default weka.classifiers.trees.j48.J48()).<p>
*
* -I num <br>
* Specify the desired size of the committee (default 15). <p>
*
* -M iterations <br>
* Set the maximum number of SemiSupDecorate iterations (default 50). <p>
*
* -S seed <br>
* Seed for random number generator. (default 0).<p>
*
* -R factor <br>
* Factor that determines number of artificial examples to generate. <p>
*
* Options after -- are passed to the designated classifier.<p>
*
* @author Prem Melville (melville@cs.utexas.edu)
* @version $Revision: 1.4 $ */
public class SemiSupDecorate extends EnsembleClassifier implements OptionHandler,SemiSupClassifier{
/** Weight of unlabeled examples versus labeled examples */
protected double m_Lambda = 1.0;
/** Set of unlabeled examples */
protected Instances m_Unlabeled;
/** Set to true to use artificial data */
protected boolean m_UseArtificial = true;
/** Set to true to use unlabeled data */
protected boolean m_UseUnlabeled = true;
//When set to false this should be equivalent to Decorate
/** Types of unlabeled usage */
static final int ALL = 0,
IGNORE_LOW = 1,
IGNORE_HIGH = 2,
FLIP_LOW = 3;
/** Type of usage of unlabeled examples */
protected int m_UnlabeledMethod = ALL;
/** Confidence threshold for labeling unlabeled examples */
protected double m_Threshold = 0.9;
/** Set to true to get debugging output. */
protected boolean m_Debug = false;
/** The model base classifier to use. */
protected Classifier m_Classifier = new weka.classifiers.trees.j48.J48();
/** Vector of classifiers that make up the committee/ensemble. */
protected Vector m_Committee = null;
/** The desired ensemble size. */
protected int m_DesiredSize = 15;
/** The maximum number of SemiSupDecorate iterations to run. */
protected int m_NumIterations = 50;
/** The seed for random number generation. */
protected int m_Seed = 0;
/** Amount of artificial/random instances to use - specified as a
fraction of the training data size. */
protected double m_ArtSize = 1.0 ;
/** The random number generator. */
protected Random m_Random = new Random(0);
/** Attribute statistics - used for generating artificial examples. */
protected Vector m_AttributeStats = null;
public void setLambda (double v) {
m_Lambda = v;
}
public double getLambda () {
return m_Lambda;
}
public String lambdaTipText() {
return "set weight of unlabeled examples vs. labeled";
}
/**
* Get the value of UseArtificial.
* @return value of UseArtificial.
*/
public boolean getUseArtificial() {
return m_UseArtificial;
}
/**
* Set the value of UseArtificial.
* @param v Value to assign to UseArtificial.
*/
public void setUseArtificial(boolean v) {
m_UseArtificial = v;
}
/**
* Get the value of Threshold.
* @return value of Threshold.
*/
public double getThreshold() {
return m_Threshold;
}
/**
* Set the value of Threshold.
* @param v Value to assign to Threshold.
*/
public void setThreshold(double v) {
m_Threshold = v;
}
/**
* Get the value of UnlabeledMethod.
* @return value of UnlabeledMethod.
*/
public int getUnlabeledMethod() {
return m_UnlabeledMethod;
}
/**
* Set the value of UnlabeledMethod.
* @param v Value to assign to UnlabeledMethod.
*/
public void setUnlabeledMethod(int v) {
m_UnlabeledMethod = v;
}
/**
* Get the value of UseUnlabeled.
* @return value of UseUnlabeled.
*/
public boolean getUseUnlabeled() {
return m_UseUnlabeled;
}
/**
* Set the value of UseUnlabeled.
* @param v Value to assign to UseUnlabeled.
*/
public void setUseUnlabeled(boolean v) {
m_UseUnlabeled = v;
}
/**
* Returns an enumeration describing the available options
*
* @return an enumeration of all the available options
*/
public Enumeration listOptions() {
Vector newVector = new Vector(8);
newVector.addElement(new Option(
"\tTurn on debugging output.",
"D", 0, "-D"));
newVector.addElement(new Option(
"\tDesired size of ensemble.\n"
+ "\t(default 15)",
"I", 1, "-I"));
newVector.addElement(new Option(
"\tMaximum number of SemiSupDecorate iterations.\n"
+ "\t(default 50)",
"M", 1, "-M"));
newVector.addElement(new Option(
"\tFull name of base classifier.\n"
+ "\t(default weka.classifiers.trees.j48.J48)",
"W", 1, "-W"));
newVector.addElement(new Option(
"\tSeed for random number generator.\n"
+"\tIf set to -1, use a random seed.\n"
+ "\t(default 0)",
"S", 1, "-S"));
newVector.addElement(new Option(
"\tFactor that determines number of artificial examples to generate.\n"
+"\tSpecified proportional to training set size.\n"
+ "\t(default 1.0)",
"R", 1, "-R"));
if ((m_Classifier != null) &&
(m_Classifier instanceof OptionHandler)) {
newVector.addElement(new Option(
"",
"", 0, "\nOptions specific to classifier "
+ m_Classifier.getClass().getName() + ":"));
Enumeration enum = ((OptionHandler)m_Classifier).listOptions();
while (enum.hasMoreElements()) {
newVector.addElement(enum.nextElement());
}
}
return newVector.elements();
}
/**
* Parses a given list of options. Valid options are:<p>
*
* -D <br>
* Turn on debugging output.<p>
*
* -W classname <br>
* Specify the full class name of a weak classifier as the basis for
* SemiSupDecorate (required).<p>
*
* -I num <br>
* Specify the desired size of the committee (default 15). <p>
*
* -M iterations <br>
* Set the maximum number of SemiSupDecorate iterations (default 50). <p>
*
* -S seed <br>
* Seed for random number generator. (default 0).<p>
*
* -R factor <br>
* Factor that determines number of artificial examples to generate. <p>
*
* Options after -- are passed to the designated classifier.<p>
*
* @param options the list of options as an array of strings
* @exception Exception if an option is not supported
*/
public void setOptions(String[] options) throws Exception {
setUseUnlabeled(Utils.getFlag('U', options));
setUseArtificial(Utils.getFlag('A', options));
setDebug(Utils.getFlag('D', options));
String unlabeledMethod = Utils.getOption('Z', options);
if (unlabeledMethod.length() != 0) {
setUnlabeledMethod(Integer.parseInt(unlabeledMethod));
} else {
setUnlabeledMethod(0);
}
String threshold = Utils.getOption('T', options);
if (threshold.length() != 0) {
setThreshold(Double.parseDouble(threshold));
} else {
setThreshold(0.9);
}
String desiredSize = Utils.getOption('I', options);
if (desiredSize.length() != 0) {
setDesiredSize(Integer.parseInt(desiredSize));
} else {
setDesiredSize(15);
}
String maxIterations = Utils.getOption('M', options);
if (maxIterations.length() != 0) {
setNumIterations(Integer.parseInt(maxIterations));
} else {
setNumIterations(50);
}
String seed = Utils.getOption('S', options);
if (seed.length() != 0) {
setSeed(Integer.parseInt(seed));
} else {
setSeed(0);
}
String artSize = Utils.getOption('R', options);
if (artSize.length() != 0) {
setArtificialSize(Double.parseDouble(artSize));
} else {
setArtificialSize(1.0);
}
String lambda = Utils.getOption('L', options);
if (lambda.length() != 0) {
setLambda(Double.parseDouble(lambda));
}
String classifierName = Utils.getOption('W', options);
if (classifierName.length() == 0) {
throw new Exception("A classifier must be specified with"
+ " the -W option.");
}
setClassifier(Classifier.forName(classifierName,
Utils.partitionOptions(options)));
}
/**
* Gets the current settings of the Classifier.
*
* @return an array of strings suitable for passing to setOptions
*/
public String [] getOptions() {
String [] classifierOptions = new String [0];
if ((m_Classifier != null) &&
(m_Classifier instanceof OptionHandler)) {
classifierOptions = ((OptionHandler)m_Classifier).getOptions();
}
String [] options = new String [classifierOptions.length + 19];
int current = 0;
if (getDebug()) {
options[current++] = "-D";
}
if (getUseUnlabeled()) {
options[current++] = "-U";
}
if (getUseArtificial()) {
options[current++] = "-A";
}
options[current++] = "-T"; options[current++] = "" + getThreshold();
options[current++] = "-Z"; options[current++] = "" + getUnlabeledMethod();
options[current++] = "-S"; options[current++] = "" + getSeed();
options[current++] = "-I"; options[current++] = "" + getDesiredSize();
options[current++] = "-M"; options[current++] = "" + getNumIterations();
options[current++] = "-R"; options[current++] = "" + getArtificialSize();
options[current++] = "-L"; options[current++] = "" + getLambda();
if (getClassifier() != null) {
options[current++] = "-W";
options[current++] = getClassifier().getClass().getName();
}
options[current++] = "--";
System.arraycopy(classifierOptions, 0, options, current,
classifierOptions.length);
current += classifierOptions.length;
while (current < options.length) {
options[current++] = "";
}
return options;
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String desiredSizeTipText() {
return "the desired number of member classifiers in the SemiSupDecorate ensemble. SemiSupDecorate may terminate "
+"before this size is reached (depending on the value of numIterations). "
+"Larger ensemble sizes usually lead to more accurate models, but increases "
+"training time and model complexity.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String numIterationsTipText() {
return "the maximum number of SemiSupDecorate iterations to run. Each iteration generates a classifier, "
+"but does not necessarily add it to the ensemble. SemiSupDecorate stops when the desired ensemble "
+"size is reached. This parameter should be greater than "
+"equal to the desiredSize. If the desiredSize is not being reached it may help to "
+"increase this value.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String artificialSizeTipText() {
return "determines the number of artificial examples to use during training. Specified as "
+"a proportion of the training data. Higher values can increase ensemble diversity.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
*/
public String seedTipText() {
return "seed for random number generator used for creating artificial data."
+" Set to -1 to use a random seed.";
}
/**
* Returns the tip text for this property
* @return tip text for this property suitable for
* displaying in the explorer/experimenter gui
b */
public String classifierTipText() {
return "the desired base learner for the ensemble.";
}
/**
* Returns a string describing classifier
* @return a description suitable for
* displaying in the explorer/experimenter gui
*/
public String globalInfo() {
return "DECORATE is a meta-learner for building diverse ensembles of "
+"classifiers by using specially constructed artificial training "
+"examples. Comprehensive experiments have demonstrated that this "
+"technique is consistently more accurate than the base classifier, Bagging and Random Forests."
+"SemiSupDecorate also obtains higher accuracy than Boosting on small training sets, and achieves "
+"comparable performance on larger training sets. "
+"For more details see: P. Melville & R. J. Mooney. Constructing diverse classifier ensembles "
+"using artificial training examples (IJCAI 2003).\n"
+"P. Melville & R. J. Mooney. Creating diversity in ensembles using artificial data (submitted).";
}
/**
* Set debugging mode
*
* @param debug true if debug output should be printed
*/
public void setDebug(boolean debug) {
m_Debug = debug;
}
/**
* Get whether debugging is turned on
*
* @return true if debugging output is on
*/
public boolean getDebug() {
return m_Debug;
}
/**
* Set the base classifier for SemiSupDecorate.
*
* @param newClassifier the Classifier to use.
*/
public void setClassifier(Classifier newClassifier) {
m_Classifier = newClassifier;
}
/**
* Get the classifier used as the base classifier
*
* @return the classifier used as the classifier
*/
public Classifier getClassifier() {
return m_Classifier;
}
/**
* Factor that determines number of artificial examples to generate.
*
* @return factor that determines number of artificial examples to generate
*/
public double getArtificialSize() {
return m_ArtSize;
}
/**
* Sets factor that determines number of artificial examples to generate.
*
* @param newwArtSize factor that determines number of artificial examples to generate
*/
public void setArtificialSize(double newArtSize) {
m_ArtSize = newArtSize;
}
/**
* Gets the desired size of the committee.
*
* @return the desired size of the committee
*/
public int getDesiredSize() {
return m_DesiredSize;
}
/**
* Sets the desired size of the committee.
*
* @param newDesiredSize the desired size of the committee
*/
public void setDesiredSize(int newDesiredSize) {
m_DesiredSize = newDesiredSize;
}
/**
* Sets the max number of SemiSupDecorate iterations to run.
*
* @param numIterations max number of SemiSupDecorate iterations to run
*/
public void setNumIterations(int numIterations) {
m_NumIterations = numIterations;
}
/**
* Gets the max number of SemiSupDecorate iterations to run.
*
* @return the max number of SemiSupDecorate iterations to run
*/
public int getNumIterations() {
return m_NumIterations;
}
/**
* Set the seed for random number generator.
*
* @param seed the random number seed
*/
public void setSeed(int seed) {
m_Seed = seed;
}
/**
* Gets the seed for the random number generator.
*
* @return the seed for the random number generator
*/
public int getSeed() {
return m_Seed;
}
/**
* Provide unlabeled data to the classifier.
* @unlabeled the unlabeled Instances
*/
public void setUnlabeled(Instances unlabeled){
m_Unlabeled = new Instances(unlabeled);//make local copy of unlabeled data
//Reweight unlabeled instances if necessary
if (m_Lambda != 1.0)
weightInstances(m_Unlabeled, m_Lambda);
}
/** Weighted all given instances with given weight */
protected void weightInstances (Instances insts, double weight) {
Enumeration enumInsts = insts.enumerateInstances();
while (enumInsts.hasMoreElements()) {
Instance instance = (Instance) enumInsts.nextElement();
instance.setWeight(weight);
}
}
/**
* Build SemiSupDecorate classifier
*
* @param data the training data to be used for generating the classifier
* @exception Exception if the classifier could not be built successfully
*/
public void buildClassifier(Instances data) throws Exception {
if(m_Classifier == null) {
throw new Exception("A base classifier has not been specified!");
}
if(data.checkForStringAttributes()) {
throw new UnsupportedAttributeTypeException("Cannot handle string attributes!");
}
if(data.classAttribute().isNumeric()) {
throw new UnsupportedClassTypeException("SemiSupDecorate can't handle a numeric class!");
}
if(m_NumIterations < m_DesiredSize)
throw new Exception("Max number of iterations must be >= desired ensemble size!");
//initialize random number generator
if(m_Seed==-1) m_Random = new Random();
else m_Random = new Random(m_Seed);
//initialize ensemble wts to be equal
m_EnsembleWts = new double [m_DesiredSize];
for(int j=0; j<m_DesiredSize; j++)
m_EnsembleWts[j] = 1.0;
initMeasures();
int numUnlabeledUsed = 0;
int i = 1;//current committee size
int numTrials = 1;//number of SemiSupDecorate iterations
Instances divData = new Instances(data);//local copy of data - diversity data
divData.deleteWithMissingClass();
Instances artData = null;//artificial data
//compute number of artficial instances to add at each iteration
int artSize = (int) (Math.abs(m_ArtSize)*divData.numInstances());
if(artSize==0) artSize=1;//atleast add one random example
computeStats(data);//Compute training data stats for creating artificial examples
//initialize new committee
m_Committee = new Vector();
Classifier newClassifier = m_Classifier;
newClassifier.buildClassifier(divData);
m_Committee.add(newClassifier);
double eComm = computeError(divData);//compute ensemble error
if(m_Debug) System.out.println("Initialize:\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm);
//repeat till desired committee size is reached OR the max number of iterations is exceeded
while(i<m_DesiredSize && numTrials<m_NumIterations){
if(m_UseArtificial){
//Generate artificial training examples
artData = generateArtificialData(artSize, data);
//Label artificial examples
labelData(artData);
//Add new artificial data
addInstances(divData, artData);
}
//Add unlabeled data
if(m_UseUnlabeled) numUnlabeledUsed = addUnlabeled(divData);
//Build new classifier
Classifier tmp[] = Classifier.makeCopies(m_Classifier,1);
newClassifier = tmp[0];
newClassifier.buildClassifier(divData);
//Remove unlabeled data
if(m_UseUnlabeled) removeInstances(divData, numUnlabeledUsed);
//Remove all the artificial data
if(m_UseArtificial) removeInstances(divData, artSize);
assert (divData.numInstances()==data.numInstances()) : "Diversity data error!";
//Test if the new classifier should be added to the ensemble
m_Committee.add(newClassifier);//add new classifier to current committee
double currError = computeError(divData);
if(currError <= eComm){//adding the new member did not increase the error
i++;
eComm = currError;
if(m_Debug) System.out.println("Iteration: "+(1+numTrials)+"\tClassifier "+i+" added to ensemble. Ensemble error = "+eComm);
}else{//reject the current classifier because it increased the ensemble error
m_Committee.removeElementAt(m_Committee.size()-1);//pop the last member
}
numTrials++;
}
}
/**
* Add unlabeled data to training set.
* @param divData diversity data to add to.
* @return number of unlabeled examples used.
*/
protected int addUnlabeled(Instances divData) throws Exception{
//set of unlabeled examples eventually used (some may be ignored).
Instances unlabeledUsed = new Instances(divData);
switch (m_UnlabeledMethod){
case ALL:
unlabeledUsed = labelAll(m_Unlabeled);
break;
case IGNORE_LOW:
unlabeledUsed = labelIgnoreLow(m_Unlabeled, unlabeledUsed);
break;
case IGNORE_HIGH:
unlabeledUsed = labelIgnoreHigh(m_Unlabeled, unlabeledUsed);
break;
case FLIP_LOW:
unlabeledUsed = labelFlipLow(m_Unlabeled);
break;
default:
System.err.println("Unrecognized unlabeled method selected!");
}
System.out.println("Unlabeled size = "+m_Unlabeled.numInstances()+" Used size = "+unlabeledUsed.numInstances());
addInstances(divData, unlabeledUsed);
return unlabeledUsed.numInstances();
}
/** Label examples as predicted by the current ensemble. */
protected Instances labelAll(Instances instances) throws Exception {
Instance curr;
for(int i=0; i<instances.numInstances(); i++){
curr = instances.instance(i);
curr.setClassValue(classifyInstance(curr));
}
return m_Unlabeled;
}
/**
* Label high-confidence examples with the ensemble's prediction.
* Ignore low-confidence examples.
*/
protected Instances labelIgnoreLow(Instances instances, Instances used) throws Exception {
Instance curr;
double []probs;
int label;
double highestProb;
for(int i=0; i<instances.numInstances(); i++){
curr = instances.instance(i);
//compute the class membership probs predicted by the current ensemble
probs = distributionForInstance(curr);
label = (int) classifyInstance(curr);
highestProb = probs[label];
if(highestProb >= m_Threshold){
curr.setClassValue(label);
used.add(curr);
}
}
return used;
}
/**
* Label low-confidence examples with inverse of ensemble's prediction.
* Ignore high-confidence examples.
*/
protected Instances labelIgnoreHigh(Instances instances, Instances used) throws Exception {
Instance curr;
double []probs;
int label;
double highestProb;
for(int i=0; i<instances.numInstances(); i++){
curr = instances.instance(i);
//compute the class membership probs predicted by the current ensemble
probs = distributionForInstance(curr);
label = (int) classifyInstance(curr);
highestProb = probs[label];
if(highestProb < m_Threshold){
curr.setClassValue(inverseLabel(probs));
used.add(curr);
}
}
return used;
}
/**
* Label low-confidence examples with inverse of ensemble's prediction.
* Use ensemble's prediction for high-confidence examples.
*/
protected Instances labelFlipLow(Instances instances) throws Exception {
Instance curr;
double []probs;
int label;
double highestProb;
int a=0, b=0;
for(int i=0; i<instances.numInstances(); i++){
curr = instances.instance(i);
//compute the class membership probs predicted by the current ensemble
probs = distributionForInstance(curr);
label = (int) classifyInstance(curr);
highestProb = probs[label];
if(highestProb >= m_Threshold){
curr.setClassValue(label);
a++;
}else{
curr.setClassValue(inverseLabel(probs));
b++;
}
}
System.out.println("As is: "+a+"\tFlipped: "+b);
return m_Unlabeled;
}
//Helper method to print arrays
protected void printArray(double []array){
for(int i=0; i<array.length; i++)
System.out.print(array[i]+" ");
System.out.println();
}
/** Returns class predictions of each ensemble member */
public double []getEnsemblePredictions(Instance instance) throws Exception{
double preds[] = new double [m_Committee.size()];
for(int i=0; i<m_Committee.size(); i++)
preds[i] = ((Classifier) m_Committee.get(i)).classifyInstance(instance);
return preds;
}
/**
* Returns vote weights of ensemble members.
*
* @return vote weights of ensemble members
*/
public double []getEnsembleWts(){
return m_EnsembleWts;
}
/** Returns size of ensemble */
public double getEnsembleSize(){
return m_Committee.size();
}
/**
* Compute and store statistics required for generating artificial data.
*
* @param data training instances
* @exception Exception if statistics could not be calculated successfully
*/
protected void computeStats(Instances data) throws Exception{
int numAttributes = data.numAttributes();
m_AttributeStats = new Vector(numAttributes);//use to map attributes to their stats
for(int j=0; j<numAttributes; j++){
if(data.attribute(j).isNominal()){
//Compute the probability of occurence of each distinct value
int []nomCounts = (data.attributeStats(j)).nominalCounts;
double []counts = new double[nomCounts.length];
if(counts.length < 2) throw new Exception("Nominal attribute has less than two distinct values!");
//Perform Laplace smoothing
for(int i=0; i<counts.length; i++)
counts[i] = nomCounts[i] + 1;
Utils.normalize(counts);
double []stats = new double[counts.length - 1];
stats[0] = counts[0];
//Calculate cumulative probabilities
for(int i=1; i<stats.length; i++)
stats[i] = stats[i-1] + counts[i];
m_AttributeStats.add(j,stats);
}else if(data.attribute(j).isNumeric()){
//Get mean and standard deviation from the training data
double []stats = new double[2];
stats[0] = data.meanOrMode(j);
stats[1] = Math.sqrt(data.variance(j));
m_AttributeStats.add(j,stats);
}else System.err.println("SemiSupDecorate can only handle numeric and nominal values.");
}
}
/**
* Generate artificial training examples.
* @param artSize size of examples set to create
* @param data training data
* @return the set of unlabeled artificial examples
*/
protected Instances generateArtificialData(int artSize, Instances data){
int numAttributes = data.numAttributes();
Instances artData = new Instances(data, artSize);
double []att;
Instance artInstance;
for(int i=0; i<artSize; i++){
att = new double[numAttributes];
for(int j=0; j<numAttributes; j++){
if(data.attribute(j).isNominal()){
//Select nominal value based on the frequency of occurence in the training data
double []stats = (double [])m_AttributeStats.get(j);
att[j] = (double) selectIndexProbabilistically(stats);
}
else if(data.attribute(j).isNumeric()){
//Generate numeric value from the Guassian distribution
//defined by the mean and std dev of the attribute
double []stats = (double [])m_AttributeStats.get(j);
att[j] = (m_Random.nextGaussian()*stats[1])+stats[0];
}else System.err.println("SemiSupDecorate can only handle numeric and nominal values.");
}
artInstance = new Instance(1.0, att);
artData.add(artInstance);
}
return artData;
}
/**
* Labels the artificially generated data.
*
* @param artData the artificially generated instances
* @exception Exception if instances cannot be labeled successfully
*/
protected void labelData(Instances artData) throws Exception {
Instance curr;
double []probs;
for(int i=0; i<artData.numInstances(); i++){
curr = artData.instance(i);
//compute the class membership probs predicted by the current ensemble
probs = distributionForInstance(curr);
//select class label inversely proportional to the ensemble predictions
curr.setClassValue(inverseLabel(probs));
}
}
/**
* Select class label such that the probability of selection is
* inversely proportional to the ensemble's predictions.
*
* @param probs class membership probabilities of instance
* @return index of class label selected
* @exception Exception if instances cannot be labeled successfully
*/
protected int inverseLabel(double []probs) throws Exception{
double []invProbs = new double[probs.length];
//Produce probability distribution inversely proportional to the given
for(int i=0; i<probs.length; i++){
if(probs[i]==0){
invProbs[i] = Double.MAX_VALUE/probs.length;
//Account for probability values of 0 - to avoid divide-by-zero errors
//Divide by probs.length to make sure normalizing works properly
}else{
invProbs[i] = 1.0 / probs[i];
}
}
Utils.normalize(invProbs);
double []cdf = new double[invProbs.length];
//Compute cumulative probabilities
cdf[0] = invProbs[0];
for(int i=1; i<invProbs.length; i++){
cdf[i] = invProbs[i]+cdf[i-1];
}
if(Double.isNaN(cdf[invProbs.length-1]))
System.err.println("Cumulative class membership probability is NaN!");
return selectIndexProbabilistically(cdf);
}
/**
* Given cumulative probabilities select a nominal attribute value index
*
* @param cdf array of cumulative probabilities
* @return index of attribute selected based on the probability distribution
*/
protected int selectIndexProbabilistically(double []cdf){
double rnd = m_Random.nextDouble();
int index = 0;
while(index < cdf.length && rnd > cdf[index]){
index++;
}
return index;
}
/**
* Removes a specified number of instances from the given set of instances.
*
* @param data given instances
* @param numRemove number of instances to delete from the given instances
*/
protected void removeInstances(Instances data, int numRemove){
int num = data.numInstances();
for(int i=num - 1; i>num - 1 - numRemove;i--){
data.delete(i);
}
}
/**
* Add new instances to the given set of instances.
*
* @param data given instances
* @param newData set of instances to add to given instances
*/
protected void addInstances(Instances data, Instances newData){
for(int i=0; i<newData.numInstances(); i++)
data.add(newData.instance(i));
}
/**
* Computes the error in classification on the given data.
*
* @param data the instances to be classified
* @return classification error
* @exception Exception if error can not be computed successfully
*/
protected double computeError(Instances data) throws Exception {
double error = 0.0;
int numInstances = data.numInstances();
Instance curr;
for(int i=0; i<numInstances; i++){
curr = data.instance(i);
//Check if the instance has been misclassified
if(curr.classValue() != ((int) classifyInstance(curr))) error++;
}
return (error/numInstances);
}
/**
* Calculates the class membership probabilities for the given test instance.
*
* @param instance the instance to be classified
* @return predicted class probability distribution
* @exception Exception if distribution can't be computed successfully
*/
public double[] distributionForInstance(Instance instance) throws Exception {
if (instance.classAttribute().isNumeric()) {
throw new UnsupportedClassTypeException("SemiSupDecorate can't handle a numeric class!");
}
double [] sums = new double [instance.numClasses()], newProbs;
Classifier curr;
for (int i = 0; i < m_Committee.size(); i++) {
curr = (Classifier) m_Committee.get(i);
if (curr instanceof DistributionClassifier) {
newProbs = ((DistributionClassifier)curr).distributionForInstance(instance);
for (int j = 0; j < newProbs.length; j++)
sums[j] += newProbs[j];
} else {
sums[(int)curr.classifyInstance(instance)]++;
}
}
if (Utils.eq(Utils.sum(sums), 0)) {
return sums;
} else {
Utils.normalize(sums);
return sums;
}
}
/**
* Returns description of the SemiSupDecorate classifier.
*
* @return description of the SemiSupDecorate classifier as a string
*/
public String toString() {
if (m_Committee == null) {
return "SemiSupDecorate: No model built yet.";
}
StringBuffer text = new StringBuffer();
text.append("SemiSupDecorate base classifiers: \n\n");
for (int i = 0; i < m_Committee.size(); i++)
text.append(((Classifier) m_Committee.get(i)).toString() + "\n\n");
text.append("Number of classifier in the ensemble: "+m_Committee.size()+"\n");
return text.toString();
}
/**
* Main method for testing this class.
*
* @param argv the options
*/
public static void main(String [] argv) {
try {
System.out.println(Evaluation.evaluateModel(new SemiSupDecorate(), argv));
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
}