/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. Sánchez (luciano@uniovi.es)
J. Alcalá-Fdez (jalcala@decsai.ugr.es)
S. García (sglopez@ujaen.es)
A. Fernández (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/*
CoForest.java
Isaac Triguero Velazquez.
Created by Isaac Triguero Velazquez 4/3/2011
Copyright (c) 2008 __MyCompanyName__. All rights reserved.
*/
package keel.Algorithms.Semi_Supervised_Learning.CoForest;
import keel.Algorithms.Semi_Supervised_Learning.Basic.C45.*;
import keel.Algorithms.Semi_Supervised_Learning.Basic.HandlerNB;
import keel.Algorithms.Semi_Supervised_Learning.Basic.HandlerSMO;
import keel.Algorithms.Semi_Supervised_Learning.Basic.PrototypeSet;
import keel.Algorithms.Semi_Supervised_Learning.Basic.PrototypeGenerator;
import keel.Algorithms.Semi_Supervised_Learning.Basic.Prototype;
import keel.Algorithms.Semi_Supervised_Learning.Basic.PrototypeGenerationAlgorithm;
import keel.Algorithms.Semi_Supervised_Learning.*;
import java.util.*;
import keel.Algorithms.Semi_Supervised_Learning.utilities.*;
import keel.Algorithms.Semi_Supervised_Learning.utilities.KNN.*;
import keel.Dataset.Attribute;
import keel.Dataset.Attributes;
import keel.Dataset.InstanceAttributes;
import keel.Dataset.InstanceSet;
import org.core.*;
import java.util.StringTokenizer;
/**
* This class implements the Tri-training. You can use: Knn, C4.5, SMO and NB as classifiers.
* @author triguero
*
*/
public class CoForestGenerator extends PrototypeGenerator {
/*Own parameters of the algorithm*/
private int numberOfselectedExamples;
private int MaxIter;
private int num_classifier;
private double threshold = 0.75;
private int m_numOriginalLabeledInsts = 0;
/** Number of features to consider in random feature selection.
If less than 1 will use int(logM+1) ) */
protected int m_numFeatures = 0;
/** Final number of features that were considered in last build. */
protected int m_KValue = 0;
private int [][] predictions;
private double [][][] probabilities;
// private String final_classifier;
protected int numberOfPrototypes; // Particle size is the percentage
protected int numberOfClass;
/** Parameters of the initial reduction process. */
private String[] paramsOfInitialReducction = null;
RandomTree [] m_classifiers;
/**
* Build a new CoForestGenerator Algorithm
* @param t Original prototype set to be reduced.
* @param perc Reduction percentage of the prototype set.
*/
public CoForestGenerator(PrototypeSet _trainingDataSet, int neigbors,int poblacion, int perc, int iteraciones, double c1, double c2, double vmax, double wstart, double wend)
{
super(_trainingDataSet);
algorithmName="CoForest";
}
/**
* Build a new CoForestGenerator Algorithm
* @param t Original prototype set to be reduced.
* @param unlabeled Original unlabeled prototype set for SSL.
* @param params Parameters of the algorithm (only % of reduced set).
*/
public CoForestGenerator(PrototypeSet t, PrototypeSet unlabeled, PrototypeSet test, Parameters parameters)
{
super(t,unlabeled, test, parameters);
algorithmName="CoForest";
this.predictions = new int[6][];
this.num_classifier = parameters.getNextAsInt();
this.threshold = parameters.getNextAsDouble();
//this.final_classifier = parameters.getNextAsString();
//Last class is the Unknown
this.numberOfClass = trainingDataSet.getPosibleValuesOfOutput().size();
this.probabilities = new double[3][][];
// System.out.print("\nIsaacSSL dice: " + this.numberOfselectedExamples+ ", "+ this.numberOfClass +"\n");
}
/**
*
* @param inst
* @param idxInst
* @param inbags
* @param idExcluded
* @return
* @throws Exception
*/
private double[] outOfBagDistributionForInstanceExcluded(Prototype inst, int idxInst, boolean[][] inbags, int idExcluded) throws Exception
{
double[] distr = new double[this.numberOfClass];
for(int i = 0; i < this.num_classifier; i++)
{
if(inbags[i][idxInst] == true || i == idExcluded)
continue;
double[] d = m_classifiers[i].distributionForInstance(inst);
if(d!=null){
for(int iClass = 0; iClass < this.numberOfClass; iClass++){
distr[iClass] += d[iClass];
}
}
}
double sumatoria = 0;
for(int i=0; i< distr.length;i++){
sumatoria+= distr[i];
}
if(sumatoria != 0){
//Utils.normalize(distr);
for (int i=0; i<distr.length; i++){
distr[i] /= sumatoria;
}
}
return distr;
}
/**
*
* @param data
* @param weights of the instances
* @param inbags
* @param id
* @return
* @throws Exception
*/
private double measureError(PrototypeSet data, boolean[][] inbags, int id) throws Exception
{
double err = 0;
double count = 0;
for(int i = 0; i < data.size() && i < m_numOriginalLabeledInsts; i++)
{
Prototype inst = data.get(i);
double[] distr = outOfBagDistributionForInstanceExcluded(inst, i, inbags, id);
double maximo= Double.MIN_VALUE;
int claseMax =0;
for(int j=0; j< distr.length; j++){
if(distr[j]> maximo){
maximo = distr[j];
claseMax = j;
}
}
if(maximo > this.threshold)
{
count += inst.getWeight();
if(claseMax != inst.getOutput(0))
err += inst.getWeight();
}
}
err /= count;
return err;
}
/**
* Resample instances w.r.t the weight
*
* @param data Instances -- the original data set
* @param id of the classifier
* @param sampled boolean[] -- the output parameter, indicating whether the instance is sampled
* @return Instances
*/
public final PrototypeSet resampleWithWeights(PrototypeSet data, int id, boolean[] sampled)
{
double[] weights = new double[data.size()];
for (int i = 0; i < weights.length; i++) {
weights[i] = data.get(i).getWeight();
}
PrototypeSet newData = new PrototypeSet(data.clone());
if (data.size() == 0) {
return newData;
}
double[] probabilities = new double[data.size()];
double sumProbs = 0, sumOfWeights=0;
for(int i=0; i<weights.length;i++){
sumOfWeights+=weights[i];
}
for (int i = 0; i < data.size(); i++) {
sumProbs += Randomize.Rand();
probabilities[i] = sumProbs;
}
for (int i = 0; i < probabilities.length; i++) {
probabilities[i] /= (sumProbs / sumOfWeights);
}
// Make sure that rounding errors don't mess things up
probabilities[data.size() - 1] = sumOfWeights;
int k = 0; int l = 0;
sumProbs = 0;
while ((k < data.size() && (l < data.size()))) {
if (weights[l] < 0) {
throw new IllegalArgumentException("Weights have to be positive.");
}
sumProbs += weights[l];
while ((k < data.size()) &&
(probabilities[k] <= sumProbs)) {
newData.add(data.get(l));
sampled[l] = true;
newData.get(k).setWeight(1);
k++;
}
l++;
}
return newData;
}
private double[] distributionForInstanceExcluded(Prototype inst, int idExcluded) throws Exception
{
double[] distr = new double[this.numberOfClass];
for(int i = 0; i < this.num_classifier; i++)
{
if(i == idExcluded)
continue;
double[] d = m_classifiers[i].distributionForInstance(inst);
for(int iClass = 0; iClass < this.numberOfClass; iClass++)
distr[iClass] += d[iClass];
}
// Normalize:
double sum = 0;
for (int i = 0; i < distr.length; i++) {
sum += distr[i];
}
for (int i = 0; i < distr.length; i++) {
distr[i] /= sum;
}
return distr;
}
/**
* To judege whether the confidence for a given instance of H* is high enough,
* which is affected by the onfidence threshold. Meanwhile, if the example is
* the confident one, assign label to it and weigh the example with the confidence
*
* @param inst Instance -- The instance
* @param idExcluded int -- the index of the individual should be excluded from H*
* @return boolean -- true for high
* @throws Exception - some exception
*/
protected boolean isHighConfidence(Prototype inst, int idExcluded) throws Exception
{
double[] distr = distributionForInstanceExcluded(inst, idExcluded);
double maximo= Double.MIN_VALUE;
int claseMax =0;
for(int j=0; j< distr.length; j++){
if(distr[j]> maximo){
maximo = distr[j];
claseMax = j;
}
}
double confidence = maximo;// getConfidence(distr);
if(confidence > this.threshold)
{
double classval = claseMax;//Utils.maxIndex(distr);
inst.setFirstOutput(classval); // .setClassValue(classval); //assign label
inst.setWeight(confidence); //set instance weight
return true;
}
else return false;
}
public int votingRule(Prototype inst) throws Exception{
double[] res = new double[this.numberOfClass];
for(int j = 0; j < this.num_classifier; j++)
{
double[] distr = m_classifiers[j].distributionForInstance(inst); // Probability of each class.
if(distr!=null){
for(int z = 0; z < res.length; z++)
res[z] += distr[z];
}
}
// Normalice RES
double sum=0;
for(int j=0; j<res.length; j++){
sum+=res[j];
}
for(int j=0; j<res.length; j++){
res[j]/=sum;
}
/// determine the maximum value
double maximum = 0;
int maxIndex = 0;
for (int j = 0; j < res.length; j++) {
if ((j == 0) || (res[j] > maximum)) {
maxIndex = j;
maximum = res[j];
}
}
return maxIndex;
}
/**
* Apply the CoForestGenerator method.
* @return
*/
public Pair<PrototypeSet, PrototypeSet> applyAlgorithm() throws Exception
{
System.out.print("\nThe algorithm CoForest is starting...\n Computing...\n");
PrototypeSet labeled, unlabeled;
double[] err = new double[this.num_classifier]; // e_i
double[] err_prime = new double[this.num_classifier]; // e'_i
double[] s_prime = new double[this.num_classifier]; // l'_i
boolean[][] inbags = new boolean[this.num_classifier][];
//obtaining labeled and unlabeled data and established indexes.
labeled = new PrototypeSet(trainingDataSet.getAllDifferentFromClass(this.numberOfClass)); // Selecting labeled prototypes from the training set.
unlabeled = new PrototypeSet(trainingDataSet.getFromClass(this.numberOfClass));
for (int j=0; j< labeled.size();j++){
labeled.get(j).setIndex(j);
}
for (int j=0; j< unlabeled.size();j++){
unlabeled.get(j).setIndex(j);
}
// In order to avoid problems with C45 and NB.
for(int p=0; p<unlabeled.size(); p++){
unlabeled.get(p).setFirstOutput(0); // todos con un valor válido.
}
//****************************************
m_numOriginalLabeledInsts = labeled.size(); //from the original labeled data sets
RandomTree rTree = new RandomTree();
// set up the random tree options
m_KValue = m_numFeatures;
if (m_KValue < 1) m_KValue = (int) (Math.log(labeled.get(0).numberOfInputs())/Math.log(2)) +1;
m_classifiers = new RandomTree[this.num_classifier];
for(int i=0; i< this.num_classifier; i++){
m_classifiers[i] = new RandomTree();
m_classifiers[i].setKValue(m_KValue);
}
PrototypeSet [] labeleds = new PrototypeSet[this.num_classifier];
int[] randSeeds = new int[this.num_classifier];
for(int i = 0; i < this.num_classifier; i++)
{
((RandomTree)m_classifiers[i]).setSeed(randSeeds[i]);
inbags[i] = new boolean[labeled.size()];
labeleds[i] = resampleWithWeights(labeled, i, inbags[i]);
// labeleds[i].print();
m_classifiers[i].buildClassifier(labeleds[i]);
// System.out.println("*******************FIN BUILD!");
err_prime[i] = 0.5;
s_prime[i] = 0; //l'_i <-- 0
}
//labeled.print();
//labeledBoostrapped[0].print();
PrototypeSet[] Li = null;
boolean bChanged = true;
/** repeat until none of h_i ( i \in {1...3} ) changes */
while(bChanged)
{
bChanged = false;
boolean[] bUpdate = new boolean[this.num_classifier];
Li = new PrototypeSet[this.num_classifier];
for(int i = 0; i < this.num_classifier; i++)
{
err[i] = measureError(labeled, inbags, i);
Li[i] = new PrototypeSet();
/** if (e_i < e'_i) */
if(err[i] < err_prime[i])
{
if(s_prime[i] == 0)
s_prime[i] = Math.min(unlabeled.sumOfWeights() / 10, 100);
/** Subsample U for each hi */
double weight = 0;
unlabeled.randomize(this.SEED);
int numWeightsAfterSubsample = (int) Math.ceil(err_prime[i] * s_prime[i] / err[i] - 1);
for(int k = 0; k < unlabeled.size(); k++)
{
weight += unlabeled.get(k).getWeight();
if (weight > numWeightsAfterSubsample)
break;
Li[i].add((Prototype)unlabeled.get(k));
}
/** for every x in U' do */
for(int j = Li[i].size() - 1; j > 0; j--)
{
Prototype curInst = Li[i].get(j);
if(!isHighConfidence(curInst, i)) //in which the label is assigned
Li[i].remove(j);
}//end of j
if(s_prime[i] < Li[i].size())
{
if(err[i] * Li[i].sumOfWeights() < err_prime[i] * s_prime[i])
bUpdate[i] = true;
}
}
}//end of for i
//update
RandomTree [] newClassifier = new RandomTree[this.num_classifier];
for(int i = 0; i < this.num_classifier; i++)
{
newClassifier[i] = new RandomTree();
if(bUpdate[i])
{
double size = Li[i].sumOfWeights();
bChanged = true;
m_classifiers[i] = newClassifier[i];
((RandomTree)m_classifiers[i]).setSeed(randSeeds[i]);
for(int j = 0; j < labeled.size(); j++) // Combine labeled and Li.
Li[i].add(new Prototype(labeled.get(j)));
m_classifiers[i].buildClassifier(Li[i]);
err_prime[i] = err[i];
s_prime[i] = size;
}
}
} //end of while
// testing phase.
PrototypeSet tranductive = new PrototypeSet(this.transductiveDataSet.clone());
PrototypeSet test = new PrototypeSet(this.testDataSet.clone());
int traPrediction[] = new int[tranductive.size()];
int tstPrediction[] = new int[test.size()];
int aciertoTrs = 0;
int aciertoTst = 0;
//transductive phase
for(int i=0; i<tranductive.size(); i++){
// Voting RULE
traPrediction[i]=this.votingRule(tranductive.get(i));
// maxIndex is the class label.
if(tranductive.get(i).getOutput(0) == traPrediction[i]){
aciertoTrs++;
}
tranductive.get(i).setFirstOutput(traPrediction[i]);
}
// test phase
for(int i=0; i<test.size(); i++){
// Voting RULE
tstPrediction[i]=this.votingRule(test.get(i));
// maxIndex is the class label.
if(test.get(i).getOutput(0) == tstPrediction[i]){
aciertoTst++;
}
test.get(i).setFirstOutput(tstPrediction[i]);
}
System.out.println("% de acierto TRS = "+ (aciertoTrs*100.)/transductiveDataSet.size());
System.out.println("% de acierto TST = "+ (aciertoTst*100.)/testDataSet.size());
return new Pair<PrototypeSet,PrototypeSet>(tranductive,test);
}
/**
* General main for all the prototoype generators
* Arguments:
* 0: Filename with the training data set to be condensed.
* 1: Filename which contains the test data set.
* 3: Seed of the random number generator. Always.
* **************************
* @param args Arguments of the main function.
*/
public static void main(String[] args)
{ }
}