/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
/**
*
* File: CoCoIS.java
*
* This class implements the Cooperative Coevolutionary Instance Selection model
* (CoCoIS)
*
* @author Written by Joaquin Derrac (University of Granada) 3/3/2010
* @version 1.1
* @since JDK1.5
*
*/
package keel.Algorithms.Preprocess.Instance_Selection.CoCoIS;
import keel.Algorithms.Preprocess.Basic.*;
import org.core.*;
import java.util.Arrays;
import java.util.StringTokenizer;
public class CoCoIS extends Metodo{
//Parameters
private static long seed; //seed of the random geenrator
private static int K; //K value of the K-NN classifier
private static int individuals; //size of the combinators population
private static int nSubpopulations; //number of subpopulations
private static int MAX_GENERATIONS; //Maximum number of generations
private static int M; //Generations per epoch of selector populations
private static int N; //Generations per epoch of combinators population
private static double W; //Weight of combinators fitness function
private static double mutationProb; //Mutation probability
private static int subpopSize; //size of the subpopulations
private static double WError; //Weight for Error in fitness function
private static double WReduction; //Weight for Reduction in fitness function
private static double WDifference; //Weight for Difference in fitness function
private static double Elitism; //Percentage of member affected by elitism
private static double PRnn; //Probability of application of RNN mutation
private static double PRandom; //Probability of application of random mutation
private static double PBit; //Probability of bit change in random mutation
//Other data structures
private static double dataTrain [][]; //training data
private static int outputTrain []; //training output
private int assignation []; // assignation of instances to strata
private int strataSize []; // size of each strata
private static int combinators [][]; // population of combinators
private static double cFitness[]; //fitness value of the population of combinators
private static double cAcc[]; // accuracy rate of the population of combinators
private static Subpopulation selectors []; //selector subpopulations
private static int ISSelection[]; //Current instance selection vector for K-NN
private static double minDist[]; //auxiliary vector for K-NN classifier
private static int nearestN[];//auxiliary vector for K-NN classifier
private static int selectedClasses[];//auxiliary vector for K-NN classifier
private static int generations; //current number of generations
private static int nClasses; //number of classes in the training set
/**
* Default builder. Process the configuration file
*
* @param ficheroScript Configuration file
*
*/
public CoCoIS (String ficheroScript) {
super (ficheroScript);
}//end-method
/**
* Computes strata distribution for the subpopulations
*/
private void computeStrata(){
int counter;
int lastClass;
//assign strata
assignation=new int[dataTrain.length];
strataSize=new int [nSubpopulations];
Arrays.fill(strataSize, 0);
//sort instances by class
for(int i=0;i<dataTrain.length;i++){
for(int j=i+1;j<dataTrain.length;j++){
if(outputTrain[j]<outputTrain[i]){
swapInstances(i,j);
}
}
}
//assing instances mantaining class distribution
counter=0;
lastClass=outputTrain[0];
for(int i=0;i<dataTrain.length;i++){
if(lastClass==outputTrain[i]){
assignation[i]=counter;
strataSize[counter]++;
counter=(counter+1)%nSubpopulations;
}
else{
counter=0;
assignation[i]=counter;
strataSize[counter]++;
counter=(counter+1)%nSubpopulations;
lastClass=outputTrain[i];
}
}
//sort instances by asignation
for(int i=0;i<dataTrain.length;i++){
for(int j=i+1;j<dataTrain.length;j++){
if(assignation[j]<assignation[i]){
swapInstances(i,j);
}
}
}
}//end-method
/**
* Swaps two instances
* @param a First instance
* @param b Second instance
*/
private void swapInstances(int a,int b){
int aux;
double auxD, auxR;
int auxN,auxA;
boolean auxM;
//swap data
for(int i=0;i<dataTrain[0].length;i++){
auxD=dataTrain[a][i];
auxR=realTrain[a][i];
auxN=nominalTrain[a][i];
auxM=nulosTrain[a][i];
dataTrain[a][i]=dataTrain[b][i];
realTrain[a][i]=realTrain[b][i];
nominalTrain[a][i]=nominalTrain[b][i];
nulosTrain[a][i]=nulosTrain[b][i];
dataTrain[b][i]=auxD;
realTrain[b][i]=auxR;
nominalTrain[b][i]=auxN;
nulosTrain[b][i]=auxM;
}
//swap class attribute
aux=outputTrain[a];
outputTrain[a]=outputTrain[b];
outputTrain[b]=aux;
//swap assignation
auxA=assignation[a];
assignation[a]=assignation[b];
assignation[b]=auxA;
}//end-method
/**
* Executes CCIS
*/
public void ejecutar () {
//data formatting
double conjS[][]; //selected data
double conjR[][]; //selected data (real values)
int conjN[][]; //selected data (nominal values)
boolean conjM[][]; //selected data (missing values)
int clasesS[]; //selected data (output values)
double strataData [][]; //strata distribution
int strataOutput []; //strata output values
int father, mother; //parents for the crossover operator
int up, down; //points of the crossover operator
int childA [], childB []; //offspring
int aux; //auxiliary variable
int count; //count variable
Randomize.setSeed (seed);
outputTrain=new int[clasesTrain.length];
dataTrain=new double[datosTrain.length][datosTrain[0].length];
for(int i=0;i<datosTrain.length;i++){
for(int j=0;j<datosTrain[0].length;j++){
dataTrain[i][j]=datosTrain[i][j];
}
outputTrain[i]=clasesTrain[i];
}
ISSelection=new int [dataTrain.length];
nearestN = new int[K];
minDist = new double[K];
computeStrata();
/*Getting the number of different classes*/
nClasses = 0;
for (int i=0; i<outputTrain.length; i++){
if (outputTrain[i] > nClasses){
nClasses = outputTrain[i];
}
}
nClasses++;
selectedClasses= new int[nClasses];
//initialize population of combinators
combinators=new int [individuals][nSubpopulations];
cFitness=new double [individuals];
cAcc=new double [individuals];
for(int i=0;i<individuals;i++){
for(int j=0; j<nSubpopulations; j++){
combinators[i][j]= Randomize.RandintClosed(0, subpopSize-1);
}
cFitness[i]=-1.0;
cAcc[i]=-1.0;
}
//initialize populations of selectors
Subpopulation.setK(K);
Subpopulation.setSize(subpopSize);
Subpopulation.setWError(WError);
Subpopulation.setWReduction(WReduction);
Subpopulation.setWDifference(WDifference);
Subpopulation.setElitism(Elitism);
Subpopulation.setPRnn(PRnn);
Subpopulation.setPRandom(PRandom);
Subpopulation.setPBit(PBit);
selectors=new Subpopulation[nSubpopulations];
for(int strata=0;strata<nSubpopulations;strata++){
//obtain Strata
strataData=new double [strataSize[strata]][dataTrain[0].length];
strataOutput=new int [strataSize[strata]];
count=0;
for(int i=0;i<dataTrain.length;i++){
if(assignation[i]==strata){
for(int j=0;j<dataTrain[0].length;j++){
strataData[count][j]=dataTrain[i][j];
}
strataOutput[count]=outputTrain[i];
count++;
}
}
selectors[strata]=new Subpopulation(strata,strataData,strataOutput);
}
long tiempo = System.currentTimeMillis();
/*******************************
*
* CCIS MAIN PROCEDURE
*
*******************************/
generations=0;
while(generations<MAX_GENERATIONS){
/*******************************
*
* Evolve combinators
*
*******************************/
for(int evolve=0;evolve<N; evolve++){
//evaluate
for(int i=0;i<individuals;i++){
if(cFitness[i]==-1.0){
cFitness[i]=fitnessFunction(i);
}
}
//sort individuals by fitness value
sortPop();
//selection of parents
father = rouletteSelection(cFitness);
do {
mother = rouletteSelection(cFitness);
} while (mother == father);
//crossover
childA=new int[nSubpopulations];
childB=new int[nSubpopulations];
//select two points
up = Randomize.RandintClosed(0, nSubpopulations-1);
do {
down = Randomize.RandintClosed(0, nSubpopulations-1);
} while (up == down);
if(up<down){
aux=up;
up=down;
down=aux;
}
//crossover
for(int i=0;i<down;i++){
childA[i]=combinators[father][i];
childB[i]=combinators[mother][i];
}
for(int i=down;i<up;i++){
childA[i]=combinators[mother][i];
childB[i]=combinators[father][i];
}
for(int i=up;i<nSubpopulations;i++){
childA[i]=combinators[father][i];
childB[i]=combinators[mother][i];
}
//replacement
for(int i=0;i<nSubpopulations;i++){
combinators[individuals-2][i]=childA[i];
combinators[individuals-1][i]=childB[i];
}
cFitness[individuals-2]=-1.0;
cFitness[individuals-1]=-1.0;
//mutation
for(int i=0;i<individuals;i++){
if(Randomize.Rand()<mutationProb){
aux=Randomize.RandintClosed(0, nSubpopulations-1);
combinators[i][aux]=selectors[aux].rouletteSelection();
cFitness[i]=-1.0;
}
}
}//end for-combinators
//last evaluation to asses combinators for evaluation of selectors
for(int i=0;i<individuals;i++){
if(cFitness[i]==-1.0){
cFitness[i]=fitnessFunction(i);
}
}
//sort individuals by fitness value
sortPop();
/*******************************
*
* Evolve selectors
*
*******************************/
for(int evolve=0;evolve<M; evolve++){
for(int pop=0;pop<nSubpopulations;pop++){
selectors[pop].doGeneration();
}
//evaluate population of combinators
for(int i=0;i<individuals;i++){
if(cFitness[i]==-1.0){
cFitness[i]=fitnessFunction(i);
}
}
//sort individuals by fitness value
sortPop();
}
generations++;
}//end-while main loop
//last evaluation to asses combinators to obtain the final reduced subset
for(int i=0;i<individuals;i++){
if(cFitness[i]==-1.0){
cFitness[i]=fitnessFunction(i);
}
}
//sort individuals by fitness value
sortPop();
/*******************************
*
* Obtention of final reduced subset
*
*******************************/
//build individual
int body []=buildIndividual(0);
int nSel=0;
for(int i=0;i<body.length;i++){
if(body[i]==1){
nSel++;
}
}
/*Building of S set from the best cromosome obtained*/
conjS = new double[nSel][dataTrain[0].length];
conjR = new double[nSel][dataTrain[0].length];
conjN = new int[nSel][dataTrain[0].length];
conjM = new boolean[nSel][dataTrain[0].length];
clasesS = new int[nSel];
for (int i=0, l=0; i<dataTrain.length; i++) {
if (body[i]==1) { //the instance must be copied to the solution
for (int j=0; j<dataTrain[0].length; j++) {
conjS[l][j] = dataTrain[i][j];
conjS[l][j] = dataTrain[i][j];
conjR[l][j] = realTrain[i][j];
conjN[l][j] = nominalTrain[i][j];
conjM[l][j] = nulosTrain[i][j];
}
clasesS[l] = outputTrain[i];
l++;
}
}
System.out.println("CCIS "+ relation + " " + (double)(System.currentTimeMillis()-tiempo)/1000.0 + "s");
OutputIS.escribeSalida(ficheroSalida[0], conjR, conjN, conjM, clasesS, entradas, salida, nEntradas, relation);
OutputIS.escribeSalida(ficheroSalida[1], test, entradas, salida, nEntradas, relation);
}//end-method
/**
* Deletes the fitness value of every member which contains the
* given selector in the subpopulation selected
*
* @param pop Subpopulation selected
* @param selector Selector given
*/
public static void RequestReevaluation(int pop, int selector){
for(int i=0;i<individuals;i++){
if(combinators[i][pop]==selector){
cFitness[i]=-1.0;
}
}
}//end-method
/**
* Builds an individual form the selectors of the populations
* @param index Individual to build
* @return Complete individual
*/
private static int [] buildIndividual(int index){
int item;
int fraction[];
int pointer;
int body[];
//build individual
body=new int [dataTrain.length];
pointer=0;
for(int i=0;i<nSubpopulations;i++){
item=combinators[index][i];
fraction=selectors[i].getBody(item);
for(int j=0;j<fraction.length;j++){
body[pointer]=fraction[j];
pointer++;
}
}
return body;
}//end-method
/**
* Builds an individual from the selectors of the populations,
* excepting one population
* @param index Individual to build
* @param delete Population discarded
* @return Complete individual
*/
private static int [] buildIndividualWithout(int index, int delete){
int item;
int fraction[];
int pointer;
int body[];
//build individual
body=new int [dataTrain.length];
pointer=0;
for(int i=0;i<nSubpopulations;i++){
item=combinators[index][i];
fraction=selectors[i].getBody(item);
if(i==delete){
Arrays.fill(fraction,0);
}
for(int j=0;j<fraction.length;j++){
body[pointer]=fraction[j];
pointer++;
}
}
return body;
} //end-method
/**
* Fitness function for the combinators
*
* @param index Individual to be evaluated
* @return Fitness value
*/
private static double fitnessFunction(int index){
double fitness;
double acc;
double reduction;
int body[];
int count;
body=buildIndividual(index);
//compute reduction
count=0;
for(int i=0;i<body.length;i++){
if(body[i]==1){
count++;
}
}
reduction= 1.0-((double)count/(double)body.length);
if(reduction==1.0){
cAcc[index]=0.0;
return 0.0;
}
else{
//compute accuracy
acc=computeAccuracy(body);
cAcc[index]=acc;
}
fitness= (acc*W)+(reduction*(1-W));
return fitness;
}//end-method
/**
* Computes accuracy of an individual
* @param selection Body of the individual selected
* @return
*/
private static double computeAccuracy(int selection[]){
int hits;
int test;
double acc;
int old;
hits=0;
//copy member to the K-NN classifier
for(int i=0;i<dataTrain.length;i++){
ISSelection[i]=selection[i];
}
//perform classification
for (int i=0; i<dataTrain.length; i++) {
//leave-one-out
old=ISSelection[i];
ISSelection[i]=0;
test=knnClassify(i);
if(test==outputTrain[i]){
hits++;
}
ISSelection[i]=old;
}
acc=(double)((double)hits/(double)dataTrain.length);
return acc;
}//end-method
/**
*
* K-NN classifier
*
* @param index Training instance to classify
* @return Class predicted
*/
private static int knnClassify(int index){
double dist;
int prediction;
int predictionValue;
boolean stop;
Arrays.fill(minDist,Double.MAX_VALUE);
//KNN Method starts here
for (int i=0; i<dataTrain.length; i++) {
if(ISSelection[i]==1){
dist = euclideanDistance(index,i);
//see if it's nearer than our previous selected neigbours
stop=false;
for(int j=0;j<K && !stop;j++){
if (dist < minDist[j]) {
for (int l = K - 1; l >= j+1; l--) {
minDist[l] = minDist[l - 1];
nearestN[l] = nearestN[l - 1];
}
minDist[j] = dist;
nearestN[j] = i;
stop=true;
}
}
}
}
//we have check all the instances... see what is the most present class
if(K==1){
return outputTrain[nearestN[0]];
}
Arrays.fill(selectedClasses, 0);
for (int i=0; i<K; i++) {
selectedClasses[outputTrain[nearestN[i]]]+=1;
}
prediction=-1;
predictionValue=0;
for (int i=0; i<nClasses; i++) {
if (predictionValue < selectedClasses[i]) {
predictionValue = selectedClasses[i];
prediction = i;
}
}
return prediction;
}//end-method
/**
* Sorts the population of combinators
*/
private void sortPop(){
for(int i=0;i<combinators.length;i++){
for(int j=i+1;j<combinators.length;j++){
if(cFitness[j]>cFitness[i]){
swapCombinators(i,j);
}
}
}
}//end-method
/**
* Swaps two combinators
* @param a First combinator
* @param b Second combinator
*/
private void swapCombinators(int a,int b){
double aux;
int auxN;
//swap data
for(int i=0;i<combinators[0].length;i++){
auxN=combinators[a][i];
combinators[a][i]=combinators[b][i];
combinators[b][i]=auxN;
}
//swap fitness
aux=cFitness[a];
cFitness[a]=cFitness[b];
cFitness[b]=aux;
}//end-method
/**
* Roulette selection method
* @param fitness Fitness array of the individuals
* @return Individual selected
*/
private int rouletteSelection(double fitness[]){
int selected;
double uniform;
double sum[];
//two worst individuals are not considered
sum=new double[fitness.length-2];
sum[0]=fitness[0];
for(int i=1;i<fitness.length-2;i++){
sum[i]=sum[i-1]+fitness[i];
}
uniform = Randomize.Randdouble(0.0, sum[fitness.length-3]);
selected = 0;
while (uniform > sum[selected]){
selected++;
}
return selected;
}//end-method
/**
* Computes the contribution of a given selector
* @param pop Population of the selector
* @param selector Selector tested
* @return Contribution
*/
public static double getContribution(int pop, int selector){
double contrib=0.0;
boolean present[];
int howMany=0;
double accWithout[];
int newSelection[]=new int [dataTrain.length];
present=new boolean[individuals];
Arrays.fill(present, false);
accWithout=new double[individuals];
Arrays.fill(accWithout, 0.0);
//mark combinations where it is present
for(int i=0;i<individuals;i++){
if(combinators[i][pop]==selector){
present[i]=true;
}
}
if(howMany<1){
return 0.0;
}
//compute old performance
for(int i=0;i<individuals;i++){
if((present[i])&&(cFitness[i]==-1.0)){
fitnessFunction(i);
}
}
//compute new performance
for(int i=0;i<individuals;i++){
if(present[i]){
newSelection=buildIndividualWithout(i,pop);
accWithout[i]=computeAccuracy(newSelection);
}
}
for(int i=0;i<individuals;i++){
if(present[i]){
contrib+=(cAcc[i]-accWithout[i]);
}
}
contrib/=(double)howMany;
return contrib;
}//end-method
/**
* Euclidean distance between two instances
* @param a First instance
* @param b Second instance
* @return Distance computed
*/
private static double euclideanDistance(int a, int b){
double dist=0.0;
double aux;
for(int i=0;i<dataTrain[0].length;i++){
aux=dataTrain[a][i]-dataTrain[b][i];
aux=aux*aux;
dist+=aux;
}
//sqrt avoided to speed up the algorithm
return dist;
}//end-method
/**
* Process the configuration file
*
* @param ficheroScript Configuration file
*/
public void leerConfiguracion (String ficheroScript) {
String fichero, linea, token;
StringTokenizer lineasFichero, tokens;
byte line[];
int i, j;
ficheroSalida = new String[2];
fichero = Fichero.leeFichero (ficheroScript);
lineasFichero = new StringTokenizer (fichero,"\n\r");
lineasFichero.nextToken();
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
token = tokens.nextToken();
/*Getting the name of the training and test files*/
line = token.getBytes();
for (i=0; line[i]!='\"'; i++);
i++;
for (j=i; line[j]!='\"'; j++);
ficheroTraining = new String (line,i,j-i);
for (i=j+1; line[i]!='\"'; i++);
i++;
for (j=i; line[j]!='\"'; j++);
ficheroTest = new String (line,i,j-i);
/*Getting the path and base name of the results files*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
token = tokens.nextToken();
/*Getting the names of the output files*/
line = token.getBytes();
for (i=0; line[i]!='\"'; i++);
i++;
for (j=i; line[j]!='\"'; j++);
ficheroSalida[0] = new String (line,i,j-i);
for (i=j+1; line[i]!='\"'; i++);
i++;
for (j=i; line[j]!='\"'; j++);
ficheroSalida[1] = new String (line,i,j-i);
/*Getting the seed*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
seed = Long.parseLong(tokens.nextToken().substring(1));
/*Getting the K parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
K = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the number of individuals of the population of combinators*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
individuals = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the number of subpopulations*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
nSubpopulations = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the number of max generations*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
MAX_GENERATIONS = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the M parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
M = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the N parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
N = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the W weight parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
W = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the mutation probability*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
mutationProb = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the subpopulation size*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
subpopSize = Integer.parseInt(tokens.nextToken().substring(1));
/*Getting the WError weight parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
WError = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the WReduction weight parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
WReduction = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the WDifference weight parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
WDifference = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the Elitism parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
Elitism = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the PRnn parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
PRnn = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the PRandom parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
PRandom = Double.parseDouble(tokens.nextToken().substring(1));
/*Getting the PBit parameter*/
linea = lineasFichero.nextToken();
tokens = new StringTokenizer (linea, "=");
tokens.nextToken();
PBit = Double.parseDouble(tokens.nextToken().substring(1));
}//end-method
}//end-class