/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.HeterDisc;
import java.util.*;
import keel.Algorithms.Discretizers.Basic.*;
import keel.Algorithms.Genetic_Rule_Learning.Globals.Parameters;
import keel.Dataset.Attribute;
import keel.Dataset.Attributes;
/**
* <p>
* This class implements the Heter-Disc discretizer
* </p>
*
* @author Written by Jose A. Saez (University of Granada), 21/12/2009
* @version 1.0
* @since JDK1.6
*/
public class HeterDisc extends Discretizer {
// this class lets to manipulate discretization schemes
public class DiscretizationScheme {
public int[] cpSelected; // indexes of the selected cutpoints
public double fitness; // fitness of the discretization
public DiscretizationScheme(int v[], int tam, double f){
cpSelected = new int[tam];
System.arraycopy(v, 0, cpSelected, 0, tam);
fitness = f;
}
}
//******************************************************************************************************
private double[] cutpoints; // possible cutpoints
private int numcp; // number of possible cutpoints
private int numInstances; // number of instances
private int[][] matrix; // quanta matrix
private Vector<DiscretizationScheme> CD; // vector with all best discretizations
private Vector<DiscretizationScheme> GD; // complete neighborhood of all discretizations at CD
private int[] solution; // best discretization found
private int[] numInterAtt; // number of intervals of each attribute
//******************************************************************************************************
/**
* <p>
* Constructor of the class
* </p>
*/
public HeterDisc(){
int i;
numInterAtt = new int[Parameters.numAttributes];
if(Parameters.setConfig){
for(i = 0 ; i < Parameters.numAttributes ; ++i){
Attribute att = Attributes.getAttribute(i);
if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER)
numInterAtt[i] = Parameters.numIntervals;
// default case
else
numInterAtt[i] = 0;
}
}
else{
String[] inter = Parameters.numIntrvls.split("_");
int cont = 0;
for(i = 0 ; i < Parameters.numAttributes ; ++i){
Attribute att = Attributes.getAttribute(i);
if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER)
numInterAtt[i] = Integer.parseInt(inter[cont++]);
// default case
else
numInterAtt[i] = 0;
}
}
}
//******************************************************************************************************
/**
* <p>
* Returns a vector with the discretized values
* </p>
* @param attribute index of the attribute to discretize
* @param values vector of indexes of the instances sorted from lowest to highest value of attribute
* @param begin index of the instance with the lowest value of attribute
* @param end index of the instance with the highest value of attribute
* @return vector with the discretized values
*/
protected Vector discretizeAttribute(int attribute, int []values, int begin, int end){
Boolean continueLoop;
int i, point; // loop indexes
numInstances = realValues[attribute].length; // number of instances
// 1) Form a set of all distinct values of attribute in ascending order
double[] valuesNoRepeated = new double[end+1];
int size = 0;
double value = realValues[attribute][values[begin]];
valuesNoRepeated[size++] = value;
for(i = begin+1 ; i <= end ; ++i){
if(value != realValues[attribute][values[i]]){
valuesNoRepeated[size++] = realValues[attribute][values[i]];
value = realValues[attribute][values[i]];
}
}
//System.out.println("att = " + attribute + ", size = " + size);
if(size == 1){
Vector cp = new Vector();
if(valuesNoRepeated[0] != realValues[attribute][values[end]])
cp.add(valuesNoRepeated[0]);
return cp;
}
// 2) Get the classes of each interval: -1 = no class, -2 = multiple classes, other = his class
int[] classOfInterval = new int[size-1];
for(i = 0 ; i < classOfInterval.length ; ++i)
classOfInterval[i] = -1;
//compute class of each interval
for(i = 0 ; i < numInstances ; ++i){
continueLoop = true;
for(point = 1 ; point < size && continueLoop ; ++point){
if(realValues[attribute][i] <= valuesNoRepeated[point]){
if(classOfInterval[point-1] == -1){
classOfInterval[point-1] = classOfInstances[i];
}
else{
if(classOfInterval[point-1] != classOfInstances[i])
classOfInterval[point-1] = -2;
}
continueLoop = false;
}
}
}
// 3) Join intervals if both have equal class and get possible cutpoints
cutpoints = new double[size];
numcp = 0;
int classInter = classOfInterval[0];
cutpoints[numcp++] = valuesNoRepeated[0];
for(i = 1 ; i < classOfInterval.length ; ++i){
if(classInter != classOfInterval[i] || classOfInterval[i] == -2){
cutpoints[numcp++] = valuesNoRepeated[i];
classInter = classOfInterval[i];
}
}
cutpoints[numcp++] = valuesNoRepeated[size-1];
// 4) Compute initial fitness
int[] selected = new int[numcp];
for(i = 0 ; i < numcp ; ++i)
selected[i] = 0;
int ni = 1;
selected[0] = 1; selected[numcp-1] = 1; // d0 y dn are selected
double GlobalOpt = computeCriterionFuction(selected, ni, attribute);
solution = new int[numcp];
System.arraycopy(selected, 0, solution, 0, numcp);
// 5) run loop
CD = new Vector<DiscretizationScheme>();
CD.add(new DiscretizationScheme(selected,numcp,GlobalOpt));
while(CD.size() > 0){
ni++;
// generate GD (all possible best neighbors from each discretization scheme in CD)
GD = new Vector<DiscretizationScheme>();
for(i = 0 ; i < CD.size() ; ++i)
generateNeighborhood(CD.get(i), ni, attribute, GlobalOpt);
// CD <- GD (all D in GD that criteriorFunction(D) > Globalopt)
CD = new Vector<DiscretizationScheme>();
double maxFitness = (-1)*Double.MIN_VALUE;
for(i = 0 ; i < GD.size() ; ++i){
CD.add(GD.get(i));
double fit = CD.get(i).fitness;
if(fit > maxFitness){
maxFitness = fit;
System.arraycopy(CD.get(i).cpSelected, 0, solution, 0, numcp);
}
}
// update GlobalOpt
GlobalOpt = maxFitness;
}
ni--;
solution[0] = solution[numcp-1] = 0;
Vector cp = new Vector();
for(i = 0 ; i < numcp ; ++i)
if(solution[i] == 1)
cp.add(cutpoints[i]);
return cp;
}
//******************************************************************************************************
/**
* <p>
* It generates the neighborhood of ds scheme discretization and adds each neighbor to variable GD
* </p>
* @param ds discretization scheme to generate its neighborhood
* @param ni number of intervals of the neighbors discretizations
* @param attribute index of the attribute to discretize
* @param GlobalOpt fitness of the best discretization scheme found
*/
public void generateNeighborhood(DiscretizationScheme ds, int ni, int attribute, double GlobalOpt){
int i;
double fitness, max = (-1)*Double.MIN_VALUE;
int[] best = new int[numcp];
for(i = 0 ; i < numcp ; ++i){
if(ds.cpSelected[i] == 0){
int[] v = new int[numcp];
System.arraycopy(ds.cpSelected, 0, v, 0, numcp);
v[i] = 1;
fitness = computeCriterionFuction(v, ni, attribute);
if(fitness > GlobalOpt){
GD.add(new DiscretizationScheme(v,numcp,fitness));
}
else{
// compute the better discretization with fitness lower than GlobalOpt
if(fitness > max){
System.arraycopy(v, 0, best, 0, numcp);
max = fitness;
}
}
}
}
// save the better discretization with fitness lower than GlobalOpt...
if(ni <= numInterAtt[attribute])
GD.add(new DiscretizationScheme(best,numcp,max));
}
//******************************************************************************************************
/**
* <p>
* It computes and returns the value of criterion function of the discretization scheme build with selectedp cutpoints
* </p>
* @param selectedp indexes of selected cutpoints
* @param ni number of intervals
* @param attribute index of the attribute
* @return the criterion function value
*/
public double computeCriterionFuction(int[] selectedp, int ni, int attribute){
int i, s;
double fitnessDiscr = 0;
// create quata matrix
CreateQuantaMatrix(ni, attribute, selectedp);
// create the conditional class probability vector for each interval
double[][] ccpv = new double[ni][Parameters.numClasses];
for(i = 0 ; i < ni ; ++i)
for(s = 0 ; s < Parameters.numClasses ; ++s)
ccpv[i][s] = (double) matrix[s][i] / (double) matrix[Parameters.numClasses][i];
for(i = 0 ; i < ni ; ++i)
fitnessDiscr += ((double)matrix[Parameters.numClasses][i]/(double)numInstances)*computeHeterCCPV(ccpv[i]);
return fitnessDiscr/ni;
}
//******************************************************************************************************
/**
* <p>
* It computes the heterogeneity for a conditional class probability vector given needed for compute
* the heterogeneity of a discretization scheme in criterion fuction calculus
* </p>
* @param ccpv conditional class probability vector
* @return the heterogeneity value
*/
public double computeHeterCCPV(double ccpv[]){
double total = 0, aux;
int s;
for(s = 0 ; s < Parameters.numClasses ; ++s){
aux = ccpv[s]-(double)(1.0/Parameters.numClasses);
total += Math.pow(aux,2);
}
return Math.sqrt(total);
}
//******************************************************************************************************
/**
* <p>
* It creates the quanta matrix basis of selected cutpoints array
* </p>
* @param ni number of intervals
* @param attribute index of the attribute
* @param selected vector with indexes of selected cut-points
*/
public void CreateQuantaMatrix(int ni, int attribute, int[] selected){
int i, j, point, clase; // loop indexes
// matrix initialization
matrix = new int[Parameters.numClasses+1][];
for(i = 0 ; i < Parameters.numClasses+1 ; ++i)
matrix[i] = new int[ni+1];
for(i = 0 ; i < Parameters.numClasses+1 ; ++i)
for(j = 0 ; j < ni+1 ; ++j)
matrix[i][j] = 0;
// create quanta matrix
boolean continuar = true;
int interval = 0;
for(i = 0 ; i < numInstances ; ++i){
continuar = true;
interval = 0;
for(point = 1 ; point < numcp && continuar ; ++point){
if(realValues[attribute][i] <= cutpoints[point] && selected[point] == 1){
matrix[classOfInstances[i]][interval]++;
continuar = false;
}
if(selected[point] == 1)
interval++;
}
}
//sumatory per classes
int suma;
for(clase = 0 ; clase < Parameters.numClasses ; ++clase){
suma = 0;
for(j = 0 ; j < ni ; ++j)
suma += matrix[clase][j];
matrix[clase][ni] = suma;
}
//sumatory per intervals
for(j = 0 ; j < ni ; ++j){
suma = 0;
for(clase = 0 ; clase < Parameters.numClasses ; ++clase)
suma += matrix[clase][j];
matrix[Parameters.numClasses][j] = suma;
}
}
}