/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.Ameva_Discretizer;
import java.util.*;
import keel.Algorithms.Discretizers.Basic.*;
import keel.Algorithms.Genetic_Rule_Learning.Globals.*;
/**
* <p>
* This is the class with the operations of the Ameva discretization. It adopts the behavior
* of the general discretizers and specifies its differences in this class, that has to
* extend the abstract methods.
*
* @author Written by Victoria Lopez Morales (University of Granada) 15/12/2009
* @version 1.0
* @since JDK1.5
* </p>
*/
public class AmevaDiscretizer extends Discretizer {
/**
* <p>
* Selects, for a given attribute, the real values that best discretize the attribute
* according to the Ameva discretizer
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return a vector with the real values that best discretize the attribute given according to
* the Ameva discretizer
*/
protected Vector discretizeAttribute(int attribute, int []values, int begin, int end) {
Vector discretization = new Vector();
Vector positionCutPoints = new Vector();
Vector CutPointSelected;
int posNewCutPoint;
double GlobalAmeva, ameva;
// Initially, select a cut point
GlobalAmeva = 0.0;
CutPointSelected = selectNewCutPoint (attribute, values, begin, end, positionCutPoints);
if (CutPointSelected.size() == 0) return discretization;
posNewCutPoint = ((Integer)CutPointSelected.elementAt(0)).intValue();
ameva = ((Double)CutPointSelected.elementAt(1)).doubleValue();
// While the discretization improves the Ameva measure (in the basic mode)
// or while there aren't enough cut points (in the AmevaR mode)
while ((ameva > GlobalAmeva) || (Parameters.amevaR && (positionCutPoints.size()+1 < Parameters.numClasses))) {
// Add the new point to the discretization
positionCutPoints.addElement(new Integer (posNewCutPoint));
Collections.sort(positionCutPoints.subList(0,positionCutPoints.size()));
discretization.addElement(new Double ((realValues[attribute][values[posNewCutPoint-1]]+realValues[attribute][values[posNewCutPoint]])/2.0));
GlobalAmeva = ameva;
// Search for another cut point
CutPointSelected = selectNewCutPoint (attribute, values, begin, end, positionCutPoints);
if (CutPointSelected.size() == 0) {
Collections.sort(discretization.subList(0,discretization.size()));
return discretization;
}
posNewCutPoint = ((Integer)CutPointSelected.elementAt(0)).intValue();
ameva = ((Double)CutPointSelected.elementAt(1)).doubleValue();
}
// Sort all discretization values before giving the final result
Collections.sort(discretization.subList(0,discretization.size()));
return discretization;
}
/**
* <p>
* Gets the number of classes that are present in the data values
* </p>
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return the number of classes that there are in the data
*/
private int getNumClasses (int [] values, int begin, int end) {
ArrayList <Integer> diff_values;
diff_values = new ArrayList <Integer> ();
// Create a list with all the different possible values for the output class
for (int j=begin; j<=end; j++) {
double aux = classOfInstances[values[j]];
// If the class considered isn't in the diff_values list yet, add to that list
if (!diff_values.contains(new Integer((int)aux))) {
diff_values.add(new Integer((int)aux));
}
}
return diff_values.size();
}
/**
* <p>
* Chooses the new best discretization value given a current discretization using the Ameva
* criteria
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the new best cut point for the current discretization (its position) and its Ameva
* value (in a vector form)
*/
private Vector selectNewCutPoint (int attribute, int []values, int begin, int end, Vector posCutPoints) {
// First, obtain all candidate cut points
Vector candidateCutPoints = getCandidateCutPoints (attribute, values, begin, end);
Vector result = new Vector();
if (candidateCutPoints.size()==0) return result;
// Initially, the best cut point is the first one
int posMax = ((Integer)candidateCutPoints.elementAt(0)).intValue();
double amevaMax = computeAmeva (attribute, values, begin, end, posCutPoints, posMax);
// Check if there is a cut point better than the current best point selected
for(int i=1,size=candidateCutPoints.size(); i<size; i++) {
int pos = ((Integer)candidateCutPoints.elementAt(i)).intValue();
double ameva = computeAmeva (attribute, values, begin, end, posCutPoints, pos);
if(ameva > amevaMax) {
amevaMax = ameva;
posMax = pos;
}
}
// Return the best cut point found
result.addElement(posMax);
result.addElement(amevaMax);
return result;
}
/**
* <p>
* Computes the Ameva measure for a given discretization including the new cut point
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @param posNewCutPoint Position of the cut point that belongs to the current discretization whose
* Ameva measure is computed
* @return Ameva for a given discretization including a new cut point
*/
private double computeAmeva (int attribute, int []values, int begin, int end, Vector posCutPoints, int posNewCutPoint) {
double chiSquare, fraction, ameva;
int N, l, k;
Vector cd, dd, jcd;
Vector posCutPointsExtended = new Vector (posCutPoints);
// Obtain a list with all the cut points (including the new one)
posCutPointsExtended.addElement(new Integer (posNewCutPoint));
Collections.sort(posCutPointsExtended.subList(0,posCutPointsExtended.size()));
// Obtain the whole contingency table
jcd = jointClassDistribution (attribute, values, begin, end, posCutPointsExtended);
cd = classDistribution (jcd);
dd = discretizationDistribution (jcd);
N = sumValues(cd);
l = getNumClasses (values, begin, end);
k = posCutPointsExtended.size() + 1;
// Compute the chi-square
chiSquare = 0.0;
for (int i=0; i<l; i++) {
for (int j=0; j<k; j++) {
fraction = Math.pow((double)((Integer)jcd.elementAt(j*Parameters.numClasses+i)).intValue(),2);
fraction = fraction/((double)((Integer)cd.elementAt(i)).intValue()*(double)((Integer)dd.elementAt(j)).intValue());
chiSquare += fraction;
}
}
chiSquare = N * (-1 + chiSquare);
// From the chi-square value compute the ameva value
ameva = chiSquare/(double)(k*(l-1));
return ameva;
}
/**
* <p>
* Adds up the integer values stored in a vector
* </p>
* @param v Vector whose integer values are going to be added
* @return sum of the addition of all integer values in the vector
*/
private int sumValues(Vector v) {
int sum=0;
for(int i=0,size=v.size();i<size;i++) {
sum+=((Integer)v.elementAt(i)).intValue();
}
return sum;
}
/**
* <p>
* Obtains a vector of all the possible cut points for the attribute
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return a vector with all the possible cut points for the attribute
*/
private Vector getCandidateCutPoints(int attribute,int []values,int begin,int end) {
Vector cutPoints = new Vector();
double valueAnt=realValues[attribute][values[begin]];
// Add all the values different from its previous value
for(int i=begin;i<=end;i++) {
double val=realValues[attribute][values[i]];
if(val!=valueAnt) cutPoints.addElement(new Integer(i));
valueAnt=val;
}
return cutPoints;
}
/**
* <p>
* Obtains the class distribution of the data
* </p>
* @param jointClassDistribution A joint distribution depending on a discretization and the
* class data which is the base to build the class distribution
* @return the class distribution of the data
*/
private Vector classDistribution (Vector jointClassDistribution) {
Vector cd = new Vector();
int count;
for (int i=0; i<Parameters.numClasses; i++) {
count = 0;
for (int j=0, size=jointClassDistribution.size()/Parameters.numClasses; j<size; j++) {
count += ((Integer)jointClassDistribution.elementAt(Parameters.numClasses*j+i)).intValue();
}
cd.addElement(count);
}
return cd;
}
/**
* <p>
* Obtains the distribution of the data given conditioned by a discretization
* </p>
* @param jointClassDistribution A joint distribution depending on a discretization and the
* class data which is the base to build the discretization distribution
* @return the distribution of the data conditioned by a discretization
*/
private Vector discretizationDistribution (Vector jointClassDistribution) {
Vector cd = new Vector();
int count;
for (int i=0, size=jointClassDistribution.size()/Parameters.numClasses; i<size; i++) {
count = 0;
for (int j=0; j<Parameters.numClasses; j++) {
count += ((Integer)jointClassDistribution.elementAt(Parameters.numClasses*i+j)).intValue();
}
cd.addElement(count);
}
return cd;
}
/**
* <p>
* Obtains a joint distribution of the data given a current discretization and the class the data
* belongs to
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return a joint distribution depending on the discretization and the class data
*/
private Vector jointClassDistribution(int attribute, int []values, int begin, int end, Vector posCutPoints) {
int []jointClassCount = new int[Parameters.numClasses*(posCutPoints.size()+1)];
for(int i=0;i<Parameters.numClasses*(posCutPoints.size()+1);i++) jointClassCount[i]=0;
for(int i=begin; i<((Integer)posCutPoints.elementAt(0)).intValue(); i++) {
jointClassCount[classOfInstances[values[i]]]++;
}
for (int i=1; i<posCutPoints.size(); i++) {
for (int j=((Integer)posCutPoints.elementAt(i-1)).intValue(); j<((Integer)posCutPoints.elementAt(i)).intValue(); j++) {
jointClassCount[Parameters.numClasses*i+classOfInstances[values[j]]]++;
}
}
for(int i=((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue(); i<=end; i++) {
jointClassCount[Parameters.numClasses*posCutPoints.size()+classOfInstances[values[i]]]++;
}
Vector res= new Vector();
for(int i=0;i<Parameters.numClasses*(posCutPoints.size()+1);i++) {
res.addElement(new Integer(jointClassCount[i]));
}
return res;
}
}