/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.MantarasDist_Discretizer;
import java.util.*;
import keel.Algorithms.Discretizers.Basic.*;
import keel.Algorithms.Genetic_Rule_Learning.Globals.*;
/**
* <p>
* This is the class with the operations of the Mantaras Distance-Based discretization. It
* adopts the behavior of the general discretizers and specifies its differences in this
* class, that has to extend the abstract methods.
*
* @author Written by Victoria Lopez Morales (University of Granada) 27/11/2009
* @version 1.0
* @since JDK1.5
* </p>
*/
public class MantarasDistDiscretizer extends Discretizer {
/**
* <p>
* Selects, for a given attribute, the real values that best discretize the attribute
* according to the Distance-Based discretizer by Mantaras
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return a vector with the real values that best discretize the attribute given according to
* the Distance-Based discretizer by Mantaras
*/
protected Vector discretizeAttribute (int attribute, int []values, int begin, int end) {
Vector discretization = new Vector();
Vector positionCutPoints = new Vector();
int posNewCutPoint;
// Initially, select a cut point
posNewCutPoint = selectNewCutPoint (attribute, values, begin, end, positionCutPoints);
if (posNewCutPoint == -1) return discretization;
// While the discretization improves
while (improvesDiscretization (attribute, values, begin, end, positionCutPoints, posNewCutPoint)) {
// Add the new point to the discretization
positionCutPoints.addElement(new Integer (posNewCutPoint));
Collections.sort(positionCutPoints.subList(0,positionCutPoints.size()));
discretization.addElement(new Double ((realValues[attribute][values[posNewCutPoint-1]]+realValues[attribute][values[posNewCutPoint]])/2.0));
// Search for another cut point
posNewCutPoint = selectNewCutPoint (attribute, values, begin, end, positionCutPoints);
if (posNewCutPoint == -1) {
Collections.sort(discretization.subList(0,discretization.size()));
return discretization;
}
}
// Sort all discretization values before giving the final result
Collections.sort(discretization.subList(0,discretization.size()));
return discretization;
}
/**
* <p>
* Checks if adding posNewCutPoint to the current discretization improves the performance of the
* discretization according to the MDLP
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Position of all the cut points selected until this moment
* @param posNewCutPoint Position of the new cut point that has to be evaluated in the discretization
* @return true, if adding posNewCutPoint to the discretization leads to a lower MDLP value, false otherwise
*/
private boolean improvesDiscretization (int attribute, int []values, int begin, int end, Vector posCutPoints, int posNewCutPoint) {
double lenDisc1, lenDisc2, lenClassesDisc1, lenClassesDisc2;
double partitionEntropy1, partitionEntropy2;
int p1 = posCutPoints.size();
int p2 = p1 + 1;
int N = classOfInstances.length;
int k = Parameters.numClasses;
Vector posCutPointsExtended = new Vector (posCutPoints);
// Calculate Len(Disc) for both Discretizations
partitionEntropy1 = computeClassWeightedEntropy (attribute, values, begin, end, posCutPoints);
lenDisc1 = ((double)p1 * Math.log(N-1)/Math.log(2)) + ((double)(p1+1)*k) + partitionEntropy1;
posCutPointsExtended.addElement(new Integer (posNewCutPoint));
Collections.sort(posCutPointsExtended.subList(0,posCutPointsExtended.size()));
partitionEntropy2 = computeClassWeightedEntropy (attribute, values, begin, end, posCutPointsExtended);
lenDisc2 = ((double)p2 * Math.log(N-1)/Math.log(2)) + ((double)(p2+1)*k) + partitionEntropy2;
// Calculate Len(Classes|Disc) for both Discretizations
lenClassesDisc1 = computeClassModifiedEntropy (attribute, values, begin, end, posCutPoints);
lenClassesDisc2 = computeClassModifiedEntropy (attribute, values, begin, end, posCutPointsExtended);
// Check if the new discretization improves the current discretization
if ((lenDisc1 + lenClassesDisc1) < (lenDisc2 + lenClassesDisc2)) {
return false;
}
else if ((lenDisc1 + lenClassesDisc1) > (lenDisc2 + lenClassesDisc2)) {
return true;
}
else {
System.out.println("The length of both solutions is the same");
return true;
}
}
/**
* <p>
* Computes a pseudo-entropy measure dependent of a given discretization
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the value of the pseudo-entropy measure
*/
private double computeClassModifiedEntropy (int attribute, int []values, int begin, int end, Vector posCutPoints) {
Vector cd;
int numValues;
double ent;
double partitionEntropy = 0.0;
if (posCutPoints.size() == 0) {
// We don't have two partitions, we only have one partition
cd = classDistribution (attribute, values, begin, end);
numValues = sumValues(cd);
partitionEntropy = (double)numValues * computeEntropy (cd, numValues);
}
else {
// Check a first partition of the data
cd = classDistribution (attribute, values, begin, ((Integer)posCutPoints.elementAt(0)).intValue() - 1);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += (double)numValues * ent;
// Check the central partitions of the data
for (int i=1; i<posCutPoints.size(); i++) {
cd = classDistribution (attribute, values, ((Integer)posCutPoints.elementAt(i-1)).intValue(), ((Integer)posCutPoints.elementAt(i)).intValue() - 1);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += ((double)numValues * ent);
}
// Check the last partition of the data
cd = classDistribution (attribute, values, ((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue(), end);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += (double)numValues * ent;
}
return partitionEntropy;
}
/**
* <p>
* Computes a pseudo-entropy measure dependent of a given discretization, which is an entropy measure
* weighted by the number of classes in the interval of the discretization
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the value of the pseudo-entropy measure weighted by the class in the interval
*/
private double computeClassWeightedEntropy (int attribute, int []values, int begin, int end, Vector posCutPoints) {
Vector cd;
int numValues;
double ent;
double partitionEntropy = 0.0;
if (posCutPoints.size() == 0) {
// We don't have two partitions, we only have one partition
cd = classDistribution (attribute, values, begin, end);
numValues = sumValues(cd);
partitionEntropy = (double)getNumClasses(values, begin, end) * computeEntropy (cd, numValues);
}
else {
// Check a first partition of the data
cd = classDistribution (attribute, values, begin, ((Integer)posCutPoints.elementAt(0)).intValue() - 1);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += ((double)getNumClasses(values, begin, ((Integer)posCutPoints.elementAt(0)).intValue() - 1) * ent);
// Check the central partitions of the data
for (int i=1; i<posCutPoints.size(); i++) {
cd = classDistribution (attribute, values, ((Integer)posCutPoints.elementAt(i-1)).intValue(), ((Integer)posCutPoints.elementAt(i)).intValue() - 1);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += ((double)getNumClasses(values, ((Integer)posCutPoints.elementAt(i-1)).intValue(), ((Integer)posCutPoints.elementAt(i)).intValue() - 1) * ent);
}
// Check the last partition of the data
cd = classDistribution (attribute, values, ((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue(), end);
numValues = sumValues(cd);
ent = computeEntropy (cd, numValues);
partitionEntropy += ((double)getNumClasses(values, ((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue(), end) * ent);
}
return partitionEntropy;
}
/**
* <p>
* Gets the number of classes that are present in the data values
* </p>
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return the number of classes that there are in the data
*/
private int getNumClasses (int [] values, int begin, int end) {
ArrayList <Integer> diff_values;
diff_values = new ArrayList <Integer> ();
// Create a list with all the different possible values for the output class
for (int j=begin; j<=end; j++) {
double aux = classOfInstances[values[j]];
// If the class considered isn't in the diff_values list yet, add to that list
if (!diff_values.contains(new Integer((int)aux))) {
diff_values.add(new Integer((int)aux));
}
}
return diff_values.size();
}
/**
* <p>
* Chooses the new best discretization value given a current discretization using the Mantaras
* Distance criteria.
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the new best cut point for the current discretization (its position)
*/
private int selectNewCutPoint (int attribute, int []values, int begin, int end, Vector posCutPoints) {
// First, obtain all candidate cut points
Vector candidateCutPoints = getCandidateCutPoints(attribute,values,begin,end);
if(candidateCutPoints.size()==0) return -1;
// Initially, the best cut point is the first one
int posMin = ((Integer)candidateCutPoints.elementAt(0)).intValue();
double distMin = computeDistanceNewPartition(attribute, values, begin, end, posCutPoints, posMin);
// Check if there is a cut point better than the current best point selected
for(int i=1,size=candidateCutPoints.size();i<size;i++) {
int pos=((Integer)candidateCutPoints.elementAt(i)).intValue();
double dist=computeDistanceNewPartition(attribute, values, begin, end, posCutPoints, pos);
if(dist < distMin) {
distMin = dist;
posMin = pos;
}
}
// Return the best cut point found
return posMin;
}
/**
* <p>
* Computes the distance (Mantaras criteria) for a given discretization including the new cut point
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @param posNewCutPoint Position of the cut point that belongs to the current discretization whose
* distance is computed
* @return distance for a given discretization including a new cut point
*/
private double computeDistanceNewPartition(int attribute, int []values, int begin, int end, Vector posCutPoints, int posNewCutPoint) {
double entC, entD, jointEnt;
Vector cd;
int numValues;
Vector posCutPointsExtended = new Vector (posCutPoints);
// Compute the entropy associated to the class distribution
cd = classDistribution (attribute, values, begin, end);
numValues = sumValues(cd);
entC = computeEntropy (cd, numValues);
// Compute the entropy associated to the discretization distribution
posCutPointsExtended.addElement(new Integer (posNewCutPoint));
Collections.sort(posCutPointsExtended.subList(0,posCutPointsExtended.size()));
cd = discretizationDistribution (attribute, values, begin, end, posCutPointsExtended);
numValues = sumValues(cd);
entD = computeEntropy (cd, numValues);
// Compute the entropy associated to the joint distribution
jointEnt = computeJointEntropy (attribute, values, begin, end, posCutPointsExtended);
// Return the distance
return (2 - ((entC+entD)/jointEnt));
}
/**
* <p>
* Computes the joint entropy for a given discretization
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param Discretization Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the entropy value corresponding to the joint class distribution of the discretization
*/
private double computeJointEntropy (int attribute, int [] values, int begin, int end, Vector Discretization) {
Vector cd;
int numValues;
cd = jointClassDistribution (attribute, values, begin, end, Discretization);
numValues = sumValues (cd);
return computeEntropy (cd, numValues);
}
/**
* <p>
* Computes the Shannon entropy for a set of values
* </p>
* @param v Set of values whose entropy is computed
* @param numValues Total number of values whose entropy is computed
* @return the Shannon entropy of the set of values
*/
private double computeEntropy(Vector v, int numValues) {
double ent=0;
for(int i=0,size=v.size();i<size;i++) {
double prob=((Integer)v.elementAt(i)).intValue();
// This is done to avoid computing invalid log values
if (prob != 0) {
prob/=(double)numValues;
ent+=prob*Math.log(prob)/Math.log(2);
}
}
return -ent;
}
/**
* <p>
* Adds up the integer values stored in a vector
* </p>
* @param v Vector whose integer values are going to be added
* @return sum of the addition of all integer values in the vector
*/
private int sumValues(Vector v) {
int sum=0;
for(int i=0,size=v.size();i<size;i++) {
sum+=((Integer)v.elementAt(i)).intValue();
}
return sum;
}
/**
* <p>
* Obtains a vector of all the possible cut points for the attribute
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return a vector with all the possible cut points for the attribute
*/
private Vector getCandidateCutPoints(int attribute,int []values,int begin,int end) {
Vector cutPoints = new Vector();
double valueAnt=realValues[attribute][values[begin]];
// Add all the values different from its previous value
for(int i=begin;i<=end;i++) {
double val=realValues[attribute][values[i]];
if(val!=valueAnt) cutPoints.addElement(new Integer(i));
valueAnt=val;
}
return cutPoints;
}
/**
* <p>
* Obtains the class distribution of the data
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @return the class distribution of the data
*/
private Vector classDistribution(int attribute,int []values,int begin,int end) {
int []classCount = new int[Parameters.numClasses];
for(int i=0;i<Parameters.numClasses;i++) classCount[i]=0;
for(int i=begin;i<=end;i++) classCount[classOfInstances[values[i]]]++;
Vector res= new Vector();
for(int i=0;i<Parameters.numClasses;i++) {
if(classCount[i]>0) res.addElement(new Integer(classCount[i]));
}
return res;
}
/**
* <p>
* Obtains the distribution of the data given a current discretization
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return the distribution of the data given a current discretization
*/
private Vector discretizationDistribution(int attribute, int []values, int begin, int end, Vector posCutPoints) {
int []distributionCount = new int[posCutPoints.size()+1];
distributionCount[0] = ((Integer)posCutPoints.elementAt(0)).intValue() - begin;
for (int i=1; i<posCutPoints.size(); i++) {
distributionCount[i] = ((Integer)posCutPoints.elementAt(i)).intValue() - ((Integer)posCutPoints.elementAt(i-1)).intValue();
}
distributionCount[posCutPoints.size()] = end - ((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue() + 1;
Vector res= new Vector();
for(int i=0;i<posCutPoints.size()+1;i++) {
if(distributionCount[i]>0) res.addElement(new Integer(distributionCount[i]));
}
return res;
}
/**
* <p>
* Obtains a joint distribution of the data given a current discretization and the class the data
* belongs to
* </p>
* @param attribute Position of the attribute in the list of attributes whose best real values
* for discretization are going to be selected
* @param values Position of the corresponding attribute value in the real values matrix,
* ordered by attribute value
* @param begin First value that is considered to belong to the data considered, usually 0
* @param end Last value that is considered to belong to the data considered, usually the last
* value of the dataset
* @param posCutPoints Discretization proposed for the attribute containing the position of the
* selected cut points
* @return a joint distribution depending on the discretization and the class data
*/
private Vector jointClassDistribution(int attribute, int []values, int begin, int end, Vector posCutPoints) {
int []jointClassCount = new int[Parameters.numClasses*(posCutPoints.size()+1)];
for(int i=0;i<Parameters.numClasses*(posCutPoints.size()+1);i++) jointClassCount[i]=0;
for(int i=begin; i<((Integer)posCutPoints.elementAt(0)).intValue(); i++) {
jointClassCount[classOfInstances[values[i]]]++;
}
for (int i=1; i<posCutPoints.size(); i++) {
for (int j=((Integer)posCutPoints.elementAt(i-1)).intValue(); j<((Integer)posCutPoints.elementAt(i)).intValue(); j++) {
jointClassCount[Parameters.numClasses*i+classOfInstances[values[j]]]++;
}
}
for(int i=((Integer)posCutPoints.elementAt(posCutPoints.size()-1)).intValue(); i<=end; i++) {
jointClassCount[Parameters.numClasses*posCutPoints.size()+classOfInstances[values[i]]]++;
}
Vector res= new Vector();
for(int i=0;i<Parameters.numClasses*(posCutPoints.size()+1);i++) {
res.addElement(new Integer(jointClassCount[i]));
}
return res;
}
}