/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.Bayesian_Discretizer; import java.util.*; import keel.Algorithms.Discretizers.Basic.*; import keel.Algorithms.Genetic_Rule_Learning.Globals.*; /** * <p> * This is the class with the operations of the Bayesian discretization. It adopts the behavior * of the general discretizers and specifies its differences in this class, that has to extend * the abstract methods. * * @author Written by Victoria Lopez Morales (University of Granada) 21/12/2009 * @version 1.0 * @since JDK1.5 * </p> */ public class BayesianDiscretizer extends Discretizer { /** * <p> * Selects, for a given attribute, the real values that best discretize the attribute * according to the bayesian discretizer * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @return a vector with the real values that best discretize the attribute given according to * the bayesian discretizer */ protected Vector discretizeAttribute (int attribute, int []values, int begin, int end) { double Pcj; // First we obtain the class distribution, needed to estimate Pcj Vector cd = classDistribution(attribute,values,begin,end); if (cd.size() == 1) return new Vector(); int numValues = sumValues(cd); Vector cutPoints = new Vector(); Vector differentValues = getAttributeDifferentValues(attribute, values, begin, end); Vector frequencyValues = new Vector(); Vector conditionalClassDistribution; Vector f_x; Vector F_x = new Vector(); Vector lead; Vector all_leads = new Vector(); // For all the different values in the data, compute its frequency for (int j=0, size = differentValues.size()-1; j<size; j++) { // Count the number of presences for this value int frequency; frequency = ((Integer)differentValues.elementAt(j+1)).intValue() - ((Integer)differentValues.elementAt(j)).intValue(); frequencyValues.addElement (new Integer(frequency)); } frequencyValues.addElement (new Integer (((end-begin+1)-((Integer)differentValues.elementAt(differentValues.size()-1)).intValue()))); // For each class for (int i=0; i<Parameters.numClasses; i++) { // Compute the probability of each class Pcj = (double)(((Integer)cd.elementAt(i)).intValue()+1)/(double)(numValues+2); conditionalClassDistribution = new Vector(); f_x = new Vector(); // First, we compute the distribution for the whole class conditioned by values int frequency; for (int j=0, size = differentValues.size()-1; j<size; j++) { frequency = 0; for (int k=((Integer)differentValues.elementAt(j)).intValue(); k<((Integer)differentValues.elementAt(j+1)).intValue(); k++) { if (classOfInstances[values[k]] == i) { frequency++; } } conditionalClassDistribution.addElement(new Double((double)(frequency)/(double)(((Integer)frequencyValues.elementAt(j)).intValue()))); f_x.addElement(new Double((double)(frequency)/(double)(((Integer)frequencyValues.elementAt(j)).intValue())*Pcj)); } frequency = 0; for (int k=((Integer)differentValues.elementAt(differentValues.size()-1)).intValue(); k<(end-begin+1); k++) { if (classOfInstances[values[k]] == i) { frequency++; } } // Build the fj(x) curve conditionalClassDistribution.addElement(new Double((double)(frequency)/(double)(((Integer)frequencyValues.elementAt(differentValues.size()-1)).intValue()))); f_x.addElement(new Double((double)(frequency)/(double)(((Integer)frequencyValues.elementAt(differentValues.size()-1)).intValue())*Pcj)); F_x.add(f_x); } // Looking at all fj(x) curves, decide the cut points // First, we compute all the leads for (int j=0, size = differentValues.size(); j<size; j++) { lead = leadCurve (F_x, j); all_leads.add(lead); } // Then, we check the lead of each class for (int i=0; i<Parameters.numClasses; i++) { boolean is_leading; // Check the initial situation for the class in terms of leading if (((Vector)all_leads.get(0)).contains(new Integer(i))) { is_leading = true; } else { is_leading = false; } // Check the following situations for the class in terms of leading for (int j=1, size = differentValues.size()-1; j<size; j++) { if (((Vector)all_leads.get(j)).contains(new Integer(i))) { // If a class is leading right now and wan't leading before // Add this point as a cut point if (!is_leading) { int posMax = ((Integer)differentValues.elementAt(j)).intValue(); double cutPoint=(realValues[attribute][values[posMax-1]]+realValues[attribute][values[posMax]])/2.0; if (!cutPoints.contains(new Double (cutPoint))) { cutPoints.addElement(new Double(cutPoint)); } is_leading = true; } } else { // If a class is not leading right now and was leading before // Add this point as a cut point if (is_leading) { int posMax = ((Integer)differentValues.elementAt(j)).intValue(); double cutPoint=(realValues[attribute][values[posMax-1]]+realValues[attribute][values[posMax]])/2.0; if (!cutPoints.contains(new Double (cutPoint))) { cutPoints.addElement(new Double(cutPoint)); } is_leading = false; } } } } // Sort all discretization values before giving the final result Collections.sort(cutPoints.subList(0, cutPoints.size())); return cutPoints; } /** * <p> * Obtain the classes of the leading curves at a certain position of the values * </p> * @param function_points A vector of vector that contains the fj(x) curves for each class * @param x_value Position in the function from which we are obtaining the leading curves * @return all the classes that have the higher value of fj(x) at the x_value position */ private Vector leadCurve (Vector function_points, int x_value) { double high_value, value; Vector lead_classes = new Vector(); // The first class is supposed to be the leading class, its value is the hightest and // this class is added to the lead_classes list lead_classes.addElement(new Integer(0)); high_value = ((Double)((Vector)function_points.get(0)).elementAt(x_value)).doubleValue(); // Check the values of the other classes for (int i=1; i<Parameters.numClasses; i++) { value = ((Double)((Vector)function_points.get(i)).elementAt(x_value)).doubleValue(); // If a class has a higher value than the highest obtained value // Clear the lead_classes list, add this class to that list and store this highest value if (value > high_value) { lead_classes.clear(); lead_classes.addElement(new Integer(i)); high_value = value; } // If a class has a value as high as the highest obtained value // Add this class to the lead_classes list else if (value == high_value) { lead_classes.addElement(new Integer(i)); } } // Return the lead_classes list return lead_classes; } /** * <p> * Adds up the integer values stored in a vector * </p> * @param v Vector whose integer values are going to be added * @return sum of the addition of all integer values in the vector */ private int sumValues(Vector v) { int sum=0; for(int i=0,size=v.size();i<size;i++) { sum+=((Integer)v.elementAt(i)).intValue(); } return sum; } /** * <p> * Obtains a vector of all the different values for the attribute * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @return a vector with all the possible different values for the attribute */ private Vector getAttributeDifferentValues (int attribute, int []values, int begin, int end) { // Add the first value of the attribute (the attribute is ordered) Vector cutPoints = new Vector(); double valueAnt=realValues[attribute][values[begin]]; cutPoints.addElement(new Integer(begin)); // Add all the values different from its previous value for(int i=begin;i<=end;i++) { double val=realValues[attribute][values[i]]; if(val!=valueAnt) cutPoints.addElement(new Integer(i)); valueAnt=val; } return cutPoints; } /** * <p> * Obtains the class distribution of the data * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @return the class distribution of the data */ private Vector classDistribution(int attribute,int []values,int begin,int end) { int []classCount = new int[Parameters.numClasses]; for(int i=0;i<Parameters.numClasses;i++) classCount[i]=0; for(int i=begin;i<=end;i++) classCount[classOfInstances[values[i]]]++; Vector res= new Vector(); for(int i=0;i<Parameters.numClasses;i++) { res.addElement(new Integer(classCount[i])); } return res; } }