/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.Zeta_Discretizer; import java.util.*; import keel.Algorithms.Discretizers.Basic.*; import keel.Algorithms.Genetic_Rule_Learning.Globals.*; /** * <p> * This is the class with the operations of the Zeta based discretization. It adopts the * behavior of the general discretizers and specifies its differences in this class, * that has to extend the abstract methods. * </p> * * @author Written by Victoria Lopez Morales (University of Granada) 06/12/2009 * @version 1.0 * @since JDK1.5 */ public class ZetaDiscretizer extends Discretizer { /** * <p> * Selects, for a given attribute, the real values that best discretize the attribute * according to the Zeta based discretizer * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @return a vector with the real values that best discretize the attribute given according to * the Zeta based discretizer */ protected Vector discretizeAttribute(int attribute, int []values, int begin, int end) { Vector classes = new Vector (); Vector cutPoints; // First, add all the classes to the classes vector for (int i=0; i<Parameters.numClasses; i++) classes.addElement(i); // Obtain all the cut points by dichotomising using the zeta measure cutPoints = dichotomiseZeta (attribute, values, begin, end, classes); // Return the obtained cutpoints return cutPoints; } /** * <p> * Dichotomise the data using the zeta measure, obtaining the cut points in a recursively process * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @param classes A vector containing the current available classes for dichotomise * @return a vector with all the values obtained that best divides classes */ private Vector dichotomiseZeta (int attribute, int []values, int begin, int end, Vector classes) { int posMax; double zetaMax; boolean []maskMax = new boolean[classes.size()]; Vector cutPoints = new Vector(); // If there is only one class, return no cut points if (classes.size() == 1) return cutPoints; // Obtain all candidate cut points Vector candidateCutPoints = getCandidateCutPoints(attribute,values,begin,end); if(candidateCutPoints.size()==0) return cutPoints; if (classes.size() == 2) { // If we only have two classes, we are in the basic case // Initially, the best cut point is the first one posMax = ((Integer)candidateCutPoints.elementAt(0)).intValue(); zetaMax = computeBasicZeta (attribute, values, begin, posMax, end, classes); // Check if there is a cut point better than the current best point selected for(int i=1,size=candidateCutPoints.size();i<size;i++) { int pos = ((Integer)candidateCutPoints.elementAt(i)).intValue(); double zeta = computeBasicZeta (attribute, values, begin, pos, end, classes); if (zeta > zetaMax) { zetaMax = zeta; posMax = pos; } } // Add the best cut point found, and return that value double cutPoint = (realValues[attribute][values[posMax-1]]+realValues[attribute][values[posMax]])/2.0; cutPoints.addElement(cutPoint); return cutPoints; } else { // We have more than one class, we have to select one class to be separated from the // other classes that are considered to be merged boolean [] mask; mask = new boolean[classes.size()]; // Initially, the best cut point is the first one posMax = ((Integer)candidateCutPoints.elementAt(0)).intValue(); zetaMax = computeZeta (attribute, values, begin, posMax, end, classes, mask); System.arraycopy(mask, 0, maskMax, 0, mask.length); // Check if there is a cut point better than the current best point selected for(int i=1,size=candidateCutPoints.size(); i<size; i++) { int pos = ((Integer)candidateCutPoints.elementAt(i)).intValue(); double zeta = computeZeta (attribute, values, begin, pos, end, classes, mask); if (zeta > zetaMax) { zetaMax = zeta; posMax = pos; System.arraycopy(mask, 0, maskMax, 0, mask.length); } } // The cut point found is added to the cutPoints vector double cutPoint = (realValues[attribute][values[posMax-1]]+realValues[attribute][values[posMax]])/2.0; cutPoints.addElement(cutPoint); // Compute recursively the other cut points boolean found = false; Vector new_classes = new Vector(classes); for (int i=0,size=classes.size(); i<size && !found; i++) { if (!maskMax[i]) { new_classes.remove(i); found = true; } } Vector otherCutPoints = dichotomiseZeta (attribute, values, begin, end, new_classes); cutPoints.addAll(otherCutPoints); // Sort all cut points values before giving the final result Collections.sort(cutPoints.subList(0, cutPoints.size())); return cutPoints; } } /** * <p> * Computes the basic zeta measure, this means, when we have two classes * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param midPoint Middle value that is considered to belong to the data considered, that * separates the data in two parts * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @param classes A vector containing the current available classes for the zeta computation * @return value of the zeta measure */ private double computeBasicZeta (int attribute, int []values, int begin, int midPoint, int end, Vector classes) { if (classes.size() == 2) { int N; int nii, nij; // Obtain the two class distributions for the given values Vector cd_below = classDistributionRestricted (attribute, values, begin, midPoint-1, classes); Vector cd_above = classDistributionRestricted (attribute, values, midPoint, end, classes); N = sumValues(cd_below) + sumValues(cd_above); nii = ((Integer)cd_below.elementAt(0)).intValue() + ((Integer)cd_above.elementAt(1)).intValue(); nij = ((Integer)cd_below.elementAt(1)).intValue() + ((Integer)cd_above.elementAt(0)).intValue(); // Compute the Zeta measure return ((double)(Math.max(nii, nij)))/((double)N); } else { System.err.println("The basic zeta computation is only proposed for k = 2"); System.exit(-1); return -1; } } /** * <p> * Computes the zeta measure, in a general case with more than two classes * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param midPoint Middle value that is considered to belong to the data considered, that * separates the data in two parts * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @param classes A vector containing the current available classes for the zeta computation * @param mask Auxiliar value that contains the class that must be considered separate at the * end of the computation * @return value of the zeta measure */ private double computeZeta (int attribute, int []values, int begin, int midPoint, int end, Vector classes, boolean []mask) { int N; int nii, nij, n_below2, n_above2; boolean []maskMax = new boolean[classes.size()]; double zetaMax; Vector aux; // Compute zeta considering the class alone is the first class maskMax[0] = false; Arrays.fill(maskMax, 1, classes.size(), true); Vector cd_below = classDistributionRestricted (attribute, values, begin, midPoint-1, classes); Vector cd_above = classDistributionRestricted (attribute, values, midPoint, end, classes); N = sumValues(cd_below) + sumValues(cd_above); aux = new Vector (cd_below); aux.remove(0); n_below2 = sumValues(aux); aux = new Vector (cd_above); aux.remove(0); n_above2 = sumValues(aux); nii = ((Integer)cd_below.elementAt(0)).intValue() + n_above2; nij = n_below2 + ((Integer)cd_above.elementAt(0)).intValue(); zetaMax = ((double)(Math.max(nii, nij)))/((double)N); // Compute zeta considering the other classes alone for(int i=1,size=classes.size();i<size;i++) { aux = new Vector (cd_below); aux.remove(i); n_below2 = sumValues(aux); aux = new Vector (cd_above); aux.remove(i); n_above2 = sumValues(aux); nii = ((Integer)cd_below.elementAt(i)).intValue() + n_above2; nij = n_below2 + ((Integer)cd_above.elementAt(i)).intValue(); double zeta = ((double)(Math.max(nii, nij)))/((double)N); if (zeta > zetaMax) { zetaMax = zeta; Arrays.fill(maskMax, true); maskMax[i] = false; } } System.arraycopy(maskMax, 0, mask, 0, maskMax.length); return zetaMax; } /** * <p> * Adds up the integer values stored in a vector * </p> * @param v Vector whose integer values are going to be added * @return sum of the addition of all integer values in the vector */ private int sumValues(Vector v) { int sum=0; for(int i=0,size=v.size();i<size;i++) { sum+=((Integer)v.elementAt(i)).intValue(); } return sum; } /** * <p> * Obtains a vector of all the possible cut points for the attribute * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @return a vector with all the possible cut points for the attribute */ private Vector getCandidateCutPoints(int attribute,int []values,int begin,int end) { Vector cutPoints = new Vector(); double valueAnt=realValues[attribute][values[begin]]; // Add all the values different from its previous value for(int i=begin;i<=end;i++) { double val=realValues[attribute][values[i]]; if(val!=valueAnt) cutPoints.addElement(new Integer(i)); valueAnt=val; } return cutPoints; } /** * <p> * Obtains the class distribution of the data, restricted to the classes that are in the classes param * </p> * @param attribute Position of the attribute in the list of attributes whose best real values * for discretization are going to be selected * @param values Position of the corresponding attribute value in the real values matrix, * ordered by attribute value * @param begin First value that is considered to belong to the data considered, usually 0 * @param end Last value that is considered to belong to the data considered, usually the last * value of the dataset * @param classes A vector containing the current available classes for the class distribution * @return the class distribution of the data of the given classes */ private Vector classDistributionRestricted (int attribute, int []values, int begin, int end, Vector classes) { int item_class; int []classCount = new int[classes.size()]; for(int i=0;i<classes.size();i++) classCount[i]=0; // Count only the frequency of the values belonging to a class in the classes vector for(int i=begin; i<=end; i++) { item_class = classes.indexOf(classOfInstances[values[i]]); if (item_class > -1) { classCount[item_class]++; } } Vector res= new Vector(); for(int i=0;i<classes.size();i++) { res.addElement(new Integer(classCount[i])); } return res; } }