/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.HellingerBD; import java.util.*; import keel.Algorithms.Discretizers.Basic.*; import keel.Algorithms.Genetic_Rule_Learning.Globals.Parameters; import keel.Dataset.Attribute; import keel.Dataset.Attributes; /** * <p> * This class implements the HellingerBD discretizer * </p> * * @author Written by Jose A. Saez (University of Granada), 12/21/2009 * @version 1.0 * @since JDK1.6 */ public class HellingerBD extends Discretizer { private double[] cutpoints; // possible cutpoints private double[] classProb; // probability of occurrence of each distinct class private int[] selected; // selected cutpoints private int numInstances; // total number of instances private double[] entropyIntrvl; // entropy of each interval private int[] numInterAtt; //****************************************************************************************************** /** * <p> * Constructor of the class * </p> */ public HellingerBD(){ int i; numInterAtt = new int[Parameters.numAttributes]; if(Parameters.setConfig){ for(i = 0 ; i < Parameters.numAttributes ; ++i){ Attribute att = Attributes.getAttribute(i); if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER) if (Parameters.numIntervals > 0) numInterAtt[i] = Parameters.numIntervals; else numInterAtt[i] = (Parameters.numInstances / (100)) > Parameters.numClasses?Parameters.numInstances / (100):Parameters.numClasses; // default case else numInterAtt[i] = 0; } } else{ String[] inter = Parameters.numIntrvls.split("_"); int cont = 0; for(i = 0 ; i < Parameters.numAttributes ; ++i){ Attribute att = Attributes.getAttribute(i); if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER) numInterAtt[i] = Integer.parseInt(inter[cont++]); // default case else numInterAtt[i] = 0; } } } //****************************************************************************************************** /** * <p> * It returns a vector with the discretized values * </p> * @param attribute index of the attribute to discretize * @param values vector of the indexes of the instances sorted from the lowest to the highest value of attribute * @param begin index of the instance with the lowest value of attribute * @param end index of the instance with the highest value of attribute * @return vector with the discretized values */ protected Vector discretizeAttribute(int attribute, int []values, int begin, int end){ int i; numInstances = realValues[attribute].length; //number of instances // 1) Form a set of all distinct values in ascending order double[] valuesNoRepeated = new double[end+1]; int size = 0; double value = realValues[attribute][values[begin]]; valuesNoRepeated[size++] = value; for(i = begin+1 ; i <= end ; ++i){ if(value != realValues[attribute][values[i]]){ valuesNoRepeated[size++] = realValues[attribute][values[i]]; value = realValues[attribute][values[i]]; } } // 2) Calculate the midpoints of all the adjacent pairs in the set int numcp = size+1; // midpoints + minimun + maximun cutpoints = new double[numcp]; cutpoints[0] = realValues[attribute][values[begin]]; cutpoints[numcp-1] = realValues[attribute][values[end]]; for(i = 1 ; i < numcp-1 ; ++i) cutpoints[i] = (valuesNoRepeated[i-1]+valuesNoRepeated[i])/2; // 3) Compute the probability of each class classProb = new double[Parameters.numClasses]; for(i = 0 ; i < Parameters.numClasses ; ++i) classProb[i] = 0; for(i = 0 ; i < numInstances ; ++i) classProb[classOfInstances[i]]++; for(i = 0 ; i < Parameters.numClasses ; ++i) classProb[i] /= numInstances; // 4) Compute the entropy of each interval int numIntervals = numcp-1; entropyIntrvl = new double[numIntervals]; for(i = 0 ; i < numIntervals ; ++i) entropyIntrvl[i] = intervalEntropy(i, attribute); // 5) Compute the entropy for each cutpoint double[] entropyCutp = new double[numcp]; entropyCutp[0] = entropyCutp[numcp-1] = (-1)*Double.MIN_VALUE; for(i = 1 ; i < numcp-1 ; ++i) entropyCutp[i] = cutpointEntropy(i); // ... and sort these values int[] positions = Quicksort.sort(entropyCutp, numcp, Quicksort.LOWEST_FIRST); // 6) Lastly, repeat and quit the first size-maxIntervals cutpoints selected = new int[numcp]; for(i = 0 ; i < numcp ; ++i) selected[i] = 1; for(i = 0 ; i < numcp-(numInterAtt[attribute]-1) ; ++i) selected[positions[i]] = 0; // 7) return the selected cutpoints Vector cp = new Vector(); for(i = 0 ; i < numcp ; ++i) if(selected[i] == 1) cp.add(cutpoints[i]); return cp; } //****************************************************************************************************** /** * <p> * It computes the interval entropy * </p> * @param interval index of the interval * @param attribute index of the attribute to discretize * @return the entropy of the interval */ public double intervalEntropy(int interval, int attribute){ int i, numInst = 0; double total = 0; double bottomBound = cutpoints[interval]; double topBound = cutpoints[interval+1]; // compute the probability of each distinct class into the interval double[] probInterval = new double[Parameters.numClasses]; for(i = 0 ; i < Parameters.numClasses ; ++i) probInterval[i] = 0; for(i = 0 ; i < numInstances ; ++i){ // if it is the last interval if(interval == entropyIntrvl.length-1){ if(realValues[attribute][i] >= bottomBound){ probInterval[classOfInstances[i]]++; numInst++; } } else if(realValues[attribute][i] >= bottomBound && realValues[attribute][i] < topBound){ probInterval[classOfInstances[i]]++; numInst++; } } for(i = 0 ; i < Parameters.numClasses ; ++i) probInterval[i] /= numInst; // compute the entropy of the interval for(i = 0 ; i < Parameters.numClasses ; ++i) total += Math.pow((Math.sqrt(classProb[i]) - Math.sqrt(probInterval[i])), 2); return Math.sqrt(total); } //****************************************************************************************************** /** * <p> * It computes the cutpoint entropy * </p> * @param cutp index of the cutpoint * @return the entropy of the cutpoint */ public double cutpointEntropy(int cutp){ return Math.abs(entropyIntrvl[cutp-1] - entropyIntrvl[cutp]); } }