/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.DIBD; import java.util.*; import keel.Algorithms.Discretizers.Basic.*; import keel.Algorithms.Genetic_Rule_Learning.Globals.*; import keel.Dataset.Attribute; import keel.Dataset.Attributes; /** * <p> * This class implements the DIBD * </p> * * @author Written by Jose A. Saez (University of Granada), 21/12/2009 * @version 1.0 * @since JDK1.6 */ public class DIBD extends Discretizer { // tags definition static int LEFT = 0; static int RIGHT = 1; static int ALL_CLASSES = -1; // instance variables private double[] cutpoints; // possible cutpoints private int size; // number of possible cutpoints private int[] numInstValue; // number of instances per value private int[][] numInstClass; // number of instances per class and per value private int numInstances; //total number of instances private int[] selected; //cutpoints selected private int[] numInterAtt; // number of intervals of each attribute //****************************************************************************************************** /** * <p> * Constructor of the class * </p> */ public DIBD(){ int i; numInterAtt = new int[Parameters.numAttributes]; if(Parameters.setConfig){ for(i = 0 ; i < Parameters.numAttributes ; ++i){ Attribute att = Attributes.getAttribute(i); if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER) if (Parameters.numIntervals > 0) numInterAtt[i] = Parameters.numIntervals; else numInterAtt[i] = (Parameters.numInstances / (100)) > Parameters.numClasses?Parameters.numInstances / (100):Parameters.numClasses; // default case else numInterAtt[i] = 0; } } else{ String[] inter = Parameters.numIntrvls.split("_"); int cont = 0; for(i = 0 ; i < Parameters.numAttributes ; ++i){ Attribute att = Attributes.getAttribute(i); if(att.getType() == Attribute.REAL || att.getType() == Attribute.INTEGER) numInterAtt[i] = Integer.parseInt(inter[cont++]); // default case else numInterAtt[i] = 0; } } } //****************************************************************************************************** /** * <p> * Returns a vector with the discretized values * </p> * @param attribute number of the attribute * @param values vector of indexes of the instances sorted from lowest to highest * @param begin index of the instance with the lowest value of attribute * @param end index of the instance with the lowest value of attribute * @return vector with the discretized values */ protected Vector discretizeAttribute(int attribute, int []values, int begin, int end){ int i, j; numInstances = realValues[attribute].length; // number of instances // structures initialization cutpoints = new double[end+1]; numInstValue = new int[end+1]; numInstClass = new int[Parameters.numClasses][end+1]; size = 0; for(i = 0 ; i < end+1 ; ++i) numInstValue[i] = 0; for(i = 0 ; i< Parameters.numClasses ; ++i) for(j = 0 ; j < end+1 ; ++j) numInstClass[i][j] = 0; // 1) calculate the distribution numbers double value = realValues[attribute][values[begin]]; cutpoints[size++] = value; numInstValue[size-1]++; numInstClass[classOfInstances[values[begin]]][size-1]++; for(i = begin+1 ; i <= end ; ++i){ if(value!=realValues[attribute][values[i]]){ cutpoints[size++] = realValues[attribute][values[i]]; numInstValue[size-1]++; numInstClass[classOfInstances[values[i]]][size-1]++; value = realValues[attribute][values[i]]; } else{ numInstValue[size-1]++; numInstClass[classOfInstances[values[i]]][size-1]++; } } // 2) calculate dichotomic entropy for each cutpoint and determine the splitting point // set initial values int icn = 1; // interval control number int vstart = 0, vend = 0; // index of start cutpoint and end cutpoint (interval) selected = new int[size]; // splitting point sequence list int pos; // index of minimal entropy double min = 0; // minimal entropy int vstartMin = 0, vendMin = 0; // index of start and end cutpoints of minimal entropy (interval) double ni; // number of instances of the selected interval double ecom, ecomL, ecomR, max, decom; // selected cutpoints initialization for(i = 0 ; i < size ; ++i) selected[i] = 0; selected[0] = selected[size-1] = 1; boolean stopCondition = false; boolean continueLoop = true; do{ pos = -1; // this indicates that the value is the first // determine the splitting point for(i = 0 ; i < size ; ++i){ // if the cutpoint is begin of a interval, keep begin and end of this interval [vstart, vend] if(selected[i] == 1){ continueLoop = true; vstart = i; for(j = i+1 ; j < size && continueLoop ; ++j){ if(selected[j] == 1){ vend = j; continueLoop = false; } } } // otherwise, compute the entropy for this cutpoint else{ ni = computeIntervalNI(ALL_CLASSES, vstart, vend); if(ni == 0) ni = 0.0001; value = (double) (Eleft(i) + Eright(i)) / ni; // if value is lower than min or it is the first value (pos == -1), then update min entropy if(value < min || pos == -1){ min = value; pos = i; vstartMin = vstart; vendMin = vend; } } } // 3) see if the cutpoint enters into the splitting point sequence list if(pos == -1) stopCondition = true; else{ // calculate compound distributional index and compound decrement for the cutpoint of minimal entropy ecom = Ecom(vstartMin, vendMin); ecomL = Ecom(vstartMin, pos); ecomR = Ecom(pos+1, vendMin); max = ecomL>ecomR?ecomL:ecomR; decom = ecom*(ecom-max); // adaptative rule control if( (decom < 0.001) || (icn >= numInterAtt[attribute]) ){ stopCondition = true; } else{ icn++; selected[pos] = 1; // add the splitting point into the splitting point sequence list } } }while(!stopCondition); Vector cp = new Vector(); selected[0] = selected[size-1] = 0; for(i = 0 ; i < size ; ++i) if(selected[i] == 1) cp.add(cutpoints[i]); return cp; } //****************************************************************************************************** /** * <p> * It computes the number of instances of class class_ (or all classes) and attribute value <= or > than * value, according to option * </p> * @param class_ class of instances computed * @param value value to compare * @param option is equal to one tag: LEFT (for <=) or RIGHT (for >) * @return number of instances */ public int computeNI(int class_, int value, int option){ int res = 0; if(option == LEFT){ if(class_ == ALL_CLASSES){ for(int i = 0 ; i <= value ; ++i) res += numInstValue[i]; } else{ for(int i = 0 ; i <= value ; ++i) res += numInstClass[class_][i]; } } if(option == RIGHT){ if(class_ == ALL_CLASSES){ for(int i = value+1 ; i < size ; ++i) res += numInstValue[i]; } else{ for(int i = value+1 ; i < size ; ++i) res += numInstClass[class_][i]; } } return res; } //****************************************************************************************************** /** * <p> * It computes the number of instances with attribute value in the interval [start, end] and class class_ * </p> * @param class_ class of instances computed * @param start begin of the interval * @param end end of the interval * @return number of instances */ public int computeIntervalNI(int class_, int start, int end){ int res = 0; if(class_ == ALL_CLASSES){ for(int i = start ; i <= end ; ++i) res += numInstValue[i]; } else{ for(int i = start ; i <= end ; ++i) res += numInstClass[class_][i]; } return res; } //****************************************************************************************************** /** * <p> * It computes the Left decision distributional index needed to compute the entropy of a cutpoint * </p> * @param value index of the cutpoint studied * @return the value of the index */ public double Eleft(int value){ double res = 0, aux; for(int dk = 0 ; dk < Parameters.numClasses ; ++dk){ aux = computeNI(dk, value, LEFT); if(aux!=0){ double aux2 = (Math.log(aux/computeNI(ALL_CLASSES, value, LEFT)))/ (Math.log(2)); res += (-1)*aux*aux2; } } return res; } //****************************************************************************************************** /** * <p> * It computes the Right decision distributional index needed to compute the entropy of a cutpoint * </p> * @param value index of the cutpoint studied * @return the value of the index */ public double Eright(int value){ double res = 0, aux; for(int dk = 0 ; dk < Parameters.numClasses ; ++dk){ aux = computeNI(dk, value, RIGHT); if(aux!=0){ double aux2 = (Math.log(aux/computeNI(ALL_CLASSES, value, RIGHT)))/ (Math.log(2)); res += (-1)*aux*aux2; } } return res; } //****************************************************************************************************** /** * <p> * It computes the compound distributional index needed to compute the compound decrement of a cutpoint * </p> * @param start begin of the interval * @param end end of the interval * @return the value of the index */ public double Ecom(int start, int end){ double ed = Ed(start, end); double ev = Ev(start, end); double res = (ed - ev) / numInstances; return res; } //****************************************************************************************************** /** * <p> * It computes the decision distributional index needed to compute the compound distributional index (Ecom) * </p> * @param start begin of the interval * @param end end of the interval * @return the value of the index */ public double Ed(int start, int end){ double res = 0, aux, aux2; for(int dk = 0 ; dk < Parameters.numClasses ; ++dk){ aux = computeIntervalNI(dk, start, end); if(aux != 0){ aux2 = ( (Math.log(aux/computeIntervalNI(ALL_CLASSES, start, end))) / (Math.log(2)) ); res += (-1)*aux*aux2; } } return res; } //****************************************************************************************************** /** * <p> * It computes the value distributional index needed to compute the compound distributional index (Ecom) * </p> * @param start begin of the interval * @param end end of the interval * @return the value of the index */ public double Ev(int start, int end){ double res = 0, aux, resto; for(int i = start ; i < end ; ++i){ for(int dk = 0 ; dk < Parameters.numClasses ; ++dk){ aux = numInstClass[dk][i]; if(aux != 0){ resto = ( (Math.log(aux/numInstValue[i])) / (Math.log(2)) ); res += (-1)*aux*resto; } } } return res; } }