/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ /** * <p> * @author Written by Salvador Garc�a (University of Ja�n - Ja�n) 25/03/2009 * @version 1.0 * @since JDK1.5 * </p> */ package keel.Algorithms.Discretizers.ModifiedChi2_Discretizer; import java.util.*; public class ModifiedChi2Discretizer extends Discretizer { /** * <p> * This class implements the Chi2 discretizer. * </p> */ public ModifiedChi2Discretizer() { } protected Vector <Interval> obtainIntervals (int attribute,int []values,int begin,int end) { /*Compute the isolated intervals and the class distribution in each one. Initially, * it considers all possible intervals from the data set.*/ Vector <Interval> intervals = mergeEqualValues(attribute,values,begin,end); return intervals; } protected Vector discretizeAttributePreliminary(int attribute, int []values, Vector <Interval> intervals) { Vector <Double> cutPoints = new Vector <Double>(); for(int i=0;i<intervals.size()-1;i++) { Interval int1 = intervals.elementAt(i); Interval int2 = intervals.elementAt(i+1); double cutPoint=(realValues[attribute][values[int1.end]]+realValues[attribute][values[int2.begin]])/2.0; cutPoints.addElement(new Double(cutPoint)); } return cutPoints; } protected Vector discretizeAttribute(int attribute,int []values, Vector <Interval> intervals, double levelSig) { boolean exit=false; double threshold; int nClasses, minNClasses = 0; while(intervals.size()>1 && !exit) { int posMin=-1; double chiMin=0; for(int i=0;i<intervals.size()-1;i++) { Interval int1 = intervals.elementAt(i); Interval int2 = intervals.elementAt(i+1); nClasses = 0; for (int j=0; j<int1.cd.length; j++) { if (int1.cd[j] > 0 || int2.cd[j] > 0) { nClasses++; } } double chi2=0; double []R = new double[2]; double []C = new double[int1.cd.length]; double [][]A = new double[2][]; double N=0; for(int j=0;j<2;j++) { R[j]=0; A[j]=new double[int1.cd.length]; } for(int j=0;j<int1.cd.length;j++) C[j]=0; for(int j=0;j<int1.cd.length;j++) { A[0][j]=int1.cd[j]; A[1][j]=int2.cd[j]; R[0]+=int1.cd[j]; R[1]+=int2.cd[j]; C[j]+=int1.cd[j]; C[j]+=int2.cd[j]; } for(int j=0;j<2;j++) N+=R[j]; for(int j=0;j<2;j++) { for(int k=0;k<int1.cd.length;k++) { double exp=R[j]*C[k]/N; if(R[j]==0 || C[k]==0) exp=0.1; chi2+=(A[j][k]-exp)*(A[j][k]-exp)/exp; } } if(posMin==-1) { posMin=i; chiMin=chi2; minNClasses = nClasses; } else { if(chi2<chiMin) { posMin=i; chiMin=chi2; minNClasses = nClasses; } } } threshold = critchi(levelSig, minNClasses -1); if(chiMin<threshold) { Interval int1 = intervals.elementAt(posMin); Interval int2 = intervals.elementAt(posMin+1); int1.enlargeInterval(int2.end); intervals.removeElementAt(posMin+1); } else { exit=true; } } Vector <Double> cutPoints = new Vector <Double>(); for(int i=0;i<intervals.size()-1;i++) { Interval int1 = intervals.elementAt(i); Interval int2 = intervals.elementAt(i+1); double cutPoint=(realValues[attribute][values[int1.end]]+realValues[attribute][values[int2.begin]])/2.0; cutPoints.addElement(new Double(cutPoint)); } return cutPoints; } Vector <Interval> mergeEqualValues(int attribute,int []values,int begin,int end) { Vector <Interval> intervals = new Vector <Interval> (); int beginAnt=begin; double valueAnt=realValues[attribute][values[begin]]; for(int i=begin+1;i<=end;i++) { double val=realValues[attribute][values[i]]; if(val!=valueAnt) { intervals.addElement(new Interval(attribute,values,beginAnt,i-1,classOfInstances)); beginAnt=i; valueAnt=val; } } intervals.addElement(new Interval(attribute,values,beginAnt,end,classOfInstances)); return intervals; } final static double Z_EPSILON=0.000001 ; final static double Z_MAX=6.0 ; double poz_orig (double z) { double y, x, w; if (z == 0.0) x = 0.0; else { y = 0.5 * Math.abs (z); if (y >= (Z_MAX * 0.5)) x = 1.0; else if (y < 1.0) { w = y*y; x = ((((((((0.000124818987 * w -0.001075204047) * w +0.005198775019) * w -0.019198292004) * w +0.059054035642) * w -0.151968751364) * w +0.319152932694) * w -0.531923007300) * w +0.797884560593) * y * 2.0; } else { y -= 2.0; x = (((((((((((((-0.000045255659 * y +0.000152529290) * y -0.000019538132) * y -0.000676904986) * y +0.001390604284) * y -0.000794620820) * y -0.002034254874) * y +0.006549791214) * y -0.010557625006) * y +0.011630447319) * y -0.009279453341) * y +0.005353579108) * y -0.002141268741) * y +0.000535310849) * y +0.999936657524; } } return (z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5)); } final static double CHI_EPSILON=0.000001; final static double CHI_MAX=99999.0; final static double LOG_SQRT_PI=0.5723649429247000870717135; final static double I_SQRT_PI=0.5641895835477562869480795; final static double BIGX=20.0; double ex(double x) { return (((x) < -BIGX) ? 0.0 : Math.exp (x)); } double pochisq(double x, int df) { double a, y=0, s; double e, c, z; boolean even; /* true if df is an even number */ if (x <= 0.0 || df < 1) return (1.0); a = 0.5 * x; even = (2 * (df / 2)) == df; if (df > 1) y = ex(-a); s = (even ? y : (2.0 * poz_orig(-Math.sqrt(x)))); if (df > 2) { x = 0.5 * (df - 1.0); z = (even ? 1.0 : 0.5); if (a > BIGX) { e = (even ? 0.0 : LOG_SQRT_PI); c = Math.log(a); while (z <= x) { e = Math.log(z) + e; s += ex(c * z - a - e); z += 1.0; } return (s); } else { e = (even ? 1.0 : (I_SQRT_PI / Math.sqrt(a))); c = 0.0; while (z <= x) { e = e * (a / z); c = c + e; z += 1.0; } return (c * y + s); } } else return (s); } double critchi (double p, int df) { double minchisq = 0.0; double maxchisq = CHI_MAX; double chisqval; if (p <= 0.0) return (maxchisq); else if (p >= 1.0) return (0.0); chisqval = df / Math.sqrt (p); /* fair first value */ while (maxchisq - minchisq > CHI_EPSILON) { if (pochisq (chisqval, df) < p) maxchisq = chisqval; else minchisq = chisqval; chisqval = (maxchisq + minchisq) * 0.5; } return (chisqval); } }