/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.CADD_Discretizer;
import java.util.*;
import keel.Algorithms.Discretizers.Basic.*;
import keel.Algorithms.Genetic_Rule_Learning.Globals.*;
/**
* <p>
*
* <p>
* This class implements the CADD discretizer.
* </p>
*
* @author Written by Salvador Garc�a (University of Granada - Granada) 04/01/2008
* @author Modified by Xavi Sol� (La Salle, Ram�n Llull University - Barcelona) 03/12/2008
* @version 1.1
* @since JDK1.5
* </p>
*/
public class CADDDiscretizer extends Discretizer {
double BIGX = 20.0;
double confidenceThreshold;
int numIntervals;
/**
* Builder
* @param _conf Confidence threshold
* @param _nint Number of intervals
*/
public CADDDiscretizer (double _conf, int _nint) {
confidenceThreshold=_conf;
numIntervals = _nint;
}
protected Vector discretizeAttribute(int attribute,int []values,int begin,int end) {
int numInt;
int sumaAbajo[], sumaDerecha[], total[], quanta[][];
int ordenados[];
double fitness, mejorFitness;
boolean parar = false;
double partialRCA;
double test;
numInt = (end - begin + 1) / (3 * Parameters.numClasses);
if (numIntervals > 0) {
if (numInt < numIntervals && numIntervals <= (end - begin + 1)) {
numInt = numIntervals;
}
}
double quota = (end - begin + 1) / (double) numInt;
double dBound = 0.0;
int i, j;
int oldBound = 0;
boolean saCabo = false;
Vector <Double> cp = new Vector <Double>();
Vector <Double> cpTmp;
Vector <Double> mejorCP;
/*First step: Uniform Frequency discretizer with fixed num. intervals*/
for (i=0; i<numInt - 1 && !saCabo; i++) {
dBound += quota;
int iBound = (int) Math.round(dBound);
if (iBound <= oldBound)
continue;
if (realValues[attribute][values[iBound-1]] != realValues[attribute][values[iBound]]) {
double cutPoint=realValues[attribute][values[iBound-1]];
cp.addElement(new Double(cutPoint));
} else {
double val = realValues[attribute][values[iBound]];
int numFW = 1;
while (iBound + numFW <= end && realValues[attribute][values[iBound + numFW]] == val) numFW++;
if (iBound + numFW > end) numFW = end - begin + 2;
int numBW = 1;
while (iBound - numBW > oldBound && realValues[attribute][values[iBound - numBW]] == val) numBW++;
if (iBound - numBW == oldBound) numBW = end - begin + 2;
if (numFW < numBW) {
iBound += numFW;
} else if (numBW < numFW) {
iBound -= numBW;
} else {
if (numFW == end - begin + 2) {
saCabo = true;
}
if (Rand.getReal() < 0.5) {
iBound += numFW;
} else {
iBound -= numBW;
iBound++;
}
}
if (!saCabo) {
double cutPoint = realValues[attribute][values[iBound-1]];
cp.addElement(new Double(cutPoint));
}
}
oldBound=iBound;
}
quanta = new int[Parameters.numClasses][cp.size()+1];
sumaAbajo = new int[cp.size()+1];
sumaDerecha = new int[Parameters.numClasses];
total = new int[1];
ordenados = new int[end - begin + 1];
for (i=begin, j=0; i<=end; i++, j++) {
ordenados[j] = values[i];
}
mejorCP = new Vector <Double>(cp);
/*Second step: Local Search changing the cut points*/
while (!parar) {
construyeQuanta(quanta, sumaAbajo, sumaDerecha, total, cp, ordenados, attribute);
mejorFitness = computeFitness(quanta, sumaAbajo, sumaDerecha, total[0]);
parar = true;
for (i=0; i<cp.size(); i++) {
cpTmp = cambiaIntervalo (cp, ordenados, attribute, i, false);
construyeQuanta(quanta, sumaAbajo, sumaDerecha, total, cpTmp, ordenados, attribute);
fitness = computeFitness(quanta, sumaAbajo, sumaDerecha, total[0]);
if (fitness > mejorFitness) {
mejorFitness = fitness;
mejorCP = new Vector <Double>(cpTmp);
parar = false;
}
cpTmp = cambiaIntervalo (cp, ordenados, attribute, i, true);
construyeQuanta(quanta, sumaAbajo, sumaDerecha, total, cpTmp, ordenados, attribute);
fitness = computeFitness(quanta, sumaAbajo, sumaDerecha, total[0]);
if (fitness > mejorFitness) {
mejorFitness = fitness;
mejorCP = new Vector <Double>(cpTmp);
parar = false;
}
}
cp = new Vector <Double>(mejorCP);
}
for (i=1; i<cp.size(); i++) {
if (cp.elementAt(i-1).doubleValue() >= cp.elementAt(i).doubleValue()) {
cp.remove(i);
i--;
}
}
/*Third step: remove intervals which are statistically independent*/
parar = false;
while (!parar && cp.size() > (numIntervals-1)) {
parar = true;
construyeQuanta(quanta, sumaAbajo, sumaDerecha, total, cp, ordenados, attribute);
for (i=0; i<cp.size() && parar; i++) {
partialRCA = computeRCA(quanta, sumaAbajo, i);
test = computeTest(quanta, sumaAbajo, i);
if (partialRCA >= test) {
parar = false;
cp.remove(i);
}
}
}
return cp;
}
private void construyeQuanta (int quanta[][], int sumaAbajo[], int sumaDerecha[], int total[], Vector <Double> cutPoints, int ordenados[], int attribute) {
int i, j;
int intervalo = 0;
for (i=0; i<quanta.length; i++) {
for (j=0; j<quanta[i].length; j++) {
quanta[i][j] = 0;
sumaAbajo[j] = 0;
}
sumaDerecha[i] = 0;
}
total[0] = 0;
for (i=0; i<ordenados.length; i++) {
if (intervalo < cutPoints.size()) {
if (realValues[attribute][ordenados[i]] >= cutPoints.elementAt(intervalo)) {
intervalo++;
}
} else {
intervalo = cutPoints.size();
}
quanta[classOfInstances[ordenados[i]]][intervalo]++;
}
for (i=0; i<quanta.length; i++) {
for (j=0; j<quanta[i].length; j++) {
sumaAbajo[j] += quanta[i][j];
sumaDerecha[i] += quanta[i][j];
total[0] += quanta[i][j];
}
}
}
private double computeFitness (int quanta[][], int sumaAbajo[], int sumaDerecha[], int total) {
int i, j;
double ICA = 0;
double HCA = 0;
for (i=0; i<quanta.length; i++) {
for (j=0; j<quanta[i].length; j++) {
if (quanta[i][j] > 0)
ICA += (double)quanta[i][j]/(double)total * log2(((double)quanta[i][j] / (double)total) / (((double)sumaDerecha[i] / (double)total) * ((double)sumaAbajo[j] / (double)total)));
}
}
for (i=0; i<quanta.length; i++) {
for (j=0; j<quanta[i].length; j++) {
if (quanta[i][j] > 0)
HCA += (double)quanta[i][j]/(double)total * log2((double)quanta[i][j]/(double)total);
}
}
HCA = -1.0 * HCA;
return ICA / HCA;
}
private double computeRCA (int quanta[][], int sumaAbajo[], int intervalo) {
int i, j;
double ICA = 0;
double HCA = 0;
int total;
int sumaDerecha[] = new int[Parameters.numClasses];
total = sumaAbajo[intervalo] + sumaAbajo[intervalo+1];
for (i=0; i<quanta.length; i++) {
for (j=intervalo; j<=intervalo+1; j++) {
sumaDerecha[i] += quanta[i][j];
}
}
for (i=0; i<quanta.length; i++) {
for (j=intervalo; j<=intervalo+1; j++) {
if (quanta[i][j] > 0)
ICA += (double)quanta[i][j]/(double)total * log2(((double)quanta[i][j] / (double)total) / (((double)sumaDerecha[i] / (double)total) * ((double)sumaAbajo[j] / (double)total)));
}
}
for (i=0; i<quanta.length; i++) {
for (j=intervalo; j<=intervalo+1; j++) {
if (quanta[i][j] > 0)
HCA += (double)quanta[i][j]/(double)total * log2((double)quanta[i][j]/(double)total);
}
}
HCA = -1.0 * HCA;
return ICA / HCA;
}
private double computeTest (int quanta[][], int sumaAbajo[], int intervalo) {
int i, j;
double HCA = 0;
int total;
int sumaDerecha[] = new int[Parameters.numClasses];
total = sumaAbajo[intervalo] + sumaAbajo[intervalo+1];
for (i=0; i<quanta.length; i++) {
for (j=intervalo; j<=intervalo+1; j++) {
sumaDerecha[i] += quanta[i][j];
}
}
for (i=0; i<quanta.length; i++) {
for (j=intervalo; j<=intervalo+1; j++) {
if (quanta[i][j] > 0)
HCA += (double)quanta[i][j]/(double)total * log2((double)quanta[i][j]/(double)total);
}
}
HCA = -1.0 * HCA;
return critchi(confidenceThreshold, Parameters.numClasses-1) / (2 * total * HCA);
}
private Vector <Double> cambiaIntervalo (Vector <Double> cp, int ordenados[], int attribute, int intervalo, boolean sentido) {
Vector <Double> res = new Vector <Double>();
int i, j;
double v;
for (i=0; i<cp.size(); i++) {
if (i == intervalo) {
v = cp.elementAt(i);
for (j=0; j<ordenados.length && realValues[attribute][ordenados[j]] < v; j++);
if (sentido) {
for ( ;j<ordenados.length && realValues[attribute][ordenados[j]] == v; j++);
if (j == ordenados.length) {
j--;
}
} else {
if (j>0) {
j--;
}
}
res.addElement(realValues[attribute][ordenados[j]]);
} else {
res.addElement(cp.elementAt(i));
}
}
return res;
}
private double log2 (double x) {
return Math.log(x) / Math.log(2);
}
private double critchi(double p, double df) {
double CHI_EPSILON = 0.000001; /* Accuracy of critchi approximation */
double CHI_MAX = 99999.0; /* Maximum chi-square value */
double minchisq = 0.0;
double maxchisq = CHI_MAX;
double chisqval;
if (p <= 0.0) {
return maxchisq;
} else {
if (p >= 1.0) {
return 0.0;
}
}
chisqval = df / Math.sqrt(p); /* fair first value */
while ((maxchisq - minchisq) > CHI_EPSILON) {
if (pochisq(chisqval, df) < p) {
maxchisq = chisqval;
} else {
minchisq = chisqval;
}
chisqval = (maxchisq + minchisq) * 0.5;
}
return chisqval;
}
private double pochisq(double x, double df) {
double a, y=0.0, s;
double e, c, z;
boolean even; /* True if df is an even number */
double LOG_SQRT_PI = 0.5723649429247000870717135; /* log(sqrt(pi)) */
double I_SQRT_PI = 0.5641895835477562869480795; /* 1 / sqrt(pi) */
if (x <= 0.0 || df < 1) {
return 1.0;
}
a = 0.5 * x;
even = !(df % 1 == 1);
if (df > 1) {
y = ex(-a);
}
s = (even ? y : (2.0 * poz(-Math.sqrt(x))));
if (df > 2) {
x = 0.5 * (df - 1.0);
z = (even ? 1.0 : 0.5);
if (a > BIGX) {
e = (even ? 0.0 : LOG_SQRT_PI);
c = Math.log(a);
while (z <= x) {
e = Math.log(z) + e;
s += ex(c * z - a - e);
z += 1.0;
}
return s;
} else {
e = (even ? 1.0 : (I_SQRT_PI / Math.sqrt(a)));
c = 0.0;
while (z <= x) {
e = e * (a / z);
c = c + e;
z += 1.0;
}
return c * y + s;
}
} else {
return s;
}
}
private double ex(double x) {
return (x < -BIGX) ? 0.0 : Math.exp(x);
}
private double poz(double z) {
double y, x, w;
double Z_MAX = 6.0; /* Maximum meaningful z value */
if (z == 0.0) {
x = 0.0;
} else {
y = 0.5 * Math.abs(z);
if (y >= (Z_MAX * 0.5)) {
x = 1.0;
} else if (y < 1.0) {
w = y * y;
x = ((((((((0.000124818987 * w
- 0.001075204047) * w + 0.005198775019) * w
- 0.019198292004) * w + 0.059054035642) * w
- 0.151968751364) * w + 0.319152932694) * w
- 0.531923007300) * w + 0.797884560593) * y * 2.0;
} else {
y -= 2.0;
x = (((((((((((((-0.000045255659 * y
+ 0.000152529290) * y - 0.000019538132) * y
- 0.000676904986) * y + 0.001390604284) * y
- 0.000794620820) * y - 0.002034254874) * y
+ 0.006549791214) * y - 0.010557625006) * y
+ 0.011630447319) * y - 0.009279453341) * y
+ 0.005353579108) * y - 0.002141268741) * y
+ 0.000535310849) * y + 0.999936657524;
}
}
return z > 0.0 ? ((x + 1.0) * 0.5) : ((1.0 - x) * 0.5);
}
}