/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.MVD;
import keel.Algorithms.Genetic_Rule_Learning.Globals.*;
import keel.Algorithms.Discretizers.Basic.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Vector;
import keel.Dataset.*;
/**
* <p>
* This class implements the UCPD algorithm
* </p>
*
* <p>
* @author Written by Julian Luengo (SCI2S research group, DECSAI in ETSIIT, University of Granada), 18/04/2011
* @version 1.0
* @since JDK1.6
* </p>
*/
public class MVD extends Discretizer {
// tags
static final int LEFT = 0;
static final int RIGHT = 1;
// instance variables
private int numInstances; // number of instances
private int numAttributes; // number of attributes
private int numClasses;
private Vector[] cutpoints; // cutpoints of each continuous attribute
ArrayList<Integer> indexContinuousAtt;
protected int numBasicIntervals;
private InstanceSet is;
private double alpha;
protected int min = -1;
protected int max = -1;
int majoritaryGroup;
//******************************************************************************************************
/**
* <p>
* Returns a vector with the discretized values
* </p>
* @param attribute index of the attribute to discretize
* @param values not used
* @param begin not used
* @param end not used
* @return vector with the discretized values
*/
protected Vector discretizeAttribute(int attribute, int []values, int begin, int end){
return cutpoints[attribute];
}
//******************************************************************************************************
/**
* <p>
* Constructor of the class
* </p>
* @param _is set of instances
*/
public MVD(InstanceSet _is, int _numBasicIntervals,double _alpha){
Attribute at;
int numContinuous = 0;
// initialize parameters
//Randomize.setSeed(Parameters.seed);
is = _is;
numInstances = _is.getNumInstances(); // number of instances
numAttributes = Attributes.getInputNumAttributes(); // number of attributes
numClasses = Attributes.getOutputAttribute(0).getNumNominalValues();
if (_numBasicIntervals > 0)
numBasicIntervals=_numBasicIntervals;
else
numBasicIntervals = (Parameters.numInstances / (100)) > Parameters.numClasses?Parameters.numInstances / (100):Parameters.numClasses;
alpha = _alpha;
//create the index array of numeric attributes
indexContinuousAtt = new ArrayList<Integer>();
for(int i=0;i<numAttributes;i++){
at = Attributes.getInputAttribute(i);
if(at.getType() != Attribute.NOMINAL){
indexContinuousAtt.add(i);
}
}
}
//******************************************************************************************************
/**
* <p>
* Computes the cutpoints for each continuous variable
* </p>
*/
public void discretizeAllAttributes(){
Instance inst;
Attribute at;
Interval x,y,z;
int currentInterval,previouscp, index;
double cp,delta,sup_x,sup_y;
boolean elegibleMergingCP,distributionDifference,significantDifference;
ArrayList<Double> data;
ArrayList<Integer> indices;
Vector<ArrayList<Double>> cutpointsEdim = new Vector<ArrayList<Double>>(indexContinuousAtt.size());
Vector<ArrayList<Double>> continuousValues = new Vector<ArrayList<Double>>(indexContinuousAtt.size());
Vector<ArrayList<Interval>> intervalsEdim = new Vector<ArrayList<Interval>>(indexContinuousAtt.size());
ArrayList<Interval> allIntervals = new ArrayList<Interval>();
int instancesPerClass[] = new int[numClasses];
ArrayList<Interval> intervals;
int contingencyTable[][];
Chi2 chi2 = new Chi2();
int numIntervals = 0;
for(int j=0;j<indexContinuousAtt.size();j++){
continuousValues.add(new ArrayList<Double>());
cutpointsEdim.add(new ArrayList<Double>());
intervalsEdim.add(new ArrayList<Interval>());
}
//1) Create a fine partition of all continuous attributes
for(int i=0;i<is.getNumInstances();i++){
inst = is.getInstance(i);
for(int j=0;j<indexContinuousAtt.size();j++){
continuousValues.get(j).add(inst.getAllInputValues()[indexContinuousAtt.get(j)]);
}
instancesPerClass[(int)inst.getAllOutputValues()[0]]++;
}
//sort the values in order so we can get the initial cutpoints
for(int j=0;j<continuousValues.size();j++){
Collections.sort(continuousValues.get(j));
}
//use a equal frequency discretization as an initial guess partitioning
for(int i = 0 ; i < continuousValues.size() ; i++){
cutpointsEdim.set(i, uniformFrequencyCutpoints(numBasicIntervals,continuousValues.get(i), i));
}
//2) iteratively select two adjacent intervals X and Y that have the minimum combined support and do not have a known discretization
//boundary between them as candidates for merging for each continuous attribute
//first build up de intervals from the cutpoints
for(int i=0;i<cutpointsEdim.size();i++){
data = cutpointsEdim.get(i);
//Dynamically convert the cutpoints to intervals
intervals = new ArrayList<Interval>();
intervals.add(new Interval(Double.MIN_VALUE,data.get(0),indexContinuousAtt.get(i)));
for(int k=1;k<data.size();k++){
intervals.add(new Interval(data.get(k-1),data.get(k),indexContinuousAtt.get(i)));
}
intervals.add(new Interval(data.get(data.size()-1),Double.MAX_VALUE,indexContinuousAtt.get(i)));
intervalsEdim.set(i, intervals);
allIntervals.addAll(intervals);
numIntervals += intervals.size();
}
//evaluate the instances covered by each interval
for(int i=0;i<is.getNumInstances();i++){
inst = is.getInstance(i);
for(int j=0;j<intervalsEdim.size();j++){
intervals = intervalsEdim.get(j);
for(int k=0;k<intervals.size();k++){
x = intervals.get(k);
if(x.covers(inst))
x.addToCoveredInstances(i);
}
}
}
//now initiate the merging process
for(int i=0;i<intervalsEdim.size();i++){
currentInterval = 0;
previouscp = -1;
elegibleMergingCP = true;
intervals = intervalsEdim.get(i);
while(elegibleMergingCP && currentInterval < intervals.size()-1 && intervals.size() >= 2){
x = intervals.get(currentInterval);
y = intervals.get(currentInterval+1);
contingencyTable = new int[2][allIntervals.size()-2];
for(int j=0,t=0;j<allIntervals.size();j++){
z = allIntervals.get(j);
if(!z.equals(x) && !z.equals(y)){
indices = z.getCoveredInstances();
for(int k=0;k<indices.size();k++){
index = indices.get(k);
if(x.covers(index) || y.covers(index)){
contingencyTable[0][t]++;
}else{
contingencyTable[1][t]++;
}
}
t++;
}
}
sup_x = x.support()/(double)numInstances;
sup_y = y.support()/(double)numInstances;
//check the difference between distributions
delta = 0.01*numInstances/Math.min(sup_x, sup_y);
distributionDifference = maximumSupportDifference(contingencyTable) >= delta;
//if the distributions are not different by the previous criteria, we check if such difference is statistically significant
if(distributionDifference){
/*indices = x.getCoveredInstances();
for(int j=0;j<indices.size();j++){
inst = is.getInstance(indices.get(j));
contingencyTable[0][(int)inst.getAllOutputValues()[0]]++;
}
indices = y.getCoveredInstances();
for(int j=0;j<indices.size();j++){
inst = is.getInstance(indices.get(j));
contingencyTable[0][(int)inst.getAllOutputValues()[0]]++;
}
for(int j=0;j<contingencyTable[1].length;j++){
contingencyTable[1][j] = instancesPerClass[j] - contingencyTable[0][j];
}*/
/*contingencyTable[0][0] = x.support();
contingencyTable[0][1] = y.support();
contingencyTable[1][0] = numInstances - x.support();
contingencyTable[1][1] = numInstances - y.support();*/
double p = chiSquare(contingencyTable);
significantDifference = chi2.critchi(p, contingencyTable[0].length-1)<((alpha/4.0)/(double)numIntervals);
}else
significantDifference = false;
//if both conditions are not met, then we can merge the intervals
if(!distributionDifference || !significantDifference){
x.mergeIntervals(y);
intervals.remove(y);
numIntervals--;
if(currentInterval > 0)
currentInterval--;
}
else{
previouscp = currentInterval;
currentInterval++;
}
}
}
cutpoints = new Vector[numAttributes];
for(int i=0,j=0;i<numAttributes;i++){
at = Attributes.getInputAttribute(i);
if(at.getType() != Attribute.NOMINAL){
cutpoints[i] = new Vector();
intervals = intervalsEdim.get(j);
x = intervals.get(0);
for(int k=1;k<intervals.size();k++){
y = intervals.get(k);
cutpoints[i].add((x.upperbound+y.lowerbound)/2.0);
x = y;
}
j++;
}
}
}
//******************************************************************************************************
/**
* <p>
* It calcules the cutpoints with uniform frequency
* </p>
* @param k number of cutpoints to compute
* @param FinalData matrix of data of PCA
* @param att index of the dimension
* @return the cutpoints
*/
private ArrayList<Double> uniformFrequencyCutpoints(int k, ArrayList<Double> data, int att){
ArrayList<Double> cutpoints = new ArrayList<Double>(k);
int instInter = (int)((double)data.size()/(double)(k+1));
int numcp = 0, cont = 0;
for(int i = 0 ; i < numInstances && numcp<k; ++i){
cont++;
if(cont >= instInter){
if(cutpoints.size()==0){
cont = 0;
cutpoints.add(numcp++, data.get(i));
}else if(data.get(i).doubleValue()!=cutpoints.get(cutpoints.size()-1).doubleValue()){
cont = 0;
cutpoints.add(numcp++, data.get(i));
}
}
}
return cutpoints;
}
/**
* Computes the expected value from the contingency table of this node
* @param i the index of the row
* @param j the index of the column
* @return
*/
protected double expectedValue(int contingencyTable[][],int i, int j){
double expected,aux;
aux = 0;
for(int ii=0;ii<contingencyTable[i].length;ii++){
aux += contingencyTable[i][ii];
}
expected = aux;
aux = 0;
for(int ii=0;ii<contingencyTable.length;ii++){
aux += contingencyTable[ii][j];
}
expected *= aux;
expected /= numInstances;
return expected;
}
/**
* Obtains the Chi square value of this node using the contigency table
* @return the chi square value
*/
public double chiSquare(int contingencyTable[][]){
double chi = 0;
double expected;
for(int i=0;i<contingencyTable.length;i++){
for(int j=0;j<contingencyTable[i].length;j++){
expected = expectedValue(contingencyTable, i, j);
chi += Math.pow(contingencyTable[i][j]-expected, 2)/expected;
}
}
return chi;
}
/**
* Gets the maximum support difference
* @return the difference between the maximum and minimum support of the contigency table
*/
public int maximumSupportDifference(int contingencyTable[][]){
computeMaximumAndMinimumSupport(contingencyTable);
return (max-min);
}
/**
* Finds the maximum and minimum supports of all groups
*/
protected void computeMaximumAndMinimumSupport(int contingencyTable[][]){
max = Integer.MIN_VALUE;
min = Integer.MAX_VALUE;
for(int i=0;i<contingencyTable[0].length;i++){
if(max < contingencyTable[0][i]){
max = contingencyTable[0][i];
majoritaryGroup = i;
}
if(min > contingencyTable[0][i])
min = contingencyTable[0][i];
}
}
}