/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.Khiops; import java.util.ArrayList; import java.util.Collections; import java.util.PriorityQueue; import java.util.Vector; import keel.Algorithms.Discretizers.Basic.Discretizer; import keel.Algorithms.Preprocess.Missing_Values.EventCovering.Stat.*; import keel.Dataset.Attributes; /** * Khiops Discretizer * Implemented by Julian Luengo, March 2010 * julianlm@decsai.ugr.es * * Based on the work of Marc Boull� * * M. Boulle. * Khiops: A Statistical Discretization Method of Continuous Attributes * Machine Learning 55:1 (2004) 53-69 * * <p> * @author Written by Juli�n Luengo Mart�n 18/03/2010 * @version 0.2 * @since JDK 1.5 * </p> */ public class Khiops extends Discretizer { int numClasses; long freqConstraint; ArrayList<Double> chi2Rows; int nj[]; public Khiops(){ numClasses = Attributes.getOutputAttribute(0).getNumNominalValues(); chi2Rows = new ArrayList<Double>(); freqConstraint = 5; } @Override protected Vector discretizeAttribute(int attribute, int[] values, int begin, int end) { ArrayList<Double> substr,intA,intB; Vector cp,tmp,bestcp; double actualChi2,bestConfidenceLevel,confidenceLevel,discCostVariation,rightChi2; DeltaValue variation,nextInt; int iter; PriorityQueue<DeltaValue> deltas; ArrayList<DeltaValue> pts; boolean improvement,control,allMetFreqConstraint; freqConstraint = Math.round(Math.max(5, Math.sqrt(end+1))); cp = new Vector(); //create initial discretization with number of interval equal to number of different values substr = new ArrayList<Double>(1); substr.add(realValues[attribute][values[0]]); for(int i=1;i<=end;i++){ if(realValues[attribute][values[i]]!=substr.get(substr.size()-1)){ cp.add(substr); substr = new ArrayList<Double>(1); } substr.add(realValues[attribute][values[i]]); } cp.add(substr); //now compute the confidence level value for this initial discretization //and initialize the chi2 rows (one for each initial interval) bestConfidenceLevel = chi2RowValues(cp,values); bestConfidenceLevel = 1.0 - StatFunc.chiSquare(bestConfidenceLevel, (cp.size()-1)*(numClasses-1)); //compute the delta-values related to all the possible merges deltas = new PriorityQueue<DeltaValue>(end,Collections.reverseOrder()); pts = new ArrayList<DeltaValue>(); for(int i=0;i<cp.size()-1;i++){ variation = new DeltaValue(); variation.leftInterval = (ArrayList<Double>)cp.get(i); variation.rightInterval = (ArrayList<Double>)cp.get(i+1); variation.leftChi2Row = chi2Rows.get(i); variation.rightChi2Row = chi2Rows.get(i+1); if(variation.leftInterval.size()>=freqConstraint && variation.rightInterval.size()>=freqConstraint) variation.freqConstrMet = true; else variation.freqConstrMet = false; variation.index = i; if(i!=0){ //make list of pointers variation.prev = pts.get(pts.size()-1); variation.prev.next = variation; } variation.delta = mergeCostVariation(variation.leftInterval,i,variation.leftChi2Row,variation.rightInterval,i+1,variation.rightChi2Row,values); deltas.add(variation); pts.add(variation); } //sort the possible merges in ascending order... //...not needed since we use a priority queue //Now we optimize the initial discretization actualChi2 = 0; iter = 0; improvement = true; bestcp = new Vector(); for(int i=0;i<cp.size();i++){ substr = (ArrayList<Double>) cp.get(i); bestcp.add(substr.clone()); } allMetFreqConstraint = false; while(deltas.size()>0 && (!allMetFreqConstraint || improvement) ){ //check for the best merge variation = deltas.poll(); //take the first item -i.e. one with the highest deltaChi2 value- //do the intervals implied in this iteration meet the minimum frequency constraints? allMetFreqConstraint = variation.freqConstrMet; intA = variation.leftInterval; intB = variation.rightInterval; //merge the intervals // index = cp.indexOf(intA); //for debugging purposes intA.addAll(intB); //remove interval B from list of intervals -now is in interval A- intB.clear(); //clear the unnecessary interval, so become unique in its content-we haven't empty intervals by definition-, and //remove() method which follows cannot confuse it with other interval -and therefore erase it incorrectly- control = cp.remove(intB); variation.leftChi2Row = variation.delta + variation.leftChi2Row + variation.rightChi2Row; control = chi2Rows.remove(variation.rightChi2Row); //update the list references nextInt = variation.next; if(nextInt!=null){ variation.next = nextInt.next; //point above the interval B to next C variation.rightInterval = nextInt.rightInterval; variation.rightChi2Row = nextInt.rightChi2Row; if(variation.next != null){ //it is not the last interval in the list //update the next interval previous pointer to the new merged interval variation.next.prev = variation; } } //remove the merge of interval B with subsequent interval from both //priority queue and control list control = deltas.remove(nextInt); control = pts.remove(nextInt); //compute the cost variation of the two intervals adjacent to the merge: //with the next one if(variation.rightInterval.size()!=0){ variation.delta = mergeCostVariation(variation.leftInterval,variation.index,variation.leftChi2Row,variation.rightInterval,variation.index+variation.leftInterval.size(),variation.rightChi2Row,values); if(variation.leftInterval.size()>=freqConstraint && variation.rightInterval.size()>=freqConstraint) variation.freqConstrMet = true; //extract and re-insert in the queue to order this item //control = deltas.remove(variation); <-- not needed, already erased from poll() at beginning deltas.add(variation); } //with the previous one if(variation.prev != null){ variation.prev.delta = mergeCostVariation(variation.prev.leftInterval,variation.prev.index,variation.prev.leftChi2Row,variation.leftInterval,variation.index,variation.leftChi2Row,values); if(variation.prev.leftInterval.size()>=freqConstraint && variation.prev.rightInterval.size()>=freqConstraint) variation.prev.freqConstrMet = true; //extract and re-insert in the queue to order this item control = deltas.remove(variation.prev); deltas.add(variation.prev); } if(variation.rightInterval.size()==0){ deltas.remove(variation); } actualChi2 = 0; for(int i=0;i<chi2Rows.size();i++){ actualChi2 += chi2Rows.get(i); } confidenceLevel = 1.0 - StatFunc.chiSquare(actualChi2, (cp.size()-1)*(numClasses-1)); //the new discretization scheme is accepted if it decreases the confidence level //or if it has merged one or two intervals with less than "freqConstraint" elements if(confidenceLevel < bestConfidenceLevel || !allMetFreqConstraint){ bestConfidenceLevel = confidenceLevel; bestcp = new Vector(); for(int i=0;i<cp.size();i++){ substr = (ArrayList<Double>) cp.get(i); bestcp.add(substr.clone()); } improvement = true; } else improvement = false; //check that all intervals meet the minimum frequency constraint //that is, no merge at the top of the queue does not meet this constraint //(merges between intervals with less elements than the constraint are always on top) if(deltas.size()>0) allMetFreqConstraint = deltas.peek().freqConstrMet; } //return the best set of intervals return createCP(bestcp); } /** * Computes the cost derived form merging two adjacent intervals na and nb * @param na Interval to the left to merge * @param indexna Index of the first element of na in the whole list of real values * @param nb Right interval to merge * @param indexnb Index of the first element of nb in the whole list of real values * @param nbChi2 Current number of intervals (the total intervals prior to the merging) * @param values Array in which position i there is the number of instance which explanatory (real) value has rank i after sorting * @return The cost variation produced by the merge operation */ public double mergeCostVariation(ArrayList<Double> na,int indexna, double naChi2, ArrayList<Double> nb,int indexnb, double nbChi2,int values[]){ double cost,newRowChi2; ArrayList<Double> merge = new ArrayList<Double>(na); merge.addAll(nb); newRowChi2 = mergedRowChi2Value(merge,indexna,values); cost = newRowChi2 - naChi2 - nbChi2; return cost; } /** * Creates the initial chi square value of the initial discretization scheme. * It also initialize the contribution to this value of each interval, so the combination * of intervals can be quickly evaluated. * @param disc the initial discretization scheme (one interval for each different value) * @param values the global array of values (sorted) * @return the chi square value for the initial discretization scheme */ public double chi2RowValues(Vector disc,int values[]){ int n,I,J; int ni[]; int nij[][]; ArrayList<Double> interval; double chi2Value,eij; n = 0; I = disc.size(); J = numClasses; ni = new int[I]; nj = new int[J]; nij = new int[I][J]; chi2Rows = new ArrayList<Double>(I); for(int i=0,m=0;i<I;i++){ interval = (ArrayList<Double>)disc.get(i); ni[i] = interval.size(); n += ni[i]; for(int j=0;j<ni[i];j++,m++){ nj[classOfInstances[values[m]]]++; nij[i][classOfInstances[values[m]]]++; } } //TODO - optimize as there is only intervals with one element (second "for" can be avoided) chi2Value = 0; for(int i=0;i<I;i++){ chi2Rows.add(0.0); for(int j=0;j<J;j++){ eij = (double)(ni[i] * nj[j])/(n); chi2Rows.set(i,chi2Rows.get(i)+Math.pow(nij[i][j]-eij,2)/(double)eij); } chi2Value += chi2Rows.get(i); } return chi2Value; } /** * This method calculates the contribution to the global chi square value * of a new interval (produced by merging two adjacent ones). * @param mergedInterval the new interval * @param index the index of the first element (left-most one) value in the global array of values * @param values the global array of values * @return the contribution of this new interval to the global chi square value */ public double mergedRowChi2Value(ArrayList<Double> mergedInterval,int index,int values[]){ int n,J; int ni,nij[]; double rowChi2Value,eij; n = values.length; J = numClasses; ni = mergedInterval.size(); nij = new int[J]; for(int j=0;j<ni;j++){ nij[classOfInstances[values[index+j]]]++; } rowChi2Value = 0; for(int j=0;j<J;j++){ eij = (double)(ni * nj[j])/(n); rowChi2Value +=Math.pow(nij[j]-eij,2)/(double)eij; } return rowChi2Value; } /** * Construct an array of cutpoints from the set of intervals. * @param intervals Vector which contains the intervals in ArrayList<Double> format * @return A Vector with double formatted cutpoints, computed as the midterm between two adjacent intervals. */ public Vector createCP(Vector intervals){ double cutPoint; Vector cp; ArrayList<Double> substr; cp = new Vector(); for(int i=0;i<intervals.size()-1;i++){ substr = (ArrayList<Double>)intervals.get(i); cutPoint = substr.get(substr.size()-1); substr = (ArrayList<Double>)intervals.get(i+1); cutPoint += substr.get(0); cutPoint /= 2.0; // if(cutPoint != substr.get(0)) cp.add(new Double(cutPoint)); } return cp; } }