/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Discretizers.MODL;
import java.util.*;
import keel.Algorithms.Discretizers.Basic.*;
import keel.Dataset.Attributes;
/**
* MODL Discretizer, based on the work of Marc Boull�
*
* M. Boulle.
* MODL: A bayes optimal discretization method for continuous attributes.
* Machine Learning 65:1 (2006) 131-165
* <p>
* @author Written by Juli�n Luengo Mart�n 07/05/2008
* @version 0.2
* @since JDK 1.5
* </p>
*/
public class MODL extends Discretizer {
int numClasses;
int discretizationApplied;
static int optimal = 1;
static int greedy = 2;
static int optimized = 3;
public MODL(String processType){
if(processType.compareTo("optimal")==0)
discretizationApplied = optimal;
if(processType.compareTo("greedy")==0)
discretizationApplied = greedy;
if(processType.compareTo("optimized")==0)
discretizationApplied = optimized;
numClasses = Attributes.getOutputAttribute(0).getNumNominalValues();
}
@Override
protected Vector discretizeAttribute(int attribute,int []values,int begin,int end) {
if(discretizationApplied == optimal)
return optimalMODL(attribute,values,begin,end);
if(discretizationApplied == greedy)
return greedyMODL(attribute,values,begin,end);
return postOptimizationMODL(attribute,values,begin,end);
}
/**
* Implements the post-optimization procedure for MODL, after obtaining the
* best initial interval division.
* @param attribute the attribute which is being discretized
* @param values the values of the attribute
* @param begin the initial position of the values
* @param end the final position of the values
* @return the best discretization scheme found
*/
protected Vector postOptimizationMODL(int attribute,int []values,int begin,int end) {
Vector cp,bckp,tmp;
ArrayList<Double> substr,intA,intB=null,intC=null;
int index,step;
Neighbour neig;
double mergeCost = 0;
boolean improvement;
PriorityQueue<Neighbour> bestNeighs = new PriorityQueue<Neighbour>();
//compute the exhaustive merge
cp = exhaustiveMerge(attribute,values,begin,end);
//now proceed to the stage greedy post-optimization
//explore all neighbours of the current discretization
improvement = true;
while(improvement){
improvement = false;
bestNeighs.clear();
//lets search for the best neighbour in our hill-climbing
//post-optimization algorithm
index = 0;
for(int i=0;i<cp.size();i++){
intA = (ArrayList<Double>)cp.get(i);
step = intA.size();
//test the Split neighbour over S(i)
neig = split(intA,index,realValues.length,cp.size(),values);
neig.intervalPosition = i;
neig.type = Neighbour.Split;
if(neig.cost < 0 && Math.abs(neig.cost)>1E-5) //avoid precission leaks
bestNeighs.add(neig);
//test the MergeSplit neighbour over {S(i)}+{S(i+1)}
if(i<cp.size()-1){
intB = (ArrayList<Double>)cp.get(i+1);
intC = new ArrayList<Double>();
intC.addAll(intA);
intC.addAll(intB);
//partition cost variation
mergeCost = partitionCost(cp.size()-1,realValues.length);
mergeCost -= partitionCost(cp.size(),realValues.length);
//interval cost variation
mergeCost += intervalCost(intC,index,values, cp.size());
mergeCost -= intervalCost(intA,index,values, cp.size());
mergeCost -= intervalCost(intB,index+intA.size(),values, cp.size());
//search for the best split on the merge
neig = split(intC,index,realValues.length,cp.size()-1,values);
neig.intervalPosition = i;
neig.type = Neighbour.MergeSplit;
neig.cost += mergeCost;
if(neig.cost < 0 && Math.abs(neig.cost)>1E-5) //avoid precission leaks
bestNeighs.add(neig);
}
//test the MergeMergeSplit neighbour over {S(i)}+{S(i+1)}+{S(i+2)}
//be AWARE of we can enter here if we could enter to MergeSplit,
//so some intervals and partially mergeCost are filled/computed
if(i<cp.size()-2){
intA = new ArrayList<Double>();
intA.addAll(intC);
intB = (ArrayList<Double>)cp.get(i+2);
intC.addAll(intB);
//partition cost variation
mergeCost += partitionCost(cp.size()-2,realValues.length);
mergeCost -= partitionCost(cp.size()-1,realValues.length);
//interval cost variation
mergeCost += intervalCost(intC,index,values, cp.size());
mergeCost -= intervalCost(intA,index,values, cp.size());
mergeCost -= intervalCost(intB,index+intA.size(),values, cp.size());
//search for the best split on the merge
neig = split(intC,index,realValues.length,cp.size()-2,values);
neig.intervalPosition = i;
neig.type = Neighbour.MergeMergeSplit;
neig.cost += mergeCost;
if(neig.cost < 0 && Math.abs(neig.cost)>1E-5) //avoid precission leaks
bestNeighs.add(neig);
}
//step to the next candidate interval
index += step;
}
if(bestNeighs.size()>0){
improvement = true;
//take the best neighbour...
neig = bestNeighs.poll();
//...and apply it
applyNeighbour(cp,neig);
}
}
//build the cutpoints
return createCP(cp);
}
/**
* Search for the best cutpoint in a given interval. The best cutpoint is located
* by means of the cost of each cutpoint (if negative, improves the actual interval).
* @param interv The interval which could be partitioned
* @param index Index of the first element of the interval in the complete real value list
* @param n Total number of real values
* @param I Current number of intervals
* @param values Mapping between instance number and the sorted rank by attribute values
* @return The best cutpoint if found: [0,cutpoint) and [cutpoint,n_i], or -1 if no cutpoint improves the current interval cost
*/
protected Neighbour split(ArrayList<Double> interv,int index,int n,int I,int values[]){
double cost,partitionCost,bestCost,intervCost;
ArrayList<Double> lstr,rstr;
int splitIndex = -1;
Neighbour neig = new Neighbour();
partitionCost = partitionCost(I+1,n) - partitionCost(I,n);
intervCost = intervalCost(interv,index,values,I);
lstr = new ArrayList<Double>();
rstr = new ArrayList<Double>();
bestCost = Double.MAX_VALUE;
for(int i=1;i<interv.size()-1;i++){
//the cutpoints MUST split different values!
if(interv.get(i-1).doubleValue()!=interv.get(i).doubleValue()){
lstr.addAll(interv.subList(0, i)); //sublist [0,i)
rstr.addAll(interv.subList(i, interv.size())); //sublist [i,ni+1)
cost = partitionCost + intervalCost(lstr,index,values,I);
cost += intervalCost(rstr,index+i,values,I);
cost -= intervCost;
if(cost < bestCost){
splitIndex = i;
bestCost = cost;
}
lstr.clear();
rstr.clear();
}
}
neig.cost = bestCost;
neig.index = splitIndex;
neig.interval = interv;
return neig;
}
/**
* Apply the neighbour to the current interval set.
* @param cp The interval set
* @param neig The neighbour (Split, MergeSplit or MergeMergeSplit) we want to apply
*/
public void applyNeighbour(Vector cp, Neighbour neig){
ArrayList<Double> intA,intB,intC;
int cutpoint,position;
position = neig.intervalPosition;
cutpoint = neig.index;
intA = (ArrayList<Double>)cp.get(position);
if(neig.type == Neighbour.Split){
intB = new ArrayList<Double>(intA.subList(cutpoint, intA.size()));
for(int i=cutpoint;i<intA.size();)
intA.remove(i);
cp.insertElementAt(intB, position+1);
}else if(neig.type == Neighbour.MergeSplit){
intC = new ArrayList<Double>(intA);
intB = (ArrayList<Double>)cp.get(position+1);
intC.addAll(intB);
intB = new ArrayList<Double>(intC.subList(cutpoint, intC.size()));
for(int i=cutpoint;i<intC.size();)
intC.remove(i);
cp.set(position, intC);
cp.set(position+1, intB);
}else if(neig.type == Neighbour.MergeMergeSplit){
intC = new ArrayList<Double>(intA);
intB = (ArrayList<Double>)cp.get(position+1);
intC.addAll(intB);
intB = (ArrayList<Double>)cp.get(position+2);
intC.addAll(intB);
intB = new ArrayList<Double>(intC.subList(cutpoint, intC.size()));
for(int i=cutpoint;i<intC.size();)
intC.remove(i);
cp.remove(position+2);
cp.set(position, intC);
cp.set(position+1, intB);
}
}
/**
* Computes the cost of the partition
* @param I the number of intervals
* @param n the number of different elements
* @return the cost of the number of partitions
*/
public double partitionCost(int I,int n){
return binomialLog(n+I-1,I-1);
}
/**
* Computes the cost of the interval in the current discretization scheme
* @param interval the interval to be considered
* @param index the index of the intial element of the interval in the global array of values
* @param values the global array of values
* @param I the current number of intervals
* @return the cost of the interval
*/
public double intervalCost(ArrayList<Double> interval,int index,int values[],int I){
double cost;
int ni,nij[];
int J = numClasses;
ni = interval.size();
cost = binomialLog(ni+numClasses-1,numClasses-1);
nij = new int[J];
for(int j=0;j<ni;j++){
nij[classOfInstances[values[index+j]]]++;
}
cost += factorialLog(ni);
for(int j=0;j<nij.length;j++)
cost -= factorialLog(nij[j]);
return cost;
}
/**
* Performs an exhaustive bottom-up merge of all unitary intervals to a unique interval.
* The best configuration is returned.
* @param attribute The attribute of the data set we are discretizing
* @param values Mapping between instance number and the sorted rank by attribute values
* @param begin First position of values to be considered.
* @param end Last position of values to be considered.
* @return The best discretization configuration.
*/
protected Vector exhaustiveMerge(int attribute,int []values,int begin,int end) {
ArrayList<Double> substr,intA,intB;
Vector cp,tmp,bestcp;
double actualMODL,bestMODL,partitioncostVariation,discCostVariation;
DeltaValue variation,nextInt;
int iter;
PriorityQueue<DeltaValue> deltas;
ArrayList<DeltaValue> pts;
boolean improvement,control;
cp = new Vector();
//create initial discretization with number of interval equal to number of values
substr = new ArrayList<Double>(1);
substr.add(realValues[attribute][values[0]]);
for(int i=1;i<=end;i++){
if(realValues[attribute][values[i]]!=substr.get(substr.size()-1)){
cp.add(substr);
substr = new ArrayList<Double>(1);
}
substr.add(realValues[attribute][values[i]]);
}
cp.add(substr);
//now compute the MODL value for this initial discretization
bestMODL = modl(cp,values);
//compute the delta-values related to all the possible merges
deltas = new PriorityQueue<DeltaValue>(end);
pts = new ArrayList<DeltaValue>();
for(int i=0;i<cp.size()-1;i++){
variation = new DeltaValue();
variation.leftInterval = (ArrayList<Double>)cp.get(i);
variation.rightInterval = (ArrayList<Double>)cp.get(i+1);
variation.index = i;
if(i!=0){ //make list pointers
variation.prev = pts.get(pts.size()-1);
variation.prev.next = variation;
}
variation.delta = mergeCostVariation(variation.leftInterval,i,variation.rightInterval,i+1,end,values);
deltas.add(variation);
pts.add(variation);
}
//sort the possible merges in ascending order...
//...not needed since we use a priority queue
//Now we optimize the discretization
actualMODL = 0;
iter = 0;
variation = deltas.poll();
improvement = true;
bestcp = new Vector();
for(int i=0;i<cp.size();i++){
substr = (ArrayList<Double>) cp.get(i);
bestcp.add(substr.clone());
}
while(cp.size() > 1 && iter<cp.size()){
intA = variation.leftInterval;
intB = variation.rightInterval;
//join the intervals' values
// index = cp.indexOf(intA); //for debugging purposes
intA.addAll(intB);
//remove interval B from list of intervals -now is in interval A-
intB.clear(); //clear the unnecessary interval, so become unique in its content-we haven't empty intervals by definition-, and
//remove() method which follows cannot confuse it with other interval -and therefore erase it incorrectly-
control = cp.remove(intB);
//update the list references
nextInt = variation.next;
if(nextInt!=null){
variation.next = nextInt.next; //point above the interval B to next C
variation.rightInterval = nextInt.rightInterval;
if(variation.next != null){ //it is not the last interval in the list
//update the next interval previous pointer to the new merged interval
variation.next.prev = variation;
}
}
//remove the merge of interval B with subsequent interval from both
//priority queue and control list
control = deltas.remove(nextInt);
control = pts.remove(nextInt);
//compute the cost variation of the two intervals adjacent to the merge:
//with the next
if(variation.rightInterval.size()!=0){
variation.delta = mergeCostVariation(variation.leftInterval,variation.index,variation.rightInterval,variation.index+variation.leftInterval.size(),end,values);
//extract and re-insert in the queue to order this item
//control = deltas.remove(variation); <-- already erased from poll
deltas.add(variation);
}
//with the previous
if(variation.prev != null){
variation.prev.delta = mergeCostVariation(variation.prev.leftInterval,variation.prev.index,variation.leftInterval,variation.index,end,values);
//extract and re-insert in the queue to order this item
control = deltas.remove(variation.prev);
deltas.add(variation.prev);
}
if(variation.rightInterval.size()==0){
deltas.remove(variation);
}
actualMODL = modl(cp,values);
if(actualMODL < bestMODL){
bestMODL = actualMODL;
bestcp = new Vector();
for(int i=0;i<cp.size();i++){
substr = (ArrayList<Double>) cp.get(i);
bestcp.add(substr.clone());
}
}
iter++;
variation = deltas.poll(); //take the first item -i.e. one such has higher/positive value-
}
//return the best set of intervals
return bestcp;
}
/**
* This method implements the greedy version of the MODL discretizer.
* It is a bottom up proccess which merges the two more appropriate intervals, until
* no improvement can be done to the global MODL value.
* @param attribute the attribute which is being discretized
* @param values the global array of values (sorted)
* @param begin the initial position of the values to be discretized
* @param end the final position of the values to be discretized
* @return the best discretization scheme found
*/
protected Vector greedyMODL(int attribute,int []values,int begin,int end) {
ArrayList<Double> substr,intA,intB;
Vector cp,tmp;
double actualMODL,bestMODL,partitioncostVariation,discCostVariation;
DeltaValue variation,nextInt;
int iter;
PriorityQueue<DeltaValue> deltas;
ArrayList<DeltaValue> pts;
boolean improvement,control;
cp = new Vector();
//create initial discretization with number of interval equal to number of values
substr = new ArrayList<Double>(1);
substr.add(realValues[attribute][values[0]]);
for(int i=1;i<=end;i++){
if(realValues[attribute][values[i]]!=substr.get(substr.size()-1)){
cp.add(substr);
substr = new ArrayList<Double>(1);
}
substr.add(realValues[attribute][values[i]]);
}
cp.add(substr);
//now compute the MODL value for this initial discretization
bestMODL = modl(cp,values);
//compute the delta-values related to all the possible merges
deltas = new PriorityQueue<DeltaValue>(end);
pts = new ArrayList<DeltaValue>();
for(int i=0;i<cp.size()-1;i++){
variation = new DeltaValue();
variation.leftInterval = (ArrayList<Double>)cp.get(i);
variation.rightInterval = (ArrayList<Double>)cp.get(i+1);
variation.index = i;
if(i!=0){ //make list pointers
variation.prev = pts.get(pts.size()-1);
variation.prev.next = variation;
}
variation.delta = mergeCostVariation(variation.leftInterval,i,variation.rightInterval,i+1,end,values);
deltas.add(variation);
pts.add(variation);
}
//sort the possible merges in ascending order...
//...not needed since we use a priority queue
//Now we optimize the discretization
actualMODL = 0;
iter = 0;
variation = deltas.poll();
improvement = true;
while(cp.size() > 1 && improvement && iter<cp.size()){
partitioncostVariation = Math.log((double)(cp.size()-1)/(realValues.length+cp.size()-1));
discCostVariation = partitioncostVariation + variation.delta;
if(discCostVariation < 0){
intA = variation.leftInterval;
intB = variation.rightInterval;
//join the intervals' values
// index = cp.indexOf(intA); //for debugging purposes
intA.addAll(intB);
//remove interval B from list of intervals -now is in interval A-
intB.clear(); //clear the unnecessary interval, so become unique in its content-we haven't empty intervals by definition-, and
//remove() method which follows cannot confuse it with other interval -and therefore erase it incorrectly-
control = cp.remove(intB);
//update the list references
nextInt = variation.next;
if(nextInt!=null){
variation.next = nextInt.next; //point above the interval B to next C
variation.rightInterval = nextInt.rightInterval;
if(variation.next != null){ //it is not the last interval in the list
//update the next interval previous pointer to the new merged interval
variation.next.prev = variation;
}
}
//remove the interval B from both priority queue and list
control = deltas.remove(nextInt);
control = pts.remove(nextInt);
//compute the cost variation of the two intervals adjacent to the merge:
//with the next
if(variation.rightInterval.size()!=0){
variation.delta = mergeCostVariation(variation.leftInterval,variation.index,variation.rightInterval,variation.index+variation.leftInterval.size(),end,values);
//extract and re-insert in the queue to order this item
//control = deltas.remove(variation); <-- already erased from poll
deltas.add(variation);
}
//with the previous
if(variation.prev != null){
variation.prev.delta = mergeCostVariation(variation.prev.leftInterval,variation.prev.index,variation.leftInterval,variation.index,end,values);
//extract and re-insert in the queue to order this item
control = deltas.remove(variation.prev);
deltas.add(variation.prev);
}
if(variation.rightInterval.size()==0){
deltas.remove(variation);
}
actualMODL = modl(cp,values);
if(actualMODL < bestMODL){
bestMODL = actualMODL;
}
iter++;
variation = deltas.poll(); //take the first item -i.e. one such has higher/positive value-
}else{
improvement = false;
}
}
//compute the cutpoints from the intervals
return createCP(cp);
}
/**
* It seachs for the best possible optimization scheme.
* It is a VERY slow process, so it is not recommended.
* @param attribute the attribute to be discretized
* @param values the global array of values (sorted)
* @param begin the initial position of the array
* @param end the final position in the array
* @return the best global discretization scheme
*/
protected Vector optimalMODL(int attribute,int []values,int begin,int end) {
Vector disc[][] = new Vector[end+1][end+1];
Vector tmp,cp;
ArrayList<Double> substr;
double minMODL,actualMODL,bestMODL;
int optimalIntervalNumber;
minMODL = bestMODL = Double.MAX_VALUE;
optimalIntervalNumber = -1;
//for all possible intervals
for(int k=0;k<=end;k++){
//for all instances in the data set
for(int j=0;j<=end;j++){
minMODL = Double.MAX_VALUE;
if(k == 0){
//create a substring {S(i,j)} of all elements from the 1st to jth
disc[j][0] = new Vector();
substr = new ArrayList<Double>();
for(int m=0;m<=j;m++)
substr.add(realValues[attribute][values[m]]);
disc[j][0].add(substr);
}else{
//find disc(S(1,j),k) which minimizes all discretizations
//disc(S(1,i),k-1) U {S(1,j)} for 1 <= i <= j
for(int i=0;i<=j;i++){
//perform the Union of sets 'U'
tmp = new Vector();
tmp.addAll(disc[i][k-1]);
substr = new ArrayList<Double>();
for(int m=i+1;m<=j;m++)
substr.add(realValues[attribute][values[m]]);
tmp.add(substr);
//compute the MODL criterion for this partition scheme
actualMODL = modl(tmp,values);
//if this discretization scheme minimizes the MODL value, is the optimum
//for k intervals
if(actualMODL < minMODL){
disc[j][k] = tmp;
minMODL = actualMODL;
}
}
}
}
//Obtain the overall discretization scheme winner
//which has the lower MODL value of all number of partitions
if(minMODL < bestMODL){
optimalIntervalNumber = k;
bestMODL = minMODL;
}
}
//once we have the optimal number of partitions -optimalIntervalNumber-
//and the best discretization scheme -disc[number of instances -1][optimalIntervalNumber]-
//compute the cutpoints
tmp = disc[end][optimalIntervalNumber];
return createCP(tmp);
}
/**
* Computes the MODL value for a current discretization scheme
* @param disc The discretization scheme to be evaluated. Comprises the intervals as ArrayList<Double> of values.
* @param values Array in which position i there is the number of instance which explanatory (real) value has rank i after sorting
* @return The MODL value corresponding to the discretization scheme
*/
public double modl(Vector disc,int values[]){
int n,I,J;
int ni[];
int nij[][];
ArrayList<Double> interval;
double modlValue;
n = 0;
I = disc.size();
J = numClasses;
ni = new int[I];
nij = new int[I][J];
for(int i=0,m=0;i<I;i++){
interval = (ArrayList<Double>)disc.get(i);
ni[i] = interval.size();
n += ni[i];
for(int j=0;j<ni[i];j++,m++){
nij[i][classOfInstances[values[m]]]++;
}
}
modlValue = Math.log(n);
modlValue += binomialLog(n+I-1,I-1);
for(int i=0;i<I;i++){
modlValue += binomialLog(ni[i]+J-1,J-1);
}
for(int i=0;i<I;i++){
modlValue += factDivision(i,ni,nij);
}
return modlValue;
}
/**
* Computes the MODL value for a current discretization scheme
* @param disc The discretization scheme to be evaluated. Comprises the intervals as ArrayList<Double> of values.
* @param values Array in which position i there is the number of instance which explanatory (real) value has rank i after sorting
* @return The MODL value corresponding to the discretization scheme
*/
public double modl(ArrayList<ArrayList<Double>> disc,int values[]){
int n,I,J;
int ni[];
int nij[][];
ArrayList<Double> interval;
double modlValue;
n = 0;
I = disc.size();
J = numClasses;
ni = new int[I];
nij = new int[I][J];
for(int i=0,m=0;i<I;i++){
interval = disc.get(i);
ni[i] = interval.size();
n += ni[i];
for(int j=0;j<ni[i];j++,m++){
nij[i][classOfInstances[values[m]]]++;
}
}
modlValue = Math.log(n);
modlValue += binomialLog(n+I-1,I-1);
for(int i=0;i<I;i++){
modlValue += binomialLog(ni[i]+J-1,J-1);
}
for(int i=0;i<I;i++){
modlValue += factDivision(i,ni,nij);
}
return modlValue;
}
/**
* Computes the cost derived form merging two adjacent intervals na and nb
* @param na Interval to the left to merge
* @param indexna Index of the first element of na in the whole list of real values
* @param nb Right interval to merge
* @param indexnb Index of the first element of nb in the whole list of real values
* @param I Current number of intervals (the total intervals prior to the merging)
* @param values Array in which position i there is the number of instance which explanatory (real) value has rank i after sorting
* @return The cost variation produced by the merge operation
*/
public double mergeCostVariation(ArrayList<Double> na,int indexna, ArrayList<Double> nb,int indexnb,int I,int values[]){
double cost;
int n = realValues.length;
int J = numClasses;
int countA[],countB[],countAB[];
ArrayList<Double> merge = new ArrayList<Double>(na);
merge.addAll(nb);
countA = new int[J];
countAB = new int[J];
for(int k=0;k<na.size();k++){
countA[classOfInstances[values[indexna+k]]]++;
countAB[classOfInstances[values[indexna+k]]]++;
}
for(int k=0;k<nb.size();k++){
countAB[classOfInstances[values[indexna+k]]]++;
}
//old version -faster-
// cost = Math.log((double)(I-1)/(n+I-1)); //computed outside
// cost += factorialLog(na.size()+nb.size()+J-1);
cost = factorialLog(na.size()+nb.size()+J-1);
cost += factorialLog(J-1);
cost -= factorialLog(na.size()+J-1);
cost -= factorialLog(nb.size()+J-1);
for(int j=0;j<numClasses;j++){
cost -= binomialLog(countAB[j],countA[j]);
}
//new version -slower-
// //partition cost variation -computed outside-
// cost = partitionCost(I-1,n);
// cost -= partitionCost(I,n);
// //interval cost variation
// cost = intervalCost(merge,indexna,values, I);
// cost -= intervalCost(na,indexna,values, I);
// cost -= intervalCost(nb,indexnb,values, I);
return cost;
}
/**
* Computes the division of factorials of the form (ni[i]! / (nij[i][0]! * nij[i][1]! *...* nij[i][J-1]!))
* @param i The interval considered
* @param ni Number of instances which belong to interval i
* @param nij Number of instances of class j which belong to interval i
*
*/
public double factDivision(int i,int ni[],int nij[][]){
double result;
result = factorialLog(ni[i]);
for(int j=0;j<nij[i].length;j++){
result -= factorialLog(nij[i][j]);
}
return result;
}
/**
* Returns the natural logarithm of n!.
* @param n argument
* @return <code>log(n!)</code>
* @throws IllegalArgumentException if preconditions are not met.
*/
public static double factorialLog(final int n) {
double logSum = 0;
for (int i = 2; i <= n; i++) {
logSum += Math.log((double)i);
}
return logSum;
}
/**
* Stirling formula for aproximating Log(n!) in O(1), if n is big enough.
* @param n Number to factorize
* @return Stirling's approximation to Log(n!)
*/
public static double stirling(int n){
return (n * Math.log(n) -n +1);
}
/**
* Returns the natural logarithm of m over n.
* @param m Upper argument
* @param n Lower argument
* @return Log(m over n)
*/
public static double binomialLog(int m, int n){
double result;
result = factorialLog(m) - factorialLog(n) - factorialLog(m-n);
return result;
}
/**
* Function that calculates combinatory of two integers
* @param m first integer
* @param n second integer
* @return the combinatory of m and n
*/
public static double combinatoria (int m, int n) {
double result = 1;
int i;
for (i=1; i<=m; i++)
result *= (double)(n-m+i)/(double)i;
return result;
}
/**
* Construct an array of cutpoints from the set of intervals.
* @param intervals Vector which contains the intervals in ArrayList<Double> format
* @return A Vector with double formatted cutpoints, computed as the midterm between two adjacent intervals.
*/
public Vector createCP(Vector intervals){
double cutPoint;
Vector cp;
ArrayList<Double> substr;
cp = new Vector();
for(int i=0;i<intervals.size()-1;i++){
substr = (ArrayList<Double>)intervals.get(i);
cutPoint = substr.get(substr.size()-1);
substr = (ArrayList<Double>)intervals.get(i+1);
cutPoint += substr.get(0);
cutPoint /= 2.0;
// if(cutPoint != substr.get(0))
cp.add(new Double(cutPoint));
}
return cp;
}
}