/***********************************************************************
This file is part of KEEL-software, the Data Mining tool for regression,
classification, clustering, pattern mining and so on.
Copyright (C) 2004-2010
F. Herrera (herrera@decsai.ugr.es)
L. S�nchez (luciano@uniovi.es)
J. Alcal�-Fdez (jalcala@decsai.ugr.es)
S. Garc�a (sglopez@ujaen.es)
A. Fern�ndez (alberto.fernandez@ujaen.es)
J. Luengo (julianlm@decsai.ugr.es)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/
**********************************************************************/
package keel.Algorithms.Lazy_Learning.LazyDT;
import java.util.Arrays;
import keel.Algorithms.Lazy_Learning.LazyAlgorithm;
/**
*
* File: LazyDT.java
*
* The LazyDT algorithm doesn't build a decision tree model in a training phase and
* uses the model when we start classifying. In spite of that behaviour, it precomputes
* some of the operations that should be done, and only with some information advances
* to the classification step.
*
* @author Written by Victoria Lopez Morales (University of Granada) 24/08/2009
* @version 0.1
* @since JDK1.5
*/
public class LazyDT extends LazyAlgorithm {
//Parameters
//Constants
/**
* Range of the information measure that has to be different to follow one path
*/
private final static double RANGE = 1.01;
/**
* Minimum number of attributes that have to support a path
*/
private final static int MIN_DATA = 5;
//Additional structures
/**
* Maximum number of values that a categorical attribute can have
*/
int maxNumValues;
/**
* Number of values that each attribute can have (all attributes must be categorical)
*/
int numValues[];
/**
* Original attribute values of the dataset
*/
double originalData[][];
/**
* Creates a LazyDT instance by reading the script file that contains all the information needed
* for running the algorithm
*
* @param script The configuration script which contains the parameters of the algorithm
*/
public LazyDT (String script) {
readDataFiles(script);
// Naming the algorithm
name="LazyDT";
// Initialization of auxiliary structures
numValues=new int [inputAtt];
// Initialization stuff ends here. So, we can start time-counting
setInitialTime();
} //end-method
/**
* Reads configuration script, to extract the parameter's values.
*
* @param script Name of the configuration script
*
*/
protected void readParameters (String script) {
}//end-method
/**
* Classifies a given item with the information stored using the LazyDT algorithm
*
* @param item Data attribute values for the item we are classifying
* @param atts Attributes in the data set that are used for building the tree and describing the
* instance given
* @return the class asigned to the item given
*/
protected int evaluate (double example[]) {
int test;
MyPair result;
// First, we denormalize the example
denormalize(example);
// Find a path with the example data
result=explore(originalData, trainOutput, example,numValues);
// Obtaining the relevant result
test= result.result();
return test;
}
/**
* Creates the path for an example following the LazyDT algorithm, looking at the attribute values
* and trying to find a class according to the values given
*
* @param data Original values of the attribute values
* @param output Original output class for all the instances
* @param example Data attribute values for the item we are trying to create a path
* @param attValues Maximum number of possible values for all the attributes
* @return a pair with the predicted class
*/
private MyPair explore(double [][] data,int [] output,double example[], int [] attValues){
MyPair result=new MyPair();
double minEnt;
double ent[];
boolean done=false;
boolean options [];
MyPair partial;
int maxSupport;
int res;
// Check if all the possibilites are the same
if(equalVector(output)){
result=new MyPair(output[0],output.length);
done=true;
}
// Check if all the available data is the same
if(equalData(data)){
result=majority(output);
done=true;
}
// Check if data has more than one attribute
if(data[0].length==1){
result=majority2(example[0],data,output);
done=true;
}
// Check if the data available is at least the minimum data necessary
if(data.length <MIN_DATA){
result=majority(output);
done=true;
}
if(!done){
minEnt=Double.MAX_VALUE;
ent=new double[data[0].length];
//select attribute to prune
for(int i=0;i<data[0].length;i++){
ent[i]=entrophy(data, output,i, attValues[i]);
if(minEnt>ent[i]){
minEnt=ent[i];
}
}
int contador=0;
//get the options
options= new boolean[data[0].length];
for(int i=0;i<data[0].length;i++){
if(ent[i]<=(RANGE*minEnt)){
options[i]=true;
contador++;
}
else{
options[i]=false;
}
}
//try options
res=-1;
maxSupport=0;
for(int i=0;i<data[0].length;i++){
if(options[i]){
partial=tryNewExplore(data,output,example,attValues,i);
if(partial.instances()>maxSupport){
maxSupport=partial.instances();
res=partial.result();
}
}
}
//get the best tree found
result=new MyPair(res,maxSupport);
}
return result;
}
/**
* Creates an alternate new path (other than the first one chosen) for an example following the LazyDT
* algorithm, looking at the attribute values and trying to find a class according to the values given
*
* @param data Original values of the attribute values
* @param output Original output class for all the instances
* @param example Data attribute values for the item we are trying to create a path
* @param attValues Maximum number of possible values for all the attributes
* @param selected Attribute chosen for the last followed path
* @return a pair with the predicted class
*/
private MyPair tryNewExplore(double [][] data,int [] output,double example[], int [] attValues,int selected){
double [][] newData;
int [] newOutput;
double newExample[];
int [] newAttValues;
boolean mask[];
int remaining;
int counter;
int counter2;
MyPair result;
//prune data
mask=new boolean [data.length];
Arrays.fill(mask, false);
remaining=0;
for(int i=0;i<data.length;i++){
if(example[selected]==data[i][selected]){
mask[i]=true;
remaining++;
}
}
if (remaining==0){
return majority(output);
}
newData= new double[remaining][data[0].length-1];
newOutput= new int[remaining];
newExample= new double[data[0].length-1];
newAttValues=new int[data[0].length-1];
counter=0;
for(int i=0;i<data.length;i++){
if(mask[i]){
counter2=0;
for(int j=0;j<data[0].length;j++){
if(j!=selected){
newData[counter][counter2]=data[i][j];
counter2++;
}
}
newOutput[counter]=output[i];
counter++;
}
}
counter2=0;
for(int j=0;j<data[0].length;j++){
if(j!=selected){
newExample[counter2]=example[j];
newAttValues[counter2]=attValues[j];
counter2++;
}
}
result=explore(newData,newOutput,newExample,newAttValues);
return result;
}
/**
* Checks if a given array has the same value for all of its elements
*
* @param vector Array that is going to be checked
* @return true, if all the elements in the array are the same; false, otherwise
*/
private boolean equalVector(int vector[]){
int value=vector[0];
if(vector.length<2){
return true;
}
for(int i=0;i<vector.length;i++){
if(vector[i]!=value){
return false;
}
}
return true;
}
/**
* Gets the most frequent value stored in the array
*
* @param vector Array whose most frequent value is going to be found
* @return a pair with the most frequent value and its value
*/
private MyPair majority(int vector[]){
int values[]=new int [nClasses];
int max;
int selected;
MyPair result;
Arrays.fill(values, 0);
for(int i=0;i<vector.length;i++){
values[vector[i]]++;
}
selected=-1;
max=-1;
for(int i=0;i<values.length;i++){
if(max<values[i]){
max=values[i];
selected=i;
}
}
result= new MyPair(selected,max);
return result;
}
/**
* Gets the most frequent value stored in the output array from the correct instances
*
* @param value Value to find
* @param data Data matrix
* @param vector Array whose most frequent value is going to be found
* @return a pair with the most frequent value and its value
*/
private MyPair majority2(double value,double data [][], int vector[]){
boolean mask[];
//prune data
mask=new boolean [data.length];
Arrays.fill(mask, false);
for(int i=0;i<data.length;i++){
if(value==data[i][0]){
mask[i]=true;
}
}
int values[]=new int [nClasses];
int max;
int selected;
MyPair result;
Arrays.fill(values, 0);
for(int i=0;i<vector.length;i++){
if(mask[i]){
values[vector[i]]++;
}
}
selected=-1;
max=-1;
for(int i=0;i<values.length;i++){
if(max<values[i]){
max=values[i];
selected=i;
}
}
result= new MyPair(selected,max);
return result;
}
/**
* Checks if a given matrix has the same value looking at the columns
*
* @param data Matrix that is going to be checked
* @return true, if all the elements in the same column are the same; false, otherwise
*/
private boolean equalData(double data[][]){
double value;
if(data.length<2){
return true;
}
for(int j=0;j<data[0].length;j++){
value=data[0][j];
for(int i=1;i<data.length;i++){
if(data[i][j]!=value){
return false;
}
}
}
return true;
}
/**
* Calculates the entrophy for a possible split in an attribute
*
* @param data Original values of the attribute values
* @param output Original output class for all the instances
* @param att Attribute that is going to be split
* @param valuesAtt Maximum number of possible values for all the attributes
* @return the entrophy for the data split for that attribute
*/
private double entrophy(double [][] data,int [] output, int att, int valuesAtt){
double value=0.0;
double entr;
double fraction;
int dataCount [][]= new int [nClasses][valuesAtt];
int classCount[]= new int [nClasses];
for(int i=0;i<nClasses;i++){
Arrays.fill(dataCount[i],0);
classCount[i]=0;
}
for(int i=0;i<data.length;i++){
dataCount[output[i]][(int)data[i][att]]++;
classCount[output[i]]++;
}
for(int i=0;i<nClasses;i++){
entr=0.0;
for(int j=0;j<valuesAtt;j++){
if(dataCount[i][j]!=0){
fraction=(double)dataCount[i][j]/(double)data.length;
entr+=fraction * (Math.log(fraction)/Math.log(2.0));
}
}
//weighting can be applied here
//value-=((double)classCount[i]/(double)data.length)*entr;
value-=entr;
}
return value;
}
/**
* Does some previous computations to the beginning of the algorithm, this means, getting the number
* of different values of the categorical attributes and the denormalization of the dataset
*/
public void precompute(){
maxNumValues=0;
for(int i=0;i<inputAtt;i++){
numValues[i]=train.getAttributeDefinitions().getInputAttribute(i).getNominalValuesList().size();
if(numValues[i]>maxNumValues){
maxNumValues=numValues[i];
}
}
originalData=new double[trainData.length][trainData[0].length];
for(int i=0;i<trainData.length;i++){
System.arraycopy(trainData[i], 0, originalData[i], 0, trainData[i].length);
}
//denormalize dataset
for(int i=0;i<originalData.length;i++){
for(int j=0;j<originalData[0].length;j++){
originalData[i][j] *= numValues[j]-1;
}
}
}
/**
* Denormalize an example given
*
* @param example Array with all the values that is going to be denormalized
*/
private void denormalize(double [] example){
for(int j=0;j<trainData[0].length;j++){
example[j] *= numValues[j]-1;
}
}
/**
* Prints a matrix in the standard output. Usually used to check when developing the algorithm.
*
* @param m Matrix that is going to be printed
*/
private void printM(double[][] m){
String text;
for(int i=0;i<m.length;i++){
text="";
for(int j=0;j<m[0].length;j++){
text+=m[i][j]+" ";
}
System.out.println(text);
}
System.out.println("******");
}
/**
*
* Small nested class that is used in the LazyDT algorithm like a data structure that stores
* an output class and the number of instances that support that class
*
* @author Written by Victoria Lopez Morales (University of Granada) 26/08/2009
* @version 0.1
* @since JDK1.5
*/
private class MyPair{
/**
* Output class for the algorithm
*/
private int result;
/**
* Number of instances that support the class decided
*/
private int instances;
/**
* Creates a pair with empty values that we can identify
*/
public MyPair(){
result=-1;
instances=0;
}
/**
* Creates a pair with the output class assigned to it and the number of instances supporting it
*
* @param res The output class assigned to this pair
* @param ins The number of instances supporting the output class
*/
public MyPair(int res, int ins){
result=res;
instances=ins;
}
/**
* Gets the output class stored in this pair
*
* @return the output class stored in this pair
*/
public int result(){
return result;
}
/**
* Gets the number of instances that supports the output class stored
*
* @return the number of instances that supports the output class stored
*/
public int instances(){
return instances;
}
}
} //end-class