/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.UCPD; import keel.Algorithms.Genetic_Rule_Learning.Globals.*; import keel.Algorithms.Discretizers.Basic.*; import org.core.Randomize; import java.util.Vector; import keel.Dataset.*; /** * <p> * This class implements the UCPD algorithm * </p> * * @author Written by Jose A. Saez ( University of Granada), 21/12/2009 * @version 1.0 * @since JDK1.6 */ public class UCPD extends Discretizer { // tags static final int LEFT = 0; static final int RIGHT = 1; // instance variables private int numInstances; // number of instances private int numAttributes; // number of attributes private Instance[] instances; // instances of the dataset private double[][] ContinuousValues; // set of continuous attributes private int numC; // number of continuous attributes private int[][] DiscreteValues; // set of discrete attributes private int numD; // number of discrete attributes private int[] numValuesD; // number of different values of each attribute private double[] MEAN; // normalized mean (in [0,1]) of each C attribute private Vector[] cutpoints; // cutpoints of each continuous attribute private int[] AttPositions; // position of each attribute of instances in // ContinuousValues and DiscreteValues //****************************************************************************************************** /** * <p> * It returns a vector with the discretized values * </p> * @param attribute index of the attribute to discretize * @param values not used * @param begin not used * @param end not used * @return vector with the discretized values */ protected Vector discretizeAttribute(int attribute, int []values, int begin, int end){ return cutpoints[AttPositions[attribute]]; } //****************************************************************************************************** /** * <p> * Constructor of the class * </p> * @param is set of instances */ public UCPD(InstanceSet is){ // initialize parameters Randomize.setSeed(Parameters.seed); int i, j; // loop indexes instances = is.getInstances(); // set of instances numInstances = is.getNumInstances(); // number of instances numAttributes = Parameters.numAttributes; // number of attributes AttPositions = new int[numAttributes]; DiscreteValues = null; // compute the number of continuous and discrete attributes numD = numC = 0; for(j = 0 ; j < numAttributes ; ++j){ Attribute at = Attributes.getAttribute(j); if(at.getType() == Attribute.REAL || at.getType() == Attribute.INTEGER) AttPositions[j] = numC++; else{ if(at.getType() == Attribute.NOMINAL) AttPositions[j] = numD++; } } // initialize matrix of continuous attrbutes ContinuousValues = new double[numInstances][numC]; // if there are discrete attributes, also initialize matrix of discrete attributes if(numD > 0 && Parameters.useDiscrete){ DiscreteValues = new int[numInstances][numD]; // fill the set of discrete values int cont; for(i = 0 ; i < numInstances ; ++i){ cont = 0; for(j = 0 ; j < numAttributes ; ++j){ Attribute at = Attributes.getAttribute(j); if(at.getType() == Attribute.NOMINAL) DiscreteValues[i][cont++] = instances[i].getInputNominalValuesInt(j); } } // create the array with the number of different values of each attribute numValuesD = new int[numD]; cont = 0; for(i = 0 ; i < numAttributes ; ++i){ Attribute at = Attributes.getAttribute(i); if(at.getType() == Attribute.NOMINAL) numValuesD[cont++] = at.getNumNominalValues(); } } } //****************************************************************************************************** /** * <p> * It computes the cutpoints for each continuous variable * </p> */ public void discretizeAllAttributes(){ int i, j, att; // loop indexes // 1) normalize each continuous attribute in [0,1] and center on mean normalizeAndCenter(); // 2) do PCA on all attributes in ContinuousValues PCA pca = new PCA(ContinuousValues); pca.ComputeParameters(); double[][] eigenvector = pca.getEigenvectors(0.9); // get most significative eigenvectors double[] eigenvalues = pca.getEigenvalues(); // get their eigenvalues int numDimensions = pca.getNumDimensions(); // get number of dimensions double[][] FinalData = pca.DerivingNewData(eigenvector); // get the final data double[][] COVAR = pca.getCovarianceMatrix(); // get the convariance matrix // 3) compute the number of cutpoints in each eigendimension double total = 0; for(i = 0 ; i < numDimensions ; ++i) total += eigenvalues[i]; int[] numCutpoints = new int[numDimensions]; for(i = 0 ; i < numDimensions ; ++i) numCutpoints[i] = (int) ( (eigenvalues[i]/total)*(Parameters.maxIntervals-Parameters.minIntervals)+Parameters.minIntervals-1 ); // 4) compute the cutpoints on each eigendimension double[][] cutpointsEdim = new double [numDimensions][]; // dataset with categorical attributes if(numD > 0 && Parameters.useDiscrete){ // compute association patterns on all categorical attributes Vector<Itemset> freqits = FrequentItemsets.getFrequentItemsets(DiscreteValues, numValuesD); // determine the cutpoints basing on uniform frequency for(i = 0 ; i < numDimensions ; ++i) cutpointsEdim[i] = uniformFrequencyCutpoints(numCutpoints[i], FinalData, i); int[][] selectedCP = new int[numDimensions][]; int[] inst1 = null; int[] inst2 = null; int numi1 = 0, numi2 = 0; boolean finish, conserve; Object[] resObj; // calculate frequent itemsets of each interval for(int dim = 0 ; dim < numDimensions ; ++dim){ // if the number of cutpoints exceeds the minimum, it can remove some of them if(numCutpoints[dim] > (Parameters.minIntervals-1)){ finish = conserve = false; selectedCP[dim] = new int[numCutpoints[dim]]; Vector<Itemset> fiInter1 = new Vector<Itemset>(); Vector<Itemset> fiInter2 = new Vector<Itemset>(); for(int sp = 0 ; sp < numCutpoints[dim] ; ++sp) selectedCP[dim][sp] = 1; int numsp = numCutpoints[dim]; finish = false; for(int sp = 0 ; sp < numCutpoints[dim] && !finish ; ++sp){ if(!conserve){ resObj = getInstancesInto(FinalData, dim, selectedCP[dim], cutpointsEdim[dim], numCutpoints[dim], sp, LEFT); inst1 = (int [])resObj[0]; numi1 = (Integer)resObj[1]; fiInter1 = frequentItemsetForInterval(freqits, inst1, numi1); } else{ inst1 = new int[numi2]; System.arraycopy(inst2, 0, inst1, 0, numi2); fiInter1 = fiInter2; } resObj = getInstancesInto(FinalData, dim, selectedCP[dim], cutpointsEdim[dim], numCutpoints[dim], sp, RIGHT); inst2 = (int [])resObj[0]; numi2 = (Integer)resObj[1]; fiInter2 = frequentItemsetForInterval(freqits, inst2, inst2.length); if(areSimilar(fiInter1,fiInter2)){ conserve = false; if(numsp > (Parameters.minIntervals-1)){ numsp--; selectedCP[dim][sp] = 0; } else{ finish = true; } } else conserve = true; } double[] cutpAUX = new double[numsp]; for(int sp = 0 ; sp < numsp ; ++sp){ if(selectedCP[dim][sp] == 1){ cutpAUX[sp] = cutpointsEdim[dim][sp]; } } cutpointsEdim[dim] = new double[numsp]; System.arraycopy(cutpAUX, 0, cutpointsEdim[dim], 0, numsp); numCutpoints[dim] = numsp; } } } // there are continuous attributes only else{ for(i = 0 ; i < numDimensions ; ++i) cutpointsEdim[i] = KMeans(numCutpoints[i]+1, FinalData, i); } // 5) determine which eigenvector has more influence over each original dimension int pos; double maximum, aux; int[] whichEigenvector = new int[numC]; for(att = 0 ; att < numC ; ++att){ pos = 0; maximum = ( eigenvector[att][0]*Math.sqrt(eigenvalues[0]) ) / Math.sqrt(COVAR[att][att]); for(i = 1 ; i < numDimensions ; ++i){ aux = ( eigenvector[att][i]*Math.sqrt(eigenvalues[i]) ) / Math.sqrt(COVAR[att][att]); if(aux > maximum){ pos = i; maximum = aux; } } whichEigenvector[att] = pos; } // 6) get the final cutpoints cutpoints = new Vector[numC]; if(Parameters.mapType.equals("knn")){ att = 0; for(i = 0 ; i < numAttributes ; ++i){ Attribute at = Attributes.getAttribute(i); if(at.getType() == Attribute.REAL || at.getType() == Attribute.INTEGER){ cutpoints[att] = new Vector(); for(j = 0 ; j < numCutpoints[whichEigenvector[att]] ; ++j){ double ptoCorte = KNN(i, cutpointsEdim[whichEigenvector[att]][j], FinalData, whichEigenvector[att]); cutpoints[att].add(ptoCorte); } att++; } } // sort cutpoints and remove double cutpoints for(i = 0 ; i < numC ; ++i){ int numa = cutpoints[i].size(); double[] puntosDecorte = new double[numa]; for(j = 0 ; j < numa ; ++j) puntosDecorte[j] = (Double)cutpoints[i].get(j); int[] positions2 = Quicksort.sort(puntosDecorte, numa, Quicksort.LOWEST_FIRST); cutpoints[i] = new Vector(); double valuecp = puntosDecorte[positions2[0]]; cutpoints[i].add(valuecp); for(j = 1 ; j < numa ; ++j) if(valuecp != puntosDecorte[positions2[j]]){ valuecp = puntosDecorte[positions2[j]]; cutpoints[i].add(valuecp); } } } else if(Parameters.mapType.equals("projection")){ for(att = 0 ; att < numC ; ++att){ double producto = eigenvector[att][whichEigenvector[att]]; cutpoints[att] = new Vector(); Attribute at = Attributes.getAttribute(att); double min = at.getMinAttribute(); double max = at.getMaxAttribute(); for(j = 0 ; j < numCutpoints[whichEigenvector[att]] ; ++j){ double ptoCorte = cutpointsEdim[whichEigenvector[att]][j]*producto; ptoCorte += MEAN[att]; ptoCorte *= (max-min); ptoCorte += min; cutpoints[att].add(ptoCorte); } } } } //****************************************************************************************************** /** * <p> * It normalizes continuous attributes and center them on their mean * </p> */ public void normalizeAndCenter(){ int i, j, cont; double min, max; // normalize real attributes to [0,1] for(i = 0 ; i < numInstances ; ++i){ cont = 0; for(j = 0 ; j < numAttributes ; ++j){ Attribute at = Attributes.getAttribute(j); if(at.getType() == Attribute.REAL || at.getType() == Attribute.INTEGER){ min = at.getMinAttribute(); max = at.getMaxAttribute(); ContinuousValues[i][cont++] = (instances[i].getInputRealValues(j)-min)/(max-min); } } } // compute de means of each attribute MEAN = new double[numC]; for(i = 0 ; i < numC ; ++i) MEAN[i] = 0; for(i = 0 ; i < numInstances ; ++i) for(j = 0 ; j < numC ; ++j) MEAN[j] += ContinuousValues[i][j]; for(j = 0 ; j < numC ; ++j) MEAN[j] /= numInstances; // mean centralization of the data for(i = 0 ; i < numInstances ; ++i) for(j = 0 ; j < numC ; ++j) ContinuousValues[i][j] -= MEAN[j]; } //****************************************************************************************************** /** * <p> * It calculates the cutpoints using the K-Means algorithm * </p> * @param k number of intervals * @param FinalData the mapped data with eigendimension * @param dim the eigendimension to discretize * @return the cutpoints */ public double[] KMeans(int k, double[][] FinalData, int dim){ int i, j; double[] cutpoints = new double[k-1]; // cutpoints selected double[] centroids = new double[k]; // actual centroids double[] groupDistance = new double[k]; // distance to each group int[] group = new int[numInstances]; // group of each instance for(i = 0 ; i < numInstances ; ++i) group[i] = -1; // take k different random centroids Sampling sampl = new Sampling(numInstances); for(i = 0 ; i < k ; ++i){ int rand = sampl.getSample(); centroids[i] = FinalData[rand][dim]; } boolean changes; do{ changes = false; // for each object for(i = 0 ; i < numInstances ; ++i){ // compute its distance to each centroid for(j = 0 ; j < k ; ++j) groupDistance[j] = Math.abs(centroids[j]-FinalData[i][dim]); // asign it the cluster closer double minor = groupDistance[0]; int pos = 0; for(j = 1 ; j < k ; ++j){ if(groupDistance[j] < minor){ pos = j; minor = groupDistance[j]; } } if(group[i] != pos){ changes = true; group[i] = pos; } } // computes the new mean of each cluster double[] sum = new double[k]; int[] ni = new int[k]; for(i = 0 ; i < k ; ++i) sum[i] = ni[i] = 0; for(i = 0 ; i < numInstances ; ++i){ sum[group[i]] += FinalData[i][dim]; ni[group[i]]++; } for(i = 0 ; i < k ; ++i) centroids[i] = sum[i]/ni[i]; }while(changes); // compute the final cutpoints double[] cutp = new double[k-1]; for(i = 0 ; i < k-1 ; ++i) cutp[i] = (centroids[i]+centroids[i+1])/2; // sorts the cutpoints int[] positions = Quicksort.sort(cutp, k-1, Quicksort.LOWEST_FIRST); for(i = 0 ; i < k-1 ; ++i) cutpoints[i] = cutp[positions[i]]; return cutpoints; } //****************************************************************************************************** /** * <p> * It computes the cutpoint using KNN algorithm * </p> * @param att original index of the attribute to compute the cutpoint * @param value value to find its nearest neighbors * @param FinalData data matrix of PCA * @param dim dimension to find the nearest neighbors * @return the value of the cutpoint */ public double KNN(int att, double value, double[][] FinalData, int dim){ // compute the distance of each instance to value double[] distance = new double[numInstances]; for(int i = 0 ; i < numInstances ; ++i) distance[i] = Math.abs(FinalData[i][dim]-value); int[] pos = Quicksort.sort(distance, numInstances, Quicksort.LOWEST_FIRST); double total = 0; for(int i = 0 ; i < Parameters.Neighborhood ; ++i) total += instances[pos[i]].getInputRealValues(att); total /= Parameters.Neighborhood; return total; } //****************************************************************************************************** /** * <p> * It calculates the cutpoints with uniform frequency * </p> * @param k number of cutpoints to compute * @param FinalData matrix of data of PCA * @param att index of the dimension * @return the cutpoints */ public double[] uniformFrequencyCutpoints(int k, double[][] FinalData, int att){ double[] cutpoints = new double[k]; int instInter = (int)((double)numInstances/(double)(k+1)); double[] DataArray = new double[numInstances]; for(int i = 0 ; i < numInstances ; ++i) DataArray[i] = FinalData[i][att]; int[] pos = Quicksort.sort(DataArray, numInstances, Quicksort.LOWEST_FIRST); int numcp = 0, cont = 0; for(int i = 0 ; i < numInstances && numcp<k; ++i){ cont++; if(cont == instInter){ cont = 0; cutpoints[numcp++] = FinalData[pos[i]][att]; } } return cutpoints; } //****************************************************************************************************** /** * <p> * It computes the frequent itemsets of the given instances * </p> * @param its * @param instances * @param numi * @return the the indexes of the instances into the interval inter */ public Vector<Itemset> frequentItemsetForInterval(Vector<Itemset> its, int[] instances, int numi){ Vector<Itemset> res = new Vector<Itemset>(); int[] votes = new int[its.size()]; for(int i = 0 ; i < its.size() ; ++i) votes[i] = 0; // compute the frequency of each itemset basing on given instances for(int i = 0 ; i < its.size() ; ++i) for(int j = 0 ; j < numi ; ++j) if(its.get(i).into(DiscreteValues[instances[j]])) votes[i]++; // see if they exceeds the minimal support for(int i = 0 ; i < its.size() ; ++i) if(votes[i] > Parameters.minSupport){ Itemset auxi = its.get(i); auxi.setSupport((double)((double)votes[i]/(double)numi)); res.add(auxi); } return res; } //****************************************************************************************************** /** * <p> * It checks if two frequents itemsets are similar * </p> * @param A first set of frequents itemsets * @param B second set of frequents itemsets * @return if A and B are or not similar */ public boolean areSimilar(Vector<Itemset> A, Vector<Itemset> B){ double total = 0; int numEqualElements = 0; for(int i = 0 ; i < A.size() ; ++i){ for(int j = 0 ; j < B.size() ; ++j){ if(A.get(i).equalsTo(B.get(j))){ total += Math.max(0,1-(Parameters.scalingFactor*Math.abs(A.get(i).getSupport()-B.get(j).getSupport())) ); numEqualElements++; } } } // number of elements of AUB = A + B - A&B int numTotal = A.size()+B.size()-numEqualElements; double valorSim = total/(double)numTotal; if(valorSim > Parameters.mergedThreshold) return true; else return false; } //****************************************************************************************************** /** * <p> * It computes the indexes of instances that fall into the interval selected * </p> * @param FinalData matrix of data of PCA * @param dim index of the dimension * @param selected indexes of the selected cutpoints * @param cutp array of cutpoints * @param ncp number of cutpoints * @param sp index of the cutpoint to form the interval * @param opt equals to LEFT to indicate the left interval of sp and equals to RIGHT to indicate the right * interval * @return the indexes of the selected instances and the number */ public Object[] getInstancesInto(double[][] FinalData, int dim, int[] selected, double[] cutp, int ncp, int sp, int opt){ int[] inst = new int[numInstances]; int numInst = 0; double bottomBound = 0, highBound = 0; boolean finish = false; // compute the bottom and the high bounds if(opt == LEFT){ // first cutpoint if(sp == 0){ highBound = cutp[sp]; bottomBound = (-1)*Double.MAX_VALUE; } // others cutpoints else{ highBound = cutp[sp]; for(int i = sp-1 ; i >= 0 && !finish ; --i) if(selected[i] == 1){ bottomBound = i; finish = true; } if(finish == false) bottomBound = (-1)*Double.MAX_VALUE; } } if(opt == RIGHT){ // last cutpoint if(sp == ncp-1){ bottomBound = cutp[sp]; highBound = Double.MAX_VALUE; } // others cutpoints else{ bottomBound = cutp[sp]; for(int i = sp+1 ; i < ncp && !finish ; ++i) if(selected[i] == 1){ highBound = i; finish = true; } if(finish == false) highBound = Double.MAX_VALUE; } } // compute the indexes for(int p = 0 ; p < numInstances ; ++p) if(FinalData[p][dim] >= bottomBound && FinalData[p][dim] < highBound) inst[numInst++] = p; Object[] solution = new Object[2]; solution[0] = inst; solution[1] = numInst; return solution; } }