/*********************************************************************** This file is part of KEEL-software, the Data Mining tool for regression, classification, clustering, pattern mining and so on. Copyright (C) 2004-2010 F. Herrera (herrera@decsai.ugr.es) L. S�nchez (luciano@uniovi.es) J. Alcal�-Fdez (jalcala@decsai.ugr.es) S. Garc�a (sglopez@ujaen.es) A. Fern�ndez (alberto.fernandez@ujaen.es) J. Luengo (julianlm@decsai.ugr.es) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/ **********************************************************************/ package keel.Algorithms.Discretizers.Cluster_Analysis; import java.util.*; import keel.Algorithms.Discretizers.Basic.*; import keel.Algorithms.Genetic_Rule_Learning.Globals.*; import keel.Dataset.Attribute; import keel.Dataset.Attributes; import keel.Dataset.Instance; import keel.Dataset.InstanceSet; /** * * <p> * This class implements the Cluster Analysis discretizer. * </p> * * <p> * @author Written by Salvador Garc�a (University of Ja�n) 17/03/2011 * @version 1.0 * @since JDK1.5 * </p> */ public class Cluster_Analysis extends Discretizer { Instance []instances; /** * Builder */ public Cluster_Analysis() { } /** * It computes the cutpoints of the given dataset * * @param is the examples of the dataset */ public void buildCutPoints(InstanceSet is) { int i, j, l, m; boolean bHit; int numReal = 0; double examples[][]; double examplesCopy[][]; double distanceMatrix[][]; double distanceEu; double Lc = 0, LcK; boolean stop; int posi, posj; double minDist; int clusters[]; int cont; boolean clustersHit[]; int numClusters; double clusterIntervals[][][]; int atreal; double max; boolean LcClusters[]; boolean classesHit[]; int clusterID[]; int classesCont; instances = is.getInstances(); classOfInstances= new int[instances.length]; for(i=0;i<instances.length;i++) classOfInstances[i]=instances[i].getOutputNominalValuesInt(0); cutPoints=new double[Parameters.numAttributes][]; realAttributes = new boolean[Parameters.numAttributes]; realValues = new double[Parameters.numAttributes][]; /*Consistency level computation Lc*/ for (i=0; i<instances.length; i++) { for (j=i+1; j<instances.length; j++) { stop = false; for (l=0; l<Attributes.getInputNumAttributes() && !stop; l++) { if (Attributes.getInputAttribute(l).getType() == Attribute.NOMINAL) { if (instances[i].getInputNominalValues(l) != instances[j].getInputNominalValues(l)) { stop = true; } } else { if (instances[i].getInputRealValues(l) != instances[j].getInputRealValues(l)) { stop = true; } } } if (!stop) { // the examples have the same input values if (instances[i].getOutputNominalValuesInt(0) != instances[j].getOutputNominalValuesInt(0)) { Lc++; } } } } Lc = 1 - Lc/(double)instances.length; /*Identification of the set of numeric attributes*/ for (i=0; i<Parameters.numAttributes; i++){ Attribute at=Attributes.getAttribute(i); if (at.getDirectionAttribute() == Attribute.INPUT){ if(at.getType()==Attribute.REAL || at.getType()==Attribute.INTEGER) { realAttributes[i]=true; numReal++; } } } /*Standarization (Normalization) of numeric attributes*/ examples = new double[numReal][instances.length]; examplesCopy = new double[numReal][instances.length]; for (i=0, j=0; i<Parameters.numAttributes; i++) { if (realAttributes[i]) { for (l=0; l<instances.length; l++) { examplesCopy[j][l] = instances[l].getInputRealValues(i); examples[j][l] = instances[l].getInputRealValues(i); examples[j][l] = (examples[j][l] - Attributes.getAttribute(i).getMinAttribute()) / (Attributes.getAttribute(i).getMaxAttribute() - Attributes.getAttribute(i).getMinAttribute()); } j++; } } /*Distance matrix computation*/ distanceMatrix = new double[instances.length][instances.length]; for (i=0; i<instances.length; i++) { for (j=0; j<instances.length; j++) { if (i == j) { distanceMatrix[i][j] = Double.POSITIVE_INFINITY; } else { distanceEu = 0; for (l=0; l<numReal; l++) { distanceEu += (examples[l][i] - examples[l][j]) * (examples[l][i] - examples[l][j]); } distanceMatrix[i][j] = distanceEu; distanceMatrix[j][i] = distanceEu; } } } /*Clusters fusion until level of consistency degrades*/ clusters = new int[distanceMatrix.length]; for (i=0; i<clusters.length; i++) { clusters[i] = i; } LcClusters = new boolean[distanceMatrix.length]; Arrays.fill(LcClusters, true); stop = false; while (!stop) { minDist = Double.POSITIVE_INFINITY; posi = -1; posj = -1; for (i=0; i<distanceMatrix.length; i++) { if (LcClusters[i]) { for (j=i+1; j<distanceMatrix[i].length; j++) { if (distanceMatrix[i][j] < minDist) { posi = i; posj = j; minDist = distanceMatrix[i][j]; } } } } /*Consistency level computation LcK*/ LcK = 0; if (posi >= 0 && posj >= 0) { cont = 0; for (i=0; i<instances.length; i++) { if (clusters[i] == clusters[posi] || clusters[i] == clusters[posj]) { cont++; for (j=i+1; j<instances.length; j++) { if (clusters[j] == clusters[posi] || clusters[j] == clusters[posj]) { if (instances[i].getOutputNominalValuesInt(0) != instances[j].getOutputNominalValuesInt(0)) { LcK++; } } } } } LcK = 1 - LcK/(double)cont; } else { stop = true; } /*check the level of consistency in new partition K*/ if (LcK < Lc && !stop) { for (i=0; i<LcClusters.length; i++) { if (clusters[i] == posi) LcClusters[i] = false; } stop = true; for (i=0; i<LcClusters.length && stop; i++) { if (LcClusters[i]) stop = false; } } else if (!stop) { for (i=0; i<clusters.length; i++) { if (clusters[i]==clusters[posj]) clusters[i] = clusters[posi]; } //distance re-computation for (i=0; i<distanceMatrix.length; i++) { if (i != posi && i != posj) { for (j=0; j<clusters.length; j++) { if (clusters[j] == clusters[posi]) { distanceMatrix[i][j] = 0.5 * distanceMatrix[i][posi] + 0.5 * distanceMatrix[i][posj] - 0.25 * distanceMatrix[posi][posj]; distanceMatrix[j][i] = 0.5 * distanceMatrix[i][posi] + 0.5 * distanceMatrix[i][posj] - 0.25 * distanceMatrix[posi][posj]; } } } else { for (j=0; j<clusters.length; j++) { if (clusters[j] == clusters[posi]) { distanceMatrix[i][j] = Double.POSITIVE_INFINITY; distanceMatrix[j][i] = Double.POSITIVE_INFINITY; } } } } } } //Computation and Identification of clusters and number of them clustersHit = new boolean[distanceMatrix.length]; Arrays.fill(clustersHit, false); for (i=0; i<clusters.length; i++) { clustersHit[clusters[i]] = true; } numClusters = 0; for (i=0; i<clustersHit.length; i++) { if (clustersHit[i]) numClusters++; } //Obtaining the min and max boundaries of the interval of each cluster clusterIntervals = new double[numClusters][numReal][2]; clusterID = new int[numClusters]; for (i=0; i<numClusters; i++){ for (j=0; j<numReal; j++) { clusterIntervals[i][j][0] = Double.POSITIVE_INFINITY; clusterIntervals[i][j][1] = Double.NEGATIVE_INFINITY; } } for (i=0, j=0; i<clustersHit.length; i++) { if (clustersHit[i]) { clusterID[j] = i; for (l=0; l<clusters.length; l++) { if (clusters[l] == i) { for (m=0; m<numReal; m++) { if (examplesCopy[m][l] < clusterIntervals[j][m][0]) { clusterIntervals[j][m][0] = examplesCopy[m][l]; } if (examplesCopy[m][l] > clusterIntervals[j][m][1]) { clusterIntervals[j][m][1] = examplesCopy[m][l]; } } } } j++; } } //Remove the clusters whose domain is a subdomain of other clusters for each attribute //and construct the set of cutpoints bHit = false; i = 0; atreal = 0; for (int a = 0; i < Parameters.numAttributes; a++){ Attribute at=Attributes.getAttribute(a); if (at.getDirectionAttribute() == Attribute.INPUT){ if(at.getType()==Attribute.REAL || at.getType()==Attribute.INTEGER) { realValues[i] = new double[instances.length]; int []points= new int[instances.length]; int numPoints=0; classesHit = new boolean[Parameters.numClasses]; for (j=0; j<numClusters; j++) { stop = false; Arrays.fill(classesHit, false); classesCont = 0; for (l=0; l<numClusters && !stop; l++) { if (j != l) { if (clusterIntervals[j][atreal][0] >= clusterIntervals[l][atreal][0] && clusterIntervals[j][atreal][1] <= clusterIntervals[l][atreal][1]) { for (m=0; m<classOfInstances.length; m++) { if (clusters[m] == clusterID[j] || clusters[m] == clusterID[l]) { classesHit[classOfInstances[m]] = true; } } for (m=0; m<classesHit.length; m++) { if (classesHit[m]) classesCont++; } if (classesCont <= 1) stop = true; } } } if (!stop) { points[numPoints++]=j; realValues[i][j]=clusterIntervals[j][atreal][0]; } } m=j; //search the greatest value of right boundary among the clusters and include into realvalues max = Double.NEGATIVE_INFINITY; for (j=0; j<numClusters; j++) { if (clusterIntervals[j][atreal][1] > max) { max = clusterIntervals[j][atreal][1]; } } if (!stop) { points[numPoints++]=m; realValues[i][m] = max; } sortValues(i,points,0,numPoints-1); Vector cp=discretizeAttribute(i,points,0,numPoints-1); if(cp.size()>0) { cutPoints[i]=new double[cp.size()]; for(j=0;j<cutPoints[i].length;j++) { cutPoints[i][j]=((Double)cp.elementAt(j)).doubleValue(); LogManager.println("Cut point "+j+" of attribute "+i+" : "+cutPoints[i][j]); } } else { cutPoints[i]=null; } LogManager.println("Number of cut points of attribute "+i+" : "+cp.size()); atreal++; } else { realAttributes[i]=false; } i++; } else { iClassIndex = a; bHit = true; } } if (bHit == false){ iClassIndex = Parameters.numAttributes; } } /** * <p> * Returns a vector with the discretized values. * </p> * @param attribute * @param values * @param begin * @param end * @return vector with the discretized values */ protected Vector <Double> discretizeAttribute(int attribute,int []values,int begin,int end) { int cd[][]; int i, j; /*Remove repetitions of boundaries*/ for (i=begin; i<end; i++) { if (realValues[attribute][values[i]] == realValues[attribute][values[i+1]]) { for (j=i; j<end; j++) { values[j] = values[j+1]; } end--; } } /*Computation of class distribution*/ cd = new int[end][Parameters.numClasses]; for (i=0; i<cd.length; i++) { Arrays.fill(cd[i], 0); } for (i=0; i<instances.length; i++) { for (j=1; j<=end; j++) { if (j<end) { if (instances[i].getInputRealValues(attribute) < realValues[attribute][values[j]] && instances[i].getInputRealValues(attribute) >= realValues[attribute][values[j-1]]) { cd[j-1][classOfInstances[i]]++; } } else { if (instances[i].getInputRealValues(attribute) <= realValues[attribute][values[j]] && instances[i].getInputRealValues(attribute) >= realValues[attribute][values[j-1]]) { cd[j-1][classOfInstances[i]]++; } } } } /*Merge interval with entropy equal to zero*/ Vector<Double> cutPoints=new Vector<Double>(); for (i=1; i<end; i++) { if (computeEntropy(cd[i-1],cd[i]) > 0) cutPoints.addElement(new Double(realValues[attribute][values[i]])); } return cutPoints; } /** * <p> * Calculate the log base 2 of a number * </p> * @param value Number to apply log base2 * @return log base 2 */ public double log2(double value) { return Math.log(value)/Math.log(2); } double computeEntropy(int cd1[],int cd2[]) { double ent=0; int numValues = 0; for (int i=0; i<cd1.length; i++) { numValues += cd1[i] + cd2[i]; } for(int i=0,size=cd1.length;i<size;i++) { double prob=(double)(cd1[i]+cd2[i]); prob/=(double)numValues; if (prob > 0.0) ent+=prob*Math.log(prob)/Math.log(2); } return -ent; } }