package com.yahoo.labs.samoa.evaluation.measures; /* * #%L * SAMOA * %% * Copyright (C) 2011 RWTH Aachen University, Germany * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.yahoo.labs.samoa.instances.Instance; import com.yahoo.labs.samoa.moa.cluster.Clustering; import com.yahoo.labs.samoa.moa.core.AutoExpandVector; import com.yahoo.labs.samoa.moa.core.DataPoint; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; /** * [CMM_GTAnalysis.java] * * CMM: Ground truth analysis * * Reference: Kremer et al., "An Effective Evaluation Measure for Clustering on Evolving Data Streams", KDD, 2011 * * @author Timm jansen * Data Management and Data Exploration Group, RWTH Aachen University */ /* * TODO: * - try to avoid calcualting the radius multiple times * - avoid the full distance map? * - knn functionality in clusters * - noise error */ public class CMM_GTAnalysis{ /** * the given ground truth clustering */ private Clustering gtClustering; /** * list of given points within the horizon */ private ArrayList<CMMPoint> cmmpoints; /** * the newly calculate ground truth clustering */ private ArrayList<GTCluster> gt0Clusters; /** * IDs of noise points */ private ArrayList<Integer> noise; /** * total number of points */ private int numPoints; /** * number of clusters of the original ground truth */ private int numGTClusters; /** * number of classes of the original ground truth, in case of a * micro clustering ground truth this differs from numGTClusters */ private int numGTClasses; /** * number of classes after we are done with the analysis */ private int numGT0Classes; /** * number of dimensions */ private int numDims; /** * mapping between true cluster ID/class label of the original ground truth * and the internal cluster ID/working class label. * * different original cluster IDs might map to the same new cluster ID due to merging of two clusters */ private HashMap<Integer, Integer> mapTrueLabelToWorkLabel; /** * log of how clusters have been merged (for debugging) */ private int[] mergeMap; /** * number of non-noise points that will create an error due to the underlying clustering model * (e.g. point being covered by two clusters representing different classes) */ private int noiseErrorByModel; /** * number of noise points that will create an error due to the underlying clustering model * (e.g. noise point being covered by a cluster) */ private int pointErrorByModel; /** * CMM debug mode */ private boolean debug = false; /******* CMM parameter ***********/ /** * defines how many nearest neighbors will be used */ private int knnNeighbourhood = 2; /** * the threshold which defines when ground truth clusters will be merged. * set to 1 to disable merging */ private double tauConnection = 0.5; /** * experimental (default: disabled) * separate k for points to cluster and cluster to cluster */ private double clusterConnectionMaxPoints = knnNeighbourhood; /** * experimental (default: disabled) * use exponential connectivity function to model different behavior: * closer points will have a stronger connection compared to the linear function. * Use ConnRefXValue and ConnX to better parameterize lambda, which controls * the decay of the connectivity */ private boolean useExpConnectivity = false; private double lambdaConnRefXValue = 0.01; private double lambdaConnX = 4; private double lamdaConn; /******************************************/ /** * Wrapper class for data points to store CMM relevant attributes * */ protected class CMMPoint extends DataPoint{ /** * Reference to original point */ protected DataPoint p = null; /** * point ID */ protected int pID = 0; /** * true class label */ protected int trueClass = -1; /** * the connectivity of the point to its cluster */ protected double connectivity = 1.0; /** * knn distnace within own cluster */ protected double knnInCluster = 0.0; /** * knn indices (for debugging only) */ protected ArrayList<Integer> knnIndices; public CMMPoint(DataPoint point, int id) { //make a copy, but keep reference super(point,point.getTimestamp()); p = point; pID = id; trueClass = (int)point.classValue(); } /** * Retruns the current working label of the cluster the point belongs to. * The label can change due to merging of clusters. * * @return the current working class label */ protected int workclass(){ if(trueClass == -1 ) return -1; else return mapTrueLabelToWorkLabel.get(trueClass); } } /** * Main class to model the new clusters that will be the output of the cluster analysis * */ protected class GTCluster{ /** points that are per definition in the cluster */ private ArrayList<Integer> points = new ArrayList<Integer>(); /** a new GT cluster consists of one or more "old" GT clusters. * Connected/overlapping clusters cannot be merged directly because of the * underlying cluster model. E.g. for merging two spherical clusters the new * cluster sphere can cover a lot more space then two separate smaller spheres. * To keep the original coverage we need to keep the orignal clusters and merge * them on an abstract level. */ private ArrayList<Integer> clusterRepresentations = new ArrayList<Integer>(); /** current work class (changes when merging) */ private int workclass; /** original work class */ private final int orgWorkClass; /** original class label*/ private final int label; /** clusters that have been merged into this cluster (debugging)*/ private ArrayList<Integer> mergedWorkLabels = null; /** average knn distance of all points in the cluster*/ private double knnMeanAvg = 0; /** average deviation of knn distance of all points*/ private double knnDevAvg = 0; /** connectivity of the cluster to all other clusters */ private ArrayList<Double> connections = new ArrayList<Double>(); private GTCluster(int workclass, int label, int gtClusteringID) { this.orgWorkClass = workclass; this.workclass = workclass; this.label = label; this.clusterRepresentations.add(gtClusteringID); } /** * The original class label the cluster represents * @return original class label */ protected int getLabel(){ return label; } /** * Calculate the probability of the point being covered through the cluster * @param point to calculate the probability for * @return probability of the point being covered through the cluster */ protected double getInclusionProbability(CMMPoint point){ double prob = Double.MIN_VALUE; //check all cluster representatives for coverage for (int c = 0; c < clusterRepresentations.size(); c++) { double tmp_prob = gtClustering.get(clusterRepresentations.get(c)).getInclusionProbability(point); if(tmp_prob > prob) prob = tmp_prob; } return prob; } /** * calculate knn distances of points within own cluster * + average knn distance and average knn distance deviation of all points */ private void calculateKnn(){ for (int p0 : points) { CMMPoint cmdp = cmmpoints.get(p0); if(!cmdp.isNoise()){ AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); //calculate nearest neighbours getKnnInCluster(cmdp, knnNeighbourhood, points, knnDist,knnPointIndex); //TODO: What to do if we have less then k neighbours? double avgKnn = 0; for (int i = 0; i < knnDist.size(); i++) { avgKnn+= knnDist.get(i); } if(knnDist.size()!=0) avgKnn/=knnDist.size(); cmdp.knnInCluster = avgKnn; cmdp.knnIndices = knnPointIndex; cmdp.p.setMeasureValue("knnAvg", cmdp.knnInCluster); knnMeanAvg+=avgKnn; knnDevAvg+=Math.pow(avgKnn,2); } } knnMeanAvg=knnMeanAvg/(double)points.size(); knnDevAvg=knnDevAvg/(double)points.size(); double variance = knnDevAvg-Math.pow(knnMeanAvg,2.0); // Due to numerical errors, small negative values can occur. if (variance <= 0.0) variance = 1e-50; knnDevAvg = Math.sqrt(variance); } /** * Calculate the connection of a cluster to this cluster * @param otherCid cluster id of the other cluster * @param initial flag for initial run */ private void calculateClusterConnection(int otherCid, boolean initial){ double avgConnection = 0; if(workclass==otherCid){ avgConnection = 1; } else{ AutoExpandVector<Double> kmax = new AutoExpandVector<Double>(); AutoExpandVector<Integer> kmaxIndexes = new AutoExpandVector<Integer>(); for(int p : points){ CMMPoint cmdp = cmmpoints.get(p); double con_p_Cj = getConnectionValue(cmmpoints.get(p), otherCid); double connection = cmdp.connectivity * con_p_Cj; if(initial){ cmdp.p.setMeasureValue("Connection to C"+otherCid, con_p_Cj); } //connection if(kmax.size() < clusterConnectionMaxPoints || connection > kmax.get(kmax.size()-1)){ int index = 0; while(index < kmax.size() && connection < kmax.get(index)) { index++; } kmax.add(index, connection); kmaxIndexes.add(index, p); if(kmax.size() > clusterConnectionMaxPoints){ kmax.remove(kmax.size()-1); kmaxIndexes.add(kmaxIndexes.size()-1); } } } //connection for (int k = 0; k < kmax.size(); k++) { avgConnection+= kmax.get(k); } avgConnection/=kmax.size(); } if(otherCid<connections.size()){ connections.set(otherCid, avgConnection); } else if(connections.size() == otherCid){ connections.add(avgConnection); } else System.out.println("Something is going really wrong with the connection listing!"+knnNeighbourhood+" "+tauConnection); } /** * Merge a cluster into this cluster * @param mergeID the ID of the cluster to be merged */ private void mergeCluster(int mergeID){ if(mergeID < gt0Clusters.size()){ //track merging (debugging) for (int i = 0; i < numGTClasses; i++) { if(mergeMap[i]==mergeID) mergeMap[i]=workclass; if(mergeMap[i]>mergeID) mergeMap[i]--; } GTCluster gtcMerge = gt0Clusters.get(mergeID); if(debug) System.out.println("Merging C"+gtcMerge.workclass+" into C"+workclass+ " with Con "+connections.get(mergeID)+" / "+gtcMerge.connections.get(workclass)); //update mapTrueLabelToWorkLabel mapTrueLabelToWorkLabel.put(gtcMerge.label, workclass); Iterator iterator = mapTrueLabelToWorkLabel.keySet().iterator(); while (iterator.hasNext()) { Integer key = (Integer)iterator.next(); //update pointer of already merged cluster int value = mapTrueLabelToWorkLabel.get(key); if(value == mergeID) mapTrueLabelToWorkLabel.put(key, workclass); if(value > mergeID) mapTrueLabelToWorkLabel.put(key, value-1); } //merge points from B into A points.addAll(gtcMerge.points); clusterRepresentations.addAll(gtcMerge.clusterRepresentations); if(mergedWorkLabels==null){ mergedWorkLabels = new ArrayList<Integer>(); } mergedWorkLabels.add(gtcMerge.orgWorkClass); if(gtcMerge.mergedWorkLabels!=null) mergedWorkLabels.addAll(gtcMerge.mergedWorkLabels); gt0Clusters.remove(mergeID); //update workclass labels for(int c=mergeID; c < gt0Clusters.size(); c++){ gt0Clusters.get(c).workclass = c; } //update knn distances calculateKnn(); for(int c=0; c < gt0Clusters.size(); c++){ gt0Clusters.get(c).connections.remove(mergeID); //recalculate connection from other clusters to the new merged one gt0Clusters.get(c).calculateClusterConnection(workclass,false); //and from new merged one to other clusters gt0Clusters.get(workclass).calculateClusterConnection(c,false); } } else{ System.out.println("Merge indices are not valid"); } } } /** * @param trueClustering the ground truth clustering * @param points data points * @param enableClassMerge allow class merging (should be set to true on default) */ public CMM_GTAnalysis(Clustering trueClustering, ArrayList<DataPoint> points, boolean enableClassMerge){ if(debug) System.out.println("GT Analysis Debug Output"); noiseErrorByModel = 0; pointErrorByModel = 0; if(!enableClassMerge){ tauConnection = 1.0; } lamdaConn = -Math.log(lambdaConnRefXValue)/Math.log(2)/lambdaConnX; this.gtClustering = trueClustering; numPoints = points.size(); numDims = points.get(0).numAttributes()-1; numGTClusters = gtClustering.size(); //init mappings between work and true labels mapTrueLabelToWorkLabel = new HashMap<Integer, Integer>(); //set up base of new clustering gt0Clusters = new ArrayList<GTCluster>(); int numWorkClasses = 0; //create label to worklabel mapping as real labels can be just a set of unordered integers for (int i = 0; i < numGTClusters; i++) { int label = (int)gtClustering.get(i).getGroundTruth(); if(!mapTrueLabelToWorkLabel.containsKey(label)){ gt0Clusters.add(new GTCluster(numWorkClasses,label,i)); mapTrueLabelToWorkLabel.put(label,numWorkClasses); numWorkClasses++; } else{ gt0Clusters.get(mapTrueLabelToWorkLabel.get(label)).clusterRepresentations.add(i); } } numGTClasses = numWorkClasses; mergeMap = new int[numGTClasses]; for (int i = 0; i < numGTClasses; i++) { mergeMap[i]=i; } //create cmd point wrapper instances cmmpoints = new ArrayList<CMMPoint>(); for (int p = 0; p < points.size(); p++) { CMMPoint cmdp = new CMMPoint(points.get(p), p); cmmpoints.add(cmdp); } //split points up into their GTClusters and Noise (according to class labels) noise = new ArrayList<Integer>(); for (int p = 0; p < numPoints; p++) { if(cmmpoints.get(p).isNoise()){ noise.add(p); } else{ gt0Clusters.get(cmmpoints.get(p).workclass()).points.add(p); } } //calculate initial knnMean and knnDev for (GTCluster gtc : gt0Clusters) { gtc.calculateKnn(); } //calculate cluster connections calculateGTClusterConnections(); //calculate point connections with own clusters calculateGTPointQualities(); if(debug) System.out.println("GT Analysis Debug End"); } /** * Calculate the connection of a point to a cluster * * @param cmmp the point to calculate the connection for * @param clusterID the corresponding cluster * @return the connection value */ //TODO: Cache the connection value for a point to the different clusters??? protected double getConnectionValue(CMMPoint cmmp, int clusterID){ AutoExpandVector<Double> knnDist = new AutoExpandVector<Double>(); AutoExpandVector<Integer> knnPointIndex = new AutoExpandVector<Integer>(); //calculate the knn distance of the point to the cluster getKnnInCluster(cmmp, knnNeighbourhood, gt0Clusters.get(clusterID).points, knnDist, knnPointIndex); //TODO: What to do if we have less then k neighbors? double avgDist = 0; for (int i = 0; i < knnDist.size(); i++) { avgDist+= knnDist.get(i); } //what to do if we only have a single point??? if(knnDist.size()!=0) avgDist/=knnDist.size(); else return 0; //get the upper knn distance of the cluster double upperKnn = gt0Clusters.get(clusterID).knnMeanAvg + gt0Clusters.get(clusterID).knnDevAvg; /* calculate the connectivity based on knn distance of the point within the cluster and the upper knn distance of the cluster*/ if(avgDist < upperKnn){ return 1; } else{ //value that should be reached at upperKnn distance //Choose connection formula double conn; if(useExpConnectivity) conn = Math.pow(2,-lamdaConn*(avgDist-upperKnn)/upperKnn); else conn = upperKnn/avgDist; if(Double.isNaN(conn)) System.out.println("Connectivity NaN at "+cmmp.p.getTimestamp()); return conn; } } /** * @param cmmp point to calculate knn distance for * @param k number of nearest neighbors to look for * @param pointIDs list of point IDs to check * @param knnDist sorted list of smallest knn distances (can already be filled to make updates possible) * @param knnPointIndex list of corresponding knn indices */ private void getKnnInCluster(CMMPoint cmmp, int k, ArrayList<Integer> pointIDs, AutoExpandVector<Double> knnDist, AutoExpandVector<Integer> knnPointIndex) { //iterate over every point in the choosen cluster, cal distance and insert into list for (int p1 = 0; p1 < pointIDs.size(); p1++) { int pid = pointIDs.get(p1); if(cmmp.pID == pid) continue; double dist = distance(cmmp,cmmpoints.get(pid)); if(knnDist.size() < k || dist < knnDist.get(knnDist.size()-1)){ int index = 0; while(index < knnDist.size() && dist > knnDist.get(index)) { index++; } knnDist.add(index, dist); knnPointIndex.add(index,pid); if(knnDist.size() > k){ knnDist.remove(knnDist.size()-1); knnPointIndex.remove(knnPointIndex.size()-1); } } } } /** * calculate initial connectivities */ private void calculateGTPointQualities(){ for (int p = 0; p < numPoints; p++) { CMMPoint cmdp = cmmpoints.get(p); if(!cmdp.isNoise()){ cmdp.connectivity = getConnectionValue(cmdp, cmdp.workclass()); cmdp.p.setMeasureValue("Connectivity", cmdp.connectivity); } } } /** * Calculate connections between clusters and merge clusters accordingly as * long as connections exceed threshold */ private void calculateGTClusterConnections(){ for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { for (int c1 = 0; c1 < gt0Clusters.size(); c1++) { gt0Clusters.get(c0).calculateClusterConnection(c1, true); } } boolean changedConnection = true; while(changedConnection){ if(debug){ System.out.println("Cluster Connection"); for (int c = 0; c < gt0Clusters.size(); c++) { System.out.print("C"+gt0Clusters.get(c).label+" --> "); for (int c1 = 0; c1 < gt0Clusters.get(c).connections.size(); c1++) { System.out.print(" C"+gt0Clusters.get(c1).label+": "+gt0Clusters.get(c).connections.get(c1)); } System.out.println(""); } System.out.println(""); } double max = 0; int maxIndexI = -1; int maxIndexJ = -1; changedConnection = false; for (int c0 = 0; c0 < gt0Clusters.size(); c0++) { for (int c1 = c0+1; c1 < gt0Clusters.size(); c1++) { if(c0==c1) continue; double min =Math.min(gt0Clusters.get(c0).connections.get(c1), gt0Clusters.get(c1).connections.get(c0)); if(min > max){ max = min; maxIndexI = c0; maxIndexJ = c1; } } } if(maxIndexI!=-1 && max > tauConnection){ gt0Clusters.get(maxIndexI).mergeCluster(maxIndexJ); if(debug) System.out.println("Merging "+maxIndexI+" and "+maxIndexJ+" because of connection "+max); changedConnection = true; } } numGT0Classes = gt0Clusters.size(); } /** * Calculates how well the original clusters are separable. * Small values indicate bad separability, values close to 1 indicate good separability * @return index of seperability */ public double getClassSeparability(){ // int totalConn = numGTClasses*(numGTClasses-1)/2; // int mergedConn = 0; // for(GTCluster gt : gt0Clusters){ // int merged = gt.clusterRepresentations.size(); // if(merged > 1) // mergedConn+=merged * (merged-1)/2; // } // if(totalConn == 0) // return 0; // else // return 1-mergedConn/(double)totalConn; return numGT0Classes/(double)numGTClasses; } /** * Calculates how well noise is separable from the given clusters * Small values indicate bad separability, values close to 1 indicate good separability * @return index of noise separability */ public double getNoiseSeparability(){ if(noise.isEmpty()) return 1; double connectivity = 0; for(int p : noise){ CMMPoint npoint = cmmpoints.get(p); double maxConnection = 0; //TODO: some kind of pruning possible. what about weighting? for (int c = 0; c < gt0Clusters.size(); c++) { double connection = getConnectionValue(npoint, c); if(connection > maxConnection) maxConnection = connection; } connectivity+=maxConnection; npoint.p.setMeasureValue("MaxConnection", maxConnection); } return 1-(connectivity / noise.size()); } /** * Calculates the relative number of errors being caused by the underlying cluster model * @return quality of the model */ public double getModelQuality(){ for(int p = 0; p < numPoints; p++){ CMMPoint cmdp = cmmpoints.get(p); for(int hc = 0; hc < numGTClusters;hc++){ if(gtClustering.get(hc).getGroundTruth() != cmdp.trueClass){ if(gtClustering.get(hc).getInclusionProbability(cmdp) >= 1){ if(!cmdp.isNoise()) pointErrorByModel++; else noiseErrorByModel++; break; } } } } if(debug) System.out.println("Error by model: noise "+noiseErrorByModel+" point "+pointErrorByModel); return 1-((pointErrorByModel + noiseErrorByModel)/(double) numPoints); } /** * Get CMM internal point * @param index of the point * @return cmm point */ protected CMMPoint getPoint(int index){ return cmmpoints.get(index); } /** * Return cluster * @param index of the cluster to return * @return cluster */ protected GTCluster getGT0Cluster(int index){ return gt0Clusters.get(index); } /** * Number of classes/clusters of the new clustering * @return number of new clusters */ protected int getNumberOfGT0Classes() { return numGT0Classes; } /** * Calculates Euclidian distance * @param inst1 point as double array * @param inst2 point as double array * @return euclidian distance */ private double distance(Instance inst1, Instance inst2){ return distance(inst1, inst2.toDoubleArray()); } /** * Calculates Euclidian distance * @param inst1 point as an instance * @param inst2 point as double array * @return euclidian distance */ private double distance(Instance inst1, double[] inst2){ double distance = 0.0; for (int i = 0; i < numDims; i++) { double d = inst1.value(i) - inst2[i]; distance += d * d; } return Math.sqrt(distance); } /** * String with main CMM parameters * @return main CMM parameter */ public String getParameterString(){ String para = ""; para+="k="+knnNeighbourhood+";"; if(useExpConnectivity){ para+="lambdaConnX="+lambdaConnX+";"; para+="lambdaConn="+lamdaConn+";"; para+="lambdaConnRef="+lambdaConnRefXValue+";"; } para+="m="+clusterConnectionMaxPoints+";"; para+="tauConn="+tauConnection+";"; return para; } }