package edu.isistan.uima.unified.algorithms.clustering; import java.util.ArrayList; import java.util.List; import edu.isistan.uima.unified.algorithms.clustering.data.Cluster; import edu.isistan.uima.unified.algorithms.clustering.data.CompositeCluster; import edu.isistan.uima.unified.algorithms.clustering.data.DataPoint; import edu.isistan.uima.unified.algorithms.clustering.data.DefaultDataPoint; import edu.isistan.uima.unified.algorithms.clustering.distance.DefaultDistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.distance.DistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.linkage.LinkageMethod; public class CMCMClusterer { /* * Private empty constructor to prevent instantiation */ private CMCMClusterer() {} /** * Main clustering method. Performs clustering according to the * given linkage method. Maximizes the cohesion intra-cluster and * minimizes the copling inter-cluster. It doesn't need a fixed * number of desired clusters as input. Throws an * IllegalArgumentException if parameters don't make sense. * @param dataPoints The data points to cluster * @param method The linkage method to use * @return A list of clusters */ public static List<Cluster> cluster(List<DataPoint> dataPoints, LinkageMethod method, DistanceMeasure measure, double minClusterDistance) { // check if parameters are fine, throw exception if not if (!configIsValid(dataPoints)) { throw new IllegalArgumentException(); } // at the beginning each data point is one cluster ArrayList<Cluster> results = new ArrayList<Cluster>(); results.addAll(dataPoints); boolean merged = true; // loop while we still have too many clusters while (merged) { // collect best values here double minDistance = minClusterDistance; Cluster bestC1 = null; Cluster bestC2 = null; // do the following for each pair for (Cluster c1 : results) { for (Cluster c2 : results) { if (c1.equals(c2)) { continue; } // compute distance double dist = method.computeDistance(c1, c2, measure); // found better? update best values! if (dist < minDistance ) { minDistance = dist; bestC1 = c1; bestC2 = c2; } } } if(bestC1 != null && bestC2 != null) { // create new cluster and add the two others found CompositeCluster newComp = new CompositeCluster(); newComp.add(bestC1); newComp.add(bestC2); // remove them from the results set results.remove(bestC1); results.remove(bestC2); // add the new cluster results.add(newComp); } else { merged = false; } } return results; } public static List<Cluster> recluster(List<Cluster> results, LinkageMethod method, DistanceMeasure measure, double minClusterDistance) { boolean merged = true; // loop while we still have too many clusters while (merged) { // collect best values here double minDistance = minClusterDistance; Cluster bestC1 = null; Cluster bestC2 = null; // do the following for each pair for (Cluster c1 : results) { for (Cluster c2 : results) { if (c1.equals(c2)) { continue; } // compute distance double dist = method.computeDistance(c1, c2, measure); // found better? update best values! if (dist < minDistance ) { minDistance = dist; bestC1 = c1; bestC2 = c2; } } } if(bestC1 != null && bestC2 != null) { // create new cluster and add the two others found CompositeCluster newComp = new CompositeCluster(); newComp.add(bestC1); newComp.add(bestC2); // remove them from the results set results.remove(bestC1); results.remove(bestC2); // add the new cluster results.add(newComp); } else { merged = false; } } return results; } /** * Helper method which performs sanity check on input parameters. * @param dataPoints The data points * @return <code>true</code> if parameters are fine, <code>false</code> otherwise */ private static boolean configIsValid(List<DataPoint> dataPoints) { return !dataPoints.isEmpty(); } /** * Convenience method to cluster a list of doubles. * @param dataPoints The doubles to cluster * @param method The linkage method to use * @param numOfClusters The desired number of clusters * @return A list of clusters */ public static List<Cluster> clusterDoubles(List<Double> dataPoints, LinkageMethod method, double minClusterDistance) { // make data points for the doubles ArrayList<DataPoint> realDataPoints = new ArrayList<DataPoint>(); for (Double d : dataPoints) { realDataPoints.add(new DefaultDataPoint(d)); } // call main cluster method return cluster(realDataPoints, method, new DefaultDistanceMeasure(), minClusterDistance); } }