/* Copyright (C) 2007 Niels Ott Copyright (C) 2007 Ramon Ziai This file is part of Clusterlib. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 */ package edu.isistan.uima.unified.algorithms.clustering; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import edu.isistan.uima.unified.algorithms.clustering.data.Cluster; import edu.isistan.uima.unified.algorithms.clustering.data.CompositeCluster; import edu.isistan.uima.unified.algorithms.clustering.data.DataPoint; import edu.isistan.uima.unified.algorithms.clustering.data.DefaultDataPoint; import edu.isistan.uima.unified.algorithms.clustering.distance.DefaultDistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.distance.DistanceMeasure; import edu.isistan.uima.unified.algorithms.clustering.linkage.LinkageMethod; /** * This class implements general agglomerative clustering. Convenience methods * exist for clustering lists of doubles and for clustering into one final cluster. * Input data is always sorted before clustering in order to ensure reproducible * results regardless of input order. * This implementation is based on the pseudocode * given by * <a href="http://www.schulteimwalde.de/phd-thesis.html">Schulte im Walde (2003, p. 186)</a>:<br> * @author Niels Ott * @author Ramon Ziai * @version $Id: HierarchicalClusterer.java 121 2008-04-17 11:27:23Z ramon $ */ public class HierarchicalClusterer { /* * Private empty constructor to prevent instantiation */ private HierarchicalClusterer() {} /** * Main clustering method. Performs agglomerative clustering according to the * given linkage method and desired number of clusters. Throws an * IllegalArgumentException if parameters don't make sense. * @param dataPoints The data points to cluster * @param method The linkage method to use * @param numOfClusters The desired number of clusters * @return A list of clusters */ public static List<Cluster> cluster(List<DataPoint> dataPoints, LinkageMethod method, DistanceMeasure measure, int numOfClusters) { // check if parameters are fine, throw exception if not if (!configIsValid(dataPoints, numOfClusters)) { throw new IllegalArgumentException(); } // at the beginning each data point is one cluster ArrayList<Cluster> results = new ArrayList<Cluster>(); results.addAll(dataPoints); // sort input data according to values Collections.sort(results, new Comparator<Cluster>() { public int compare(Cluster o1, Cluster o2) { // cast to data points, we know that at this point // there can be no composites DataPoint dp1 = (DataPoint) o1; DataPoint dp2 = (DataPoint) o2; // defer comparing to double values return Double.compare(dp1.getValue(), dp2.getValue()); } }); // loop while we still have too many clusters while (results.size() > numOfClusters) { // collect best values here double minDistance = Double.MAX_VALUE; Cluster bestC1 = null; Cluster bestC2 = null; // do the following for each pair for (Cluster c1 : results) { for (Cluster c2 : results) { if (c1.equals(c2)) { continue; } // compute distance double dist = method.computeDistance(c1, c2, measure); // found better? update best values! if (dist < minDistance) { minDistance = dist; bestC1 = c1; bestC2 = c2; } } } // create new cluster and add the two others found CompositeCluster newComp = new CompositeCluster(); newComp.add(bestC1); newComp.add(bestC2); // remove them from the results set results.remove(bestC1); results.remove(bestC2); // add the new cluster results.add(newComp); } return results; } /** * Helper method which performs sanity check on input parameters. * @param dataPoints The data points * @param numOfClusters The desired number of clusters * @return <code>true</code> if parameters are fine, <code>false</code> otherwise */ private static boolean configIsValid(List<DataPoint> dataPoints, int numOfClusters) { return !dataPoints.isEmpty() && numOfClusters >= 1 && dataPoints.size() >= numOfClusters; } /** * Convenience method for clustering into one big cluster. * @param dataPoints The data points to cluster * @param method The linkage method to use * @return A cluster that contains all others */ public static Cluster cluster(List<DataPoint> dataPoints, LinkageMethod method, DistanceMeasure measure) { return cluster(dataPoints, method, measure, 1).get(0); } /** * Convenience method to cluster a list of doubles. * @param dataPoints The doubles to cluster * @param method The linkage method to use * @param numOfClusters The desired number of clusters * @return A list of clusters */ public static List<Cluster> clusterDoubles(List<Double> dataPoints, LinkageMethod method, int numOfClusters) { // make data points for the doubles ArrayList<DataPoint> realDataPoints = new ArrayList<DataPoint>(); for (Double d : dataPoints) { realDataPoints.add(new DefaultDataPoint(d)); } // call main cluster method return cluster(realDataPoints, method, new DefaultDistanceMeasure(), numOfClusters); } /** * Convenience method to cluster a list of doubles into one big cluster. * @param dataPoints The doubles to cluster * @param method The linkage method to use * @return A cluster that contains all others */ public static Cluster clusterDoubles(List<Double> dataPoints, LinkageMethod method) { return clusterDoubles(dataPoints, method, 1).get(0); } }