/* * File: CustomClusteringExample.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright September 11, 2008, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package examples; import gov.sandia.cognition.collection.CollectionUtil; import gov.sandia.cognition.learning.algorithm.clustering.AgglomerativeClusterer; import gov.sandia.cognition.learning.algorithm.clustering.cluster.CentroidCluster; import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator; import gov.sandia.cognition.learning.algorithm.clustering.divergence.ClusterMeanLinkDivergenceFunction; import gov.sandia.cognition.math.Metric; import gov.sandia.cognition.math.matrix.mtj.Vector3; import gov.sandia.cognition.statistics.distribution.UnivariateGaussian; import gov.sandia.cognition.util.AbstractCloneableSerializable; import java.util.Collection; import java.util.LinkedList; import java.util.Random; /** * This is an example of using a custom distance metric and cluster creator with * agglomerative clustering. * * @author Justin Basilico * @since 2.1 */ public class CustomClusteringExample { /** * Main method * @param args * We don't take any command-line arguments */ public static void main( final String... args) { // Step 1: Get the data. // First we generate some data to cluster. Typically, one already has // the data loaded. The important thing is that it gets turned into a // Collection. final int numClusters = 4; final Random random = new Random(); final Collection<ExampleData> data = generateData(numClusters, random); // Step 2: Create our metric. // We are going to use a custom metric, which is defined below. final Metric<ExampleData> metric = new CustomMetric(); // Step 3: Choose the cluster creator. // We also need to provide a cluster creator for our algorithm. Here // we make use of our custom cluster creator. Note that we could have // used the DefaultClusterCreator with the divergence function we are // using, but by using our own custom creator, we can pull out // information regarding the centroid after clustering is complete. final ClusterCreator<CentroidCluster<ExampleData>, ExampleData> clusterCreator = new CustomClusterCreator(); // Step 4: Choose the type of agglomerative clustering. // The key to using the custom metric is that it needs to be given to // the method of cluster-to-cluster divergence that we will provide to // the agglomerative clustering algorithm. Here we pick "mean link" // as our type of clustering, but we could alternatively have chosen // one of the others such as "complete link", or "single link". final ClusterMeanLinkDivergenceFunction<CentroidCluster<ExampleData>, ExampleData> clusterDivergence = new ClusterMeanLinkDivergenceFunction<CentroidCluster<ExampleData>, ExampleData>( metric); // Step 5: Create and initialize the clustering algorithm. // Next we populate the agglomerative clustering algorithm with our // the cluster divergence and cluster creator that we just created. final AgglomerativeClusterer<ExampleData, CentroidCluster<ExampleData>> clusterer = new AgglomerativeClusterer<ExampleData, CentroidCluster<ExampleData>>( clusterDivergence, clusterCreator); // Now we the minimum number of clusters to create. Alternatively, we // could set the "maximum minimum distance", which is a threshold of the // maximum allowed minimum distance between clusters, which stops the // clustering when the clusters are too far away. clusterer.setMinNumClusters(numClusters); // Step 6: Run the clustering. // Now we run the clustering by passing our data to the clustering // algorithm. final Collection<CentroidCluster<ExampleData>> clusters = clusterer.learn(data); // Step 7: Use te clusters. // Now we can use the clusters that we've created. printClusters(clusters); } /** * The example data holds a three-dimensional position value, plus an * integer score associated with that position. */ public static class ExampleData { /** The three-dimensional position of the example. */ private Vector3 position; /** The score of the example. */ private int score; /** * Creates a new example data instance. * * @param position The position. * @param score The score */ public ExampleData( final Vector3 position, final int score) { super(); this.setPosition(position); this.setScore(score); } /** * Gets the position. * * @return The position. */ public Vector3 getPosition() { return position; } /** * Sets the position. * * @param position The position. */ public void setPosition( final Vector3 position) { this.position = position; } /** * Gets the score. * * @return The score. */ public int getScore() { return score; } /** * Sets the score. * * @param score The score. */ public void setScore( final int score) { this.score = score; } @Override public String toString() { return "position = " + this.getPosition() + ", score = " + this.getScore(); } } /** * This implements our custom metric. The Metric interface specifies that * the value returned must fulfill the requirements of being a mathematical * metric, which are: * * g(x, y) + g(y, z) >= g(x, z) * g(x, y) == g(y, x) * g(x, x) == 0 */ public static class CustomMetric extends AbstractCloneableSerializable implements Metric<ExampleData> { public double evaluate( final ExampleData first, final ExampleData second) { // To compute the metric, we use the euclidean distance in position // and the difference in the score. // First get he euclidean distance between the position. We // already know that euclidean distance fulfills the contract for // being a Metric. final double positionDifference = first.getPosition().euclideanDistance( second.getPosition()); // Now compute the difference in score. Note that we need to use // the absolute value here in order to fulfill the contract for // a Metric. final int scoreDifference = Math.abs(first.getScore() - second.getScore()); // We weight the difference in position higher than a difference // in score for some extra customization. return 2.0 * positionDifference + 0.5 * scoreDifference; } } /** * This implements a custom cluster creator. We make use of the * ClusterCreator interface to create our own CentroidCluster, which is * the type of cluster that has an object representing its centroid. In * our case, the centroid is the mean element of the cluster. To implement * this, we just have to compute the mean over the two fields in our * data type: the position and the score. */ public static class CustomClusterCreator extends AbstractCloneableSerializable implements ClusterCreator<CentroidCluster<ExampleData>, ExampleData> { @Override public CentroidCluster<ExampleData> createCluster( final Collection<? extends ExampleData> members) { if ( members == null ) { // Error: Members is null. throw new NullPointerException("The members cannot be null."); } else if ( members.size() <= 0 ) { // No members to create the cluster from. return null; } // We need to compute the mean position and mean score for our the // centroid example for our cluster. Vector3 meanPosition = new Vector3(); double meanScore = 0.0; // Go through al lthe examples and add up te sum of positions and // scores. for (ExampleData example : members) { meanPosition.plusEquals(example.getPosition()); meanScore += example.getScore(); } // Now convert the sums into means by dividing by the number of // members. final int numMembers = members.size(); meanPosition.scaleEquals(1.0 / (double) numMembers); meanScore = Math.round(meanScore / (double) numMembers); // Create the centroid object. final ExampleData centroid = new ExampleData(meanPosition, (int) meanScore); // Return the centroid cluster. return new CentroidCluster<ExampleData>(centroid, members); } } /** * Generates the data we use for clustering. * * @param numClusters The number of clusters. * @param random The random number generator. * @return The generated data. */ public static LinkedList<ExampleData> generateData( final int numClusters, final Random random) { final LinkedList<ExampleData> result = new LinkedList<ExampleData>(); // We're going to generate some clusters from which we will sample // the example data. Note that there is more noise in the score than // in the position.\ final int numToGenerate = 100; final UnivariateGaussian.CDF positionNoiseCDF = new UnivariateGaussian.CDF(0.0, 1.0); final UnivariateGaussian.CDF scoreNoiseCDF = new UnivariateGaussian.CDF(0.0, 5.0); // Create the clusters to sample from. final ExampleData[] clusters = new ExampleData[numClusters]; for (int i = 0; i < numClusters; i++) { final Vector3 position = new Vector3( 25.0 * random.nextDouble(), 25.0 * random.nextDouble(), 25.0 * random.nextDouble()); final int score = random.nextInt(10); clusters[i] = new ExampleData(position, score); } // Generate the examples. for (int i = 0; i < numToGenerate; i++) { // Get the cluster information. final ExampleData cluster = clusters[random.nextInt(numClusters)]; Vector3 position = cluster.getPosition().clone(); int score = cluster.getScore(); // Add the noise. position.plusEquals(new Vector3( positionNoiseCDF.sample(random), positionNoiseCDF.sample(random), positionNoiseCDF.sample(random))); score += (int) Math.round(scoreNoiseCDF.sample(random)); // Create the example and add it to the result. final ExampleData example = new ExampleData(position, score); result.add(example); } return result; } /** * Prints the given clusters to standard out. * * @param clusters The clusters to print. */ public static void printClusters( final Collection<CentroidCluster<ExampleData>> clusters) { for (int i = 0; i < clusters.size(); i++) { final CentroidCluster<ExampleData> cluster = CollectionUtil.getElement(clusters, i); System.out.println("Cluster " + i + ":" + cluster.getCentroid()); for (ExampleData example : cluster.getMembers()) { System.out.println(" " + example); } } } }