CustomClusteringExample.java example

/*
 * File:                CustomClusteringExample.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright September 11, 2008, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package examples;

import gov.sandia.cognition.collection.CollectionUtil;
import gov.sandia.cognition.learning.algorithm.clustering.AgglomerativeClusterer;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.CentroidCluster;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator;
import gov.sandia.cognition.learning.algorithm.clustering.divergence.ClusterMeanLinkDivergenceFunction;
import gov.sandia.cognition.math.Metric;
import gov.sandia.cognition.math.matrix.mtj.Vector3;
import gov.sandia.cognition.statistics.distribution.UnivariateGaussian;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Random;

/**
 * This is an example of using a custom distance metric and cluster creator with
 * agglomerative clustering. 
 * 
 * @author  Justin Basilico
 * @since   2.1
 */
public class CustomClusteringExample
{
    /**
     * Main method
     * @param args
     * We don't take any command-line arguments
     */
    public static void main(
        final String... args)
    {
        // Step 1: Get the data.
        // First we generate some data to cluster. Typically, one already has
        // the data loaded. The important thing is that it gets turned into a
        // Collection.
        final int numClusters = 4;
        final Random random = new Random();
        final Collection<ExampleData> data = generateData(numClusters, random);
        
        
        // Step 2: Create our metric.
        // We are going to use a custom metric, which is defined below.
        final Metric<ExampleData> metric = new CustomMetric();
        
        
        // Step 3: Choose the cluster creator.
        // We also need to provide a cluster creator for our algorithm. Here
        // we make use of our custom cluster creator. Note that we could have
        // used the DefaultClusterCreator with the divergence function we are
        // using, but by using our own custom creator, we can pull out 
        // information regarding the centroid after clustering is complete.
        final ClusterCreator<CentroidCluster<ExampleData>, ExampleData> 
            clusterCreator = new CustomClusterCreator();
        
        
        // Step 4: Choose the type of agglomerative clustering.
        // The key to using the custom metric is that it needs to be given to
        // the method of cluster-to-cluster divergence that we will provide to
        // the agglomerative clustering algorithm. Here we pick "mean link"
        // as our type of clustering, but we could alternatively have chosen 
        // one of the others such as "complete link", or "single link".
        final ClusterMeanLinkDivergenceFunction<CentroidCluster<ExampleData>, ExampleData> 
            clusterDivergence = 
                new ClusterMeanLinkDivergenceFunction<CentroidCluster<ExampleData>, ExampleData>(
                    metric);
        
        // Step 5: Create and initialize the clustering algorithm.
        // Next we populate the agglomerative clustering algorithm with our
        // the cluster divergence and cluster creator that we just created.
        final AgglomerativeClusterer<ExampleData, CentroidCluster<ExampleData>> 
            clusterer = new AgglomerativeClusterer<ExampleData, CentroidCluster<ExampleData>>(
                clusterDivergence, clusterCreator);

        // Now we the minimum number of clusters to create. Alternatively, we
        // could set the "maximum minimum distance", which is a threshold of the 
        // maximum allowed minimum distance between clusters, which stops the
        // clustering when the clusters are too far away.
        clusterer.setMinNumClusters(numClusters);
        
        
        // Step 6: Run the clustering.
        // Now we run the clustering by passing our data to the clustering 
        // algorithm.
        final Collection<CentroidCluster<ExampleData>> clusters =
            clusterer.learn(data);
        
        
        // Step 7: Use te clusters.
        // Now we can use the clusters that we've created.
        printClusters(clusters);
    }
    

    /**
     * The example data holds a three-dimensional position value, plus an
     * integer score associated with that position.
     */
    public static class ExampleData
    {
        /** The three-dimensional position of the example. */
        private Vector3 position;

        /** The score of the example. */
        private int score;

        /**
         * Creates a new example data instance.
         * 
         * @param position The position.
         * @param score The score
         */
        public ExampleData(
            final Vector3 position,
            final int score)
        {
            super();

            this.setPosition(position);
            this.setScore(score);
        }

        /**
         * Gets the position.
         * 
         * @return The position.
         */
        public Vector3 getPosition()
        {
            return position;
        }

        /**
         * Sets the position.
         * 
         * @param position The position.
         */
        public void setPosition(
            final Vector3 position)
        {
            this.position = position;
        }

        /**
         * Gets the score.
         * 
         * @return The score.
         */
        public int getScore()
        {
            return score;
        }

        /**
         * Sets the score.
         * 
         * @param score The score.
         */
        public void setScore(
            final int score)
        {
            this.score = score;
        }

        @Override
        public String toString()
        {
            return "position = " + this.getPosition() + ", score = " 
                + this.getScore();
        }

    }

    /**
     * This implements our custom metric. The Metric interface specifies that
     * the value returned must fulfill the requirements of being a mathematical
     * metric, which are:
     * 
     *     g(x, y) + g(y, z) >= g(x, z)
     *               g(x, y) == g(y, x)
     *               g(x, x) == 0
     */
    public static class CustomMetric
        extends AbstractCloneableSerializable
        implements Metric<ExampleData>
    {

        public double evaluate(
            final ExampleData first,
            final ExampleData second)
        {
            // To compute the metric, we use the euclidean distance in position
            // and the difference in the score.
            
            // First get he euclidean distance between the position. We
            // already know that euclidean distance fulfills the contract for
            // being a Metric.
            final double positionDifference =
                first.getPosition().euclideanDistance(
                second.getPosition());

            // Now compute the difference in score. Note that we need to use
            // the absolute value here in order to fulfill the contract for
            // a Metric.
            final int scoreDifference =
                Math.abs(first.getScore() - second.getScore());

            // We weight the difference in position higher than a difference
            // in score for some extra customization.
            return 2.0 * positionDifference + 0.5 * scoreDifference;
        }

    }
    
    /**
     * This implements a custom cluster creator. We make use of the
     * ClusterCreator interface to create our own CentroidCluster, which is
     * the type of cluster that has an object representing its centroid. In
     * our case, the centroid is the mean element of the cluster. To implement
     * this, we just have to compute the mean over the two fields in our
     * data type: the position and the score.
     */
    public static class CustomClusterCreator
        extends AbstractCloneableSerializable
        implements ClusterCreator<CentroidCluster<ExampleData>, ExampleData>
    {
        @Override
        public CentroidCluster<ExampleData> createCluster(
            final Collection<? extends ExampleData> members)
        {        
            if ( members == null )
            {
                // Error: Members is null.
                throw new NullPointerException("The members cannot be null.");
            }
            else if ( members.size() <= 0 )
            {
                // No members to create the cluster from.
                return null;
            }
            
            // We need to compute the mean position and mean score for our the
            // centroid example for our cluster.
            Vector3 meanPosition = new Vector3();
            double meanScore = 0.0;
            
            // Go through al lthe examples and add up te sum of positions and
            // scores.
            for (ExampleData example : members)
            {
                meanPosition.plusEquals(example.getPosition());
                meanScore += example.getScore();
            }
            
            // Now convert the sums into means by dividing by the number of
            // members.
            final int numMembers = members.size();
            meanPosition.scaleEquals(1.0 / (double) numMembers);
            meanScore = Math.round(meanScore / (double) numMembers);
            
            // Create the centroid object.
            final ExampleData centroid = 
                new ExampleData(meanPosition, (int) meanScore);
            
            // Return the centroid cluster.
            return new CentroidCluster<ExampleData>(centroid, members);
        }
    }


    /**
     * Generates the data we use for clustering.
     * 
     * @param numClusters The number of clusters.
     * @param random The random number generator.
     * @return The generated data.
     */
    public static LinkedList<ExampleData> generateData(
        final int numClusters,
        final Random random)
    {
        final LinkedList<ExampleData> result = new LinkedList<ExampleData>();

        // We're going to generate some clusters from which we will sample
        // the example data. Note that there is more noise in the score than
        // in the position.\
        final int numToGenerate = 100;
        final UnivariateGaussian.CDF positionNoiseCDF =
            new UnivariateGaussian.CDF(0.0, 1.0);
        final UnivariateGaussian.CDF scoreNoiseCDF =
            new UnivariateGaussian.CDF(0.0, 5.0);
        
        // Create the clusters to sample from.
        final ExampleData[] clusters = new ExampleData[numClusters];
        for (int i = 0; i < numClusters; i++)
        {
            final Vector3 position = new Vector3(
                25.0 * random.nextDouble(),
                25.0 * random.nextDouble(),
                25.0 * random.nextDouble());
            final int score = random.nextInt(10);
            clusters[i] = new ExampleData(position, score);
        }

        // Generate the examples.
        for (int i = 0; i < numToGenerate; i++)
        {
            // Get the cluster information.
            final ExampleData cluster = clusters[random.nextInt(numClusters)];
            Vector3 position = cluster.getPosition().clone();
            int score = cluster.getScore();
            
            // Add the noise.
            position.plusEquals(new Vector3(
                positionNoiseCDF.sample(random),
                positionNoiseCDF.sample(random),
                positionNoiseCDF.sample(random)));
            score += (int) Math.round(scoreNoiseCDF.sample(random));
            
            // Create the example and add it to the result.
            final ExampleData example = new ExampleData(position, score);
            result.add(example);
        }
        
        return result;
    }
    
    /**
     * Prints the given clusters to standard out.
     * 
     * @param clusters The clusters to print.
     */
    public static void printClusters(
        final Collection<CentroidCluster<ExampleData>> clusters)
    {
        for (int i = 0; i < clusters.size(); i++)
        {
            final CentroidCluster<ExampleData> cluster = CollectionUtil.getElement(clusters, i);
            System.out.println("Cluster " + i + ":" + cluster.getCentroid());
            
            for (ExampleData example : cluster.getMembers())
            {
                System.out.println("    " + example);
            }
        }
    }

}