SimpleKMeansExample.java example

Explorer
Foundry-master
- Components
/*
 * File:                SimpleKMeansExample.java
 * Authors:             Justin Basilico
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright April 28, 2010, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive 
 * license for use of this work by or on behalf of the U.S. Government. Export 
 * of this program may require a license from the United States Government. 
 * See CopyrightHistory.txt for complete details.
 * 
 */

package examples;

import gov.sandia.cognition.learning.algorithm.clustering.KMeansClusterer;
import gov.sandia.cognition.learning.algorithm.clustering.KMeansFactory;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.CentroidCluster;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.VectorMeanCentroidClusterCreator;
import gov.sandia.cognition.learning.algorithm.clustering.divergence.CentroidClusterDivergenceFunction;
import gov.sandia.cognition.learning.algorithm.clustering.divergence.ClusterDivergenceFunction;
import gov.sandia.cognition.learning.algorithm.clustering.initializer.GreedyClusterInitializer;
import gov.sandia.cognition.learning.function.distance.CosineDistanceMetric;
import gov.sandia.cognition.learning.function.distance.EuclideanDistanceSquaredMetric;
import gov.sandia.cognition.learning.function.distance.ManhattanDistanceMetric;
import gov.sandia.cognition.math.Semimetric;
import gov.sandia.cognition.math.matrix.Vector;
import gov.sandia.cognition.math.matrix.VectorFactory;
import gov.sandia.cognition.math.matrix.Vectorizable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Random;

/**
 * This example demonstrates how to use the k-means clustering from the
 * learning package in the Cognitive Foundry. It demonstrates several different
 * ways to initialize k-means, depending on how much customization of the
 * algorithm you wish to perform.
 * 
 * @author  Justin Basilico
 * @since   3.0
 */
public class SimpleKMeansExample
{

    /**
     * The main method where execution begins.
     *
     * @param   arguments
     *      The command-line arguments (ignored).
     */
    public static void main(
        final String... arguments)
    {
        // We're going to need a random number generator both to create some
        // random data and because k-means does random initialization of the
        // clusters.
        final Random random = new Random();
        
        // Lets make up some random 2-dimensional data with values in [-1, +1].
        Collection<Vector> data = new ArrayList<Vector>();
        VectorFactory<?> vectorFactory = VectorFactory.getDefault();
        for (int i = 0; i < 100; i++)
        {
            data.add(vectorFactory.createUniformRandom(2, -1.0, +1.0, random));
        }


        // Version 1:
        // Lets use the "easy" way to create a clusterer by just giving it
        // the number of clusters we want and use the default divergence metric
        // Euclidean distance. Note that the default k-means implementation uses
        // multi-threaded parallelism.
        int numRequestedClusters = 2; // The "k" in k-means.
        KMeansClusterer<Vector, CentroidCluster<Vector>> kMeans =
            KMeansFactory.create(numRequestedClusters, random);

        // Now run the clustering to create the clusters.
        Collection<CentroidCluster<Vector>> clusters = kMeans.learn(data);
        printClusters("Version 1: ", clusters);

        kMeans.setNumRequestedClusters(10);
        clusters = kMeans.learn(data);
        printClusters("Version 1 (k=10): ", clusters);

        // Version 2:
        // Now lets use a different metric, manhattan distance.
        kMeans = KMeansFactory.create(numRequestedClusters,
            ManhattanDistanceMetric.INSTANCE, random);

        // Now run the clustering to create the clusters.
        clusters = kMeans.learn(data);
        printClusters("Version 2: ", clusters);


        // Version 3:
        // Lets create a non-parallel k-means implementation directly instead
        // of using the factory class. It shows all of the different things you
        // can change on the clusterer, but also shows why we provided the
        // factory class for common use-cases. We will also use the cosine
        // distance metric instead.
        Semimetric<Vectorizable> metric = CosineDistanceMetric.INSTANCE;
        int maxIterations = 200;
        ClusterCreator<CentroidCluster<Vector>, Vector> creator =
            VectorMeanCentroidClusterCreator.INSTANCE;
        GreedyClusterInitializer<CentroidCluster<Vector>, Vector>
            initializer =
                new GreedyClusterInitializer<CentroidCluster<Vector>, Vector>(
                    metric, creator, random);
        ClusterDivergenceFunction<CentroidCluster<Vector>, Vector>
            clusterDivergence =
                new CentroidClusterDivergenceFunction<Vector>(metric);
        kMeans = new KMeansClusterer<Vector, CentroidCluster<Vector>>(
            numRequestedClusters, maxIterations,
            initializer, clusterDivergence, creator);

        // Now run the clustering to create the clusters.
        clusters = kMeans.learn(data);
        printClusters("Version 3: ", clusters);


        // Version 4:
        // You can also change the parameters of the clustering after you
        // created it. This would work with all three versions. We're changing
        // the distance to the euclidean distance squared, which saves us
        // computing a square-root when doing the distance computation. We're
        // also changing the number of requested clusters (k) and the maximum
        // number of iterations to allow it to run for.
        kMeans.setNumRequestedClusters(10);
        kMeans.setMaxIterations(20);
        kMeans.setDivergenceFunction(
            new CentroidClusterDivergenceFunction<Vector>(
                EuclideanDistanceSquaredMetric.INSTANCE));
        
        clusters = kMeans.learn(data);
        printClusters("Version 4: ", clusters);
    }

    /**
     * Prints out the cluster centers.
     *
     * @param   title
     *      The title to print.
     * @param   clusters
     *      The cluster centers.
     */
    public static void printClusters(
        final String title,
        final Collection<CentroidCluster<Vector>> clusters)
    {
        System.out.print(title);
        System.out.println("There are " + clusters.size() + " clusters.");
        int index = 0;
        for (CentroidCluster<Vector> cluster : clusters)
        {
            System.out.println("    " + index + ": " + cluster.getCentroid());
            index++;
            // Another useful method on a cluster is: cluster.getMembers()
        }
    }

}