/* * File: SimpleKMeansExample.java * Authors: Justin Basilico * Company: Sandia National Laboratories * Project: Cognitive Foundry * * Copyright April 28, 2010, Sandia Corporation. * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive * license for use of this work by or on behalf of the U.S. Government. Export * of this program may require a license from the United States Government. * See CopyrightHistory.txt for complete details. * */ package examples; import gov.sandia.cognition.learning.algorithm.clustering.KMeansClusterer; import gov.sandia.cognition.learning.algorithm.clustering.KMeansFactory; import gov.sandia.cognition.learning.algorithm.clustering.cluster.CentroidCluster; import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator; import gov.sandia.cognition.learning.algorithm.clustering.cluster.VectorMeanCentroidClusterCreator; import gov.sandia.cognition.learning.algorithm.clustering.divergence.CentroidClusterDivergenceFunction; import gov.sandia.cognition.learning.algorithm.clustering.divergence.ClusterDivergenceFunction; import gov.sandia.cognition.learning.algorithm.clustering.initializer.GreedyClusterInitializer; import gov.sandia.cognition.learning.function.distance.CosineDistanceMetric; import gov.sandia.cognition.learning.function.distance.EuclideanDistanceSquaredMetric; import gov.sandia.cognition.learning.function.distance.ManhattanDistanceMetric; import gov.sandia.cognition.math.Semimetric; import gov.sandia.cognition.math.matrix.Vector; import gov.sandia.cognition.math.matrix.VectorFactory; import gov.sandia.cognition.math.matrix.Vectorizable; import java.util.ArrayList; import java.util.Collection; import java.util.Random; /** * This example demonstrates how to use the k-means clustering from the * learning package in the Cognitive Foundry. It demonstrates several different * ways to initialize k-means, depending on how much customization of the * algorithm you wish to perform. * * @author Justin Basilico * @since 3.0 */ public class SimpleKMeansExample { /** * The main method where execution begins. * * @param arguments * The command-line arguments (ignored). */ public static void main( final String... arguments) { // We're going to need a random number generator both to create some // random data and because k-means does random initialization of the // clusters. final Random random = new Random(); // Lets make up some random 2-dimensional data with values in [-1, +1]. Collection<Vector> data = new ArrayList<Vector>(); VectorFactory<?> vectorFactory = VectorFactory.getDefault(); for (int i = 0; i < 100; i++) { data.add(vectorFactory.createUniformRandom(2, -1.0, +1.0, random)); } // Version 1: // Lets use the "easy" way to create a clusterer by just giving it // the number of clusters we want and use the default divergence metric // Euclidean distance. Note that the default k-means implementation uses // multi-threaded parallelism. int numRequestedClusters = 2; // The "k" in k-means. KMeansClusterer<Vector, CentroidCluster<Vector>> kMeans = KMeansFactory.create(numRequestedClusters, random); // Now run the clustering to create the clusters. Collection<CentroidCluster<Vector>> clusters = kMeans.learn(data); printClusters("Version 1: ", clusters); kMeans.setNumRequestedClusters(10); clusters = kMeans.learn(data); printClusters("Version 1 (k=10): ", clusters); // Version 2: // Now lets use a different metric, manhattan distance. kMeans = KMeansFactory.create(numRequestedClusters, ManhattanDistanceMetric.INSTANCE, random); // Now run the clustering to create the clusters. clusters = kMeans.learn(data); printClusters("Version 2: ", clusters); // Version 3: // Lets create a non-parallel k-means implementation directly instead // of using the factory class. It shows all of the different things you // can change on the clusterer, but also shows why we provided the // factory class for common use-cases. We will also use the cosine // distance metric instead. Semimetric<Vectorizable> metric = CosineDistanceMetric.INSTANCE; int maxIterations = 200; ClusterCreator<CentroidCluster<Vector>, Vector> creator = VectorMeanCentroidClusterCreator.INSTANCE; GreedyClusterInitializer<CentroidCluster<Vector>, Vector> initializer = new GreedyClusterInitializer<CentroidCluster<Vector>, Vector>( metric, creator, random); ClusterDivergenceFunction<CentroidCluster<Vector>, Vector> clusterDivergence = new CentroidClusterDivergenceFunction<Vector>(metric); kMeans = new KMeansClusterer<Vector, CentroidCluster<Vector>>( numRequestedClusters, maxIterations, initializer, clusterDivergence, creator); // Now run the clustering to create the clusters. clusters = kMeans.learn(data); printClusters("Version 3: ", clusters); // Version 4: // You can also change the parameters of the clustering after you // created it. This would work with all three versions. We're changing // the distance to the euclidean distance squared, which saves us // computing a square-root when doing the distance computation. We're // also changing the number of requested clusters (k) and the maximum // number of iterations to allow it to run for. kMeans.setNumRequestedClusters(10); kMeans.setMaxIterations(20); kMeans.setDivergenceFunction( new CentroidClusterDivergenceFunction<Vector>( EuclideanDistanceSquaredMetric.INSTANCE)); clusters = kMeans.learn(data); printClusters("Version 4: ", clusters); } /** * Prints out the cluster centers. * * @param title * The title to print. * @param clusters * The cluster centers. */ public static void printClusters( final String title, final Collection<CentroidCluster<Vector>> clusters) { System.out.print(title); System.out.println("There are " + clusters.size() + " clusters."); int index = 0; for (CentroidCluster<Vector> cluster : clusters) { System.out.println(" " + index + ": " + cluster.getCentroid()); index++; // Another useful method on a cluster is: cluster.getMembers() } } }