ParallelizedKMeansClusterer.java example

Explorer
Foundry-master
- Components
/*
 * File:                ParallelizedKMeansClusterer.java
 * Authors:             Kevin R. Dixon
 * Company:             Sandia National Laboratories
 * Project:             Cognitive Foundry
 * 
 * Copyright Oct 6, 2008, Sandia Corporation.
 * Under the terms of Contract DE-AC04-94AL85000, there is a non-exclusive
 * license for use of this work by or on behalf of the U.S. Government. 
 * Export of this program may require a license from the United States
 * Government. See CopyrightHistory.txt for complete details.
 * 
 */

package gov.sandia.cognition.learning.algorithm.clustering;

import gov.sandia.cognition.algorithm.ParallelAlgorithm;
import gov.sandia.cognition.algorithm.ParallelUtil;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.Cluster;
import gov.sandia.cognition.learning.algorithm.clustering.cluster.ClusterCreator;
import gov.sandia.cognition.learning.algorithm.clustering.divergence.ClusterDivergenceFunction;
import gov.sandia.cognition.learning.algorithm.clustering.initializer.FixedClusterInitializer;
import gov.sandia.cognition.learning.data.SequentialDataMultiPartitioner;
import java.util.ArrayList;
import java.util.Collection;
import java.util.concurrent.Callable;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * This is a parallel implementation of the k-means clustering algorithm.  The
 * default is to use n-1 available cores/hyperthreads on a machine and spread
 * the data-point-to-cluster assignment (E-step) and the cluster re-estimation
 * (M-step) across these computational units.  The output of this algorithm is
 * exact, and should return the same results as the serial version of k-means
 * for an identical dataset and random seed.
 *
 * @param   <DataType> The type of the data to cluster. This is typically 
 *          defined by the divergence function used.
 * @param   <ClusterType> The type of {@code Cluster} created by the algorithm.
 *          This is typically defined by the cluster creator function used.
 * @author Kevin R. Dixon
 * @since 3.0
 */
@PublicationReference(
    author="Halil Bisgin",
    title="Parallel Clustering Algorithms with Application to Climatology",
    type=PublicationType.Thesis,
    year=2007,
    url="http://www.halilbisgin.com/thesis/thesis.pdf"
)
public class ParallelizedKMeansClusterer<DataType, ClusterType extends Cluster<DataType>>
    extends KMeansClusterer<DataType,ClusterType>
    implements ParallelAlgorithm
{

    /**
     * Parallel tasks that assign the data points to clusters
     */
    private ArrayList<Callable<int[]>> assignmentTasks;
    
    /**
     * Parallel tasks that creates clusters from the assigned data points
     */
    private ArrayList<Callable<ClusterType>> clusterCreatorTask;
    
    /**
     * Thread pool used to parallelize the computation
     */
    private transient ThreadPoolExecutor threadPool;    
    
    /**
     * ArrayList of assignments from the subtasks
     */
    private Collection<int[]> assignmentList;    
    
    /**
     * Array of new assignments
     */
    private int[] newAssignments;
    
    /**
     * Default constructor
     */
    public ParallelizedKMeansClusterer()
    {
        this( DEFAULT_NUM_REQUESTED_CLUSTERS, DEFAULT_MAX_ITERATIONS,
            null, null, null, null );
    }
    
    /** 
     * Creates a new instance of ParallelizedKMeansClusterer2 
     * @param numRequestedClusters The number of clusters requested (k).
     * @param maxIterations Maximum number of iterations before stopping
     * @param threadPool Thread pool to use for parallelization
     * @param initializer The initializer for the clusters.
     * @param divergenceFunction The divergence function.
     * @param creator The cluster creator.
     */
    public ParallelizedKMeansClusterer(
        int numRequestedClusters,
        int maxIterations,
        ThreadPoolExecutor threadPool,
        FixedClusterInitializer<ClusterType, DataType> initializer,
        ClusterDivergenceFunction<? super ClusterType, ? super DataType> divergenceFunction,
        ClusterCreator<ClusterType, DataType> creator )
    {
        super( numRequestedClusters, maxIterations, initializer, divergenceFunction, creator );
        this.setThreadPool( threadPool );
    }

    @Override
    public ParallelizedKMeansClusterer<DataType, ClusterType> clone()
    {
        return (ParallelizedKMeansClusterer<DataType, ClusterType>) super.clone();
    }
    
    public ThreadPoolExecutor getThreadPool()
    {
        if( this.threadPool == null )
        {
            this.setThreadPool( ParallelUtil.createThreadPool() );
        }
        
        return this.threadPool;
    }

    public void setThreadPool(
        final ThreadPoolExecutor threadPool )
    {
        this.threadPool = threadPool;
    }
    
    public int getNumThreads()
    {
        return ParallelUtil.getNumThreads( this );
    }
    
    /**
     * Creates the assignment tasks given the number of threads requested
     */
    protected void createAssignmentTasks()
    {
        
        int numThreads = this.getNumThreads();
        
        // if the number of requested components is less than 1, then
        ArrayList<ArrayList<DataType>> partitions =
            SequentialDataMultiPartitioner.create( this.getData(), numThreads );
        
        this.assignmentTasks = new ArrayList<Callable<int[]>>( numThreads );
        for( int i = 0; i < numThreads; i++ )
        {
            this.assignmentTasks.add( new AssignDataToCluster( partitions.get( i ) ) );
        }
        
        // Create the tasks that will assign data point onto clusters to
        // re-estimate the cluster locations
        int numClusters = this.getNumClusters();
        this.clusterCreatorTask = new ArrayList<Callable<ClusterType>>( numClusters );
        for( int i = 0; i < numClusters; i++ )
        {
            this.clusterCreatorTask.add( new CreateClustersFromAssignments() );
        }
        
    }

    @Override
    protected boolean initializeAlgorithm()
    {
        boolean superRetval = super.initializeAlgorithm();
        this.createAssignmentTasks();
        this.newAssignments = new int[ this.data.size() ];
        return superRetval;
    }

    @Override
    protected int[] assignDataToClusters(
        Collection<? extends DataType> data )
    {
        
        try
        {
            // Execute the assignments in parallel.  The k-means algorithm
            // typically spends the vast majority of its time executing
            // this loop.
            this.assignmentList = ParallelUtil.executeInParallel(
                this.assignmentTasks, this.getThreadPool() );
        }
        catch (Exception ex)
        {
            Logger.getLogger( ParallelizedKMeansClusterer.class.getName() ).log( Level.SEVERE, null, ex );
        }
        
        // Put the assignment array back together.
        // We're just going to re-use the "newAssignment" member as the super
        // method does a element-wise copy, not a pointer reassignment to
        // minimize memory thrashing.
        int index = 0;
        for( int[] subAssignment : this.assignmentList )
        {
            for( int i = 0; i < subAssignment.length; i++ )
            {
                int assignment = subAssignment[i];
                this.newAssignments[index] = assignment;
                index++;
            }
        }
        
        return this.newAssignments;
        
    }

    @Override
    @SuppressWarnings("unchecked")
    protected void createClustersFromAssignments()
    {
        // Loop through the clusters and initialize their membership lists
        // based on who is in them.
        // The k-means algoirhtm typically spends much less time in this loop
        // than the assignment step, but that also depends on the type of
        // cluster being estimated.
        int numClusters = this.getNumClusters();
        ArrayList<ArrayList<DataType>> clustersMembers = this.assignDataFromIndices();
        for( int i = 0; i < numClusters; i++ )
        {
            ((CreateClustersFromAssignments) this.clusterCreatorTask.get(i)).data = 
                clustersMembers.get( i );
        }

        Collection<ClusterType> results = null;
        try
        {
            // Execute the assignments in parallel
            results = ParallelUtil.executeInParallel(
                this.clusterCreatorTask, this.getThreadPool() );
        }
        catch (Exception ex)
        {
            Logger.getLogger( ParallelizedKMeansClusterer.class.getName() ).log( Level.SEVERE, null, ex );
        }
        
        int index = 0;
        for( ClusterType cluster : results )
        {
            this.getClusters().set( index, cluster );
            index++;
        }
        
    }
    
    
    /**
     * Callable task for that creates clusters from assigned data
     */
    protected class CreateClustersFromAssignments
        implements Callable<ClusterType>
    {
     
        /**
         * Data set to use for the task
         */
        public ArrayList<DataType> data;

        /**
         * Creates a new instance of CreateClustersFromAssignments
         */
        public CreateClustersFromAssignments()
        {
            this.data = null;
        }
        
        public ClusterType call()
        {
            return ParallelizedKMeansClusterer.this.getCreator().createCluster( this.data );
        }
        
    }
    
    /**
     * Callable task for the evaluate() method.
     */
    protected class AssignDataToCluster
        implements Callable<int[]>
    {
        
        /**
         * local data
         */
        private Collection<DataType> localData;
        
        /**
         * Creates a new instance of AssignDataToCluster
         * @param localData
         * Local data
         */
        public AssignDataToCluster(
            Collection<DataType> localData )
        {
            this.localData = localData;
        }

        public int[] call()
        {
            return ParallelizedKMeansClusterer.super.assignDataToClusters( 
                this.localData );
        }
        
    }    
    
}