package mil.nga.giat.geowave.analytic.mapreduce.kmeans.runner; import java.util.Collection; import java.util.HashSet; import java.util.Set; import java.util.UUID; import mil.nga.giat.geowave.analytic.PropertyManagement; import mil.nga.giat.geowave.analytic.SimpleFeatureItemWrapperFactory; import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils; import mil.nga.giat.geowave.analytic.distance.FeatureCentroidDistanceFn; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureCentroidExtractor; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureGeometryExtractor; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobController; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.clustering.runner.ClusteringRunner; import mil.nga.giat.geowave.analytic.param.CentroidParameters; import mil.nga.giat.geowave.analytic.param.CommonParameters; import mil.nga.giat.geowave.analytic.param.FormatConfiguration; import mil.nga.giat.geowave.analytic.param.GlobalParameters; import mil.nga.giat.geowave.analytic.param.ParameterEnum; import mil.nga.giat.geowave.analytic.param.SampleParameters; import org.apache.hadoop.conf.Configuration; import org.opengis.feature.simple.SimpleFeature; /** * The KMeans Parallel algorithm,labeled Algorithm 2 within in section 3.3 of * * Bahmani, Kumar, Moseley, Vassilvitskii and Vattani. Scalable K-means++. VLDB * Endowment Vol. 5, No. 7. 2012. * * @formatter:off Couple things to note: * * (1) Updating the cost of each sampled point occurs as the * first step within sampling loop; the initial sample is * performed outside the loop. * * (2) A final update cost occurs outside the sampling loop just * prior to stripping off the top 'K' centers. * * @formatter:on * */ public class KMeansParallelJobRunner extends MapReduceJobController implements ClusteringRunner { final SampleMultipleSetsJobRunner<SimpleFeature> sampleSetsRunner = new SampleMultipleSetsJobRunner<SimpleFeature>(); final StripWeakCentroidsRunner<SimpleFeature> stripWeakCentroidsRunner = new StripWeakCentroidsRunner<SimpleFeature>(); final KMeansIterationsJobRunner<SimpleFeature> kmeansJobRunner = new KMeansIterationsJobRunner<SimpleFeature>(); private int currentZoomLevel = 1; public KMeansParallelJobRunner() { // defaults setZoomLevel(1); // sts of child runners init( new MapReduceJobRunner[] { sampleSetsRunner, stripWeakCentroidsRunner, // run this one more time with // 'smaller' size kmeansJobRunner }, new PostOperationTask[] { DoNothingTask, DoNothingTask, new PostOperationTask() { @Override public void runTask( final Configuration config, final MapReduceJobRunner runner ) { kmeansJobRunner.setReducerCount(stripWeakCentroidsRunner.getCurrentCentroidCount()); } }, DoNothingTask }); } @Override public void setZoomLevel( final int zoomLevel ) { currentZoomLevel = zoomLevel; sampleSetsRunner.setZoomLevel(zoomLevel); } @Override public void setInputFormatConfiguration( final FormatConfiguration inputFormatConfiguration ) { sampleSetsRunner.setInputFormatConfiguration(inputFormatConfiguration); kmeansJobRunner.setInputFormatConfiguration(inputFormatConfiguration); } @Override public int run( final Configuration configuration, final PropertyManagement propertyManagement ) throws Exception { return runJob( configuration, propertyManagement); } private int runJob( final Configuration config, final PropertyManagement propertyManagement ) throws Exception { propertyManagement.store( CentroidParameters.Centroid.ZOOM_LEVEL, currentZoomLevel); propertyManagement.storeIfEmpty( GlobalParameters.Global.BATCH_ID, UUID.randomUUID().toString()); propertyManagement.storeIfEmpty( CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS, SimpleFeatureItemWrapperFactory.class); propertyManagement.storeIfEmpty( CommonParameters.Common.DISTANCE_FUNCTION_CLASS, FeatureCentroidDistanceFn.class); propertyManagement.storeIfEmpty( CentroidParameters.Centroid.EXTRACTOR_CLASS, SimpleFeatureCentroidExtractor.class); propertyManagement.storeIfEmpty( CommonParameters.Common.DIMENSION_EXTRACT_CLASS, SimpleFeatureGeometryExtractor.class); stripWeakCentroidsRunner.setRange( propertyManagement.getPropertyAsInt( SampleParameters.Sample.MIN_SAMPLE_SIZE, 2), propertyManagement.getPropertyAsInt( SampleParameters.Sample.MAX_SAMPLE_SIZE, 1000)); ClusteringUtils.createAdapter(propertyManagement); ClusteringUtils.createIndex(propertyManagement); return super.run( config, propertyManagement); } @Override public Collection<ParameterEnum<?>> getParameters() { final Set<ParameterEnum<?>> params = new HashSet<ParameterEnum<?>>(); params.addAll(kmeansJobRunner.getParameters()); params.addAll(sampleSetsRunner.getParameters()); // while override params.remove(CentroidParameters.Centroid.ZOOM_LEVEL); return params; } }