package mil.nga.giat.geowave.analytic.mapreduce.kmeans.runner; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Set; import java.util.UUID; import org.apache.hadoop.conf.Configuration; import org.opengis.feature.simple.SimpleFeature; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import mil.nga.giat.geowave.analytic.AnalyticItemWrapperFactory; import mil.nga.giat.geowave.analytic.PropertyManagement; import mil.nga.giat.geowave.analytic.SimpleFeatureItemWrapperFactory; import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils; import mil.nga.giat.geowave.analytic.clustering.DistortionGroupManagement; import mil.nga.giat.geowave.analytic.distance.FeatureCentroidDistanceFn; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureCentroidExtractor; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureGeometryExtractor; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobController; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.clustering.runner.ClusteringRunner; import mil.nga.giat.geowave.analytic.param.CentroidParameters; import mil.nga.giat.geowave.analytic.param.ClusteringParameters; import mil.nga.giat.geowave.analytic.param.CommonParameters; import mil.nga.giat.geowave.analytic.param.FormatConfiguration; import mil.nga.giat.geowave.analytic.param.GlobalParameters; import mil.nga.giat.geowave.analytic.param.JumpParameters; import mil.nga.giat.geowave.analytic.param.MapReduceParameters; import mil.nga.giat.geowave.analytic.param.ParameterEnum; import mil.nga.giat.geowave.analytic.param.SampleParameters; import mil.nga.giat.geowave.analytic.param.StoreParameters; import mil.nga.giat.geowave.analytic.param.StoreParameters.StoreParam; import mil.nga.giat.geowave.analytic.store.PersistableStore; import mil.nga.giat.geowave.core.index.sfc.data.NumericRange; import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions; /** * The KMeans Jump algorithm * * Catherine A. Sugar and Gareth M. James (2003). * "Finding the number of clusters in a data set: An information theoretic approach" * Journal of the American Statistical Association 98 (January): 750–763 * * @formatter:off Couple things to note: * * * @formatter:on * */ public class KMeansJumpJobRunner extends MapReduceJobController implements ClusteringRunner { final static Logger LOGGER = LoggerFactory.getLogger(KMeansJumpJobRunner.class); final KMeansDistortionJobRunner jumpRunner = new KMeansDistortionJobRunner(); final KMeansParallelJobRunnerDelegate kmeansRunner = new KMeansParallelJobRunnerDelegate(); private int currentZoomLevel = 1; public KMeansJumpJobRunner() { // defaults setZoomLevel(1); // child runners init( new MapReduceJobRunner[] { kmeansRunner, jumpRunner, }, new PostOperationTask[] { DoNothingTask, DoNothingTask }); } @Override public void setZoomLevel( final int zoomLevel ) { currentZoomLevel = zoomLevel; kmeansRunner.setZoomLevel(zoomLevel); } @Override public void setInputFormatConfiguration( final FormatConfiguration inputFormatConfiguration ) { jumpRunner.setInputFormatConfiguration(inputFormatConfiguration); kmeansRunner.setInputFormatConfiguration(inputFormatConfiguration); } @Override @SuppressWarnings("unchecked") public int run( final Configuration configuration, final PropertyManagement propertyManagement ) throws Exception { propertyManagement.store( CentroidParameters.Centroid.ZOOM_LEVEL, currentZoomLevel); propertyManagement.storeIfEmpty( GlobalParameters.Global.BATCH_ID, UUID.randomUUID().toString()); propertyManagement.storeIfEmpty( CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS, SimpleFeatureItemWrapperFactory.class); propertyManagement.storeIfEmpty( CommonParameters.Common.DISTANCE_FUNCTION_CLASS, FeatureCentroidDistanceFn.class); propertyManagement.storeIfEmpty( CentroidParameters.Centroid.EXTRACTOR_CLASS, SimpleFeatureCentroidExtractor.class); propertyManagement.storeIfEmpty( CommonParameters.Common.DIMENSION_EXTRACT_CLASS, SimpleFeatureGeometryExtractor.class); propertyManagement.copy( CentroidParameters.Centroid.DATA_TYPE_ID, SampleParameters.Sample.DATA_TYPE_ID); propertyManagement.copy( CentroidParameters.Centroid.INDEX_ID, SampleParameters.Sample.INDEX_ID); ClusteringUtils.createAdapter(propertyManagement); ClusteringUtils.createIndex(propertyManagement); final String currentBatchId = propertyManagement.getPropertyAsString( GlobalParameters.Global.BATCH_ID, UUID.randomUUID().toString()); try { final NumericRange rangeOfIterations = propertyManagement.getPropertyAsRange( JumpParameters.Jump.RANGE_OF_CENTROIDS, new NumericRange( 2, 200)); propertyManagement.store( GlobalParameters.Global.PARENT_BATCH_ID, currentBatchId); final DataStorePluginOptions dataStoreOptions = ((PersistableStore) propertyManagement .getProperty(StoreParam.INPUT_STORE)).getDataStoreOptions(); final DistortionGroupManagement distortionGroupManagement = new DistortionGroupManagement( dataStoreOptions.createDataStore(), dataStoreOptions.createIndexStore(), dataStoreOptions.createAdapterStore()); for (int k = (int) Math.max( 2, Math.round(rangeOfIterations.getMin())); k < Math.round(rangeOfIterations.getMax()); k++) { // regardless of the algorithm, the sample set is fixed in size propertyManagement.store( SampleParameters.Sample.MIN_SAMPLE_SIZE, k); propertyManagement.store( SampleParameters.Sample.MAX_SAMPLE_SIZE, k); propertyManagement.store( SampleParameters.Sample.SAMPLE_SIZE, k); jumpRunner.setCentroidsCount(k); jumpRunner.setDataStoreOptions(dataStoreOptions); final String iterationBatchId = currentBatchId + "_" + k; propertyManagement.store( GlobalParameters.Global.BATCH_ID, iterationBatchId); jumpRunner.setReducerCount(k); final int status = super.run( configuration, propertyManagement); if (status != 0) { return status; } } propertyManagement.store( GlobalParameters.Global.BATCH_ID, currentBatchId); @SuppressWarnings("rawtypes") final Class<AnalyticItemWrapperFactory> analyticItemWrapperFC = propertyManagement.getPropertyAsClass( CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS, AnalyticItemWrapperFactory.class); /** * Associate the batch id with the best set of groups so the caller * can find the clusters for the given batch */ final int result = distortionGroupManagement.retainBestGroups( (AnalyticItemWrapperFactory<SimpleFeature>) analyticItemWrapperFC.newInstance(), propertyManagement.getPropertyAsString(CentroidParameters.Centroid.DATA_TYPE_ID), propertyManagement.getPropertyAsString(CentroidParameters.Centroid.INDEX_ID), currentBatchId, currentZoomLevel); return result; } catch (final Exception ex) { LOGGER.error( "Cannot create distortions", ex); return 1; } } @Override public Collection<ParameterEnum<?>> getParameters() { final Set<ParameterEnum<?>> params = new HashSet<ParameterEnum<?>>(); params.addAll(kmeansRunner.singleSamplekmeansJobRunner.getParameters()); params.addAll(kmeansRunner.parallelJobRunner.getParameters()); params.addAll(Arrays.asList(new ParameterEnum<?>[] { JumpParameters.Jump.RANGE_OF_CENTROIDS, JumpParameters.Jump.KPLUSPLUS_MIN, ClusteringParameters.Clustering.MAX_REDUCER_COUNT, CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS, CentroidParameters.Centroid.INDEX_ID, CentroidParameters.Centroid.DATA_TYPE_ID, CentroidParameters.Centroid.DATA_NAMESPACE_URI, CentroidParameters.Centroid.EXTRACTOR_CLASS, CommonParameters.Common.DISTANCE_FUNCTION_CLASS, CommonParameters.Common.DIMENSION_EXTRACT_CLASS, StoreParameters.StoreParam.INPUT_STORE, GlobalParameters.Global.BATCH_ID })); params.addAll(MapReduceParameters.getParameters()); params.remove(CentroidParameters.Centroid.ZOOM_LEVEL); params.remove(SampleParameters.Sample.DATA_TYPE_ID); params.remove(SampleParameters.Sample.INDEX_ID); return params; } private static class KMeansParallelJobRunnerDelegate implements MapReduceJobRunner { final KMeansSingleSampleJobRunner<SimpleFeature> singleSamplekmeansJobRunner = new KMeansSingleSampleJobRunner<SimpleFeature>(); final KMeansParallelJobRunner parallelJobRunner = new KMeansParallelJobRunner(); @Override public int run( final Configuration config, final PropertyManagement runTimeProperties ) throws Exception { final int k = runTimeProperties.getPropertyAsInt( SampleParameters.Sample.SAMPLE_SIZE, 1); final int minkplusplus = runTimeProperties.getPropertyAsInt( JumpParameters.Jump.KPLUSPLUS_MIN, 3); if (k >= minkplusplus) { return parallelJobRunner.run( config, runTimeProperties); } else { return singleSamplekmeansJobRunner.run( config, runTimeProperties); } } public void setZoomLevel( final int zoomLevel ) { parallelJobRunner.setZoomLevel(zoomLevel); singleSamplekmeansJobRunner.setZoomLevel(zoomLevel); } public void setInputFormatConfiguration( final FormatConfiguration inputFormatConfiguration ) { parallelJobRunner.setInputFormatConfiguration(inputFormatConfiguration); singleSamplekmeansJobRunner.setInputFormatConfiguration(inputFormatConfiguration); } } }