package mil.nga.giat.geowave.analytic.mapreduce.kmeans.runner; import java.util.UUID; import mil.nga.giat.geowave.analytic.PropertyManagement; import mil.nga.giat.geowave.analytic.clustering.NestedGroupCentroidAssignment; import mil.nga.giat.geowave.analytic.mapreduce.GeoWaveAnalyticJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.GeoWaveOutputFormatConfiguration; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.kmeans.KSamplerMapReduce; import mil.nga.giat.geowave.analytic.param.CentroidParameters; import mil.nga.giat.geowave.analytic.param.GlobalParameters; import mil.nga.giat.geowave.analytic.param.ParameterEnum; import mil.nga.giat.geowave.analytic.param.SampleParameters; import mil.nga.giat.geowave.analytic.sample.function.RandomSamplingRankFunction; import mil.nga.giat.geowave.analytic.sample.function.SamplingRankFunction; import mil.nga.giat.geowave.core.geotime.ingest.SpatialTemporalDimensionalityTypeProvider; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.core.store.adapter.AdapterStore; import mil.nga.giat.geowave.core.store.adapter.DataAdapter; import mil.nga.giat.geowave.core.store.index.IndexStore; import mil.nga.giat.geowave.core.store.index.PrimaryIndex; import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey; import mil.nga.giat.geowave.mapreduce.output.GeoWaveOutputKey; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.mapreduce.Job; /** * * Samples 'K' number of data items by evaluating a {@link SamplingRankFunction} * * For KMeans Parallel, the initial step requires seeding the centroids with a * single point. In this case, K=1 and the rank function is random. This means * the top selected geometry is random. In addition, each subsequent iteration * samples based on probability function and K is some provided sample size. * * */ public class KSamplerJobRunner extends GeoWaveAnalyticJobRunner implements MapReduceJobRunner { protected int zoomLevel = 1; private Class<? extends SamplingRankFunction> samplingRankFunctionClass = RandomSamplingRankFunction.class; public KSamplerJobRunner() { super.setOutputFormatConfiguration(new GeoWaveOutputFormatConfiguration()); } public void setSamplingRankFunctionClass( final Class<? extends SamplingRankFunction> samplingRankFunctionClass ) { this.samplingRankFunctionClass = samplingRankFunctionClass; } public void setZoomLevel( final int zoomLevel ) { this.zoomLevel = zoomLevel; } @Override public Class<?> getScope() { return KSamplerMapReduce.class; } @Override public void configure( final Job job ) throws Exception { job.setMapperClass(KSamplerMapReduce.SampleMap.class); job.setMapOutputKeyClass(GeoWaveInputKey.class); job.setMapOutputValueClass(ObjectWritable.class); job.setReducerClass(KSamplerMapReduce.SampleReducer.class); job.setPartitionerClass(KSamplerMapReduce.SampleKeyPartitioner.class); job.setReduceSpeculativeExecution(false); job.setOutputKeyClass(GeoWaveOutputKey.class); job.setOutputValueClass(Object.class); } private DataAdapter<?> getAdapter( final PropertyManagement runTimeProperties ) throws Exception { final AdapterStore adapterStore = super.getAdapterStore(runTimeProperties); return adapterStore.getAdapter(new ByteArrayId( runTimeProperties.getPropertyAsString( SampleParameters.Sample.DATA_TYPE_ID, "sample"))); } private PrimaryIndex getIndex( final PropertyManagement runTimeProperties ) throws Exception { final IndexStore indexStore = super.getIndexStore(runTimeProperties); return (PrimaryIndex) indexStore.getIndex(new ByteArrayId( runTimeProperties.getPropertyAsString( SampleParameters.Sample.INDEX_ID, "index"))); } @Override public int run( final Configuration config, final PropertyManagement runTimeProperties ) throws Exception { runTimeProperties.storeIfEmpty( GlobalParameters.Global.BATCH_ID, UUID.randomUUID().toString()); runTimeProperties.storeIfEmpty( SampleParameters.Sample.DATA_TYPE_ID, "sample"); runTimeProperties.store( CentroidParameters.Centroid.ZOOM_LEVEL, zoomLevel); runTimeProperties.storeIfEmpty( SampleParameters.Sample.INDEX_ID, new SpatialTemporalDimensionalityTypeProvider().createPrimaryIndex().getId()); runTimeProperties.setConfig( new ParameterEnum[] { GlobalParameters.Global.BATCH_ID, SampleParameters.Sample.INDEX_ID, SampleParameters.Sample.SAMPLE_SIZE, SampleParameters.Sample.DATA_TYPE_ID, CentroidParameters.Centroid.EXTRACTOR_CLASS, CentroidParameters.Centroid.WRAPPER_FACTORY_CLASS, CentroidParameters.Centroid.ZOOM_LEVEL }, config, getScope()); ((ParameterEnum<Class<?>>) SampleParameters.Sample.SAMPLE_RANK_FUNCTION).getHelper().setValue( config, getScope(), samplingRankFunctionClass); NestedGroupCentroidAssignment.setParameters( config, getScope(), runTimeProperties); addDataAdapter( config, getAdapter(runTimeProperties)); addIndex( config, getIndex(runTimeProperties)); super.setReducerCount(zoomLevel); return super.run( config, runTimeProperties); } @Override protected String getJobName() { return "K-Sampler"; } }