package mil.nga.giat.geowave.analytic.mapreduce.clustering.runner; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.Set; import mil.nga.giat.geowave.analytic.PropertyManagement; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureCentroidExtractor; import mil.nga.giat.geowave.analytic.extract.SimpleFeatureGeometryExtractor; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobController; import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.SequenceFileInputFormatConfiguration; import mil.nga.giat.geowave.analytic.mapreduce.SequenceFileOutputFormatConfiguration; import mil.nga.giat.geowave.analytic.param.CentroidParameters; import mil.nga.giat.geowave.analytic.param.ClusteringParameters.Clustering; import mil.nga.giat.geowave.analytic.param.CommonParameters; import mil.nga.giat.geowave.analytic.param.ExtractParameters; import mil.nga.giat.geowave.analytic.param.GlobalParameters.Global; import mil.nga.giat.geowave.analytic.param.HullParameters; import mil.nga.giat.geowave.analytic.param.MapReduceParameters; import mil.nga.giat.geowave.analytic.param.ParameterEnum; import mil.nga.giat.geowave.core.geotime.ingest.SpatialDimensionalityTypeProvider; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.geotools.feature.type.BasicFeatureTypes; /** * Runs a clustering at multiple levels. Lower levels cluster within each * cluster of the higher level. * * Steps: * * @formatter: off * * (1) Extract and deduplicate items from geowave. * * (2) Cluster item within their assigned groups. Initially, items * are all part of the same group. * * (3) Assign to each point the cluster (group id). * * (4) Repeat steps 2 to 3 for each lower level. * * @formatter: on * */ public abstract class MultiLevelClusteringJobRunner extends MapReduceJobController implements MapReduceJobRunner { final GroupAssigmentJobRunner groupAssignmentRunner = new GroupAssigmentJobRunner(); final GeoWaveAnalyticExtractJobRunner jobExtractRunner = new GeoWaveAnalyticExtractJobRunner(); final ConvexHullJobRunner hullRunner = new ConvexHullJobRunner(); public MultiLevelClusteringJobRunner() { init( new MapReduceJobRunner[] {}, new PostOperationTask[] {}); } protected abstract ClusteringRunner getClusteringRunner(); @Override public Collection<ParameterEnum<?>> getParameters() { final Set<ParameterEnum<?>> params = new HashSet<ParameterEnum<?>>(); params.addAll(jobExtractRunner.getParameters()); params.addAll(hullRunner.getParameters()); params.addAll(getClusteringRunner().getParameters()); params.addAll(Arrays.asList(new ParameterEnum<?>[] { Clustering.ZOOM_LEVELS, Global.BATCH_ID })); params.addAll(MapReduceParameters.getParameters()); // the output data type is used for centroid management params.remove(CentroidParameters.Centroid.DATA_TYPE_ID); params.remove(CentroidParameters.Centroid.DATA_NAMESPACE_URI); return params; } @Override public int run( final Configuration configuration, final PropertyManagement propertyManagement ) throws Exception { return runJob( configuration, propertyManagement); } private int runJob( final Configuration config, final PropertyManagement propertyManagement ) throws Exception { final ClusteringRunner clusteringRunner = getClusteringRunner(); final Integer zoomLevels = propertyManagement.getPropertyAsInt( Clustering.ZOOM_LEVELS, 1); jobExtractRunner.setConf(config); final String dataTypeId = propertyManagement.getPropertyAsString( ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID, "centroid"); final String namespaceURI = propertyManagement.getPropertyAsString( ExtractParameters.Extract.DATA_NAMESPACE_URI, BasicFeatureTypes.DEFAULT_NAMESPACE); propertyManagement.storeIfEmpty( ExtractParameters.Extract.DATA_NAMESPACE_URI, namespaceURI); propertyManagement.storeIfEmpty( ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID, dataTypeId); propertyManagement.storeIfEmpty( CentroidParameters.Centroid.EXTRACTOR_CLASS, SimpleFeatureCentroidExtractor.class); propertyManagement.storeIfEmpty( CommonParameters.Common.DIMENSION_EXTRACT_CLASS, SimpleFeatureGeometryExtractor.class); propertyManagement.store( CentroidParameters.Centroid.DATA_TYPE_ID, dataTypeId); propertyManagement.store( CentroidParameters.Centroid.DATA_NAMESPACE_URI, namespaceURI); // TODO: set out index type for extracts? propertyManagement.storeIfEmpty( CentroidParameters.Centroid.INDEX_ID, new SpatialDimensionalityTypeProvider().createPrimaryIndex().getId().getString()); propertyManagement.storeIfEmpty( HullParameters.Hull.INDEX_ID, new SpatialDimensionalityTypeProvider().createPrimaryIndex().getId().getString()); // first. extract data int status = jobExtractRunner.run( config, propertyManagement); final Path extractPath = jobExtractRunner.getHdfsOutputPath(); groupAssignmentRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( extractPath)); clusteringRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( extractPath)); hullRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( extractPath)); final boolean retainGroupAssigments = propertyManagement.getPropertyAsBoolean( Clustering.RETAIN_GROUP_ASSIGNMENTS, false); // run clustering for each level final String outputBaseDir = propertyManagement.getPropertyAsString( MapReduceParameters.MRConfig.HDFS_BASE_DIR, "/tmp"); FileSystem fs = null; try { fs = FileSystem.get(config); for (int i = 0; (status == 0) && (i < zoomLevels); i++) { final int zoomLevel = i + 1; clusteringRunner.setZoomLevel(zoomLevel); hullRunner.setZoomLevel(zoomLevel); // need to get this removed at some point. propertyManagement.store( CentroidParameters.Centroid.ZOOM_LEVEL, zoomLevel); status = clusteringRunner.run( config, propertyManagement); if (status == 0) { final Path nextPath = new Path( outputBaseDir + "/" + "level_" + zoomLevel); if (fs.exists(nextPath)) { fs.delete( nextPath, true); } groupAssignmentRunner.setOutputFormatConfiguration(new SequenceFileOutputFormatConfiguration( nextPath)); groupAssignmentRunner.setZoomLevel(zoomLevel); status = retainGroupAssigments ? groupAssignmentRunner.run( config, propertyManagement) : 0; if (status == 0) { status = hullRunner.run( config, propertyManagement); } if (retainGroupAssigments) { clusteringRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( nextPath)); hullRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( nextPath)); groupAssignmentRunner.setInputFormatConfiguration(new SequenceFileInputFormatConfiguration( nextPath)); } } } return status; } finally { if (fs != null) fs.close(); } } }