package mil.nga.giat.geowave.analytic.mapreduce.dbscan; import java.util.Arrays; import java.util.Collection; import mil.nga.giat.geowave.analytic.AdapterWithObjectWritable; import mil.nga.giat.geowave.analytic.AnalyticFeature; import mil.nga.giat.geowave.analytic.Projection; import mil.nga.giat.geowave.analytic.PropertyManagement; import mil.nga.giat.geowave.analytic.SimpleFeatureProjection; import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNJobRunner; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce.PartitionDataWritable; import mil.nga.giat.geowave.analytic.param.ClusteringParameters; import mil.nga.giat.geowave.analytic.param.ClusteringParameters.Clustering; import mil.nga.giat.geowave.analytic.param.GlobalParameters; import mil.nga.giat.geowave.analytic.param.GlobalParameters.Global; import mil.nga.giat.geowave.analytic.param.HullParameters; import mil.nga.giat.geowave.analytic.param.HullParameters.Hull; import mil.nga.giat.geowave.analytic.param.ParameterEnum; import mil.nga.giat.geowave.analytic.param.PartitionParameters.Partition; import mil.nga.giat.geowave.mapreduce.JobContextAdapterStore; import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.Job; import org.geotools.feature.type.BasicFeatureTypes; /** * Run a single DBScan job producing micro clusters over a set of neighbors */ public class DBScanJobRunner extends NNJobRunner { private static final String[] CodecsRank = new String[] { "BZip2", // "Gzip", "Lz4", "Snappy", "Lzo", }; private boolean firstIteration = true; private long memInMB = 4096; @Override public void configure( final Job job ) throws Exception { super.configure(job); job.setMapperClass(NNMapReduce.NNMapper.class); job.setReducerClass(DBScanMapReduce.DBScanMapHullReducer.class); job.setMapOutputKeyClass(PartitionDataWritable.class); job.setMapOutputValueClass(AdapterWithObjectWritable.class); job.setOutputKeyClass(GeoWaveInputKey.class); job.setOutputValueClass(ObjectWritable.class); job.setSpeculativeExecution(false); final Configuration conf = job.getConfiguration(); conf.set( "mapreduce.map.java.opts", "-Xmx" + memInMB + "m"); conf.set( "mapreduce.reduce.java.opts", "-Xmx" + memInMB + "m"); conf.setLong( "mapred.task.timeout", 2000000); conf.setInt( "mapreduce.task.io.sort.mb", 250); job.getConfiguration().setBoolean( "mapreduce.reduce.speculative", false); Class<? extends CompressionCodec> bestCodecClass = org.apache.hadoop.io.compress.DefaultCodec.class; int rank = 0; for (final Class<? extends CompressionCodec> codecClass : CompressionCodecFactory.getCodecClasses(conf)) { int r = 1; for (final String codecs : CodecsRank) { if (codecClass.getName().contains( codecs)) { break; } r++; } if ((rank < r) && (r <= CodecsRank.length)) { try { final CompressionCodec codec = codecClass.newInstance(); if (Configurable.class.isAssignableFrom(codecClass)) { ((Configurable) codec).setConf(conf); } // throws an exception if not configurable in this context CodecPool.getCompressor(codec); bestCodecClass = codecClass; rank = r; } catch (final Throwable ex) { // occurs when codec is not installed. LOGGER.warn( "Not configuable in this context", ex); } } } LOGGER.warn("Compression with " + bestCodecClass.toString()); conf.setClass( "mapreduce.map.output.compress.codec", bestCodecClass, CompressionCodec.class); conf.setBoolean( "mapreduce.map.output.compress", true); conf.setBooleanIfUnset( "first.iteration", firstIteration); } public void setMemoryInMB( final long memInMB ) { this.memInMB = memInMB; } protected void setFirstIteration( final boolean firstIteration ) { this.firstIteration = firstIteration; } @Override public int run( final Configuration config, final PropertyManagement runTimeProperties ) throws Exception { runTimeProperties.storeIfEmpty( HullParameters.Hull.DATA_TYPE_ID, "concave_hull"); final String adapterID = runTimeProperties.getPropertyAsString( HullParameters.Hull.DATA_TYPE_ID, "concave_hull"); final String namespaceURI = runTimeProperties.storeIfEmpty( HullParameters.Hull.DATA_NAMESPACE_URI, BasicFeatureTypes.DEFAULT_NAMESPACE).toString(); JobContextAdapterStore.addDataAdapter( config, AnalyticFeature.createGeometryFeatureAdapter( adapterID, new String[0], namespaceURI, ClusteringUtils.CLUSTERING_CRS)); final Projection<?> projectionFunction = runTimeProperties.getClassInstance( HullParameters.Hull.PROJECTION_CLASS, Projection.class, SimpleFeatureProjection.class); projectionFunction.setup( runTimeProperties, getScope(), config); runTimeProperties.setConfig( new ParameterEnum[] { HullParameters.Hull.PROJECTION_CLASS, GlobalParameters.Global.BATCH_ID, HullParameters.Hull.ZOOM_LEVEL, HullParameters.Hull.ITERATION, HullParameters.Hull.DATA_TYPE_ID, HullParameters.Hull.DATA_NAMESPACE_URI, ClusteringParameters.Clustering.MINIMUM_SIZE, Partition.GEOMETRIC_DISTANCE_UNIT, Partition.DISTANCE_THRESHOLDS, Partition.MAX_MEMBER_SELECTION }, config, getScope()); return super.run( config, runTimeProperties); } @Override public Collection<ParameterEnum<?>> getParameters() { final Collection<ParameterEnum<?>> params = super.getParameters(); params.addAll(Arrays.asList(new ParameterEnum<?>[] { Partition.PARTITIONER_CLASS, Partition.MAX_DISTANCE, Partition.MAX_MEMBER_SELECTION, Global.BATCH_ID, Hull.DATA_TYPE_ID, Hull.PROJECTION_CLASS, Clustering.MINIMUM_SIZE, Partition.GEOMETRIC_DISTANCE_UNIT, Partition.DISTANCE_THRESHOLDS })); return params; } }