package mil.nga.giat.geowave.analytic.mapreduce.dbscan; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.UUID; import mil.nga.giat.geowave.adapter.vector.FeatureDataAdapter; import mil.nga.giat.geowave.adapter.vector.FeatureWritable; import mil.nga.giat.geowave.analytic.AdapterWithObjectWritable; import mil.nga.giat.geowave.analytic.AnalyticFeature; import mil.nga.giat.geowave.analytic.Projection; import mil.nga.giat.geowave.analytic.ScopedJobConfiguration; import mil.nga.giat.geowave.analytic.SimpleFeatureProjection; import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils; import mil.nga.giat.geowave.analytic.distance.CoordinateCircleDistanceFn; import mil.nga.giat.geowave.analytic.mapreduce.dbscan.ClusterNeighborList.ClusterNeighborListFactory; import mil.nga.giat.geowave.analytic.mapreduce.dbscan.ClusterUnionList.ClusterUnionListFactory; import mil.nga.giat.geowave.analytic.mapreduce.dbscan.PreProcessSingleItemClusterList.PreProcessSingleItemClusterListFactory; import mil.nga.giat.geowave.analytic.mapreduce.dbscan.SingleItemClusterList.SingleItemClusterListFactory; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce.NNReducer; import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce.PartitionDataWritable; import mil.nga.giat.geowave.analytic.nn.NNProcessor; import mil.nga.giat.geowave.analytic.nn.NNProcessor.CompleteNotifier; import mil.nga.giat.geowave.analytic.nn.NeighborList; import mil.nga.giat.geowave.analytic.nn.NeighborListFactory; import mil.nga.giat.geowave.analytic.nn.TypeConverter; import mil.nga.giat.geowave.analytic.param.ClusteringParameters; import mil.nga.giat.geowave.analytic.param.GlobalParameters; import mil.nga.giat.geowave.analytic.param.HullParameters; import mil.nga.giat.geowave.analytic.partitioner.Partitioner.PartitionData; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.mapreduce.HadoopWritableSerializer; import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey; import org.apache.hadoop.io.ObjectWritable; import org.apache.hadoop.mapreduce.Reducer; import org.geotools.feature.type.BasicFeatureTypes; import org.opengis.feature.simple.SimpleFeature; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The approach differs from the approach commonly documented (e.g. * https://en.wikipedia.org/wiki/DBSCAN). This approach does not maintain a * queue of viable neighbors to navigate. * * Clusters are merged if they share neighbors in common and both clusters meet * the minimum size constraints. * * Clusters may be made up of points or geometries. When processing geometries, * the closest two points are included in the cluster, not the entire geometry. * The reason for this is that geometries may span large areas. This technique * has a disadvantage of mis-representing dense segments as a dense set of * points. * * The design uses two level partitioning, working within the confines of @{link * NNProcessor}. Performance gains and memory constraints are accomplished * through a pre-processing step. * * Pre-processing first finds dense clusters, replacing each dense cluster with * a concave polygon. Although not very scientific, the condensing process the * minimum condensed cluster size is between 50 and 200, depending on the * setting of the minimum owners. The choice is some what arbitrary. Retaining * individual points for clusters larger than 200 often creates memory concerns. * However, there is little value in condensing below 50 as that indicates a * fairly small cluster, which does not contribute to a performance concern. * Override 'calculateCondensingMinimum ()' to come up with a different * approach. * * Pre-processing also finds cluster centers that have less than the minimum and * tosses those centers. There is a caution here. Clusters of this type can fall * on the 'edge' of dense clusters, thus 'tightening' the dense regions. It does * effectively remove outliers. Alter the approach by over-riding * 'calculateTossMinimum()' (e.g. make it a smaller number like 0 or 1). * * */ public class DBScanMapReduce { protected static final Logger LOGGER = LoggerFactory.getLogger(DBScanMapReduce.class); public abstract static class DBScanMapReducer<KEYOUT, VALUEOUT> extends NNReducer<ClusterItem, KEYOUT, VALUEOUT, Map<ByteArrayId, Cluster>> { protected int minOwners = 0; @Override protected Map<ByteArrayId, Cluster> createSummary() { return new HashMap<ByteArrayId, Cluster>(); } @Override protected void processNeighbors( final PartitionData partitionData, final ByteArrayId primaryId, final ClusterItem primary, final NeighborList<ClusterItem> neighbors, final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context, final Map<ByteArrayId, Cluster> index ) throws IOException, InterruptedException { if (LOGGER.isTraceEnabled()) { LOGGER.trace( "Finish {} ", primaryId); } if (neighbors == null) { return; } final Cluster cluster = ((ClusterNeighborList) neighbors).getCluster(); if (cluster == null) { return; } if (cluster.size() < minOwners) { LOGGER.trace( "Invalidate {} ", primaryId); cluster.invalidate(); return; } cluster.finish(); } @Override protected void setup( final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context ) throws IOException, InterruptedException { super.setup(context); final ScopedJobConfiguration config = new ScopedJobConfiguration( context.getConfiguration(), NNMapReduce.class); // first run must at least form a triangle minOwners = config.getInt( ClusteringParameters.Clustering.MINIMUM_SIZE, 2); LOGGER.info( "Minumum owners = {}", minOwners); } } public static class SimpleFeatureToClusterItemConverter implements TypeConverter<ClusterItem> { final Projection<SimpleFeature> projection; public SimpleFeatureToClusterItemConverter( final Projection<SimpleFeature> projection ) { super(); this.projection = projection; } @Override public ClusterItem convert( final ByteArrayId id, final Object o ) { final SimpleFeature feature = (SimpleFeature) o; final Long count = (Long) feature.getAttribute(AnalyticFeature.ClusterFeatureAttribute.COUNT.attrName()); return new ClusterItem( feature.getID(), projection.getProjection(feature), count == null ? 1 : count, false); } } public static class DBScanMapHullReducer extends DBScanMapReducer<GeoWaveInputKey, ObjectWritable> { private String batchID; private int zoomLevel = 1; private int iteration = 1; private FeatureDataAdapter outputAdapter; private final ObjectWritable output = new ObjectWritable(); private boolean firstIteration = true; protected int calculateCondensingMinimum() { return Math.min( Math.max( minOwners, 200), minOwners * 10); } protected int calculateTossMinimum() { return (minOwners - 2); } /** * Find the large clusters and condense them down. Find the points that * are not reachable to viable clusters and remove them. * * @throws InterruptedException * @throws IOException */ @Override protected void preprocess( final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context, final NNProcessor<Object, ClusterItem> processor, final Map<ByteArrayId, Cluster> index ) throws IOException, InterruptedException { if (!firstIteration) { return; } processor.trimSmallPartitions(calculateTossMinimum()); // 2.0 times minimum compression size. // if compression is not likely to increase // performance, then pre-processing does not buy much performance if (processor.size() < (calculateCondensingMinimum() * 2.0)) { return; } processor.process( new ClusterNeighborListFactory( new PreProcessSingleItemClusterListFactory( index), index), new CompleteNotifier<ClusterItem>() { final int condenseSize = calculateCondensingMinimum(); final int tossSize = calculateTossMinimum(); @Override public void complete( final ByteArrayId id, final ClusterItem value, final NeighborList<ClusterItem> list ) { final Cluster cluster = ((ClusterNeighborList) list).getCluster(); // this basically excludes points that cannot // contribute to extending the network. // may be a BAD idea. if (cluster.size() < tossSize) { processor.remove(id); } // this is a condensing component else if (cluster.size() > condenseSize) { cluster.finish(); value.setGeometry(cluster.getGeometry()); value.setCount(list.size()); value.setCompressed(); final Iterator<ByteArrayId> it = cluster.getLinkedClusters().iterator(); while (it.hasNext()) { final ByteArrayId idToRemove = it.next(); processor.remove(idToRemove); it.remove(); } } else { cluster.clear(); } context.progress(); } }); index.clear(); } @Override protected void processSummary( final PartitionData partitionData, final Map<ByteArrayId, Cluster> summary, final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context ) throws IOException, InterruptedException { final HadoopWritableSerializer<SimpleFeature, FeatureWritable> serializer = outputAdapter .createWritableSerializer(); final Set<Cluster> processed = new HashSet<Cluster>(); final Iterator<Map.Entry<ByteArrayId, Cluster>> clusterIt = summary.entrySet().iterator(); while (clusterIt.hasNext()) { final Cluster cluster = clusterIt.next().getValue(); clusterIt.remove(); if (cluster.isCompressed() && !processed.contains(cluster)) { processed.add(cluster); final SimpleFeature newPolygonFeature = AnalyticFeature.createGeometryFeature( outputAdapter.getFeatureType(), batchID, UUID.randomUUID().toString(), cluster.getId().getString(), // name partitionData.getGroupId() != null ? partitionData.getGroupId().toString() : cluster .getId() .getString(), // group 0.0, cluster.getGeometry(), new String[0], new double[0], zoomLevel, iteration, cluster.size()); output.set(serializer.toWritable(newPolygonFeature)); if (LOGGER.isTraceEnabled()) { LOGGER.trace( "Generating {}", newPolygonFeature.toString()); } // ShapefileTool.writeShape( // cluster.getId().getString() + iteration, // new File( // "./target/testdb_" + cluster.getId().getString() + // iteration), // new Geometry[] { // (Geometry) cluster.get() // }); context.write( new GeoWaveInputKey( outputAdapter.getAdapterId(), new ByteArrayId( newPolygonFeature.getID())), output); } } } @Override public NeighborListFactory<ClusterItem> createNeighborsListFactory( final Map<ByteArrayId, Cluster> summary ) { return new ClusterNeighborListFactory( (firstIteration) ? new SingleItemClusterListFactory( summary) : new ClusterUnionListFactory( summary), summary); } @SuppressWarnings("unchecked") @Override protected void setup( final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context ) throws IOException, InterruptedException { final ScopedJobConfiguration config = new ScopedJobConfiguration( context.getConfiguration(), NNMapReduce.class); super.setup(context); DBScanClusterList.getHullTool().setDistanceFnForCoordinate( new CoordinateCircleDistanceFn()); DBScanClusterList.setMergeSize(minOwners); batchID = config.getString( GlobalParameters.Global.BATCH_ID, UUID.randomUUID().toString()); zoomLevel = config.getInt( HullParameters.Hull.ZOOM_LEVEL, 1); iteration = config.getInt( HullParameters.Hull.ITERATION, 1); firstIteration = context.getConfiguration().getBoolean( "first.iteration", true); final String polygonDataTypeId = config.getString( HullParameters.Hull.DATA_TYPE_ID, "concave_hull"); outputAdapter = AnalyticFeature.createGeometryFeatureAdapter( polygonDataTypeId, new String[0], config.getString( HullParameters.Hull.DATA_NAMESPACE_URI, BasicFeatureTypes.DEFAULT_NAMESPACE), ClusteringUtils.CLUSTERING_CRS); Projection<SimpleFeature> projectionFunction; try { projectionFunction = config.getInstance( HullParameters.Hull.PROJECTION_CLASS, Projection.class, SimpleFeatureProjection.class); } catch (InstantiationException | IllegalAccessException e) { throw new IOException( e); } super.typeConverter = new SimpleFeatureToClusterItemConverter( projectionFunction); distanceProfileFn = new ClusterItemDistanceFn(); super.distanceFn = new ClusterItemDistanceFn(); } } }