package mil.nga.giat.geowave.analytic.mapreduce.dbscan;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import mil.nga.giat.geowave.adapter.vector.FeatureDataAdapter;
import mil.nga.giat.geowave.adapter.vector.FeatureWritable;
import mil.nga.giat.geowave.analytic.AdapterWithObjectWritable;
import mil.nga.giat.geowave.analytic.AnalyticFeature;
import mil.nga.giat.geowave.analytic.Projection;
import mil.nga.giat.geowave.analytic.ScopedJobConfiguration;
import mil.nga.giat.geowave.analytic.SimpleFeatureProjection;
import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils;
import mil.nga.giat.geowave.analytic.distance.CoordinateCircleDistanceFn;
import mil.nga.giat.geowave.analytic.mapreduce.dbscan.ClusterNeighborList.ClusterNeighborListFactory;
import mil.nga.giat.geowave.analytic.mapreduce.dbscan.ClusterUnionList.ClusterUnionListFactory;
import mil.nga.giat.geowave.analytic.mapreduce.dbscan.PreProcessSingleItemClusterList.PreProcessSingleItemClusterListFactory;
import mil.nga.giat.geowave.analytic.mapreduce.dbscan.SingleItemClusterList.SingleItemClusterListFactory;
import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce;
import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce.NNReducer;
import mil.nga.giat.geowave.analytic.mapreduce.nn.NNMapReduce.PartitionDataWritable;
import mil.nga.giat.geowave.analytic.nn.NNProcessor;
import mil.nga.giat.geowave.analytic.nn.NNProcessor.CompleteNotifier;
import mil.nga.giat.geowave.analytic.nn.NeighborList;
import mil.nga.giat.geowave.analytic.nn.NeighborListFactory;
import mil.nga.giat.geowave.analytic.nn.TypeConverter;
import mil.nga.giat.geowave.analytic.param.ClusteringParameters;
import mil.nga.giat.geowave.analytic.param.GlobalParameters;
import mil.nga.giat.geowave.analytic.param.HullParameters;
import mil.nga.giat.geowave.analytic.partitioner.Partitioner.PartitionData;
import mil.nga.giat.geowave.core.index.ByteArrayId;
import mil.nga.giat.geowave.mapreduce.HadoopWritableSerializer;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.geotools.feature.type.BasicFeatureTypes;
import org.opengis.feature.simple.SimpleFeature;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The approach differs from the approach commonly documented (e.g.
* https://en.wikipedia.org/wiki/DBSCAN). This approach does not maintain a
* queue of viable neighbors to navigate.
*
* Clusters are merged if they share neighbors in common and both clusters meet
* the minimum size constraints.
*
* Clusters may be made up of points or geometries. When processing geometries,
* the closest two points are included in the cluster, not the entire geometry.
* The reason for this is that geometries may span large areas. This technique
* has a disadvantage of mis-representing dense segments as a dense set of
* points.
*
* The design uses two level partitioning, working within the confines of @{link
* NNProcessor}. Performance gains and memory constraints are accomplished
* through a pre-processing step.
*
* Pre-processing first finds dense clusters, replacing each dense cluster with
* a concave polygon. Although not very scientific, the condensing process the
* minimum condensed cluster size is between 50 and 200, depending on the
* setting of the minimum owners. The choice is some what arbitrary. Retaining
* individual points for clusters larger than 200 often creates memory concerns.
* However, there is little value in condensing below 50 as that indicates a
* fairly small cluster, which does not contribute to a performance concern.
* Override 'calculateCondensingMinimum ()' to come up with a different
* approach.
*
* Pre-processing also finds cluster centers that have less than the minimum and
* tosses those centers. There is a caution here. Clusters of this type can fall
* on the 'edge' of dense clusters, thus 'tightening' the dense regions. It does
* effectively remove outliers. Alter the approach by over-riding
* 'calculateTossMinimum()' (e.g. make it a smaller number like 0 or 1).
*
*
*/
public class DBScanMapReduce
{
protected static final Logger LOGGER = LoggerFactory.getLogger(DBScanMapReduce.class);
public abstract static class DBScanMapReducer<KEYOUT, VALUEOUT> extends
NNReducer<ClusterItem, KEYOUT, VALUEOUT, Map<ByteArrayId, Cluster>>
{
protected int minOwners = 0;
@Override
protected Map<ByteArrayId, Cluster> createSummary() {
return new HashMap<ByteArrayId, Cluster>();
}
@Override
protected void processNeighbors(
final PartitionData partitionData,
final ByteArrayId primaryId,
final ClusterItem primary,
final NeighborList<ClusterItem> neighbors,
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context,
final Map<ByteArrayId, Cluster> index )
throws IOException,
InterruptedException {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(
"Finish {} ",
primaryId);
}
if (neighbors == null) {
return;
}
final Cluster cluster = ((ClusterNeighborList) neighbors).getCluster();
if (cluster == null) {
return;
}
if (cluster.size() < minOwners) {
LOGGER.trace(
"Invalidate {} ",
primaryId);
cluster.invalidate();
return;
}
cluster.finish();
}
@Override
protected void setup(
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context )
throws IOException,
InterruptedException {
super.setup(context);
final ScopedJobConfiguration config = new ScopedJobConfiguration(
context.getConfiguration(),
NNMapReduce.class);
// first run must at least form a triangle
minOwners = config.getInt(
ClusteringParameters.Clustering.MINIMUM_SIZE,
2);
LOGGER.info(
"Minumum owners = {}",
minOwners);
}
}
public static class SimpleFeatureToClusterItemConverter implements
TypeConverter<ClusterItem>
{
final Projection<SimpleFeature> projection;
public SimpleFeatureToClusterItemConverter(
final Projection<SimpleFeature> projection ) {
super();
this.projection = projection;
}
@Override
public ClusterItem convert(
final ByteArrayId id,
final Object o ) {
final SimpleFeature feature = (SimpleFeature) o;
final Long count = (Long) feature.getAttribute(AnalyticFeature.ClusterFeatureAttribute.COUNT.attrName());
return new ClusterItem(
feature.getID(),
projection.getProjection(feature),
count == null ? 1 : count,
false);
}
}
public static class DBScanMapHullReducer extends
DBScanMapReducer<GeoWaveInputKey, ObjectWritable>
{
private String batchID;
private int zoomLevel = 1;
private int iteration = 1;
private FeatureDataAdapter outputAdapter;
private final ObjectWritable output = new ObjectWritable();
private boolean firstIteration = true;
protected int calculateCondensingMinimum() {
return Math.min(
Math.max(
minOwners,
200),
minOwners * 10);
}
protected int calculateTossMinimum() {
return (minOwners - 2);
}
/**
* Find the large clusters and condense them down. Find the points that
* are not reachable to viable clusters and remove them.
*
* @throws InterruptedException
* @throws IOException
*/
@Override
protected void preprocess(
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context,
final NNProcessor<Object, ClusterItem> processor,
final Map<ByteArrayId, Cluster> index )
throws IOException,
InterruptedException {
if (!firstIteration) {
return;
}
processor.trimSmallPartitions(calculateTossMinimum());
// 2.0 times minimum compression size.
// if compression is not likely to increase
// performance, then pre-processing does not buy much performance
if (processor.size() < (calculateCondensingMinimum() * 2.0)) {
return;
}
processor.process(
new ClusterNeighborListFactory(
new PreProcessSingleItemClusterListFactory(
index),
index),
new CompleteNotifier<ClusterItem>() {
final int condenseSize = calculateCondensingMinimum();
final int tossSize = calculateTossMinimum();
@Override
public void complete(
final ByteArrayId id,
final ClusterItem value,
final NeighborList<ClusterItem> list ) {
final Cluster cluster = ((ClusterNeighborList) list).getCluster();
// this basically excludes points that cannot
// contribute to extending the network.
// may be a BAD idea.
if (cluster.size() < tossSize) {
processor.remove(id);
}
// this is a condensing component
else if (cluster.size() > condenseSize) {
cluster.finish();
value.setGeometry(cluster.getGeometry());
value.setCount(list.size());
value.setCompressed();
final Iterator<ByteArrayId> it = cluster.getLinkedClusters().iterator();
while (it.hasNext()) {
final ByteArrayId idToRemove = it.next();
processor.remove(idToRemove);
it.remove();
}
}
else {
cluster.clear();
}
context.progress();
}
});
index.clear();
}
@Override
protected void processSummary(
final PartitionData partitionData,
final Map<ByteArrayId, Cluster> summary,
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context )
throws IOException,
InterruptedException {
final HadoopWritableSerializer<SimpleFeature, FeatureWritable> serializer = outputAdapter
.createWritableSerializer();
final Set<Cluster> processed = new HashSet<Cluster>();
final Iterator<Map.Entry<ByteArrayId, Cluster>> clusterIt = summary.entrySet().iterator();
while (clusterIt.hasNext()) {
final Cluster cluster = clusterIt.next().getValue();
clusterIt.remove();
if (cluster.isCompressed() && !processed.contains(cluster)) {
processed.add(cluster);
final SimpleFeature newPolygonFeature = AnalyticFeature.createGeometryFeature(
outputAdapter.getFeatureType(),
batchID,
UUID.randomUUID().toString(),
cluster.getId().getString(), // name
partitionData.getGroupId() != null ? partitionData.getGroupId().toString() : cluster
.getId()
.getString(), // group
0.0,
cluster.getGeometry(),
new String[0],
new double[0],
zoomLevel,
iteration,
cluster.size());
output.set(serializer.toWritable(newPolygonFeature));
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(
"Generating {}",
newPolygonFeature.toString());
}
// ShapefileTool.writeShape(
// cluster.getId().getString() + iteration,
// new File(
// "./target/testdb_" + cluster.getId().getString() +
// iteration),
// new Geometry[] {
// (Geometry) cluster.get()
// });
context.write(
new GeoWaveInputKey(
outputAdapter.getAdapterId(),
new ByteArrayId(
newPolygonFeature.getID())),
output);
}
}
}
@Override
public NeighborListFactory<ClusterItem> createNeighborsListFactory(
final Map<ByteArrayId, Cluster> summary ) {
return new ClusterNeighborListFactory(
(firstIteration) ? new SingleItemClusterListFactory(
summary) : new ClusterUnionListFactory(
summary),
summary);
}
@SuppressWarnings("unchecked")
@Override
protected void setup(
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, GeoWaveInputKey, ObjectWritable>.Context context )
throws IOException,
InterruptedException {
final ScopedJobConfiguration config = new ScopedJobConfiguration(
context.getConfiguration(),
NNMapReduce.class);
super.setup(context);
DBScanClusterList.getHullTool().setDistanceFnForCoordinate(
new CoordinateCircleDistanceFn());
DBScanClusterList.setMergeSize(minOwners);
batchID = config.getString(
GlobalParameters.Global.BATCH_ID,
UUID.randomUUID().toString());
zoomLevel = config.getInt(
HullParameters.Hull.ZOOM_LEVEL,
1);
iteration = config.getInt(
HullParameters.Hull.ITERATION,
1);
firstIteration = context.getConfiguration().getBoolean(
"first.iteration",
true);
final String polygonDataTypeId = config.getString(
HullParameters.Hull.DATA_TYPE_ID,
"concave_hull");
outputAdapter = AnalyticFeature.createGeometryFeatureAdapter(
polygonDataTypeId,
new String[0],
config.getString(
HullParameters.Hull.DATA_NAMESPACE_URI,
BasicFeatureTypes.DEFAULT_NAMESPACE),
ClusteringUtils.CLUSTERING_CRS);
Projection<SimpleFeature> projectionFunction;
try {
projectionFunction = config.getInstance(
HullParameters.Hull.PROJECTION_CLASS,
Projection.class,
SimpleFeatureProjection.class);
}
catch (InstantiationException | IllegalAccessException e) {
throw new IOException(
e);
}
super.typeConverter = new SimpleFeatureToClusterItemConverter(
projectionFunction);
distanceProfileFn = new ClusterItemDistanceFn();
super.distanceFn = new ClusterItemDistanceFn();
}
}
}