package mil.nga.giat.geowave.analytic.mapreduce.nn;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import mil.nga.giat.geowave.analytic.AdapterWithObjectWritable;
import mil.nga.giat.geowave.analytic.PropertyManagement;
import mil.nga.giat.geowave.analytic.ScopedJobConfiguration;
import mil.nga.giat.geowave.analytic.distance.DistanceFn;
import mil.nga.giat.geowave.analytic.distance.FeatureGeometryDistanceFn;
import mil.nga.giat.geowave.analytic.nn.DefaultNeighborList;
import mil.nga.giat.geowave.analytic.nn.DistanceProfile;
import mil.nga.giat.geowave.analytic.nn.DistanceProfileGenerateFn;
import mil.nga.giat.geowave.analytic.nn.NNProcessor;
import mil.nga.giat.geowave.analytic.nn.NNProcessor.CompleteNotifier;
import mil.nga.giat.geowave.analytic.nn.NeighborList;
import mil.nga.giat.geowave.analytic.nn.NeighborListFactory;
import mil.nga.giat.geowave.analytic.nn.TypeConverter;
import mil.nga.giat.geowave.analytic.param.CommonParameters;
import mil.nga.giat.geowave.analytic.param.ParameterEnum;
import mil.nga.giat.geowave.analytic.param.ParameterHelper;
import mil.nga.giat.geowave.analytic.param.PartitionParameters;
import mil.nga.giat.geowave.analytic.param.PartitionParameters.Partition;
import mil.nga.giat.geowave.analytic.partitioner.OrthodromicDistancePartitioner;
import mil.nga.giat.geowave.analytic.partitioner.Partitioner;
import mil.nga.giat.geowave.analytic.partitioner.Partitioner.PartitionData;
import mil.nga.giat.geowave.analytic.partitioner.Partitioner.PartitionDataCallback;
import mil.nga.giat.geowave.core.index.ByteArrayId;
import mil.nga.giat.geowave.mapreduce.HadoopWritableSerializationTool;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputFormat;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.opengis.feature.simple.SimpleFeature;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.primitives.SignedBytes;
/**
* Find the nearest neighbors to a each item.
*
* The solution represented here partitions the data using a partitioner. The
* nearest neighbors are inspected within those partitions. Each partition is
* processed in memory. If the partitioner is agnostic to density, then the
* number of nearest neighbors inspected in a partition may exceed memory.
* Selecting the appropriate partitioning is critical. It may be best to work
* bottom up, partitioning at a finer grain and iterating through larger
* partitions.
*
* The reducer has four extension points:
*
* @formatter:off
*
* (1) createSetForNeighbors() create a set for primary and
* secondary neighbor lists. The set implementation can control
* the amount of memory used. The algorithm loads the primary and
* secondary sets before performing the neighbor analysis. An
* implementer can constrain the set size, removing items not
* considered relevant.
*
* (2) createSummary() permits extensions to create an summary
* object for the entire partition
*
* (3) processNeighbors() permits extensions to process the
* neighbor list for each primary item and update the summary
* object
*
* (4) processSummary() permits the reducer to produce an output
* from the summary object
*
* @formatter:on
*
* * Properties:
*
* @formatter:off "NNMapReduce.Partition.PartitionerClass" ->
* {@link mil.nga.giat.geowave.analytic.partitioner.Partitioner}
* <p/>
* "NNMapReduce.Common.DistanceFunctionClass" -> Used to
* determine distance to between simple features
* {@link mil.nga.giat.geowave.analytic.distance.DistanceFn}
* <p/>
* "NNMapReduce.Partition.PartitionerClass" ->
* {@link mil.nga.giat.geowave.analytic.partitioner.Partitioner}
* <p/>
* "NNMapReduce.Partition.MaxMemberSelection" -> Maximum number
* of neighbors (pick the top K closest, where this variable is
* K) (integer)
* <p/>
* "NNMapReduce.Partition.PartitionDistance" -> Maximum distance
* between item and its neighbors. (double)
*
*
* @formatter:on
*/
public class NNMapReduce
{
protected static final Logger LOGGER = LoggerFactory.getLogger(NNMapReduce.class);
/**
* Nearest neighbors...take one
*
*/
public static class NNMapper<T> extends
Mapper<GeoWaveInputKey, Object, PartitionDataWritable, AdapterWithObjectWritable>
{
protected Partitioner<T> partitioner;
protected HadoopWritableSerializationTool serializationTool;
final protected AdapterWithObjectWritable outputValue = new AdapterWithObjectWritable();
final protected PartitionDataWritable partitionDataWritable = new PartitionDataWritable();
@Override
protected void map(
final GeoWaveInputKey key,
final Object value,
final Mapper<GeoWaveInputKey, Object, PartitionDataWritable, AdapterWithObjectWritable>.Context context )
throws IOException,
InterruptedException {
@SuppressWarnings("unchecked")
final T unwrappedValue = (T) ((value instanceof ObjectWritable) ? serializationTool.fromWritable(
key.getAdapterId(),
(ObjectWritable) value) : value);
try {
partitioner.partition(
unwrappedValue,
new PartitionDataCallback() {
@Override
public void partitionWith(
final PartitionData partitionData )
throws Exception {
outputValue.setAdapterId(key.getAdapterId());
AdapterWithObjectWritable.fillWritableWithAdapter(
serializationTool,
outputValue,
key.getAdapterId(),
key.getDataId(),
partitionData.isPrimary(),
unwrappedValue);
partitionDataWritable.setPartitionData(partitionData);
context.write(
partitionDataWritable,
outputValue);
}
});
}
catch (final IOException e) {
throw e;
}
catch (final Exception e) {
throw new IOException(
e);
}
}
@SuppressWarnings("unchecked")
@Override
protected void setup(
final Mapper<GeoWaveInputKey, Object, PartitionDataWritable, AdapterWithObjectWritable>.Context context )
throws IOException,
InterruptedException {
super.setup(context);
final ScopedJobConfiguration config = new ScopedJobConfiguration(
context.getConfiguration(),
NNMapReduce.class,
LOGGER);
serializationTool = new HadoopWritableSerializationTool(
GeoWaveInputFormat.getJobContextAdapterStore(context));
try {
partitioner = config.getInstance(
PartitionParameters.Partition.PARTITIONER_CLASS,
Partitioner.class,
OrthodromicDistancePartitioner.class);
partitioner.initialize(
context,
NNMapReduce.class);
}
catch (final Exception e1) {
throw new IOException(
e1);
}
}
}
public abstract static class NNReducer<VALUEIN, KEYOUT, VALUEOUT, PARTITION_SUMMARY> extends
Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>
{
protected HadoopWritableSerializationTool serializationTool;
protected DistanceFn<VALUEIN> distanceFn;
protected double maxDistance = 1.0;
protected int maxNeighbors = Integer.MAX_VALUE;
protected Partitioner<Object> partitioner;
protected TypeConverter<VALUEIN> typeConverter = new TypeConverter<VALUEIN>() {
@SuppressWarnings("unchecked")
@Override
public VALUEIN convert(
final ByteArrayId id,
final Object o ) {
return (VALUEIN) o;
}
};
protected DistanceProfileGenerateFn<?, VALUEIN> distanceProfileFn = new LocalDistanceProfileGenerateFn();
@Override
protected void reduce(
final PartitionDataWritable key,
final Iterable<AdapterWithObjectWritable> values,
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context )
throws IOException,
InterruptedException {
final NNProcessor<Object, VALUEIN> processor = new NNProcessor<Object, VALUEIN>(
partitioner,
typeConverter,
distanceProfileFn,
maxDistance,
key.partitionData);
processor.setUpperBoundPerPartition(maxNeighbors);
final PARTITION_SUMMARY summary = createSummary();
for (final AdapterWithObjectWritable inputValue : values) {
final Object value = AdapterWithObjectWritable.fromWritableWithAdapter(
serializationTool,
inputValue);
processor.add(
inputValue.getDataId(),
key.partitionData.isPrimary(),
value);
}
preprocess(
context,
processor,
summary);
processor.process(
this.createNeighborsListFactory(summary),
new CompleteNotifier<VALUEIN>() {
@Override
public void complete(
final ByteArrayId id,
final VALUEIN value,
final NeighborList<VALUEIN> primaryList )
throws IOException,
InterruptedException {
context.progress();
processNeighbors(
key.partitionData,
id,
value,
primaryList,
context,
summary);
processor.remove(id);
}
});
processSummary(
key.partitionData,
summary,
context);
}
public NeighborListFactory<VALUEIN> createNeighborsListFactory(
final PARTITION_SUMMARY summary ) {
return new DefaultNeighborList.DefaultNeighborListFactory<VALUEIN>();
}
/**
*
* @param primaries
* @param others
* @param summary
* @param startingPoint
* @return alternate startingPoint
*/
protected void preprocess(
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context,
final NNProcessor<Object, VALUEIN> processor,
final PARTITION_SUMMARY summary )
throws IOException,
InterruptedException {}
/**
*
* @Return an object that represents a summary of the neighbors
* processed
*/
protected abstract PARTITION_SUMMARY createSummary();
/**
* Allow extended classes to do some final processing for the partition.
*
* @param summary
* @param context
*/
protected abstract void processSummary(
PartitionData partitionData,
PARTITION_SUMMARY summary,
Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context )
throws IOException,
InterruptedException;
/**
*
* allow the extending classes to return sets with constraints and
* management algorithms
*/
protected Set<VALUEIN> createSetForNeighbors(
final boolean isSetForPrimary ) {
return new HashSet<VALUEIN>();
}
protected abstract void processNeighbors(
PartitionData partitionData,
ByteArrayId primaryId,
VALUEIN primary,
NeighborList<VALUEIN> neighbors,
Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context,
PARTITION_SUMMARY summary )
throws IOException,
InterruptedException;
@SuppressWarnings("unchecked")
@Override
protected void setup(
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, KEYOUT, VALUEOUT>.Context context )
throws IOException,
InterruptedException {
final ScopedJobConfiguration config = new ScopedJobConfiguration(
context.getConfiguration(),
NNMapReduce.class,
NNMapReduce.LOGGER);
serializationTool = new HadoopWritableSerializationTool(
GeoWaveInputFormat.getJobContextAdapterStore(context));
try {
distanceFn = config.getInstance(
CommonParameters.Common.DISTANCE_FUNCTION_CLASS,
DistanceFn.class,
FeatureGeometryDistanceFn.class);
}
catch (InstantiationException | IllegalAccessException e) {
throw new IOException(
e);
}
maxDistance = config.getDouble(
PartitionParameters.Partition.MAX_DISTANCE,
1.0);
try {
LOGGER.info("Using secondary partitioning");
partitioner = config.getInstance(
PartitionParameters.Partition.SECONDARY_PARTITIONER_CLASS,
Partitioner.class,
PassthruPartitioner.class);
((ParameterHelper<Double>) Partition.PARTITION_PRECISION.getHelper()).setValue(
context.getConfiguration(),
NNMapReduce.class,
new Double(
1.0));
partitioner.initialize(
context,
NNMapReduce.class);
}
catch (final Exception e1) {
throw new IOException(
e1);
}
maxNeighbors = config.getInt(
PartitionParameters.Partition.MAX_MEMBER_SELECTION,
NNProcessor.DEFAULT_UPPER_BOUND_PARTIION_SIZE);
LOGGER.info(
"Maximum Neighbors = {}",
maxNeighbors);
}
protected class LocalDistanceProfileGenerateFn implements
DistanceProfileGenerateFn<Object, VALUEIN>
{
// for GC concerns in the default NN case
DistanceProfile<Object> singleNotThreadSafeImage = new DistanceProfile<Object>();
@Override
public DistanceProfile<Object> computeProfile(
final VALUEIN item1,
final VALUEIN item2 ) {
singleNotThreadSafeImage.setDistance(distanceFn.measure(
item1,
item2));
return singleNotThreadSafeImage;
}
}
}
public static class NNSimpleFeatureIDOutputReducer extends
NNReducer<SimpleFeature, Text, Text, Boolean>
{
final Text primaryText = new Text();
final Text neighborsText = new Text();
final byte[] sepBytes = new byte[] {
0x2c
};
@Override
protected void processNeighbors(
final PartitionData partitionData,
final ByteArrayId primaryId,
final SimpleFeature primary,
final NeighborList<SimpleFeature> neighbors,
final Reducer<PartitionDataWritable, AdapterWithObjectWritable, Text, Text>.Context context,
final Boolean summary )
throws IOException,
InterruptedException {
if ((neighbors == null) || (neighbors.size() == 0)) {
return;
}
primaryText.clear();
neighborsText.clear();
byte[] utfBytes;
try {
utfBytes = primary.getID().getBytes(
"UTF-8");
primaryText.append(
utfBytes,
0,
utfBytes.length);
for (final Map.Entry<ByteArrayId, SimpleFeature> neighbor : neighbors) {
if (neighborsText.getLength() > 0) {
neighborsText.append(
sepBytes,
0,
sepBytes.length);
}
utfBytes = neighbor.getValue().getID().getBytes(
"UTF-8");
neighborsText.append(
utfBytes,
0,
utfBytes.length);
}
context.write(
primaryText,
neighborsText);
}
catch (final UnsupportedEncodingException e) {
throw new RuntimeException(
"UTF-8 Encoding invalid for Simople feature ID",
e);
}
}
@Override
protected Boolean createSummary() {
return Boolean.TRUE;
}
@Override
protected void processSummary(
final PartitionData partitionData,
final Boolean summary,
final org.apache.hadoop.mapreduce.Reducer.Context context ) {
// do nothing
}
}
public static class PartitionDataWritable implements
Writable,
WritableComparable<PartitionDataWritable>
{
protected PartitionData partitionData;
public PartitionDataWritable() {
}
protected void setPartitionData(
final PartitionData partitionData ) {
this.partitionData = partitionData;
}
public PartitionData getPartitionData() {
return partitionData;
}
public PartitionDataWritable(
final PartitionData partitionData ) {
this.partitionData = partitionData;
}
@Override
public void readFields(
final DataInput input )
throws IOException {
partitionData = new PartitionData();
partitionData.readFields(input);
}
@Override
public void write(
final DataOutput output )
throws IOException {
partitionData.write(output);
}
@Override
public int compareTo(
final PartitionDataWritable o ) {
final int val = SignedBytes.lexicographicalComparator().compare(
partitionData.getId().getBytes(),
o.partitionData.getId().getBytes());
if ((val == 0) && (o.partitionData.getGroupId() != null) && (partitionData.getGroupId() != null)) {
return SignedBytes.lexicographicalComparator().compare(
partitionData.getGroupId().getBytes(),
o.partitionData.getGroupId().getBytes());
}
return val;
}
@Override
public String toString() {
return partitionData.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = (prime * result) + ((partitionData == null) ? 0 : partitionData.hashCode());
return result;
}
@Override
public boolean equals(
final Object obj ) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final PartitionDataWritable other = (PartitionDataWritable) obj;
if (partitionData == null) {
if (other.partitionData != null) {
return false;
}
}
else if (!partitionData.equals(other.partitionData)) {
return false;
}
return true;
}
}
public static class PassthruPartitioner<T> implements
Partitioner<T>
{
/**
*
*/
private static final long serialVersionUID = -1022316020113365561L;
@Override
public void initialize(
final JobContext context,
final Class<?> scope )
throws IOException {}
private static final List<PartitionData> FixedPartition = Collections.singletonList(new PartitionData(
new ByteArrayId(
"1"),
true));
@Override
public List<PartitionData> getCubeIdentifiers(
final T entry ) {
return FixedPartition;
}
@Override
public void partition(
final T entry,
final PartitionDataCallback callback )
throws Exception {
callback.partitionWith(FixedPartition.get(0));
}
@Override
public Collection<ParameterEnum<?>> getParameters() {
return Collections.emptyList();
}
@Override
public void setup(
final PropertyManagement runTimeProperties,
final Class<?> scope,
final Configuration configuration ) {}
}
}