package mil.nga.giat.geowave.analytic.mapreduce.clustering.runner;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.ToolRunner;
import mil.nga.giat.geowave.adapter.vector.FeatureDataAdapter;
import mil.nga.giat.geowave.analytic.AnalyticFeature;
import mil.nga.giat.geowave.analytic.IndependentJobRunner;
import mil.nga.giat.geowave.analytic.PropertyManagement;
import mil.nga.giat.geowave.analytic.ScopedJobConfiguration;
import mil.nga.giat.geowave.analytic.clustering.ClusteringUtils;
import mil.nga.giat.geowave.analytic.extract.DimensionExtractor;
import mil.nga.giat.geowave.analytic.extract.SimpleFeatureGeometryExtractor;
import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobController;
import mil.nga.giat.geowave.analytic.mapreduce.MapReduceJobRunner;
import mil.nga.giat.geowave.analytic.mapreduce.clustering.SimpleFeatureOutputReducer;
import mil.nga.giat.geowave.analytic.param.ExtractParameters;
import mil.nga.giat.geowave.analytic.param.GlobalParameters;
import mil.nga.giat.geowave.analytic.param.MapReduceParameters;
import mil.nga.giat.geowave.analytic.param.ParameterEnum;
import mil.nga.giat.geowave.analytic.param.StoreParameters;
import mil.nga.giat.geowave.analytic.param.StoreParameters.StoreParam;
import mil.nga.giat.geowave.analytic.store.PersistableStore;
import mil.nga.giat.geowave.core.store.query.DistributableQuery;
import mil.nga.giat.geowave.mapreduce.GeoWaveConfiguratorBase;
import mil.nga.giat.geowave.mapreduce.dedupe.GeoWaveDedupeJobRunner;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputFormat;
import mil.nga.giat.geowave.mapreduce.output.GeoWaveOutputFormat;
/**
*
* Run a map reduce job to extract a population of data from GeoWave (Accumulo),
* remove duplicates, and output a SimpleFeature with the ID and the extracted
* geometry from each of the GeoWave data item.
*
*/
public class GeoWaveAnalyticExtractJobRunner extends
GeoWaveDedupeJobRunner implements
MapReduceJobRunner,
IndependentJobRunner
{
private String outputBaseDir = "/tmp";
private int reducerCount = 1;
public GeoWaveAnalyticExtractJobRunner() {
super(
null); // Datastore options are set in configure()
}
@Override
protected int getNumReduceTasks() {
return reducerCount;
}
@Override
protected String getHdfsOutputBase() {
return outputBaseDir;
}
@Override
protected void configure(
final Job job )
throws Exception {
final ScopedJobConfiguration configWrapper = new ScopedJobConfiguration(
job.getConfiguration(),
SimpleFeatureOutputReducer.class);
reducerCount = Math.max(
configWrapper.getInt(
ExtractParameters.Extract.REDUCER_COUNT,
8),
1);
outputBaseDir = configWrapper.getString(
MapReduceParameters.MRConfig.HDFS_BASE_DIR,
"/tmp");
LOGGER.info("Output base directory " + outputBaseDir);
super.configure(job);
@SuppressWarnings("rawtypes")
final Class<? extends DimensionExtractor> dimensionExtractorClass = job.getConfiguration().getClass(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
ExtractParameters.Extract.DIMENSION_EXTRACT_CLASS),
SimpleFeatureGeometryExtractor.class,
DimensionExtractor.class);
GeoWaveOutputFormat.addDataAdapter(
job.getConfiguration(),
createAdapter(
job.getConfiguration().get(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID)),
job.getConfiguration().get(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
ExtractParameters.Extract.DATA_NAMESPACE_URI)),
dimensionExtractorClass));
job.setJobName("GeoWave Extract (" + dataStoreOptions.getGeowaveNamespace() + ")");
job.setReduceSpeculativeExecution(false);
}
private FeatureDataAdapter createAdapter(
final String outputDataTypeID,
final String namespaceURI,
@SuppressWarnings("rawtypes")
final Class<? extends DimensionExtractor> dimensionExtractorClass )
throws InstantiationException,
IllegalAccessException {
final DimensionExtractor<?> extractor = dimensionExtractorClass.newInstance();
return AnalyticFeature.createGeometryFeatureAdapter(
outputDataTypeID,
extractor.getDimensionNames(),
namespaceURI,
ClusteringUtils.CLUSTERING_CRS);
}
@Override
public Path getHdfsOutputPath() {
return new Path(
getHdfsOutputBase() + "/" + dataStoreOptions.getGeowaveNamespace() + "_dedupe");
}
@Override
@SuppressWarnings("rawtypes")
protected Class<? extends Reducer> getReducer() {
return SimpleFeatureOutputReducer.class;
}
@Override
public int run(
final Configuration config,
final PropertyManagement runTimeProperties )
throws Exception {
runTimeProperties.storeIfEmpty(
ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID,
"centroid");
runTimeProperties.setConfig(
new ParameterEnum[] {
MapReduceParameters.MRConfig.HDFS_BASE_DIR,
ExtractParameters.Extract.REDUCER_COUNT,
ExtractParameters.Extract.DATA_NAMESPACE_URI,
ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID
},
config,
SimpleFeatureOutputReducer.class);
config.set(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
ExtractParameters.Extract.GROUP_ID),
runTimeProperties.getPropertyAsString(
ExtractParameters.Extract.GROUP_ID,
UUID.randomUUID().toString()));
config.set(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
GlobalParameters.Global.BATCH_ID),
runTimeProperties.getPropertyAsString(
GlobalParameters.Global.BATCH_ID,
UUID.randomUUID().toString()));
DistributableQuery myQuery = query;
if (myQuery == null) {
myQuery = runTimeProperties.getPropertyAsQuery(ExtractParameters.Extract.QUERY);
}
setMinInputSplits(runTimeProperties.getPropertyAsInt(
ExtractParameters.Extract.MIN_INPUT_SPLIT,
1));
setMaxInputSplits(runTimeProperties.getPropertyAsInt(
ExtractParameters.Extract.MAX_INPUT_SPLIT,
10000));
if (myQuery != null) {
GeoWaveInputFormat.setQuery(
config,
query);
}
if (minInputSplits != null) {
GeoWaveInputFormat.setMinimumSplitCount(
config,
minInputSplits);
}
if (maxInputSplits != null) {
GeoWaveInputFormat.setMaximumSplitCount(
config,
maxInputSplits);
}
setConf(config);
config.setClass(
GeoWaveConfiguratorBase.enumToConfKey(
SimpleFeatureOutputReducer.class,
ExtractParameters.Extract.DIMENSION_EXTRACT_CLASS),
runTimeProperties.getPropertyAsClass(
ExtractParameters.Extract.DIMENSION_EXTRACT_CLASS,
DimensionExtractor.class,
SimpleFeatureGeometryExtractor.class),
DimensionExtractor.class);
final PersistableStore store = ((PersistableStore) runTimeProperties.getProperty(StoreParam.INPUT_STORE));
setQueryOptions(runTimeProperties.getPropertyAsQueryOptions(ExtractParameters.Extract.QUERY_OPTIONS));
dataStoreOptions = store.getDataStoreOptions();
GeoWaveInputFormat.setStoreOptions(
config,
dataStoreOptions);
GeoWaveOutputFormat.setStoreOptions(
config,
dataStoreOptions);
try (final FileSystem fs = FileSystem.get(config)) {
if (fs.exists(this.getHdfsOutputPath())) {
fs.delete(
getHdfsOutputPath(),
true);
}
return ToolRunner.run(
config,
this,
new String[] {});
}
}
@Override
public Collection<ParameterEnum<?>> getParameters() {
final Set<ParameterEnum<?>> params = new HashSet<ParameterEnum<?>>();
params.addAll(Arrays.asList(new ParameterEnum<?>[] {
ExtractParameters.Extract.REDUCER_COUNT,
ExtractParameters.Extract.OUTPUT_DATA_TYPE_ID,
ExtractParameters.Extract.DATA_NAMESPACE_URI,
ExtractParameters.Extract.DIMENSION_EXTRACT_CLASS,
ExtractParameters.Extract.MIN_INPUT_SPLIT,
ExtractParameters.Extract.MAX_INPUT_SPLIT,
ExtractParameters.Extract.QUERY,
ExtractParameters.Extract.QUERY_OPTIONS,
StoreParam.INPUT_STORE,
GlobalParameters.Global.BATCH_ID
}));
params.addAll(MapReduceParameters.getParameters());
return params;
}
@Override
public int run(
final PropertyManagement runTimeProperties )
throws Exception {
return this.run(
MapReduceJobController.getConfiguration(runTimeProperties),
runTimeProperties);
}
@Override
public boolean runOperation(
final String[] args )
throws ParseException {
try {
final Job job = new Job(
super.getConf());
job.setJarByClass(this.getClass());
configure(job);
return job.waitForCompletion(true);
}
catch (final Exception e) {
LOGGER.error(
"Unable to run job",
e);
throw new ParseException(
e.getMessage());
}
}
}