package mil.nga.giat.geowave.mapreduce.dedupe;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import mil.nga.giat.geowave.core.cli.operations.config.options.ConfigOptions;
import mil.nga.giat.geowave.core.cli.parser.CommandLineOperationParams;
import mil.nga.giat.geowave.core.cli.parser.OperationParser;
import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions;
import mil.nga.giat.geowave.core.store.operations.remote.options.StoreLoader;
import mil.nga.giat.geowave.mapreduce.AbstractGeoWaveJobRunner;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputFormat;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey;
/**
* This class can run a basic job to query GeoWave, deduplicating results, and
* writing the final set of key value pairs to a sequence file. It can be
* extended for more advanced capabilities or job chaining.
*/
public class GeoWaveDedupeJobRunner extends
AbstractGeoWaveJobRunner
{
public GeoWaveDedupeJobRunner(
DataStorePluginOptions dataStoreOptions ) {
super(
dataStoreOptions);
}
@Override
protected void configure(
final Job job )
throws Exception {
job.setJobName("GeoWave Dedupe (" + dataStoreOptions.getGeowaveNamespace() + ")");
job.setMapperClass(GeoWaveDedupeMapper.class);
job.setCombinerClass(GeoWaveDedupeCombiner.class);
job.setReducerClass(getReducer());
job.setMapOutputKeyClass(GeoWaveInputKey.class);
job.setMapOutputValueClass(ObjectWritable.class);
job.setOutputKeyClass(GeoWaveInputKey.class);
job.setOutputValueClass(ObjectWritable.class);
job.setInputFormatClass(GeoWaveInputFormat.class);
job.setOutputFormatClass(getOutputFormatClass());
job.setNumReduceTasks(getNumReduceTasks());
job.setSpeculativeExecution(false);
try (final FileSystem fs = FileSystem.get(job.getConfiguration())) {
final Path outputPath = getHdfsOutputPath();
fs.delete(
outputPath,
true);
FileOutputFormat.setOutputPath(
job,
outputPath);
}
}
protected String getHdfsOutputBase() {
return "/tmp";
}
@SuppressWarnings("rawtypes")
protected Class<? extends Reducer> getReducer() {
return GeoWaveDedupeReducer.class;
}
public Path getHdfsOutputPath() {
return new Path(
getHdfsOutputBase() + "/" + dataStoreOptions.getGeowaveNamespace() + "_dedupe");
}
protected Class<? extends OutputFormat> getOutputFormatClass() {
return SequenceFileOutputFormat.class;
}
protected int getNumReduceTasks() {
return 8;
}
public static void main(
final String[] args )
throws Exception {
ConfigOptions opts = new ConfigOptions();
MainParameterHolder holder = new MainParameterHolder();
OperationParser parser = new OperationParser();
parser.addAdditionalObject(opts);
parser.addAdditionalObject(holder);
// Second round to get everything else.
CommandLineOperationParams params = parser.parse(args);
// Set the datastore plugin
if (holder.getMainParameter().size() == 0) {
throw new ParameterException(
"Must specify datastore name as first argument.");
}
// Load the params for config file.
opts.prepare(params);
StoreLoader loader = new StoreLoader(
holder.getMainParameter().get(
0));
loader.loadFromConfig((File) params.getContext().get(
ConfigOptions.PROPERTIES_FILE_CONTEXT));
final int res = ToolRunner.run(
new Configuration(),
new GeoWaveDedupeJobRunner(
loader.getDataStorePlugin()),
args);
System.exit(res);
}
public static class MainParameterHolder
{
@Parameter
private List<String> mainParameter = new ArrayList<String>();
public List<String> getMainParameter() {
return mainParameter;
}
}
}