package mil.nga.giat.geowave.core.ingest.hdfs.mapreduce;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import mil.nga.giat.geowave.core.index.ByteArrayId;
import mil.nga.giat.geowave.core.index.ByteArrayUtils;
import mil.nga.giat.geowave.core.index.Persistable;
import mil.nga.giat.geowave.core.index.PersistenceUtils;
import mil.nga.giat.geowave.core.ingest.DataAdapterProvider;
import mil.nga.giat.geowave.core.store.adapter.WritableDataAdapter;
import mil.nga.giat.geowave.core.store.index.PrimaryIndex;
import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions;
import mil.nga.giat.geowave.core.store.operations.remote.options.IndexPluginOptions;
import mil.nga.giat.geowave.core.store.operations.remote.options.VisibilityOptions;
import mil.nga.giat.geowave.mapreduce.output.GeoWaveOutputFormat;
/**
* This class can be sub-classed to run map-reduce jobs within the ingest
* framework using plugins provided by types that are discovered through SPI.
*
* @param <T>
* The type of map-reduce ingest plugin that can be persisted to the
* map-reduce job configuration and used by the mapper and/or reducer
* to ingest data
*/
abstract public class AbstractMapReduceIngest<T extends Persistable & DataAdapterProvider<?>> extends
Configured implements
Tool
{
public static final String INGEST_PLUGIN_KEY = "INGEST_PLUGIN";
public static final String GLOBAL_VISIBILITY_KEY = "GLOBAL_VISIBILITY";
public static final String PRIMARY_INDEX_IDS_KEY = "PRIMARY_INDEX_IDS";
private static String JOB_NAME = "%s ingest from %s to namespace %s (%s)";
protected final DataStorePluginOptions dataStoreOptions;
protected final List<IndexPluginOptions> indexOptions;
protected final VisibilityOptions ingestOptions;
protected final Path inputFile;
protected final String typeName;
protected final IngestFromHdfsPlugin<?, ?> parentPlugin;
protected final T ingestPlugin;
public AbstractMapReduceIngest(
final DataStorePluginOptions dataStoreOptions,
final List<IndexPluginOptions> indexOptions,
final VisibilityOptions ingestOptions,
final Path inputFile,
final String typeName,
final IngestFromHdfsPlugin<?, ?> parentPlugin,
final T ingestPlugin ) {
this.dataStoreOptions = dataStoreOptions;
this.indexOptions = indexOptions;
this.ingestOptions = ingestOptions;
this.inputFile = inputFile;
this.typeName = typeName;
this.parentPlugin = parentPlugin;
this.ingestPlugin = ingestPlugin;
}
public String getJobName() {
return String.format(
JOB_NAME,
typeName,
inputFile.toString(),
dataStoreOptions.getGeowaveNamespace(),
getIngestDescription());
}
abstract protected String getIngestDescription();
protected static List<ByteArrayId> getPrimaryIndexIds(
final Configuration conf ) {
final String primaryIndexIdStr = conf.get(AbstractMapReduceIngest.PRIMARY_INDEX_IDS_KEY);
final List<ByteArrayId> primaryIndexIds = new ArrayList<ByteArrayId>();
if ((primaryIndexIdStr != null) && !primaryIndexIdStr.isEmpty()) {
final String[] indexIds = primaryIndexIdStr.split(",");
for (final String indexId : indexIds) {
primaryIndexIds.add(new ByteArrayId(
indexId));
}
}
return primaryIndexIds;
}
@Override
public int run(
final String[] args )
throws Exception {
final Configuration conf = getConf();
conf.set(
INGEST_PLUGIN_KEY,
ByteArrayUtils.byteArrayToString(PersistenceUtils.toBinary(ingestPlugin)));
if (ingestOptions.getVisibility() != null) {
conf.set(
GLOBAL_VISIBILITY_KEY,
ingestOptions.getVisibility());
}
final Job job = new Job(
conf,
getJobName());
final StringBuilder indexIds = new StringBuilder();
for (final IndexPluginOptions indexOption : indexOptions) {
final PrimaryIndex primaryIndex = indexOption.createPrimaryIndex();
if (primaryIndex != null) {
// add index
GeoWaveOutputFormat.addIndex(
job.getConfiguration(),
primaryIndex);
if (indexIds.length() != 0) {
indexIds.append(",");
}
indexIds.append(primaryIndex.getId().getString());
}
}
job.getConfiguration().set(
PRIMARY_INDEX_IDS_KEY,
indexIds.toString());
job.setJarByClass(AbstractMapReduceIngest.class);
job.setInputFormatClass(AvroKeyInputFormat.class);
AvroJob.setInputKeySchema(
job,
parentPlugin.getAvroSchema());
FileInputFormat.setInputPaths(
job,
inputFile);
setupMapper(job);
setupReducer(job);
// set geowave output format
job.setOutputFormatClass(GeoWaveOutputFormat.class);
GeoWaveOutputFormat.setStoreOptions(
job.getConfiguration(),
dataStoreOptions);
final WritableDataAdapter<?>[] dataAdapters = ingestPlugin.getDataAdapters(ingestOptions.getVisibility());
for (final WritableDataAdapter<?> dataAdapter : dataAdapters) {
GeoWaveOutputFormat.addDataAdapter(
job.getConfiguration(),
dataAdapter);
}
job.setSpeculativeExecution(false);
// add required indices
final PrimaryIndex[] requiredIndices = parentPlugin.getRequiredIndices();
if (requiredIndices != null) {
for (final PrimaryIndex requiredIndex : requiredIndices) {
GeoWaveOutputFormat.addIndex(
job.getConfiguration(),
requiredIndex);
}
}
return job.waitForCompletion(true) ? 0 : -1;
}
abstract protected void setupMapper(
Job job );
abstract protected void setupReducer(
Job job );
}