package mil.nga.giat.geowave.core.ingest.hdfs.mapreduce; import java.io.IOException; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import mil.nga.giat.geowave.core.ingest.DataAdapterProvider; import mil.nga.giat.geowave.core.ingest.IngestUtils; import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions; import mil.nga.giat.geowave.core.store.operations.remote.options.IndexPluginOptions; import mil.nga.giat.geowave.core.store.operations.remote.options.VisibilityOptions; import mil.nga.giat.geowave.mapreduce.GeoWaveConfiguratorBase; /** * This class actually executes the ingestion of intermediate data into GeoWave * that had been staged in HDFS. */ public class IngestFromHdfsDriver { private final static Logger LOGGER = LoggerFactory.getLogger(IngestFromHdfsDriver.class); private final static int NUM_CONCURRENT_JOBS = 5; private final static int DAYS_TO_AWAIT_COMPLETION = 999; protected final DataStorePluginOptions storeOptions; protected final List<IndexPluginOptions> indexOptions; protected final VisibilityOptions ingestOptions; private final MapReduceCommandLineOptions mapReduceOptions; private final Map<String, IngestFromHdfsPlugin<?, ?>> ingestPlugins; private final String hdfsHostPort; private final String basePath; private static ExecutorService singletonExecutor; public IngestFromHdfsDriver( DataStorePluginOptions storeOptions, List<IndexPluginOptions> indexOptions, VisibilityOptions ingestOptions, MapReduceCommandLineOptions mapReduceOptions, Map<String, IngestFromHdfsPlugin<?, ?>> ingestPlugins, String hdfsHostPort, String basePath ) { this.storeOptions = storeOptions; this.indexOptions = indexOptions; this.ingestOptions = ingestOptions; this.mapReduceOptions = mapReduceOptions; this.ingestPlugins = ingestPlugins; this.hdfsHostPort = hdfsHostPort; this.basePath = basePath; } private static synchronized ExecutorService getSingletonExecutorService() { if ((singletonExecutor == null) || singletonExecutor.isShutdown()) { singletonExecutor = Executors.newFixedThreadPool(NUM_CONCURRENT_JOBS); } return singletonExecutor; } private boolean checkIndexesAgainstProvider( String providerName, DataAdapterProvider<?> adapterProvider ) { boolean valid = true; for (IndexPluginOptions option : indexOptions) { if (!IngestUtils.isCompatible( adapterProvider, option)) { // HP Fortify "Log Forging" false positive // What Fortify considers "user input" comes only // from users with OS-level access anyway LOGGER.warn("HDFS file ingest plugin for ingest type '" + providerName + "' does not support dimensionality '" + option.getType() + "'"); valid = false; } } return valid; } public boolean runOperation() { final Path hdfsBaseDirectory = new Path( basePath); try { final Configuration conf = new Configuration(); GeoWaveConfiguratorBase.setRemoteInvocationParams( hdfsHostPort, mapReduceOptions.getJobTrackerOrResourceManagerHostPort(), conf); try (FileSystem fs = FileSystem.get(conf)) { if (!fs.exists(hdfsBaseDirectory)) { LOGGER.error( "HDFS base directory {} does not exist", hdfsBaseDirectory); return false; } for (Entry<String, IngestFromHdfsPlugin<?, ?>> pluginProvider : ingestPlugins.entrySet()) { // if an appropriate sequence file does not exist, continue // TODO: we should probably clean up the type name to make // it // HDFS path safe in case there are invalid characters final Path inputFile = new Path( hdfsBaseDirectory, pluginProvider.getKey()); if (!fs.exists(inputFile)) { LOGGER.warn("HDFS file '" + inputFile + "' does not exist for ingest type '" + pluginProvider.getKey() + "'"); continue; } IngestFromHdfsPlugin<?, ?> ingestFromHdfsPlugin = pluginProvider.getValue(); IngestWithReducer ingestWithReducer = null; IngestWithMapper ingestWithMapper = null; // first find one preferred method of ingest from HDFS // (exclusively setting one or the other instance above) if (ingestFromHdfsPlugin.isUseReducerPreferred()) { ingestWithReducer = ingestFromHdfsPlugin.ingestWithReducer(); if (ingestWithReducer == null) { LOGGER.warn("Plugin provider '" + pluginProvider.getKey() + "' prefers ingest with reducer but it is unimplemented"); } } if (ingestWithReducer == null) { // check for ingest with mapper ingestWithMapper = ingestFromHdfsPlugin.ingestWithMapper(); if ((ingestWithMapper == null) && !ingestFromHdfsPlugin.isUseReducerPreferred()) { ingestWithReducer = ingestFromHdfsPlugin.ingestWithReducer(); if (ingestWithReducer == null) { LOGGER.warn("Plugin provider '" + pluginProvider.getKey() + "' does not does not support ingest from HDFS"); continue; } else { LOGGER.warn("Plugin provider '" + pluginProvider.getKey() + "' prefers ingest with mapper but it is unimplemented"); } } } AbstractMapReduceIngest jobRunner = null; if (ingestWithReducer != null) { if (!checkIndexesAgainstProvider( pluginProvider.getKey(), ingestWithReducer)) { continue; } jobRunner = new IngestWithReducerJobRunner( storeOptions, indexOptions, ingestOptions, inputFile, pluginProvider.getKey(), ingestFromHdfsPlugin, ingestWithReducer); } else if (ingestWithMapper != null) { if (!checkIndexesAgainstProvider( pluginProvider.getKey(), ingestWithMapper)) { continue; } jobRunner = new IngestWithMapperJobRunner( storeOptions, indexOptions, ingestOptions, inputFile, pluginProvider.getKey(), ingestFromHdfsPlugin, ingestWithMapper); } if (jobRunner != null) { try { runJob( conf, jobRunner); } catch (final Exception e) { LOGGER.warn( "Error running ingest job", e); return false; } } } } } catch (final IOException e) { LOGGER.warn( "Error in accessing HDFS file system", e); return false; } finally { final ExecutorService executorService = getSingletonExecutorService(); executorService.shutdown(); // do we want to just exit once our jobs are submitted or wait? // for now let's just wait a REALLY long time until all of the // submitted jobs complete try { executorService.awaitTermination( DAYS_TO_AWAIT_COMPLETION, TimeUnit.DAYS); } catch (final InterruptedException e) { LOGGER.error( "Error waiting for submitted jobs to complete", e); } } // we really do not know if the service failed...bummer return true; } private void runJob( final Configuration conf, final AbstractMapReduceIngest jobRunner ) throws Exception { final ExecutorService executorService = getSingletonExecutorService(); executorService.execute(new Runnable() { @Override public void run() { try { final int res = ToolRunner.run( conf, jobRunner, new String[0]); if (res != 0) { LOGGER.error("Mapper ingest job '" + jobRunner.getJobName() + "' exited with error code: " + res); } } catch (final Exception e) { LOGGER.error( "Error running mapper ingest job: " + jobRunner.getJobName(), e); } } }); } }