package mil.nga.giat.geowave.core.ingest.local; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.core.ingest.GeoWaveData; import mil.nga.giat.geowave.core.store.CloseableIterator; import mil.nga.giat.geowave.core.store.DataStore; import mil.nga.giat.geowave.core.store.adapter.WritableDataAdapter; import mil.nga.giat.geowave.core.store.index.PrimaryIndex; import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions; import mil.nga.giat.geowave.core.store.operations.remote.options.IndexPluginOptions; import mil.nga.giat.geowave.core.store.operations.remote.options.VisibilityOptions; /** * This extends the local file driver to directly ingest data into GeoWave * utilizing the LocalFileIngestPlugin's that are discovered by the system. */ public class LocalFileIngestDriver extends AbstractLocalFileDriver<LocalFileIngestPlugin<?>, LocalIngestRunData> { public final static int INGEST_BATCH_SIZE = 500; private final static Logger LOGGER = LoggerFactory.getLogger(LocalFileIngestDriver.class); protected DataStorePluginOptions storeOptions; protected List<IndexPluginOptions> indexOptions; protected VisibilityOptions ingestOptions; protected Map<String, LocalFileIngestPlugin<?>> ingestPlugins; protected int threads; protected ExecutorService ingestExecutor; public LocalFileIngestDriver( DataStorePluginOptions storeOptions, List<IndexPluginOptions> indexOptions, Map<String, LocalFileIngestPlugin<?>> ingestPlugins, VisibilityOptions ingestOptions, LocalInputCommandLineOptions inputOptions, int threads ) { super( inputOptions); this.storeOptions = storeOptions; this.indexOptions = indexOptions; this.ingestOptions = ingestOptions; this.ingestPlugins = ingestPlugins; this.threads = threads; } public boolean runOperation( String inputPath ) { // first collect the local file ingest plugins final Map<String, LocalFileIngestPlugin<?>> localFileIngestPlugins = new HashMap<String, LocalFileIngestPlugin<?>>(); final List<WritableDataAdapter<?>> adapters = new ArrayList<WritableDataAdapter<?>>(); for (Entry<String, LocalFileIngestPlugin<?>> pluginEntry : ingestPlugins.entrySet()) { if (!checkIndexesAgainstProvider( pluginEntry.getKey(), pluginEntry.getValue(), indexOptions)) { continue; } localFileIngestPlugins.put( pluginEntry.getKey(), pluginEntry.getValue()); adapters.addAll(Arrays.asList(pluginEntry.getValue().getDataAdapters( ingestOptions.getVisibility()))); } DataStore dataStore = storeOptions.createDataStore(); try (LocalIngestRunData runData = new LocalIngestRunData( adapters, dataStore)) { startExecutor(); processInput( inputPath, localFileIngestPlugins, runData); // We place this here and not just in finally because of the way // that try-with-resources works. // We want to wait for our ingesting threads to finish before we // kill our index writers, which // are cached in LocalIngestRunData. If we were don't, then the // index writers will be // closed before they are finished processing the file entries. shutdownExecutor(); } catch (final IOException e) { LOGGER.error( "Unexpected I/O exception when reading input files", e); return false; } finally { shutdownExecutor(); } return true; } /** * Create a basic thread pool to ingest file data. We limit it to the amount * of threads specified on the command line. */ private void startExecutor() { ingestExecutor = Executors.newFixedThreadPool(threads); } /** * This function will wait for executing tasks to complete for up to 10 * seconds. */ private void shutdownExecutor() { if (ingestExecutor != null) { try { ingestExecutor.shutdown(); while (!ingestExecutor.awaitTermination( 10, TimeUnit.SECONDS)) { LOGGER.debug("Waiting for ingest executor to terminate"); } } catch (InterruptedException e) { LOGGER.error("Failed to terminate executor service"); } finally { ingestExecutor = null; } } } @Override protected void processFile( final File file, final String typeName, final LocalFileIngestPlugin<?> plugin, final LocalIngestRunData ingestRunData ) throws IOException { LOGGER.info(String.format( "Beginning ingest for file: [%s]", file.getName())); // This loads up the primary indexes that are specified on the command // line. // Usually spatial or spatial-temporal final Map<ByteArrayId, PrimaryIndex> specifiedPrimaryIndexes = new HashMap<ByteArrayId, PrimaryIndex>(); for (final IndexPluginOptions dimensionType : indexOptions) { final PrimaryIndex primaryIndex = dimensionType.createPrimaryIndex(); if (primaryIndex == null) { LOGGER.error("Could not get index instance, getIndex() returned null;"); throw new IOException( "Could not get index instance, getIndex() returned null"); } specifiedPrimaryIndexes.put( primaryIndex.getId(), primaryIndex); } // This gets the list of required indexes from the Plugin. // If for some reason a GeoWaveData specifies an index that isn't // originally // in the specifiedPrimaryIndexes list, then this array is used to // determine // if the Plugin supports it. If it does, then we allow the creation of // the // index. final Map<ByteArrayId, PrimaryIndex> requiredIndexMap = new HashMap<ByteArrayId, PrimaryIndex>(); final PrimaryIndex[] requiredIndices = plugin.getRequiredIndices(); if ((requiredIndices != null) && (requiredIndices.length > 0)) { for (final PrimaryIndex requiredIndex : requiredIndices) { requiredIndexMap.put( requiredIndex.getId(), requiredIndex); } } // Create our queue. We will post GeoWaveData items to these queue until // there are no more items, at which point we will tell the workers to // complete. Ingest batch size is the total max number of items to read // from the file at a time for the worker threads to execute. BlockingQueue<GeoWaveData<?>> queue = LocalIngestRunData.createBlockingQueue(INGEST_BATCH_SIZE); // Create our Jobs. We submit as many jobs as we have executors for. // These folks will read our blocking queue LOGGER.debug(String.format( "Creating [%d] threads to ingest file: [%s]", threads, file.getName())); List<IngestTask> ingestTasks = new ArrayList<IngestTask>(); try { for (int i = 0; i < threads; i++) { String id = String.format( "%s-%d", file.getName(), i); IngestTask task = new IngestTask( id, ingestRunData, specifiedPrimaryIndexes, requiredIndexMap, queue); ingestTasks.add(task); ingestExecutor.submit(task); } // Read files until EOF from the command line. try (CloseableIterator<?> geowaveDataIt = plugin.toGeoWaveData( file, specifiedPrimaryIndexes.keySet(), ingestOptions.getVisibility())) { while (geowaveDataIt.hasNext()) { final GeoWaveData<?> geowaveData = (GeoWaveData<?>) geowaveDataIt.next(); try { while (!queue.offer( geowaveData, 100, TimeUnit.MILLISECONDS)) { // Determine if we have any workers left. The point // of this code is so we // aren't hanging after our workers exit (before the // file is done) due to // some un-handled exception. boolean workerAlive = false; for (IngestTask task : ingestTasks) { if (!task.isFinished()) { workerAlive = true; break; } } // If the workers are still there, then just try to // offer again. // This will loop forever until there are no workers // left. if (workerAlive) { LOGGER.debug("Worker threads are overwhelmed, waiting 1 second"); } else { String message = "Datastore error, all workers have terminated! Aborting..."; LOGGER.error(message); throw new RuntimeException( message); } } } catch (InterruptedException e) { // I can't see how this will ever happen, except maybe // someone kills the process? throw new RuntimeException( "Interrupted placing GeoWaveData on queue"); } } } } finally { // Terminate our ingest tasks. for (IngestTask task : ingestTasks) { task.terminate(); } } LOGGER.info(String.format( "Finished ingest for file: [%s]", file.getName())); } }