package mil.nga.giat.geowave.format.gdelt; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.file.Files; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.zip.ZipInputStream; import mil.nga.giat.geowave.adapter.vector.ingest.AbstractSimpleFeatureIngestPlugin; import mil.nga.giat.geowave.adapter.vector.ingest.DataSchemaOptionProvider; import mil.nga.giat.geowave.adapter.vector.utils.SimpleFeatureUserDataConfigurationSet; import mil.nga.giat.geowave.core.geotime.store.dimension.GeometryWrapper; import mil.nga.giat.geowave.core.geotime.store.dimension.Time; import mil.nga.giat.geowave.core.index.ByteArrayId; import mil.nga.giat.geowave.core.index.StringUtils; import mil.nga.giat.geowave.core.ingest.GeoWaveData; import mil.nga.giat.geowave.core.ingest.IngestPluginBase; import mil.nga.giat.geowave.core.ingest.avro.WholeFile; import mil.nga.giat.geowave.core.ingest.hdfs.mapreduce.IngestWithMapper; import mil.nga.giat.geowave.core.ingest.hdfs.mapreduce.IngestWithReducer; import mil.nga.giat.geowave.core.store.CloseableIterator; import mil.nga.giat.geowave.core.store.index.CommonIndexValue; import mil.nga.giat.geowave.core.store.index.PrimaryIndex; import org.apache.avro.Schema; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.geotools.feature.simple.SimpleFeatureBuilder; import org.opengis.feature.simple.SimpleFeature; import org.opengis.feature.simple.SimpleFeatureType; import com.vividsolutions.jts.geom.Coordinate; import com.vividsolutions.jts.geom.GeometryFactory; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; /* */ public class GDELTIngestPlugin extends AbstractSimpleFeatureIngestPlugin<WholeFile> { private final static Logger LOGGER = LoggerFactory.getLogger(GDELTIngestPlugin.class); private SimpleFeatureBuilder gdeltEventBuilder; private SimpleFeatureType gdeltEventType; private final ByteArrayId eventKey; private boolean includeSupplementalFields; public GDELTIngestPlugin() { // default to reduced data format setIncludeSupplementalFields(false); eventKey = new ByteArrayId( StringUtils.stringToBinary(GDELTUtils.GDELT_EVENT_FEATURE)); } public GDELTIngestPlugin( DataSchemaOptionProvider dataSchemaOptionProvider ) { setIncludeSupplementalFields(dataSchemaOptionProvider.includeSupplementalFields()); eventKey = new ByteArrayId( StringUtils.stringToBinary(GDELTUtils.GDELT_EVENT_FEATURE)); } private void setIncludeSupplementalFields( final boolean includeSupplementalFields ) { this.includeSupplementalFields = includeSupplementalFields; gdeltEventType = GDELTUtils.createGDELTEventDataType(includeSupplementalFields); gdeltEventBuilder = new SimpleFeatureBuilder( gdeltEventType); } @Override protected SimpleFeatureType[] getTypes() { return new SimpleFeatureType[] { SimpleFeatureUserDataConfigurationSet.configureType(gdeltEventType) }; } @Override public String[] getFileExtensionFilters() { return new String[] { "zip" }; } @Override public void init( final File baseDirectory ) { } @Override public boolean supportsFile( final File file ) { return GDELTUtils.validate(file); } @Override public Schema getAvroSchema() { return WholeFile.getClassSchema(); } @Override public WholeFile[] toAvroObjects( final File input ) { final WholeFile avroFile = new WholeFile(); avroFile.setOriginalFilePath(input.getAbsolutePath()); try { avroFile.setOriginalFile(ByteBuffer.wrap(Files.readAllBytes(input.toPath()))); } catch (final IOException e) { LOGGER.warn( "Unable to read GDELT file: " + input.getAbsolutePath(), e); return new WholeFile[] {}; } return new WholeFile[] { avroFile }; } @Override public boolean isUseReducerPreferred() { return false; } @Override public IngestWithMapper<WholeFile, SimpleFeature> ingestWithMapper() { return new IngestGDELTFromHdfs( this); } @Override public IngestWithReducer<WholeFile, ?, ?, SimpleFeature> ingestWithReducer() { // unsupported right now throw new UnsupportedOperationException( "GDELT events cannot be ingested with a reducer"); } @Override @SuppressFBWarnings(value = { "REC_CATCH_EXCEPTION" }, justification = "Intentionally catching any possible exception as there may be unknown format issues in a file and we don't want to error partially through parsing") protected CloseableIterator<GeoWaveData<SimpleFeature>> toGeoWaveDataInternal( final WholeFile hfile, final Collection<ByteArrayId> primaryIndexIds, final String globalVisibility ) { final List<GeoWaveData<SimpleFeature>> featureData = new ArrayList<GeoWaveData<SimpleFeature>>(); final InputStream in = new ByteArrayInputStream( hfile.getOriginalFile().array()); final ZipInputStream zip = new ZipInputStream( in); try { // Expected input is zipped single files (exactly one entry) zip.getNextEntry(); } catch (final IOException e) { LOGGER.error( "Failed to read ZipEntry from GDELT input file: " + hfile.getOriginalFilePath(), e); } final InputStreamReader isr = new InputStreamReader( zip, StringUtils.UTF8_CHAR_SET); final BufferedReader br = new BufferedReader( isr); final GeometryFactory geometryFactory = new GeometryFactory(); Date timeStamp = null; String timestring = ""; String eventId = ""; int actionGeoType; double lat = 0; double lon = 0; String actor1Name = ""; String actor2Name = ""; String countryCode = ""; String sourceUrl = ""; String actor1CC = ""; String actor2CC = ""; String numMentions = ""; String numSources = ""; String numArticles = ""; String avgTone = ""; String line; int lineNumber = 0; try { while ((line = br.readLine()) != null) { lineNumber++; try { final String[] vals = line.split("\t"); if ((vals.length < GDELTUtils.GDELT_MIN_COLUMNS) || (vals.length > GDELTUtils.GDELT_MAX_COLUMNS)) { LOGGER.debug("Invalid GDELT line length: " + vals.length + " tokens found on line " + lineNumber + " of " + hfile.getOriginalFilePath()); continue; } actionGeoType = Integer.parseInt(vals[GDELTUtils.GDELT_ACTION_GEO_TYPE_COLUMN_ID]); if (actionGeoType == 0) { // No geo associated with this event continue; } eventId = vals[GDELTUtils.GDELT_EVENT_ID_COLUMN_ID]; try { final Pair<Double, Double> latLon = GDELTUtils.parseLatLon(vals); if (latLon == null) { LOGGER .debug("No spatial data on line " + lineNumber + " of " + hfile.getOriginalFilePath()); continue; } lat = latLon.getLeft(); lon = latLon.getRight(); } catch (final Exception e) { LOGGER.debug( "Error reading GDELT lat/lon on line " + lineNumber + " of " + hfile.getOriginalFilePath(), e); continue; } final Coordinate cord = new Coordinate( lon, lat); gdeltEventBuilder.set( GDELTUtils.GDELT_GEOMETRY_ATTRIBUTE, geometryFactory.createPoint(cord)); gdeltEventBuilder.set( GDELTUtils.GDELT_EVENT_ID_ATTRIBUTE, eventId); timestring = vals[GDELTUtils.GDELT_TIMESTAMP_COLUMN_ID]; timeStamp = GDELTUtils.parseDate(timestring); gdeltEventBuilder.set( GDELTUtils.GDELT_TIMESTAMP_ATTRIBUTE, timeStamp); gdeltEventBuilder.set( GDELTUtils.GDELT_LATITUDE_ATTRIBUTE, lat); gdeltEventBuilder.set( GDELTUtils.GDELT_LONGITUDE_ATTRIBUTE, lon); actor1Name = vals[GDELTUtils.ACTOR_1_NAME_COLUMN_ID]; if ((actor1Name != null) && !actor1Name.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.ACTOR_1_NAME_ATTRIBUTE, actor1Name); } actor2Name = vals[GDELTUtils.ACTOR_2_NAME_COLUMN_ID]; if ((actor2Name != null) && !actor2Name.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.ACTOR_2_NAME_ATTRIBUTE, actor2Name); } countryCode = vals[GDELTUtils.ACTION_COUNTRY_CODE_COLUMN_ID]; if ((countryCode != null) && !countryCode.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.ACTION_COUNTRY_CODE_ATTRIBUTE, countryCode); } if (vals.length > GDELTUtils.SOURCE_URL_COLUMN_ID) { sourceUrl = vals[GDELTUtils.SOURCE_URL_COLUMN_ID]; } if ((sourceUrl != null) && !sourceUrl.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.SOURCE_URL_ATTRIBUTE, sourceUrl); } if (includeSupplementalFields) { actor1CC = vals[GDELTUtils.ACTOR_1_COUNTRY_CODE_COLUMN_ID]; if ((actor1CC != null) && !actor1CC.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.ACTOR_1_COUNTRY_CODE_ATTRIBUTE, actor1CC); } actor2CC = vals[GDELTUtils.ACTOR_2_COUNTRY_CODE_COLUMN_ID]; if ((actor2CC != null) && !actor2CC.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.ACTOR_2_COUNTRY_CODE_ATTRIBUTE, actor2CC); } numMentions = vals[GDELTUtils.NUM_MENTIONS_COLUMN_ID]; if ((numMentions != null) && !numMentions.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.NUM_MENTIONS_ATTRIBUTE, Integer.parseInt(numMentions)); } numSources = vals[GDELTUtils.NUM_SOURCES_COLUMN_ID]; if ((numSources != null) && !numSources.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.NUM_SOURCES_ATTRIBUTE, Integer.parseInt(numSources)); } numArticles = vals[GDELTUtils.NUM_ARTICLES_COLUMN_ID]; if ((numArticles != null) && !numArticles.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.NUM_ARTICLES_ATTRIBUTE, Integer.parseInt(numArticles)); } avgTone = vals[GDELTUtils.AVG_TONE_COLUMN_ID]; if ((avgTone != null) && !avgTone.isEmpty()) { gdeltEventBuilder.set( GDELTUtils.AVG_TONE_ATTRIBUTE, Double.parseDouble(avgTone)); } } featureData.add(new GeoWaveData<SimpleFeature>( eventKey, primaryIndexIds, gdeltEventBuilder.buildFeature(eventId))); } catch (final Exception e) { LOGGER.error( "Error parsing line: " + line, e); continue; } } } catch (final IOException e) { LOGGER.warn( "Error reading line from GDELT file: " + hfile.getOriginalFilePath(), e); } finally { IOUtils.closeQuietly(br); IOUtils.closeQuietly(isr); IOUtils.closeQuietly(in); } return new CloseableIterator.Wrapper<GeoWaveData<SimpleFeature>>( featureData.iterator()); } @Override public PrimaryIndex[] getRequiredIndices() { return new PrimaryIndex[] {}; } @Override public IngestPluginBase<WholeFile, SimpleFeature> getIngestWithAvroPlugin() { return new IngestGDELTFromHdfs( this); } public static class IngestGDELTFromHdfs extends AbstractIngestSimpleFeatureWithMapper<WholeFile> { public IngestGDELTFromHdfs() { this( new GDELTIngestPlugin()); } public IngestGDELTFromHdfs( final GDELTIngestPlugin parentPlugin ) { super( parentPlugin); } } @Override public Class<? extends CommonIndexValue>[] getSupportedIndexableTypes() { return new Class[] { GeometryWrapper.class, Time.class }; } }