package mil.nga.giat.geowave.format.twitter;
import com.vividsolutions.jts.geom.Coordinate;
import com.vividsolutions.jts.geom.GeometryFactory;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import mil.nga.giat.geowave.adapter.vector.ingest.AbstractSimpleFeatureIngestPlugin;
import mil.nga.giat.geowave.adapter.vector.utils.SimpleFeatureUserDataConfigurationSet;
import mil.nga.giat.geowave.core.geotime.store.dimension.GeometryWrapper;
import mil.nga.giat.geowave.core.geotime.store.dimension.Time;
import mil.nga.giat.geowave.core.index.ByteArrayId;
import mil.nga.giat.geowave.core.index.StringUtils;
import mil.nga.giat.geowave.core.ingest.GeoWaveData;
import mil.nga.giat.geowave.core.ingest.IngestPluginBase;
import mil.nga.giat.geowave.core.ingest.avro.WholeFile;
import mil.nga.giat.geowave.core.ingest.hdfs.mapreduce.IngestWithMapper;
import mil.nga.giat.geowave.core.ingest.hdfs.mapreduce.IngestWithReducer;
import mil.nga.giat.geowave.core.store.CloseableIterator;
import mil.nga.giat.geowave.core.store.index.CommonIndexValue;
import mil.nga.giat.geowave.core.store.index.PrimaryIndex;
import org.apache.avro.Schema;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.geotools.feature.simple.SimpleFeatureBuilder;
import org.opengis.feature.simple.SimpleFeature;
import org.opengis.feature.simple.SimpleFeatureType;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.zip.GZIPInputStream;
import javax.json.Json;
import javax.json.JsonObject;
import javax.json.JsonReader;
/*
*/
public class TwitterIngestPlugin extends
AbstractSimpleFeatureIngestPlugin<WholeFile>
{
private final static Logger LOGGER = LoggerFactory.getLogger(TwitterIngestPlugin.class);
private SimpleFeatureBuilder twitterSftBuilder;
private SimpleFeatureType twitterSft;
private final ByteArrayId sftNameKey;
public TwitterIngestPlugin() {
twitterSft = TwitterUtils.createTwitterEventDataType();
twitterSftBuilder = new SimpleFeatureBuilder(
twitterSft);
sftNameKey = new ByteArrayId(
StringUtils.stringToBinary(TwitterUtils.TWITTER_SFT_NAME));
}
@Override
protected SimpleFeatureType[] getTypes() {
return new SimpleFeatureType[] {
SimpleFeatureUserDataConfigurationSet.configureType(twitterSft)
};
}
@Override
public String[] getFileExtensionFilters() {
return new String[] {
"gz"
};
}
@Override
public void init(
final File baseDirectory ) {
}
@Override
public boolean supportsFile(
final File file ) {
return TwitterUtils.validate(file);
}
@Override
public Schema getAvroSchema() {
return WholeFile.getClassSchema();
}
@Override
public WholeFile[] toAvroObjects(
final File input ) {
final WholeFile avroFile = new WholeFile();
avroFile.setOriginalFilePath(input.getAbsolutePath());
try {
avroFile.setOriginalFile(ByteBuffer.wrap(Files.readAllBytes(input.toPath())));
}
catch (final IOException e) {
LOGGER.warn(
"Unable to read Twitter file: " + input.getAbsolutePath(),
e);
return new WholeFile[] {};
}
return new WholeFile[] {
avroFile
};
}
@Override
public boolean isUseReducerPreferred() {
return false;
}
@Override
public IngestWithMapper<WholeFile, SimpleFeature> ingestWithMapper() {
return new IngestTwitterFromHdfs(
this);
}
@Override
public IngestWithReducer<WholeFile, ?, ?, SimpleFeature> ingestWithReducer() {
// unsupported right now
throw new UnsupportedOperationException(
"Twitter events cannot be ingested with a reducer");
}
@Override
@SuppressFBWarnings(value = {
"REC_CATCH_EXCEPTION"
}, justification = "Intentionally catching any possible exception as there may be unknown format issues in a file and we don't want to error partially through parsing")
protected CloseableIterator<GeoWaveData<SimpleFeature>> toGeoWaveDataInternal(
final WholeFile hfile,
final Collection<ByteArrayId> primaryIndexIds,
final String globalVisibility ) {
final List<GeoWaveData<SimpleFeature>> featureData = new ArrayList<GeoWaveData<SimpleFeature>>();
final InputStream in = new ByteArrayInputStream(
hfile.getOriginalFile().array());
try {
final GZIPInputStream zip = new GZIPInputStream(
in);
final InputStreamReader isr = new InputStreamReader(
zip,
StringUtils.UTF8_CHAR_SET);
final BufferedReader br = new BufferedReader(
isr);
final GeometryFactory geometryFactory = new GeometryFactory();
String line;
int lineNumber = 0;
String userid = "";
String userName = "";
String tweetText = "";
String inReplyUser = "";
String inReplyStatus = "";
int retweetCount = 0;
String lang = "";
Date dtg = null;
String dtgString = "";
String tweetId = "";
double lat = 0;
double lon = 0;
StringReader sr = new StringReader(
"");
JsonReader jsonReader = null;
try {
while ((line = br.readLine()) != null) {
userid = "";
userName = "";
tweetText = "";
inReplyUser = "";
inReplyStatus = "";
retweetCount = 0;
lang = "";
dtg = null;
dtgString = "";
tweetId = "";
lat = 0;
lon = 0;
lineNumber++;
try {
sr = new StringReader(
line);
jsonReader = Json.createReader(sr);
JsonObject tweet = jsonReader.readObject();
try {
lon = tweet.getJsonObject(
"coordinates").getJsonArray(
"coordinates").getJsonNumber(
0).doubleValue();
lat = tweet.getJsonObject(
"coordinates").getJsonArray(
"coordinates").getJsonNumber(
1).doubleValue();
LOGGER.debug("line " + lineNumber + " at POINT(" + lon + " " + lat + ")");
}
catch (final Exception e) {
LOGGER.debug(
"Error reading twitter coordinate on line " + lineNumber + " of "
+ hfile.getOriginalFilePath() + "\n" + line,
e);
continue;
}
final Coordinate coord = new Coordinate(
lon,
lat);
try {
dtgString = tweet.getString("created_at");
dtg = TwitterUtils.parseDate(dtgString);
}
catch (final Exception e) {
LOGGER.warn(
"Error reading tweet date on line " + lineNumber + " of "
+ hfile.getOriginalFilePath(),
e);
continue;
}
JsonObject user = tweet.getJsonObject("user");
tweetId = tweet.getString("id_str");
userid = user.getString("id_str");
userName = user.getString("name");
tweetText = tweet.getString("text");
// nullable
if (!tweet.isNull("in_reply_to_user_id_str"))
inReplyUser = tweet.getString("in_reply_to_user_id_str");
if (!tweet.isNull("in_reply_to_status_id_str"))
inReplyStatus = tweet.getString("in_reply_to_status_id_str");
retweetCount = tweet.getInt("retweet_count");
if (!tweet.isNull("lang")) lang = tweet.getString("lang");
twitterSftBuilder.set(
TwitterUtils.TWITTER_USERID_ATTRIBUTE,
userid);
twitterSftBuilder.set(
TwitterUtils.TWITTER_USERNAME_ATTRIBUTE,
userName);
twitterSftBuilder.set(
TwitterUtils.TWITTER_TEXT_ATTRIBUTE,
tweetText);
twitterSftBuilder.set(
TwitterUtils.TWITTER_INREPLYTOUSER_ATTRIBUTE,
inReplyUser);
twitterSftBuilder.set(
TwitterUtils.TWITTER_INREPLYTOSTATUS_ATTRIBUTE,
inReplyStatus);
twitterSftBuilder.set(
TwitterUtils.TWITTER_RETWEETCOUNT_ATTRIBUTE,
retweetCount);
twitterSftBuilder.set(
TwitterUtils.TWITTER_LANG_ATTRIBUTE,
lang);
twitterSftBuilder.set(
TwitterUtils.TWITTER_DTG_ATTRIBUTE,
dtg);
twitterSftBuilder.set(
TwitterUtils.TWITTER_GEOMETRY_ATTRIBUTE,
geometryFactory.createPoint(coord));
SimpleFeature tweetSft = twitterSftBuilder.buildFeature(tweetId);
// LOGGER.warn(tweetSft.toString());
featureData.add(new GeoWaveData<SimpleFeature>(
sftNameKey,
primaryIndexIds,
tweetSft));
}
catch (final Exception e) {
LOGGER.error(
"Error parsing line: " + line,
e);
continue;
}
finally {
if (sr != null) sr.close();
if (jsonReader != null) jsonReader.close();
}
}
}
catch (final IOException e) {
LOGGER.warn(
"Error reading line from Twitter file: " + hfile.getOriginalFilePath(),
e);
}
finally {
IOUtils.closeQuietly(br);
IOUtils.closeQuietly(isr);
IOUtils.closeQuietly(in);
}
}
catch (final IOException e) {
LOGGER.error(
"Failed to read gz entry: " + hfile.getOriginalFilePath(),
e);
}
return new CloseableIterator.Wrapper<GeoWaveData<SimpleFeature>>(
featureData.iterator());
}
@Override
public PrimaryIndex[] getRequiredIndices() {
return new PrimaryIndex[] {};
}
@Override
public IngestPluginBase<WholeFile, SimpleFeature> getIngestWithAvroPlugin() {
return new IngestTwitterFromHdfs(
this);
}
public static class IngestTwitterFromHdfs extends
AbstractIngestSimpleFeatureWithMapper<WholeFile>
{
public IngestTwitterFromHdfs() {
this(
new TwitterIngestPlugin());
}
public IngestTwitterFromHdfs(
final TwitterIngestPlugin parentPlugin ) {
super(
parentPlugin);
}
}
@Override
public Class<? extends CommonIndexValue>[] getSupportedIndexableTypes() {
return new Class[] {
GeometryWrapper.class,
Time.class
};
}
}