package org.opensextant.examples.twitter;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.text.ParseException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.sf.json.JSONObject;
import org.opensextant.data.TextInput;
import org.opensextant.ConfigException;
import org.opensextant.extraction.ExtractionResult;
import org.opensextant.extraction.TextMatch;
import org.opensextant.extractors.geo.PlaceGeocoder;
import org.opensextant.extractors.xcoord.XConstants;
import org.opensextant.extractors.xcoord.XCoord;
import org.opensextant.output.FormatterFactory;
import org.opensextant.output.GISDataFormatter;
import org.opensextant.output.OpenSextantSchema;
import org.opensextant.processing.Parameters;
import org.opensextant.processing.ProcessingException;
import org.opensextant.util.FileUtility;
import org.opensextant.util.TextUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author Marc C. Ubaldino, MITRE, ubaldino at mitre dot org
*/
public class TweetGeocoder {
private final Logger log = LoggerFactory.getLogger(TweetGeocoder.class);
private final boolean debug = log.isDebugEnabled();
private int recordCount = 0;
int batch = 10000;
private PlaceGeocoder geocoder = null;
XCoord userlocX;
private static String formatType = null;
GISDataFormatter tweetOutput;
GISDataFormatter userOutput;
private static Set<String> tweet_stop;
private static Set<String> tweet_pass;
public TweetGeocoder(String job) throws IOException, ConfigException, ProcessingException {
//
// These static parameter/flags inform XCoord API how to detect and return coordinates.
//
Parameters.RUNTIME_FLAGS = Parameters.FLAG_NO_COORDINATES;//| Parameters.FLAG_ALLOW_LOWERCASE_ABBREV;
Parameters.RUNTIME_FLAGS ^= Parameters.FLAG_EXTRACT_CONTEXT;
userlocX = new XCoord();
try {
userlocX.configure(TweetGeocoder.class.getResource("/twitter/tweet-xcoord.cfg"));
userlocX.disableAll();
// Explicitly enable DD
// Note -- for parsing coordinates in Tweet metadata
// we need to turn off the normal Decimal degree filters.
// Decimal degrees are really the only thing we want out of tweets,
// so we need to carefully undo DD filters.
//
userlocX.match_DD(true);
XCoord.RUNTIME_FLAGS ^= XConstants.DD_FILTERS_ON; // Be less strict with Decimal degrees.
XCoord.RUNTIME_FLAGS ^= XConstants.FLAG_EXTRACT_CONTEXT; // ignore text context.
} catch (ConfigException xcerr) {
throw new ProcessingException(xcerr);
}
tweet_stop = FileUtility.loadDictionary("/twitter/tweet-not-places.txt", true);
tweet_pass = FileUtility.loadDictionary("/twitter/tweet-places.txt", true);
geocoder = new PlaceGeocoder();
Parameters tweetJob = new Parameters();
Parameters userJob = new Parameters();
// Fill out the basic I/O parameters.
tweetJob.outputDir = "./output";
FileUtility.makeDirectory(tweetJob.outputDir);
if (formatType == null) {
formatType = "CSV";
}
tweetJob.setJobName(job);
boolean overwrite = true;
tweetJob.isdefault = false;
// Caller should be sure "timestamp" field does not overwrite existing field.
// DEFINE this once.
OpenSextantSchema.addDateField("timestamp");
OpenSextantSchema.addTextField("tweet");
OpenSextantSchema.addTextField("author");
// Given the job parameters you can then create the default tweetOutput formatter
// which takes the Parameters as guidance on file paths, locations, oput filters, etc.
tweetOutput = createFormatter(formatType, tweetJob);
tweetOutput.overwrite = overwrite;
tweetOutput.includeOffsets = false;
tweetOutput.includeCoordinate = true;
tweetOutput.setGisDataModel();
userJob.outputDir = tweetJob.outputDir;
userJob.setJobName(job + "_Users");
userJob.isdefault = false;
userOutput = createFormatter(formatType, userJob);
userOutput.overwrite = overwrite;
// Tune User profile geo schema -- very few fields that matter.
userOutput.includeOffsets = false;
userOutput.includeCoordinate = true;
userOutput.setGisDataModel();
// Swap these fields.
tweetOutput.removeField("context");
tweetOutput.addField("tweet");
tweetOutput.addField("timestamp");
tweetOutput.addField("author");
userOutput.removeField("start");
userOutput.removeField("end");
// userOutput.field_order.remove("method");
userOutput.removeField("placename");
userOutput.removeField("confidence");
userOutput.addField("timestamp");
userOutput.addField("author");
userOutput.removeField("context");
userOutput.addField("tweet");
// Create output files, by starting the job formatters.
// This creates the IO streams and sets the schema for those files.
//
tweetOutput.start(tweetJob.getJobName());
userOutput.start(userJob.getJobName());
geocoder.setParameters(tweetJob);
geocoder.configure();
}
/**
* The default formatter
*/
public static GISDataFormatter createFormatter(String outputFormat, Parameters p)
throws IOException, ProcessingException {
if (p.isdefault) {
throw new ProcessingException("Caller is required to use non-default Parameters; "
+ "\nat least set the output options, folder, jobname, etc.");
}
GISDataFormatter formatter = (GISDataFormatter) FormatterFactory.getInstance(outputFormat);
if (formatter == null) {
throw new ProcessingException("Wrong formatter?");
}
formatter.setParameters(p);
// formatter.setOutputDir(params.outputDir);
formatter.setOutputFilename(p.getJobName() + formatter.outputExtension);
return formatter;
}
/**
* If user loc.xy:
* write out( xy )
* else if user loc
* geocode (user loc)
* write out ()
*
* geocode(status) write out ()
*/
public void geocodeTweetUser(Tweet tw) {
if (tw.author_xy_val == null || tw.author_location == null) {
return;
}
ExtractionResult res = new ExtractionResult(tw.id);
res.addAttribute("timestamp", tw.pub_date);
res.addAttribute("author", tw.author);
res.addAttribute("tweet", tw.getText());
/*
* If User profile location or geo coord is a Coordinate... parse and add to matched locations
*/
if (tw.author_xy_val != null) {
res.matches = userlocX.extract(new TextInput(tw.id, tw.author_xy_val));
} else if (tw.author_location != null) {
res.matches = userlocX.extract(new TextInput(tw.id, tw.author_location));
}
/*
* If User profile is a place name, attempt to match it and disambiguate.
*/
if (res.matches.isEmpty()) {
try {
res.matches = geocoder.extract(new TextInput(tw.id, tw.author_location));
} catch (Exception userErr) {
log.error("Geocoding error with Users?", userErr);
}
}
if (res.matches.isEmpty()) {
return;
}
userOutput.writeGeocodingResult(res);
}
/**
* If a tweet has a non-zero status text, let's find all places in the
* content.
*/
public void geocodeTweet(Tweet tw) {
++recordCount;
if (tw.getText() != null && !tw.getText().isEmpty()) {
try {
ExtractionResult res = new ExtractionResult(tw.id);
// Place name tagger may not work if content has mostly lower case proper names.!!!! TODO: allow mixed case;
res.matches = geocoder.extract(new TextInput(tw.id, tw.getText()));
res.addAttribute("timestamp", tw.pub_date);
res.addAttribute("tweet", tw.getText());
res.addAttribute("author", tw.author);
enrichResults(res.matches);
tweetOutput.writeGeocodingResult(res);
} catch (Exception err) {
log.error("Geocoding error?", err);
}
}
if (recordCount % batch == 0 && recordCount > 0) {
log.info("ROW #" + recordCount);
geocoder.reportMemory();
}
}
private Set<String> distinct_names = new HashSet<String>();
/**
* Enrich and filter geocoding as needed.
*
* FILTER OUT from GIS output: + name or matchtext is a known stop word
* (non-place), + short terms that are not countries
*/
private void enrichResults(List<TextMatch> matches) {
distinct_names.clear();
for (TextMatch g : matches) {
String norm = g.getText().toLowerCase();
if (norm.contains(" tnt ")) {
g.setFilteredOut(true);
}
// Filter out duplicates
if (distinct_names.contains(norm)) {
g.setFilteredOut(true);
} else {
// Track distinct names
distinct_names.add(norm);
}
if (tweet_pass.contains(norm)) {
// let it pass.
} else if (tweet_stop.contains(norm)) {
g.setFilteredOut(true);
if (debug) {
log.debug("Filter out:" + norm);
}
// Hmmm:
// } else if (tweet_pass.contains(norm) || !TextUtils.isASCII(g.getText().getBytes())) {
// DO Nothing.
//
} else if (norm.length() < 4) {
/**
* TBD. refactoring name tagger.
*/
/*Geocoding geo = (Geocoding) g;
if (!(geo.isCountry() || geo.isAdministrative())) {
g.setFilteredOut(true);
if (debug) {
log.info("Filter out short term:" + norm);
}
}
*
*/
}
}
}
/**
* Remove line endings; Emoticons; what else?
*/
private String scrubText(String x) {
String _new = TextUtils.fast_replace(x, "\n\r", " ");
_new = TextUtils.removeEmoticons(_new);
_new = TextUtils.removeSymbols(_new);
return _new;
}
private String separateHashMark(String t) {
return t.replace("#", "# ");
}
/**
* Need references to current methodologies for what data is available,
* reliable, etc and where/when to use it.
*/
private void readTweet(String json, Tweet tw) throws ParseException {
try {
JSONObject twj = JSONObject.fromObject(json.trim());
tw.fromJSON(twj);
// RESET using a cleaned up status text
tw.setText(scrubText(separateHashMark(tw.getText())));
} catch (Exception twerr) {
throw new ParseException("Failed to parse Tweet " + twerr.getMessage(), 0);
}
}
public static int START_ROW = 0;
public static int MAX_ROWS = -10000;
/**
* One JSON tweet per line
*/
public void process(String path) throws IOException, ProcessingException {
File input = new File(path);
LineNumberReader io = new LineNumberReader(new FileReader(input));
String line;
Tweet tw = new Tweet();
int linecount = 0;
while ((line = io.readLine()) != null) {
++linecount;
if (linecount < START_ROW) {
continue;
}
try {
tw.reset();
readTweet(line, tw);
geocodeTweet(tw);
geocodeTweetUser(tw);
} catch (ParseException err) {
// throw new ProcessingException(err);
log.error("At line #" + linecount + " we failed to parse " + line, err);
}
if (recordCount >= MAX_ROWS && MAX_ROWS > 0) {
break;
}
}
io.close();
shutdown();
}
public void shutdown() {
// Close connections and save your output.
if (geocoder != null) {
geocoder.cleanup();
}
if (tweetOutput != null) {
tweetOutput.finish();
}
if (userOutput != null) {
userOutput.finish();
}
}
public static void main(String[] args) {
gnu.getopt.Getopt opts = new gnu.getopt.Getopt("TweetGeocoder", args, "n:i:f:");
try {
String jobname = null;
String inputfile = null;
int c;
while ((c = opts.getopt()) != -1) {
switch (c) {
case 'n':
jobname = opts.getOptarg();
break;
case 'i':
inputfile = opts.getOptarg();
break;
case 'f':
formatType = opts.getOptarg();
break;
}
}
final TweetGeocoder job = new TweetGeocoder(jobname);
job.process(inputfile);
} catch (Exception err) {
err.printStackTrace();
}
}
/**
* Extend the generic Tweet with some name and value tracking.
*/
class TweetPlus extends Tweet {
public Set<String> names = new HashSet<String>();
public TweetPlus() {
super();
}
@Override
public void reset() {
super.reset();
names.clear();
}
}
}