package edu.cmu.test.geocoderTrainingGen; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.List; import twitter4j.GeoLocation; import twitter4j.Status; import twitter4j.TwitterException; import twitter4j.json.DataObjectFactory; import edu.cmu.geolocator.GlobalParam; import edu.cmu.geolocator.io.GetReader; import edu.cmu.geolocator.io.GetWriter; import edu.cmu.geolocator.model.LocEntityAnnotation; import edu.cmu.geolocator.model.Tweet; import edu.cmu.geolocator.parser.ParserFactory; public class TweetGen { public static String norm(String text) { return text.replace("\n", " ").replace("\r", "").replace("\t", " "); } public static void main(String argv[]) throws FileNotFoundException, UnsupportedEncodingException { GlobalParam.setGazIndex("GazIndex"); GlobalParam.setGeoNames("GeoNames"); String path = "/Users/Indri/Documents/Research_data/Disambiguation/additionalData/"; BufferedWriter bw = null; try { bw = GetWriter .getFileWriter(path+"all.txt"); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } File folder = new File( path+"coord/"); BufferedReader br = null; for (final File fileEntry : folder.listFiles()) { if (fileEntry.getName().equals(".DS_Store")) continue; try { br = GetReader.getUTF8FileReader(path+"coord/"+fileEntry.getName()); } catch (FileNotFoundException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } catch (UnsupportedEncodingException e2) { // TODO Auto-generated catch block e2.printStackTrace(); } String line = null; Status tweet = null; String text, uloc = "", udsc = "", tzone = "", pname = "", pcnty = "", ptype = ""; double lat, lon; StringBuilder parsedlocs; List<LocEntityAnnotation> locations,gazentries; GeoLocation[][] pbox = null; String bbox = null; try { while ((line = br.readLine()) != null) { text = ""; uloc = ""; udsc = ""; tzone = ""; pname = ""; pcnty = ""; ptype = ""; bbox = ""; lat = 0; lon = 0; parsedlocs = new StringBuilder(); locations = null; pbox = null; if (line.trim().length() == 0) continue; try { tweet = DataObjectFactory.createStatus(line); // System.out.println(tweet.getText().trim()); } catch (TwitterException e) { // TODO Auto-generated catch block System.err.println("Not parserable"); e.printStackTrace(); continue; } text = tweet.getText(); text = TweetGen.norm(text); if (text.trim().length() < 2) continue; if (tweet.getGeoLocation() != null) { lat = tweet.getGeoLocation().getLatitude(); lon = tweet.getGeoLocation().getLongitude(); } if (tweet.getUser() != null) { uloc = tweet.getUser().getLocation() != null ? tweet.getUser().getLocation() : ""; udsc = tweet.getUser().getDescription() != null ? tweet.getUser().getDescription() : ""; tzone = tweet.getUser().getTimeZone() != null ? tweet.getUser().getTimeZone() : ""; uloc = TweetGen.norm(uloc); udsc = TweetGen.norm(udsc); tzone = TweetGen.norm(tzone); } if (tweet.getPlace() != null) { pname = tweet.getPlace().getFullName() != null ? tweet.getPlace().getFullName() : ""; pcnty = tweet.getPlace().getCountry() != null ? tweet.getPlace().getCountry() : ""; ptype = tweet.getPlace().getPlaceType() != null ? tweet.getPlace().getPlaceType() : ""; pname = TweetGen.norm(pname); pcnty = TweetGen.norm(pcnty); ptype = TweetGen.norm(ptype); pbox = tweet.getPlace().getBoundingBoxCoordinates() != null ? tweet.getPlace() .getBoundingBoxCoordinates() : null; if (pbox != null) { bbox += "[" + pbox[0][0].getLatitude() + " " + pbox[0][0].getLongitude() + "] [" + pbox[0][1].getLatitude() + " " + pbox[0][1].getLongitude() + "] [" + pbox[0][2].getLatitude() + " " + pbox[0][2].getLongitude() + "] [" + pbox[0][3].getLatitude() + " " + pbox[0][3].getLongitude() + "]"; bbox = "\"type\"\":\"\"Polygon\"\" \"\"coordinates\"\":[[" + bbox + "]]\"\"\""; System.out.println(bbox); } } Tweet t = new Tweet(tweet); // locations = ParserFactory.getEnNERParser().parse(t); gazentries = ParserFactory.getEnToponymParser().parse(t); if (gazentries.size()==0) continue; System.out.println("Tweet is : " + tweet.getText()); System.out.println("Locations Recognized are: " + gazentries.size()); parsedlocs = new StringBuilder(); for (LocEntityAnnotation loc : gazentries) { if(gazentries.contains(loc)) parsedlocs.append("tp{").append(loc.getTokenString()).append("[ , ]").append("}tp"); } if (parsedlocs.length()==0) continue; bw.write(text + "\t" + parsedlocs.toString() + "\t" + lat + "\t" + lon + "\t" + uloc + "\t" + tzone + "\t" + udsc + "\tnull\t" + pname + "\t" + pcnty + "\t" + ptype + "\t" + bbox + "\n"); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); try { br.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } try { bw.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }