package eu.socialsensor.twcollect.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import twitter4j.Status; import twitter4j.TwitterException; import twitter4j.json.DataObjectFactory; public class SerializedStreamReader { public static void main(String[] args) throws IOException, TwitterException { File jsonDir = new File("./"); File[] files = jsonDir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { if (name.contains(".json.")){ return true; } return false; } }); List<String> jsonFileList = new ArrayList<String>(); for (int i = 0; i < files.length; i++) { jsonFileList.add(files[i].getCanonicalPath()); } final long minTime = 1392660000000L; final long maxTime = 1392746400000L; //printJsonFileSummary(jsonFileList); extractSubsetOfTweetFields(jsonFileList, "rehearsal_meta_filtered.txt", new StatusTransformer() { @Override public String extractLine(Status status) { String id = String.valueOf(status.getId()); String timestamp = String.valueOf(status.getCreatedAt().getTime()); String username = status.getUser().getScreenName(); return id + "\t" + timestamp + "\t" + username; } @Override public Map<String, String> extractKeyValues(Status status) { // we don't care about this return new HashMap<String, String>(); } }, new StatusFilterer() { @Override public boolean acceptStatus(Status status) { if (status.getCreatedAt().getTime() >= minTime && status.getCreatedAt().getTime() <= maxTime){ return true; } return false; } }); } public static void extractSubsetOfTweetFields(List<String> jsonFiles, String outputFile, StatusTransformer transformer, StatusFilterer filterer) throws IOException, TwitterException { BufferedWriter writer = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outputFile), FileUtil.UTF8)); for (int i = 0; i < jsonFiles.size(); i++) { System.out.println(jsonFiles.get(i)); BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(jsonFiles.get(i)), FileUtil.UTF8)); String line = null; while ((line = reader.readLine())!= null){ Status status = DataObjectFactory.createStatus(line); if (filterer.acceptStatus(status)){ writer.append(transformer.extractLine(status)); writer.newLine(); } } reader.close(); writer.flush(); } writer.close(); } public static void printJsonFileSummary(List<String> jsonFiles) throws IOException, TwitterException{ // aggregators int count = 0; int countGeo = 0; int countRetweet = 0; int countResponses = 0; long minTime = Long.MAX_VALUE, maxTime = 0; Date minDate = null, maxDate = null; Set<Long> users = new HashSet<Long>(); for (int i = 0; i < jsonFiles.size(); i++) { System.out.println(jsonFiles.get(i)); BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(jsonFiles.get(i)), FileUtil.UTF8)); String line = null; while ((line = reader.readLine())!= null){ Status status = DataObjectFactory.createStatus(line); count++; users.add(status.getUser().getId()); if (status.isRetweet()){ countRetweet++; } if (status.getInReplyToStatusId() > 0){ countResponses++; } if (status.getGeoLocation() != null){ countGeo++; } long tstamp = status.getCreatedAt().getTime(); if (tstamp < minTime){ minTime = tstamp; minDate = status.getCreatedAt(); } if (tstamp > maxTime){ maxTime = tstamp; maxDate = status.getCreatedAt(); } } reader.close(); } System.out.println("Period: [" + new Timestamp(minTime) + "," + new Timestamp(maxTime) + "]"); System.out.println("Period: [" + minDate.toString() + "," + maxDate.toString() + "]"); System.out.println("#tweets: " + count); System.out.println("#geo: " + countGeo); System.out.println("#retweets: " + countRetweet); System.out.println("#replies: " + countResponses); System.out.println("#users: " + users.size()); } }