package eu.socialsensor.twcollect;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
/**
* Utility class for downloading a set of tweets (using their IDs) and keeping
* track of some statistics
* @author kleinmind
*
*/
public class TweetCorpusDownloader {
// very simple example of multi-threading downloading
public static void main(String[] args) {
String idFile = "tweets_200.txt";
String responseFile = "responses.txt";
try {
downloadIdsMultiThread(idFile, responseFile, true, 10);
} catch (Exception e) {
e.printStackTrace();
}
}
// assumes that files ending with "txt" contain ids
protected static String[] getIdFiles(String idFileDirectory) {
File idFileDir = new File(idFileDirectory);
if (idFileDir.isDirectory()){
String[] files = idFileDir.list(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
if (name.endsWith("txt")){
return true;
}
return false;
}
});
return files;
}
return new String[0];
}
/**
*
* @param nrThreads This matters if one wants to use the multi-threading method
*/
public TweetCorpusDownloader(int nrThreads){
// initialize
downloadExecutor = Executors.newFixedThreadPool(nrThreads);
pool = new ExecutorCompletionService<TweetFieldsResponse>(downloadExecutor);
numPendingTasks = 0;
maxNumPendingTasks = nrThreads * 10;
}
protected static final String UTF8 = "UTF-8";
// idsFile: file with tweet IDs, one tweet ID per line
// responsesLogFile: file where responses will be logged - if it already exists,
// it will be used for resuming
// resume: if true, the responsesLogFile will be used as a starting point (no redownloading)
// if false, all IDs will be redownloaded and the responsesLogFile will be overwritten
public static void downloadIds(String idsFile, String responsesLogFile,
final boolean resume){
BufferedReader reader = null;
try {
reader = new BufferedReader(new FileReader(idsFile));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String tempOutput = responsesLogFile + ".tmp";
Set<String> existingIds = new HashSet<String>();
if (resume){
if ((new File(responsesLogFile)).exists()){
existingIds = TweetFieldsResponse.readIds(responsesLogFile);
// copy "existingIds" lines to temporary output file
copyExistingIds(responsesLogFile, tempOutput, existingIds);
}
}
BufferedWriter writer = null;
try {
writer = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(tempOutput, resume),UTF8));
} catch (IOException e1) {
e1.printStackTrace();
}
TweetFieldsFetcher fetcher = new TweetFieldsFetcher();
String tweetId = null;
int countLine = 0;
try {
while ((tweetId = reader.readLine()) != null){
countLine++;
if (existingIds.contains(tweetId)){
continue;
}
System.out.println(countLine);
TweetFieldsResponse response = fetcher.fetchTweetFields(tweetId);
writer.write(response.toString());
writer.newLine();
}
writer.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
// copy temporary file to original response file
try {
Files.move(Paths.get(tempOutput), Paths.get(responsesLogFile), StandardCopyOption.REPLACE_EXISTING);
} catch (IOException e) {
e.printStackTrace();
}
}
// same as above
// nrThreads defines the number of threads used to fetch twitter URLs
// use with caution!
public static void downloadIdsMultiThread(String idsFile,
String responsesLogFile, boolean resume, int nrThreads) throws Exception {
BufferedReader reader = new BufferedReader(new FileReader(idsFile));
// first count the number of tweets to download
int nrTweets = 0;
while (reader.readLine() != null){
nrTweets++;
}
reader.close();
// reopen
reader = new BufferedReader(new FileReader(idsFile));
String tempOutput = responsesLogFile + ".tmp";
Set<String> existingIds = new HashSet<String>();
if (resume){
if ((new File(responsesLogFile)).exists()){
existingIds = TweetFieldsResponse.readIds(responsesLogFile);
System.out.println("Loaded " + existingIds.size() + " ids");
nrTweets -= existingIds.size(); // we are not going to count those as download tasks
// copy "existingIds" lines to temporary output file
copyExistingIds(responsesLogFile, tempOutput, existingIds);
}
}
TweetCorpusDownloader downloader = new TweetCorpusDownloader(nrThreads);
// Not yet sure whether a ParallelWriter is really necessary.
File responseFile = new File(tempOutput);
ParallelWriter pwriter = new ParallelWriter(responseFile, resume);
new Thread(pwriter).start();
int submittedCounter = 0;
int completedCounter = 0;
int failedCounter = 0;
long start = System.currentTimeMillis();
while (true) {
// if there are more tasks to submit and the downloader can accept more tasks then submit
while (submittedCounter < nrTweets && downloader.canAcceptMoreTasks()) {
String tweetId = reader.readLine();
if (existingIds.contains(tweetId)) { continue; }
submittedCounter++;
downloader.submitTweetFetchTask(tweetId);
}
// if there are submitted tasks that are pending completion, try to consume
if (completedCounter + failedCounter < submittedCounter) {
try {
TweetFieldsResponse response = downloader.getTweetFieldsResponseWait();
pwriter.append(response.toString());
completedCounter++;
System.out.println(completedCounter + " downloads completed!");
} catch (Exception e) {
failedCounter++;
System.out.println(failedCounter + " downloads failed!");
System.out.println(e.getMessage());
}
}
// if all tasks have been consumed then break;
if (completedCounter + failedCounter == nrTweets) {
downloader.shutDown();
reader.close();
break;
}
}
long end = System.currentTimeMillis();
System.out.println("Total time: " + (end - start) + " ms");
System.out.println("Downloaded tweets: " + completedCounter);
System.out.println("Failed tweets: " + failedCounter);
pwriter.end();
// copy temporary file to original response file
Files.move(Paths.get(tempOutput), Paths.get(responsesLogFile), StandardCopyOption.REPLACE_EXISTING);
}
// fields used by the multi-threading fetching method
private ExecutorService downloadExecutor;
private CompletionService<TweetFieldsResponse> pool;
private int numPendingTasks;
private final int maxNumPendingTasks;
// helper methods used by the multi-threading fetching method
protected static void copyExistingIds(String responseFile, String tempFile, Set<String> existingIds){
try {
BufferedReader outReader = new BufferedReader(new InputStreamReader(new FileInputStream(responseFile), UTF8));
BufferedWriter outWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(tempFile), UTF8));
String line = null;
while ( (line = outReader.readLine()) != null) {
TweetFieldsResponse response = TweetFieldsResponse.fromString(line);
if (existingIds.contains(response.getTweet().getId())){
// if the id has been "properly" downloaded then copy it
outWriter.write(line);
outWriter.newLine();
}
}
outReader.close();
outWriter.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e1) {
e1.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
protected void submitTweetFetchTask(String tweetId) {
Callable<TweetFieldsResponse> call = new TweetFetch(tweetId);
pool.submit(call);
numPendingTasks++;
}
protected TweetFieldsResponse getTweetFieldsResponse() throws Exception {
Future<TweetFieldsResponse> future = pool.poll();
if (future == null) { // no completed tasks in the pool
return null;
} else {
try {
TweetFieldsResponse response = future.get();
return response;
} catch (Exception e) {
throw e;
} finally {
// in any case (Exception or not) the numPendingTask should be reduced
numPendingTasks--;
}
}
}
protected TweetFieldsResponse getTweetFieldsResponseWait() throws Exception {
try {
TweetFieldsResponse response = pool.take().get();
return response;
} catch (Exception e) {
throw e;
} finally {
//in any case (Exception or not) the numPendingTask should be reduced
numPendingTasks--;
}
}
protected boolean canAcceptMoreTasks() {
if (numPendingTasks < maxNumPendingTasks) {
return true;
} else {
return false;
}
}
protected void shutDown() throws InterruptedException {
downloadExecutor.shutdown();
downloadExecutor.awaitTermination(10, TimeUnit.SECONDS);
}
}