/**
* The main class of the persister related to downloading of data from persisted collections - all REST APIs call this class methods
* to generate CSV, JSON, TXT-JSON files with or without user specified filters.
*
* @author Imran, Koushik
*/
package qa.qcri.aidr.utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import net.minidev.json.JSONObject;
import org.apache.commons.io.input.ReversedLinesFileReader;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.text.translate.UnicodeEscaper;
import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
import org.apache.log4j.Logger;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.io.ICsvBeanWriter;
import org.supercsv.io.ICsvMapWriter;
import qa.qcri.aidr.common.filter.FilterQueryMatcher;
import qa.qcri.aidr.common.filter.JsonQueryList;
import qa.qcri.aidr.common.filter.NominalLabel;
import qa.qcri.aidr.dbmanager.dto.HumanLabeledDocumentDTO;
import qa.qcri.aidr.dbmanager.dto.HumanLabeledDocumentList;
import qa.qcri.aidr.entity.FacebookDataFeed;
import qa.qcri.aidr.io.FileSystemOperations;
import qa.qcri.aidr.io.ReadWriteCSV;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonReader;
public class JsonDeserializer {
private static Logger logger = Logger.getLogger(JsonDeserializer.class.getName());
private static final int BUFFER_SIZE = 10 * 1024 * 1024; // buffer size to use for buffered r/w
private static final int LIST_BUFFER_SIZE = 50000;
private static final String HUMAN_TAGGED_FILE_PREFIX = "-human_labeled_filtered-";
private static final String ZIP_EXTENSION = ".zip";
private static final String CSV_EXTENSION = ".csv";
private static final String FACEBOOK_PREFIX = "_fb_posts";
private static final String TWEETS_PREFIX = "_last_100k_tweets";
private static final String TWEET_IDS_PREFIX = "_tweetIds_filtered";
private static final String TWEET_IDS = "_tweetIds";
private static final String CLASSIFIED = "Classified_";
private static final String FILE_SEPARATOR = "/";
private static final String STRING_SEPARATOR = "-";
private static final String WITH_RETWEET = "with-retweet";
private static final String WITHOUT_RETWEET = "without-retweet";
private static final String VOLUME_STRING = "vol";
private final MD5HashGenerator MD5Hash;
public JsonDeserializer() {
MD5Hash = new MD5HashGenerator();
}
// This method generates tweetIds csv from all the jsons of a collection
public Map<String, Object> generateJson2TweetIdsCSV(String collectionCode, boolean downloadLimited) {
List<String> fileNames = FileSystemOperations.getAllJSONFileVolumes(collectionCode);
List<Tweet> tweetsList = new ArrayList<Tweet>(LIST_BUFFER_SIZE);
ICsvBeanWriter beanWriter = null;
// String fileName = collectionCode + "_tweetIds" + ".csv";
String fileNameforCSVGen = collectionCode + TWEET_IDS;
String fileName = fileNameforCSVGen + CSV_EXTENSION;
long lastCount = 0;
long currentCount = 0;
int totalCount = 0;
try {
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>();
BufferedReader br = null;
String fileToDelete = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH) + collectionCode + FILE_SEPARATOR + collectionCode
+ TWEET_IDS + CSV_EXTENSION;
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist a csv file with same name
for (String file : fileNames) {
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + file;
try {
br = new BufferedReader(new FileReader(fileLocation));
String line;
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.clear();
break;
}
Tweet tweet = getTweet(line);
if (tweet != null) {
if (tweetsList.size() < LIST_BUFFER_SIZE) { // after every 10k write to CSV file
tweetsList.add(tweet);
} else {
int countToWrite;
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
} else {
countToWrite = tweetsList.size();
}
if (countToWrite > 0) {
beanWriter = csv.writeCollectorTweetIDSCSV(beanWriter, tweetsList.subList(0, countToWrite),
collectionCode, fileName);
totalCount += countToWrite;
lastCount = currentCount;
currentCount += tweetsList.size();
logger.info(collectionCode + ": Writing_tweetIds: " + lastCount + " to " + currentCount);
}
tweetsList.clear();
tweetsList.add(tweet);
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file = " + fileLocation);
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file = " + fileLocation);
}
} // end for
int countToWrite = tweetsList.size();
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
}
if (countToWrite > 0) {
beanWriter = csv.writeCollectorTweetIDSCSV(beanWriter, tweetsList.subList(0, countToWrite), collectionCode, fileName);
totalCount += countToWrite;
tweetsList.clear();
}
} finally {
if (beanWriter != null) {
try {
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
// beanWriter = csv.writeCollectorTweetIDSCSV(beanWriter, tweetsList,
// collectionCode, collectionCode + "_tweetIds");
tweetsList.clear();
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public Map<String, Object> generateClassifiedJson2TweetIdsCSV(String collectionCode, final boolean downloadLimited) {
ICsvMapWriter writer = null;
String fileNameforCSVGen = CLASSIFIED + collectionCode + TWEET_IDS;
String fileName = fileNameforCSVGen + CSV_EXTENSION;
int totalCount = 0;
try {
List<String> fileNames = FileSystemOperations.getClassifiedFileVolumes(collectionCode);
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>();
String[] runningHeader = null;
BufferedReader br = null;
String fileToDelete = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + CLASSIFIED + collectionCode + TWEET_IDS + CSV_EXTENSION;
logger.info(collectionCode + ": Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exists a csv file with same name
long lastCount = 0;
long currentCount = 0;
for (String file : fileNames) {
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + file;
logger.info(collectionCode + ": Reading file " + fileLocation);
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
try {
br = new BufferedReader(new FileReader(fileLocation));
String line;
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.clear();
break;
}
// Otherwise process tweet
ClassifiedTweet tweet = getClassifiedTweet(line);
if (tweet != null && !tweet.getNominalLabels().isEmpty()) {
if (tweetsList.size() < LIST_BUFFER_SIZE) {
tweetsList.add(tweet);
} else {
int countToWrite;
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
} else {
countToWrite = tweetsList.size();
}
if (countToWrite > 0) {
if (0 == totalCount) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite),
collectionCode, fileName);
lastCount = currentCount;
currentCount += countToWrite;
logger.info(collectionCode + ": Writing_tweetIds: " + lastCount + " to " + currentCount);
totalCount += countToWrite;
}
tweetsList.clear();
tweetsList.add(tweet);
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file = " + fileLocation);
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file = " + fileLocation);
}
} // end for
int countToWrite = tweetsList.size();
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
}
if (countToWrite > 0) {
if (0 == totalCount) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite), collectionCode,
fileName);
totalCount += countToWrite;
tweetsList.clear();
}
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
/**
*
* @param collectionCode
* @param selectedLabels
* list of user provided label names for filtering tweets
* @return JSON to CSV converted tweet IDs and classifiers filtered by user selected label
* name
*/
public Map<String, Object> generateClassifiedJson2TweetIdsAndClassifiersCSVFiltered(final String collectionCode, final JsonQueryList queryList,
final boolean downloadLimited, String userName) {
ICsvMapWriter writer = null;
// String fileNameforCSVGen = "Classified_" + collectionCode + "_tweetIds_filtered";
// String fileName = fileNameforCSVGen + ".csv";
String fileNameforCSVGen = null;
try {
fileNameforCSVGen = collectionCode + TWEET_IDS_PREFIX + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
fileNameforCSVGen = collectionCode + TWEET_IDS_PREFIX;
}
String fileName = fileNameforCSVGen + CSV_EXTENSION;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
String fileToDelete = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
int totalCount = 0;
try {
// List<String> fileNames = FileSystemOperations.getClassifiedFileVolumes(collectionCode);
List<String> fileNames = FileSystemOperations.getAllJSONFileVolumes(collectionCode);
Collections.sort(fileNames);
Collections.reverse(fileNames);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
// BufferedReader br = null;
ReversedLinesFileReader br = null;
logger.info(collectionCode + ": Deleteing file : " + fileToDelete + ZIP_EXTENSION);
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION); // delete if there exists a csv file with same name
// writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList, collectionCode, fileName);
// Added by koushik - first build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (String file : fileNames) {
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH) + collectionCode + FILE_SEPARATOR + file;
logger.info(collectionCode + ": Reading file " + fileLocation);
try {
// br = new BufferedReader(new FileReader(fileLocation));
File f = new File(fileLocation);
br = new ReversedLinesFileReader(f);
String line;
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.clear();
break;
}
// Otherwise add to write buffer
ClassifiedTweet tweet = getClassifiedTweet(line, collectionCode, Boolean.FALSE);
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList, collectionCode, fileName, writer);
}
if (tweet != null && satisfiesFilter(queryList, tweetFilter, tweet)) {
if (tweetsList.size() < LIST_BUFFER_SIZE
&& tweetsList.size() < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.add(tweet);
} else {
int countToWrite;
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
} else {
countToWrite = tweetsList.size();
}
if (countToWrite > 0) {
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite),
collectionCode, fileName);
totalCount += countToWrite;
}
tweetsList.clear();
tweetsList.add(tweet);
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file = " + fileLocation);
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file = " + fileLocation);
}
} // end for
int countToWrite = tweetsList.size();
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
}
if (countToWrite > 0 && !tweetsList.isEmpty()) {
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite), collectionCode,
fileName);
totalCount += countToWrite;
tweetsList.clear();
}
// In case if there wasn't any tweet. Just create an empty csv file.
if (countToWrite == 0 && tweetsList.isEmpty() && 0 == totalCount && runningHeader == null && writer == null) {
runningHeader = csv.resetClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, 0);
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList, collectionCode, fileName);
}
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
// beanWriter = csv.writeClassifiedTweetIDsCSV(beanWriter, tweetsList,
// collectionCode, fileNameforCSVGen);
tweetsList.clear();
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
logger.info(collectionCode + ": Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist a csv file with same name
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
/**
*
* @param collectionCode
* @param selectedLabels
* list of user provided label names for filtering tweets
* @return JSON to CSV converted tweet IDs filtered by user selected label
* name
*/
public Map<String, Object> generateClassifiedJson2TweetIdsOnlyCSVFiltered(final String collectionCode, final Integer exportLimit, final JsonQueryList queryList,
String userName, final boolean removeRetweet) {
ICsvMapWriter writer = null;
String fileNameforCSVGen = null;
try {
fileNameforCSVGen = collectionCode + TWEET_IDS_PREFIX + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
fileNameforCSVGen = collectionCode + TWEET_IDS_PREFIX;
}
String fileName = fileNameforCSVGen + CSV_EXTENSION;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
String fileToDelete = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
int totalCount = 0;
try {
List<String> fileNames = FileSystemOperations.getAllJSONFileVolumes(collectionCode);
Collections.sort(fileNames);
Collections.reverse(fileNames);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
ReversedLinesFileReader br = null;
logger.info(collectionCode + ": Deleteing file : " + fileToDelete + ZIP_EXTENSION);
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION); // delete if there exists a csv file with same name
// Added by koushik - first build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (String file : fileNames) {
if (totalCount >= exportLimit) {
break;
}
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH) + collectionCode + FILE_SEPARATOR + file;
logger.info(collectionCode + ": Reading file " + fileLocation);
try {
// br = new BufferedReader(new FileReader(fileLocation));
File f = new File(fileLocation);
br = new ReversedLinesFileReader(f);
String line;
while ((line = br.readLine()) != null) {
if (totalCount >= exportLimit) {
tweetsList.clear();
break;
}
// Otherwise add to write buffer
ClassifiedTweet tweet = getClassifiedTweet(line, collectionCode, removeRetweet);
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = new String[]{"tweetID"};
writer = csv.writeClassifiedTweetIDsOnlyCSV(runningHeader, writer, tweetsList,
collectionCode, fileName);
}
if (tweet != null && satisfiesFilter(queryList, tweetFilter, tweet)) {
if (tweetsList.size() < LIST_BUFFER_SIZE
&& tweetsList.size() < exportLimit) {
tweetsList.add(tweet);
} else {
int countToWrite;
countToWrite = Math.min(tweetsList.size(), exportLimit - totalCount);
if (countToWrite > 0) {
writer = csv.writeClassifiedTweetIDsOnlyCSV(runningHeader, writer, tweetsList.subList(0, countToWrite),
collectionCode, fileName);
totalCount += countToWrite;
}
tweetsList.clear();
tweetsList.add(tweet);
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file = " + fileLocation);
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file = " + fileLocation);
}
} // end for
int countToWrite = tweetsList.size();
countToWrite = Math.min(tweetsList.size(), exportLimit - totalCount);
if (countToWrite > 0 && !tweetsList.isEmpty()) {
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = new String[]{"tweetID"};
}
writer = csv.writeClassifiedTweetIDsOnlyCSV(runningHeader, writer, tweetsList.subList(0, countToWrite), collectionCode,
fileName);
totalCount += countToWrite;
tweetsList.clear();
}
// In case if there wasn't any tweet. Just create an empty csv file.
if (countToWrite == 0 && tweetsList.isEmpty() && 0 == totalCount && runningHeader == null && writer == null) {
runningHeader = new String[]{"tweetID"};
writer = csv.writeClassifiedTweetIDsOnlyCSV(runningHeader, writer, tweetsList, collectionCode, fileName);
}
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
// beanWriter = csv.writeClassifiedTweetIDsCSV(beanWriter, tweetsList,
// collectionCode, fileNameforCSVGen);
tweetsList.clear();
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
logger.info(collectionCode + ": Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist a csv file with same name
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
private final static String getDateTime() {
DateFormat df = new SimpleDateFormat("yyyyMMdd"); // yyyy-MM-dd_hh:mm:ss
return df.format(new Date());
}
public String generateJSON2CSV_100K_BasedOnTweetCount(String collectionCode, int exportLimit) {
BufferedReader br = null;
//boolean isCSVGenerated = true;
String fileName = "";
ICsvBeanWriter beanWriter = null;
try {
String folderLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforCSVGen = collectionCode + TWEETS_PREFIX;
fileName = fileNameforCSVGen + CSV_EXTENSION;
FileSystemOperations.deleteFile(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
Arrays.sort(listOfFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f1.lastModified()).compareTo(f2.lastModified());
}
});
List<Tweet> tweetsList = new ArrayList<Tweet>();
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>();
int currentSize = 0;
createTweetList: {
for (int i = 0; i < listOfFiles.length; i++) {
File f = listOfFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json") && currentFileName.contains("vol")) {
String line;
logger.info(collectionCode + ": Reading file : " + f.getAbsolutePath());
InputStream is = new FileInputStream(f.getAbsolutePath());
br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
try {
Tweet tweet = getTweet(line);
if (tweet != null) {
if (tweetsList.size() < LIST_BUFFER_SIZE && currentSize <= exportLimit) {
// write to arrayList
tweetsList.add(tweet);
} else {
// write csv file
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0) {
beanWriter = csv.writeCollectorTweetsCSV(tweetsList.subList(0, countToWrite), collectionCode,
fileName, beanWriter);
currentSize += countToWrite;
}
// empty arraylist
tweetsList.clear();
tweetsList.add(tweet);
}
if (beanWriter != null) {
if (currentSize >= exportLimit) {
// write to the csv file
break createTweetList;
}
}
}
} catch (Exception ex) {
logger.error("Error while parsing the json" + ex);
}
}
if (!tweetsList.isEmpty()) {
beanWriter = csv.writeCollectorTweetsCSV(tweetsList, collectionCode, fileName, beanWriter);
logger.info(collectionCode + ": final beanWriter : " + beanWriter.getRowNumber());
tweetsList.clear();
}
br.close();
}
}
}
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0) {
beanWriter = csv.writeCollectorTweetsCSV(tweetsList.subList(0, countToWrite), collectionCode, fileName, beanWriter);
tweetsList.clear();
}
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
//isCSVGenerated = false;
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file");
} finally {
if (beanWriter != null) {
try {
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
return fileName;
}
public String taggerGenerateJSON2CSV_100K_BasedOnTweetCount(String collectionCode, int exportLimit) {
BufferedReader br = null;
String fileName = "";
ICsvMapWriter writer = null;
try {
String folderLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforCSVGen = CLASSIFIED + collectionCode + TWEETS_PREFIX;
fileName = fileNameforCSVGen + CSV_EXTENSION;
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileName);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
// to get only Tagger's files
ArrayList<File> taggerFilesList = new ArrayList<File>();
for (int i = 0; i < listOfFiles.length; i++) {
if (StringUtils.startsWith(listOfFiles[i].getName(), CLASSIFIED)
&& StringUtils.containsIgnoreCase(listOfFiles[i].getName(), "vol")) {
taggerFilesList.add(listOfFiles[i]);
}
}
Object[] objectsArray = taggerFilesList.toArray();
File[] taggerFiles = Arrays.copyOf(objectsArray, objectsArray.length, File[].class);
Arrays.sort(taggerFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f2.lastModified()).compareTo(f1.lastModified()); // koushik: changed sort order
}
});
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
int currentSize = 0;
createTweetList: {
for (int i = 0; i < taggerFiles.length; i++) {
File f = taggerFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json")) {
String line;
logger.info(collectionCode + ": Reading file : " + f.getAbsolutePath());
InputStream is = new FileInputStream(f.getAbsolutePath());
br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
ClassifiedTweet tweet = getClassifiedTweet(line);
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList, collectionCode, fileName, writer);
}
if (tweet != null && !tweet.getNominalLabels().isEmpty()) {
if (tweetsList.size() < LIST_BUFFER_SIZE) {
tweetsList.add(tweet);
} else {
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0) {
// buffer full, write to csv file
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite),
collectionCode, fileName, writer);
currentSize += countToWrite;
}
tweetsList.clear();
if (currentSize >= exportLimit) {
break createTweetList;
}
// Otherwise add current tweet and continue
tweetsList.add(tweet);
}
}
}
/*
* if (!tweetsList.isEmpty()) { writer =
* csv.writeClassifiedTweetsCSV(tweetsList,
* collectionCode, fileName, writer);
* tweetsList.clear(); }
*/
br.close();
}
}
}
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0) {
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite), collectionCode, fileName, writer);
tweetsList.clear();
}
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": File not found.");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
return fileName;
}
/**
*
* @param collectionCode
* c
* @param exportLimit
* @param selectedLabels
* list of user provided label names for filtering tweets
* @return JSON to CSV converted 100K tweets filtered by user selected label
* name
*/
public Map<String, Object> taggerGenerateJSON2CSVBasedOnTweetCountFiltered(String collectionCode, int exportLimit,
final JsonQueryList queryList, String userName, boolean removeRetweet) {
// BufferedReader br = null;
ReversedLinesFileReader br = null;
int currentSize = 0;
String fileName = "";
ICsvMapWriter writer = null;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforCSVGen = null;
Long currentTimeStamp = System.currentTimeMillis();
String exclusionPattern = VOLUME_STRING;
fileNameforCSVGen = currentTimeStamp + STRING_SEPARATOR + exportLimit + STRING_SEPARATOR + (removeRetweet ? WITHOUT_RETWEET : WITH_RETWEET);
try {
fileNameforCSVGen = fileNameforCSVGen + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
logger.info("Issue in generating user hash for user : " + userName);
}
fileName = fileNameforCSVGen + CSV_EXTENSION;
try {
FileSystemOperations.deleteFiles(folderLocation, CSV_EXTENSION + ZIP_EXTENSION, exclusionPattern);
logger.info("Deleting exsiting file: " + folderLocation + FILE_SEPARATOR + fileName + ZIP_EXTENSION);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
// to get only Tagger's files
ArrayList<File> taggerFilesList = new ArrayList<File>();
for (int i = 0; i < listOfFiles.length; i++) {
if (StringUtils.startsWith(listOfFiles[i].getName(), (collectionCode + "_"))
&& StringUtils.containsIgnoreCase(listOfFiles[i].getName(), "vol")) {
logger.info("Added to list, file: " + listOfFiles[i]);
taggerFilesList.add(listOfFiles[i]);
}
}
Object[] objectsArray = taggerFilesList.toArray();
File[] taggerFiles = Arrays.copyOf(objectsArray, objectsArray.length, File[].class);
Arrays.sort(taggerFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f1.lastModified()).compareTo(f2.lastModified());
}
});
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
// writer = csv.writeClassifiedTweetsCSV(runningHeader, null,
// collectionCode, fileName, writer);
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
//int j = 0;
for (int i = taggerFiles.length - 1; i >= 0; i--) {
File f = taggerFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json")) {
String line;
logger.info("Reading file : " + f.getAbsolutePath());
// InputStream is = new
// FileInputStream(f.getAbsolutePath());
// br = new BufferedReader(new InputStreamReader(is,
// Charset.forName("UTF-8")));
br = new ReversedLinesFileReader(f);
if (currentSize < exportLimit) {
while ((line = br.readLine()) != null) {
ClassifiedTweet tweet = getClassifiedTweet(line, collectionCode, removeRetweet);
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList, collectionCode, fileName, writer);
}
// logger.info("Parsed tweet = " + tweet.getTweetID() + "," + tweet.getMessage());
if (tweet != null && satisfiesFilter(queryList, tweetFilter, tweet)) {
if (tweetsList.size() < LIST_BUFFER_SIZE && tweetsList.size() < exportLimit) {
// Apply filter on tweet
tweetsList.add(tweet);
} else {
// write-buffer full, write to csv file
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0 && currentSize < exportLimit) {
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite),
collectionCode, fileName, writer);
currentSize += countToWrite;
logger.info("currentSize = " + currentSize + ", countToWrite = " + countToWrite);
// logger.info(tweet.toJsonString());
// clear contents from tweetsList buffer
tweetsList.clear();
countToWrite = 0;
if (currentSize >= exportLimit) {
break; // we are done
} else {
// Otherwise add the tweet to fresh buffer and continue
tweetsList.add(tweet);
}
}
}
}
} // end while
} else {
break;
}
}
br.close();
logger.info("Done processing file : " + f.getAbsolutePath());
} // end for
if (currentSize < exportLimit) {
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
// logger.info("Outside for loop: currentSize = " + currentSize + ", countToWrite = " + countToWrite + " tweetsList size = " + tweetsList.size());
if (countToWrite > 0 && tweetsList.size() >= countToWrite) {
// logger.info("Outside loop, writing residual list: exportLimit = " + exportLimit + ", currentSize = " + currentSize + ", countToWrite = " + countToWrite);
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite), collectionCode, fileName,
writer);
currentSize += countToWrite;
tweetsList.clear();
}
// In case there wasn't any tweet. Just create an empty csv file.
if (countToWrite == 0 && tweetsList.size() == 0 && 0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.resetClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, 0);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite), collectionCode, fileName,
writer);
}
}
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
String fileToDelete = fileName;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileToDelete);
logger.info("Deleted raw created file: " + folderLocation + FILE_SEPARATOR + fileNameforCSVGen + CSV_EXTENSION);
Map<String, Object> resultMap = new HashMap<String, Object>();
resultMap.put("fileName", fileName);
resultMap.put("currentSize", currentSize);
return resultMap;
}
private Tweet getTweet(String line) {
Tweet tweet = new Tweet();
try {
//Gson jsonObject = new GsonBuilder().serializeNulls().disableHtmlEscaping().serializeSpecialFloatingPointValues().create();
JsonParser parser = new JsonParser();
JsonObject jsonObj = (JsonObject) parser.parse(line);
if (jsonObj.get("id_str") != null) {
tweet.setTweetID(jsonObj.get("id_str").getAsString());
}
if (jsonObj.get("text") != null) {
tweet.setMessage(jsonObj.get("text").getAsString());
}
if (jsonObj.get("created_at") != null) {
tweet.setCreatedAt(jsonObj.get("created_at").getAsString());
}
JsonObject jsonUserObj = null;
if (jsonObj.get("user") != null) {
jsonUserObj = jsonObj.get("user").getAsJsonObject();
if (jsonUserObj.get("id") != null) {
tweet.setUserID(jsonUserObj.get("id").getAsString());
}
if (jsonUserObj.get("screen_name") != null) {
tweet.setUserName(jsonUserObj.get("screen_name").getAsString());
tweet.setTweetURL("https://twitter.com/" + tweet.getUserName() + "/status/" + tweet.getTweetID());
}
if (jsonUserObj.get("url") != null) {
tweet.setUserURL(jsonUserObj.get("url").toString());
}
}
} catch (Exception ex) {
logger.error("Exception while parsing the json to tweet" + ex);
return null;
}
return tweet;
}
public ClassifiedTweet getClassifiedTweet(String line) {
return getClassifiedTweet(line, null, Boolean.FALSE);
}
public ClassifiedTweet getClassifiedTweet(String line, String collectionCode, boolean removeRetweet) {
ClassifiedTweet tweet = new ClassifiedTweet();
try {
StringReader reader = new StringReader(line.trim());
JsonReader jsonReader = new JsonReader(reader);
jsonReader.setLenient(true);
//Gson jsonObject = new GsonBuilder().serializeNulls().disableHtmlEscaping().serializeSpecialFloatingPointValues().create();
JsonParser parser = new JsonParser();
JsonObject jsonObj = (JsonObject) parser.parse(jsonReader);
if(removeRetweet && jsonObj.get("retweeted_status") != null) {
return null;
}
if (jsonObj.get("id_str") != null) {
tweet.setTweetID(jsonObj.get("id_str").getAsString());
}
if (jsonObj.get("text") != null) {
tweet.setMessage(jsonObj.get("text").getAsString());
}
if (jsonObj.get("created_at") != null) {
tweet.setCreatedAtString(jsonObj.get("created_at").getAsString());
tweet.setCreateAt(new Date(tweet.getTimestamp()));
}
JsonObject jsonUserObj = null;
if (jsonObj.get("user") != null) {
jsonUserObj = jsonObj.get("user").getAsJsonObject();
if (jsonUserObj.get("id") != null) {
tweet.setUserID(jsonUserObj.get("id").getAsString());
}
if (jsonUserObj.get("screen_name") != null) {
tweet.setUserName(jsonUserObj.get("screen_name").getAsString());
tweet.setTweetURL("https://twitter.com/" + tweet.getUserName() + "/status/" + tweet.getTweetID());
}
if (jsonUserObj.get("url") != null) {
tweet.setUserURL(jsonUserObj.get("url").toString());
}
}
JsonObject aidrObject = null;
if (jsonObj.get("aidr") != null) {
aidrObject = jsonObj.get("aidr").getAsJsonObject();
if (aidrObject.get("crisis_name") != null) {
tweet.setCrisisName(aidrObject.get("crisis_name").getAsString());
}
if (aidrObject.get("crisis_code") != null) {
tweet.setCrisisCode(aidrObject.get("crisis_code").getAsString());
}
if (aidrObject.has("nominal_labels") && (aidrObject.get("nominal_labels") != null)) {
// JSONArray nominalLabels = (JSONArray)
// aidrObject.get("nominal_labels");
JsonArray nominalLabels = aidrObject.get("nominal_labels").getAsJsonArray();
StringBuffer allAttributeNames = new StringBuffer();
StringBuffer allAttributeCodes = new StringBuffer();
StringBuffer allLabelNames = new StringBuffer();
StringBuffer allLabelCodes = new StringBuffer();
StringBuffer allLabelDescriptions = new StringBuffer();
StringBuffer allConfidences = new StringBuffer();
StringBuffer humanLabeled = new StringBuffer();
for (int i = 0; i < nominalLabels.size(); i++) {
// JSONObject label = (JSONObject) nominalLabels.get(i);
JsonObject label = nominalLabels.get(i).getAsJsonObject();
allAttributeNames.append(!label.get("attribute_name").isJsonNull() ? label.get("attribute_name").getAsString()
: "null");
allAttributeCodes.append(!label.get("attribute_code").isJsonNull() ? label.get("attribute_code").getAsString()
: "null");
allLabelNames.append(!label.get("label_name").isJsonNull() ? label.get("label_name").getAsString() : "null");
allLabelCodes.append(!label.get("label_code").isJsonNull() ? label.get("label_code").getAsString() : "null");
allLabelDescriptions.append(!label.get("label_description").isJsonNull() ? label.get("label_description")
.getAsString() : "null");
allConfidences.append(!label.get("confidence").isJsonNull() ? label.get("confidence").getAsFloat() : 0);
humanLabeled.append(!label.get("from_human").isJsonNull() ? label.get("from_human").getAsBoolean() : false);
// Added by koushik
NominalLabel nLabel = new NominalLabel();
nLabel.attribute_code = (label.has("attribute_code") && !label.get("attribute_code").isJsonNull()) ? label.get(
"attribute_code").getAsString() : "null";
nLabel.label_code = (label.has("label_code") && !label.get("label_code").isJsonNull()) ? label.get("label_code")
.getAsString() : "null";
nLabel.confidence = (label.has("confidence") && !label.get("confidence").isJsonNull()) ? Float.parseFloat(label
.get("confidence").getAsString()) : 0;
nLabel.attribute_name = (label.has("attribute_name") && !label.get("attribute_name").isJsonNull()) ? label.get(
"attribute_name").getAsString() : "null";
nLabel.label_name = (label.has("label_name") && !label.get("label_name").isJsonNull()) ? label.get("label_name")
.getAsString() : "null";
nLabel.attribute_description = (label.has("attribute_description") && !label.get("attribute_description")
.isJsonNull()) ? label.get("attribute_description").getAsString() : "null";
nLabel.label_description = (label.has("label_description") && !label.get("label_description").isJsonNull()) ? label
.get("label_description").getAsString() : "null";
nLabel.from_human = (label.has("from_human") && !label.get("from_human").isJsonNull()) ? Boolean.parseBoolean(label
.get("from_human").getAsString()) : false;
tweet.getNominalLabels().add(nLabel);
// remove the ugly ';' from end-of-list
if (i < nominalLabels.size() - 1) {
allAttributeNames.append(";");
allAttributeCodes.append(";");
allLabelNames.append(";");
allLabelDescriptions.append(";");
allConfidences.append(";");
humanLabeled.append(";");
}
}
tweet.setAttributeName_1(allAttributeNames.toString());
tweet.setAttributeCode_1(allAttributeCodes.toString());
tweet.setLabelName_1(allLabelNames.toString());
tweet.setLabelDescription_1(allLabelDescriptions.toString());
tweet.setConfidence_1(allConfidences.toString());
tweet.setHumanLabeled_1(humanLabeled.toString());
} else {
tweet.createDummyNominalLabels(collectionCode);
}
} else {
tweet.createDummyNominalLabels(collectionCode);
}
return tweet;
} catch (Exception ex) {
logger.error("Exception while parsing the json to classiffied tweet for the collection: " + collectionCode + "\t" + ex);
return null;
}
}
public String generateJSON2JSON_100K_BasedOnTweetCount(String collectionCode, DownloadJsonType jsonType) {
BufferedReader br = null;
String fileName = "";
BufferedWriter beanWriter = null;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
try {
String folderLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforJsonGen = collectionCode + TWEETS_PREFIX;
fileName = fileNameforJsonGen + extension;
FileSystemOperations.deleteFile(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileNameforJsonGen + extension);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
Arrays.sort(listOfFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f1.lastModified()).compareTo(f2.lastModified());
}
});
long currentSize = 0;
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
boolean isDone = false;
for (int i = 0; i < listOfFiles.length; i++) {
File f = listOfFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json") && currentFileName.contains("vol")) {
String line;
logger.info("Reading file : " + f.getAbsolutePath());
InputStream is = new FileInputStream(f.getAbsolutePath());
br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
try {
if (currentSize <= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.TWEETS_EXPORT_LIMIT))) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& currentSize < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.TWEETS_EXPORT_LIMIT)) && currentSize > 0) {
beanWriter.write(", "); // do not append for last item
}
// write to file
beanWriter.write(line);
beanWriter.newLine();
++currentSize;
} else {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
isDone = true;
break;
}
} catch (Exception ex) {
logger.error("JSON file parsing exception" + ex);
}
} // end while
br.close();
if (isDone) {
beanWriter.close();
break;
}
}
} // end for
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
return fileName;
}
public Map<String, Object> generateJson2TweetIdsJson(String collectionCode, final boolean downloadLimited, DownloadJsonType jsonType) {
BufferedReader br = null;
String fileName = "";
BufferedWriter beanWriter = null;
int totalCount = 0;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
try {
String folderLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforJsonGen = collectionCode + TWEET_IDS;
fileName = fileNameforJsonGen + extension;
FileSystemOperations.deleteFile(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileNameforJsonGen + extension);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
Arrays.sort(listOfFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f1.lastModified()).compareTo(f2.lastModified());
}
});
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
for (int i = 0; i < listOfFiles.length; i++) {
File f = listOfFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json") && currentFileName.contains("vol")) {
String line;
logger.info("Reading file : " + f.getAbsolutePath());
InputStream is = new FileInputStream(f.getAbsolutePath());
br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
try {
Tweet tweet = getTweet(line);
if (tweet != null && tweet.getTweetID() != null) {
JSONObject obj = new JSONObject();
obj.put("id", tweet.getTweetID());
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& totalCount < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT)) && totalCount > 0) {
beanWriter.write(", "); // do not append for last item
}
// write to file
beanWriter.write(obj.toJSONString());
beanWriter.newLine();
++totalCount;
}
} catch (Exception ex) {
logger.error("JSON file parsing exception" + ex);
}
} // end while
br.close();
}
} // end for
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
beanWriter.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public String taggerGenerateJSON2JSON_100K_BasedOnTweetCount(String collectionCode, int exportLimit, DownloadJsonType jsonType) {
BufferedReader br = null;
String fileName = "";
BufferedWriter beanWriter = null;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
try {
String folderLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
logger.info("For collection: " + collectionCode + ", will create file from folder: " + folderLocation);
String fileNameforJsonGen = CLASSIFIED + collectionCode + TWEETS_PREFIX;
fileName = fileNameforJsonGen + extension;
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforJsonGen + extension);
logger.info("Deleted existing file: " + folderLocation + FILE_SEPARATOR + fileNameforJsonGen + extension);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
// to get only Tagger's files
ArrayList<File> taggerFilesList = new ArrayList<File>();
for (int i = 0; i < listOfFiles.length; i++) {
if (StringUtils.startsWith(listOfFiles[i].getName(), CLASSIFIED)
&& StringUtils.containsIgnoreCase(listOfFiles[i].getName(), "vol")) {
taggerFilesList.add(listOfFiles[i]);
}
}
Object[] objectsArray = taggerFilesList.toArray();
File[] taggerFiles = Arrays.copyOf(objectsArray, objectsArray.length, File[].class);
Arrays.sort(taggerFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f2.lastModified()).compareTo(f1.lastModified()); // koushik: changed sort order?
}
});
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
long currentSize = 0;
boolean isDone = false;
for (int i = 0; i < taggerFiles.length; i++) {
File f = taggerFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json") && currentFileName.contains("vol")) {
String line;
logger.info("Reading file : " + f.getAbsolutePath());
InputStream is = new FileInputStream(f.getAbsolutePath());
br = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
while ((line = br.readLine()) != null) {
try {
if (currentSize < exportLimit
&& currentSize < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.TWEETS_EXPORT_LIMIT))) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && currentSize > 0) {
beanWriter.write(", ");
}
// write to file
beanWriter.write(line);
beanWriter.newLine();
++currentSize;
} else {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
isDone = true;
break;
}
} catch (Exception ex) {
logger.error("JSON file parsing exception" + ex);
}
} // end while
br.close();
if (isDone) {
beanWriter.close();
break;
}
}
} // end for
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} catch (NullPointerException ex) {
logger.error(collectionCode + ": empty list of files to read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
return fileName;
}
public Map<String, Object> generateClassifiedJson2TweetIdsJSON(String collectionCode, final boolean downloadLimited,
DownloadJsonType jsonType) {
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
BufferedWriter beanWriter = null;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String fileNameforJsonGen = CLASSIFIED + collectionCode + TWEET_IDS;
String fileName = fileNameforJsonGen + extension;
int totalCount = 0;
try {
List<String> fileNames = FileSystemOperations.getClassifiedFileVolumes(collectionCode);
BufferedReader br = null;
String fileToDelete = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + CLASSIFIED + collectionCode + TWEET_IDS + extension;
// System.out.println("Deleteing file : " + fileToDelete);
logger.info("Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exists a csv file with same name
// System.out.println(fileNames);
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
for (String file : fileNames) {
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + file;
// logger.info("Reading file " + fileLocation);
try {
br = new BufferedReader(new FileReader(fileLocation));
String line;
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT)))
break;
ClassifiedTweet tweet = getClassifiedTweet(line);
if (tweet != null && tweet.getTweetID() != null) {
if (!tweet.getNominalLabels().isEmpty()) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& totalCount < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT)) && totalCount > 0) {
beanWriter.write(", ");
}
// write to file
beanWriter.write(createJsonClassifiedTweetIDString(tweet));
beanWriter.newLine();
}
++totalCount;
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
}
} // end for
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
beanWriter.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public Map<String, Object> taggerGenerateJSON2JSONBasedOnTweetCountFiltered(String collectionCode, int exportLimit, JsonQueryList queryList,
DownloadJsonType jsonType, String userName, boolean removeRetweet) {
// BufferedReader br = null;
ReversedLinesFileReader br = null;
String fileName = "";
long currentSize = 0;
BufferedWriter beanWriter = null;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR;
String fileNameforJsonGen = null;
Long currentTimeStamp = System.currentTimeMillis();
String exclusionPattern = HUMAN_TAGGED_FILE_PREFIX;
fileNameforJsonGen = currentTimeStamp + STRING_SEPARATOR + exportLimit + STRING_SEPARATOR + (removeRetweet ? WITHOUT_RETWEET : WITH_RETWEET);
try {
fileNameforJsonGen = fileNameforJsonGen + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
logger.info("Issue in generating user hash for user : " + userName);
}
fileName = fileNameforJsonGen + extension;
boolean jsonObjectClosed = false;
try {
FileSystemOperations.deleteFiles(folderLocation, extension + ZIP_EXTENSION, exclusionPattern);
File folder = new File(folderLocation);
File[] listOfFiles = folder.listFiles();
// to get only Tagger's files
ArrayList<File> taggerFilesList = new ArrayList<File>();
for (int i = 0; i < listOfFiles.length; i++) {
if (StringUtils.startsWith(listOfFiles[i].getName(), (collectionCode + "_"))
&& StringUtils.containsIgnoreCase(listOfFiles[i].getName(), "vol")) {
taggerFilesList.add(listOfFiles[i]);
// logger.info("Added to list, file: " + listOfFiles[i]);
}
}
Object[] objectsArray = taggerFilesList.toArray();
File[] taggerFiles = Arrays.copyOf(objectsArray, objectsArray.length, File[].class);
Arrays.sort(taggerFiles, new Comparator<File>() {
@Override
public int compare(File f1, File f2) {
return Long.valueOf(f1.lastModified()).compareTo(f2.lastModified());
// koushik: changed sort order to create list in ascending order of modified time
}
});
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
boolean isDone = false;
for (int i = taggerFiles.length - 1; i >= 0; i--) {
File f = taggerFiles[i];
String currentFileName = f.getName();
if (currentFileName.endsWith(".json") && currentFileName.contains("vol")) {
String line;
logger.info("Reading file : " + f.getAbsolutePath());
// InputStream is = new
// FileInputStream(f.getAbsolutePath());
// br = new BufferedReader(new InputStreamReader(is,
// Charset.forName("UTF-8")));
br = new ReversedLinesFileReader(f);
while ((line = br.readLine()) != null) {
try {
ClassifiedTweet tweet = getClassifiedTweet(line, null, removeRetweet);
if(tweet != null) {
if (currentSize < exportLimit) {
// Apply filter on tweet
if (satisfiesFilter(queryList, tweetFilter, tweet)) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && currentSize > 0) {
beanWriter.write(", ");
}
// write to file
//Decode unicode hex value to normal characters
UnicodeUnescaper u = new UnicodeUnescaper();
line = u.translate(line);
//Encoding angular brackets < >
line = UnicodeEscaper.between(60,60).translate(line);
line = UnicodeEscaper.between(62,62).translate(line);
beanWriter.write(line);
beanWriter.newLine();
++currentSize;
}
} else {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
isDone = true;
break;
}
}
} catch (Exception ex) {
logger.error("JSON file parsing exception" + ex);
}
} // end while
br.close();
if (isDone) {
beanWriter.newLine();
beanWriter.close();
break;
}
}
} // end for
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} catch (NullPointerException ex) {
logger.error(collectionCode + ": empty list of files to read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
beanWriter.newLine();
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
String fileToDelete = fileName;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileToDelete);
logger.info("Deleted created raw file: " + folderLocation + FILE_SEPARATOR + fileNameforJsonGen + extension);
Map<String, Object> resultMap = new HashMap<String, Object>();
resultMap.put("fileName", fileName);
resultMap.put("currentSize", currentSize);
return resultMap;
}
public Map<String, Object> generateClassifiedJson2TweetIdsJSONFiltered(String collectionCode, Boolean downloadLimited,
JsonQueryList queryList, DownloadJsonType jsonType, String userName) {
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
BufferedWriter beanWriter = null;
// String fileNameforJsonGen = "Classified_" + collectionCode +
// "_tweetIds_filtered";
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR;
String fileNameforJsonGen = null;
try {
fileNameforJsonGen = collectionCode + TWEET_IDS_PREFIX + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
fileNameforJsonGen = collectionCode + TWEET_IDS_PREFIX;
}
String fileName = fileNameforJsonGen + extension;
int totalCount = 0;
try {
List<String> fileNames = FileSystemOperations.getAllJSONFileVolumes(collectionCode);
Collections.sort(fileNames);
Collections.reverse(fileNames);
// BufferedReader br = null;
ReversedLinesFileReader br = null;
String fileToDelete = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
// System.out.println("Deleteing file : " + fileToDelete + ".zip");
logger.info("Deleteing file : " + fileToDelete + ZIP_EXTENSION);
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION); // delete if there exist a csv file with same name
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (String file : fileNames) {
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + file;
logger.info("Reading file " + fileLocation);
try {
// br = new BufferedReader(new FileReader(fileLocation));
File f = new File(fileLocation);
br = new ReversedLinesFileReader(f);
String line;
while ((line = br.readLine()) != null) {
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
ClassifiedTweet tweet = getClassifiedTweet(line);
if (tweet != null && tweet.getTweetID() != null) {
// Apply filter on tweet
if (satisfiesFilter(queryList, tweetFilter, tweet)) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& totalCount < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT)) && totalCount > 0) {
beanWriter.write(", ");
}
// write to file
beanWriter.write(createJsonClassifiedTweetIDString(tweet));
beanWriter.newLine();
++totalCount;
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
}
} // end for
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
beanWriter.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
String fileToDelete = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
// System.out.println("Deleteing file : " + fileToDelete);
logger.info("Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist
// a csv file with same
// name
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public Map<String, Object> generateClassifiedJson2TweetIdsOnlyJSONFiltered(String collectionCode, Integer exportLimit,
JsonQueryList queryList, DownloadJsonType jsonType, String userName, Boolean removeRetweet) {
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
BufferedWriter beanWriter = null;
// String fileNameforJsonGen = "Classified_" + collectionCode +
// "_tweetIds_filtered";
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR;
String fileNameforJsonGen = null;
try {
fileNameforJsonGen = collectionCode + TWEET_IDS_PREFIX + STRING_SEPARATOR + MD5Hash.getMD5Hash(userName);
} catch (Exception e) {
fileNameforJsonGen = collectionCode + TWEET_IDS_PREFIX;
}
String fileName = fileNameforJsonGen + extension;
int totalCount = 0;
try {
List<String> fileNames = FileSystemOperations.getAllJSONFileVolumes(collectionCode);
Collections.sort(fileNames);
Collections.reverse(fileNames);
// BufferedReader br = null;
ReversedLinesFileReader br = null;
String fileToDelete = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
// System.out.println("Deleteing file : " + fileToDelete + ".zip");
logger.info("Deleteing file : " + fileToDelete + ZIP_EXTENSION);
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION); // delete if there exist a csv file with same name
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(fileName);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (String file : fileNames) {
if ( totalCount > exportLimit) {
break;
}
String fileLocation = PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + file;
logger.info("Reading file " + fileLocation);
try {
// br = new BufferedReader(new FileReader(fileLocation));
File f = new File(fileLocation);
br = new ReversedLinesFileReader(f);
String line;
while ((line = br.readLine()) != null) {
if (totalCount >= exportLimit) {
break;
}
ClassifiedTweet tweet = getClassifiedTweet(line);
if (tweet != null && tweet.getTweetID() != null) {
// Apply filter on tweet
if (satisfiesFilter(queryList, tweetFilter, tweet)) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& totalCount < exportLimit && totalCount > 0) {
beanWriter.write(", ");
}
// write to file
JSONObject obj = new JSONObject();
obj.put("id", tweet.getTweetID());
beanWriter.write(obj.toJSONString());
beanWriter.newLine();
++totalCount;
}
}
}
br.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
}
} // end for
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
beanWriter.close();
} catch (FileNotFoundException ex) {
logger.error(collectionCode + ": couldn't find file");
} catch (IOException ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName);
String fileToDelete = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileName;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
// System.out.println("Deleteing file : " + fileToDelete);
logger.info("Deleteing file : " + fileToDelete);
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist
// a csv file with same
// name
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public String createJsonClassifiedTweetIDString(ClassifiedTweet tweet) {
JSONObject obj = new JSONObject();
obj.put("id", tweet.getTweetID());
obj.put("crisis_name", tweet.getCrisisName());
if (tweet.getNominalLabels() != null) {
List<NominalLabel> nbList = tweet.getNominalLabels();
for (int i = 0; i < nbList.size(); i++) {
NominalLabel nb = nbList.get(i);
obj.put("attribute_name_" + i, nb.attribute_name);
obj.put("attribute_code_" + i, nb.attribute_code);
obj.put("label_name_" + i, nb.label_name);
obj.put("label_description_" + i, nb.label_description);
obj.put("label_code_" + i, nb.label_code);
obj.put("confidence_" + i, nb.confidence);
obj.put("humanLabeled_" + i, nb.from_human);
}
}
return obj.toJSONString();
}
public boolean satisfiesFilter(final JsonQueryList queryList, final FilterQueryMatcher tweetFilter, final ClassifiedTweet tweet) {
// Apply filter on tweet
// logger.info("queryList = " + queryList + ", constraints = " +
// (queryList != null ? queryList.getConstraints().isEmpty() : "null"));
if (null == queryList || queryList.getConstraints().isEmpty()) {
// logger.info("No filtering");
return true; // no filtering
} else {
if (!tweet.getNominalLabels().isEmpty()) {
return tweetFilter.getMatcherResult(tweet); // satisfies filter
}
}
return false;
}
public String generateClassifiedList2CSV_100K_BasedOnTweetCountFiltered(String collectionCode, int exportLimit,
JsonQueryList queryList, HumanLabeledDocumentList labeledItems, String userName) {
ICsvMapWriter writer = null;
String fileName = null;
String fileNameforCSVGen = null;
try {
fileNameforCSVGen = collectionCode + HUMAN_TAGGED_FILE_PREFIX + MD5Hash.getMD5Hash(userName) + CSV_EXTENSION;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL)
+ FILE_SEPARATOR + collectionCode + FILE_SEPARATOR + fileNameforCSVGen;
} catch (Exception e) {
logger.error("Exception while generating MD5Hash for user: " + userName);
return null;
}
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforCSVGen + ZIP_EXTENSION);
logger.info("Deleted existing file: " + folderLocation + FILE_SEPARATOR + fileNameforCSVGen + ZIP_EXTENSION);
try {
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
int currentSize = 0;
// writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList,
// collectionCode, fileNameforCSVGen, writer);
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (HumanLabeledDocumentDTO dto : labeledItems.getItems()) {
ClassifiedTweet tweet = new ClassifiedTweet();
tweet.toClassifiedTweetFromLabeledDoc(dto, collectionCode);
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList, collectionCode, fileNameforCSVGen, writer);
}
// logger.info("Parsed tweet = " + tweet.toJsonString());
if (tweet != null && satisfiesFilter(queryList, tweetFilter, tweet)) {
if (tweetsList.size() < exportLimit && tweetsList.size() < LIST_BUFFER_SIZE) {
// Apply filter on tweet
tweetsList.add(tweet);
} else {
// write buffer full, write to csv file
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
if (countToWrite > 0) {
// System.out.println("exportLimit = " + exportLimit
// + ", currentSize = " + currentSize +
// ", countToWrite = " + countToWrite);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite), collectionCode,
fileNameforCSVGen, writer);
currentSize += countToWrite;
logger.info("currentSize = " + currentSize + ", countToWrite = " + countToWrite);
}
// clear contents from tweetsList buffer
tweetsList.clear();
if (currentSize >= exportLimit) {
break; // we are done
}
// Otherwise add the tweet to fresh buffer and continue
tweetsList.add(tweet);
}
}
}
int countToWrite = Math.min(tweetsList.size(), exportLimit - currentSize);
// logger.info("Outside for loop: currentSize = " + currentSize +
// ", countToWrite = " + countToWrite + " tweetsList size = " +
// tweetsList.size());
if (countToWrite > 0) {
// System.out.println("exportLimit = " + exportLimit +
// ", currentSize = " + currentSize + ", countToWrite = " +
// countToWrite);
if (0 == currentSize && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList.subList(0, countToWrite), collectionCode,
fileNameforCSVGen, writer);
tweetsList.clear();
}
} catch (Exception e) {
logger.error("Exception in generateClassifiedList2CSV_100K_BasedOnTweetCountFiltered", e);
return null;
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
logger.error("Exception in generateClassifiedList2CSV_100K_BasedOnTweetCountFiltered", e);
return null;
}
}
}
// Compress generated file and send the compressed file link
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileNameforCSVGen);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforCSVGen);
logger.info("Deleted raw file post compression: " + fileNameforCSVGen);
return fileName;
}
public Map<String, Object> generateClassifiedList2TweetIdsCSVFiltered(String collectionCode, JsonQueryList queryList,
Boolean downloadLimited, HumanLabeledDocumentList labeledItems, String userName) {
ICsvMapWriter writer = null;
String fileName = null;
String fileNameforCSVGen = null;
try {
fileNameforCSVGen = collectionCode + HUMAN_TAGGED_FILE_PREFIX + "tweetIds-" + MD5Hash.getMD5Hash(userName) + CSV_EXTENSION;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL)
+ FILE_SEPARATOR + collectionCode + FILE_SEPARATOR + fileNameforCSVGen;
} catch (Exception e) {
logger.error("Exception while generating MD5Hash for user: " + userName);
return null;
}
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
int totalCount = 0;
List<ClassifiedTweet> tweetsList = new ArrayList<ClassifiedTweet>(LIST_BUFFER_SIZE);
try {
ReadWriteCSV<CellProcessor> csv = new ReadWriteCSV<CellProcessor>(collectionCode);
String[] runningHeader = null;
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforCSVGen + ZIP_EXTENSION);
logger.info(collectionCode + ": Deleteing file : " + fileNameforCSVGen + ZIP_EXTENSION);
// Added by koushik - first build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
// writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer,
// tweetsList, collectionCode, fileNameforCSVGen);
for (HumanLabeledDocumentDTO dto : labeledItems.getItems()) {
ClassifiedTweet tweet = new ClassifiedTweet();
tweet.toClassifiedTweetFromLabeledDoc(dto, collectionCode);
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet);
writer = csv.writeClassifiedTweetsCSV(runningHeader, tweetsList, collectionCode, fileNameforCSVGen, writer);
}
if (downloadLimited
&& totalCount >= Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.clear();
break;
}
if (tweet != null && satisfiesFilter(queryList, tweetFilter, tweet)) {
if (tweetsList.size() < LIST_BUFFER_SIZE
&& tweetsList.size() < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
tweetsList.add(tweet);
} else {
int countToWrite;
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
} else {
countToWrite = tweetsList.size();
}
if (countToWrite > 0) {
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite),
collectionCode, fileNameforCSVGen);
totalCount += countToWrite;
}
tweetsList.clear();
tweetsList.add(tweet);
}
}
} // end for
int countToWrite = tweetsList.size();
if (downloadLimited) {
countToWrite = Math.min(
tweetsList.size(),
Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))
- totalCount);
}
if (countToWrite > 0 && !tweetsList.isEmpty()) {
if (0 == totalCount && runningHeader == null && writer == null) {
runningHeader = csv.setClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader,
ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweetsList.get(0));
}
writer = csv.writeClassifiedTweetIDsCSV(runningHeader, writer, tweetsList.subList(0, countToWrite), collectionCode,
fileNameforCSVGen);
totalCount += countToWrite;
tweetsList.clear();
}
} catch (Exception e) {
logger.error("Exception in generateClassifiedList2TweetIdsCSVFiltered");
} finally {
if (writer != null) {
try {
writer.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for csv file write ");
}
}
}
// beanWriter = csv.writeClassifiedTweetIDsCSV(beanWriter, tweetsList,
// collectionCode, fileNameforCSVGen);
tweetsList.clear();
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileNameforCSVGen);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforCSVGen);
// System.out.println("Deleted raw file post compression: " +
// fileNameforCSVGen);
logger.info("Deleted raw file post compression: " + fileNameforCSVGen);
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
public String generateClassifiedList2JSON_100K_BasedOnTweetCountFiltered(String collectionCode, int exportLimit,
JsonQueryList queryList, DownloadJsonType jsonType, HumanLabeledDocumentList labeledItems, String userName) {
String fileName = null;
String fileNameforGen = null;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
// If everything ok, then finally generate the fileName
try {
fileNameforGen = collectionCode + HUMAN_TAGGED_FILE_PREFIX + MD5Hash.getMD5Hash(userName) + extension;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL)
+ FILE_SEPARATOR + collectionCode + FILE_SEPARATOR + fileNameforGen;
} catch (Exception e) {
logger.error("Exception while generating MD5Hash for user: " + userName);
return null;
}
BufferedWriter beanWriter = null;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
boolean jsonObjectClosed = false;
try {
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforGen + ZIP_EXTENSION);
logger.info("Deleted existing file: " + folderLocation + FILE_SEPARATOR + fileNameforGen + ZIP_EXTENSION);
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileNameforGen);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
long currentSize = 0;
boolean isDone = false;
for (HumanLabeledDocumentDTO dto : labeledItems.getItems()) {
ClassifiedTweet tweet = new ClassifiedTweet();
tweet.toClassifiedTweetFromLabeledDoc(dto, collectionCode);
if (currentSize < exportLimit
&& currentSize < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.TWEETS_EXPORT_LIMIT))) {
// Apply filter on tweet
if (satisfiesFilter(queryList, tweetFilter, tweet)) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& currentSize < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.TWEETS_EXPORT_LIMIT)) && currentSize > 0) {
beanWriter.write(", ");
}
// write to file
String jsonString = tweet.toJsonString();
beanWriter.write(jsonString);
beanWriter.newLine();
++currentSize;
}
// System.out.println("currentSize : " + currentSize);
} else {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
isDone = true;
break;
}
if (isDone) {
beanWriter.newLine();
beanWriter.close();
break;
}
} // end for
} catch (Exception ex) {
logger.error(collectionCode + ": empty list of files to read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
beanWriter.newLine();
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileNameforGen);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(folderLocation + FILE_SEPARATOR + fileNameforGen);
// System.out.println("Deleted raw file post compression: " +
// fileNameforGen);
logger.info("Deleted raw file post compression: " + fileNameforGen);
return fileName;
}
public Map<String, Object> generateClassifiedList2TweetIdsJSONFiltered(String collectionCode, Boolean downloadLimited,
JsonQueryList queryList, DownloadJsonType jsonType, HumanLabeledDocumentList labeledItems, String userName) {
String fileName = null;
String fileNameforGen = null;
String extension = DownloadJsonType.getSuffixString(jsonType);
if (null == extension) {
extension = DownloadJsonType.defaultSuffix();
}
boolean jsonObjectClosed = false;
BufferedWriter beanWriter = null;
String folderLocation = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
// If everything ok, then finally generate the fileName
try {
fileNameforGen = collectionCode + HUMAN_TAGGED_FILE_PREFIX + "tweetIds-" + MD5Hash.getMD5Hash(userName) + extension;
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL)
+ FILE_SEPARATOR + collectionCode + FILE_SEPARATOR + fileNameforGen;
} catch (Exception e) {
logger.error("Error while generating MD5Hash for the user: " + userName);
return null;
}
String fileToDelete = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode + FILE_SEPARATOR + fileNameforGen;
int totalCount = 0;
try {
// System.out.println("Deleteing file : " + fileToDelete + ".zip");
logger.info("Deleteing file : " + fileToDelete + ZIP_EXTENSION);
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION); // delete
// if
// there
// exist
// a
// file
// with
// same
// name
StringBuffer outputFile = new StringBuffer().append(folderLocation).append(FILE_SEPARATOR).append(fileNameforGen);
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)) {
beanWriter.write("[");
}
// First build the FilterQueryMatcher
FilterQueryMatcher tweetFilter = new FilterQueryMatcher();
if (queryList != null)
tweetFilter.queryList.setConstraints(queryList);
tweetFilter.buildMatcherArray();
for (HumanLabeledDocumentDTO dto : labeledItems.getItems()) {
if (downloadLimited
&& totalCount > Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT))) {
break;
}
ClassifiedTweet tweet = new ClassifiedTweet();
tweet.toClassifiedTweetFromLabeledDoc(dto, collectionCode);
// Apply filter on tweet
if (satisfiesFilter(queryList, tweetFilter, tweet)) {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType)
&& totalCount < Integer.parseInt(PersisterConfigurator.getInstance().getProperty(
PersisterConfigurationProperty.DEFAULT_TWEETID_VOLUME_LIMIT)) && totalCount > 0) {
beanWriter.write(", ");
}
// write to file
beanWriter.write(createJsonClassifiedTweetIDString(tweet));
beanWriter.newLine();
++totalCount;
}
} // end for
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.flush();
beanWriter.close();
} catch (Exception ex) {
logger.error(collectionCode + ": IO Exception for file read");
} finally {
if (beanWriter != null) {
try {
if (DownloadJsonType.JSON_OBJECT.equals(jsonType) && !jsonObjectClosed) {
beanWriter.write("]");
jsonObjectClosed = true;
}
beanWriter.close();
} catch (IOException ex) {
logger.error(collectionCode + ": IOException for JSON file write ");
}
}
}
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileNameforGen);
fileName = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL) + collectionCode
+ FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(fileToDelete); // delete if there exist
// a file with same name
// System.out.println("Deleted raw file post compression: " +
// fileToDelete);
logger.info("Deleted raw file post compression: " + fileToDelete);
return ResultStatus.getUIWrapper("fileName", fileName, "count", totalCount);
}
/**
* Return a file URL which consists Facebook post data for a particular collection
*
* @param collectionCode
* @param fbFeeds
* @return
*/
public Map<String, Object> generateFileForFacebookDataFeed(String collectionCode, List<FacebookDataFeed> fbFeeds) {
String folderLocation = PersisterConfigurator
.getInstance()
.getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH)
+ collectionCode;
String extension = DownloadJsonType.JSON_OBJECT.getSuffix();
String downloadFileURL = null;
// delete the file if already created
String fileName = collectionCode;
String fileToDelete = folderLocation + FILE_SEPARATOR + fileName;
FileSystemOperations.deleteFile(fileToDelete + ZIP_EXTENSION);
// create new file
StringBuffer outputFile = new StringBuffer().append(folderLocation)
.append(FILE_SEPARATOR)
.append(fileName)
.append(extension);
BufferedWriter beanWriter;
try {
beanWriter = new BufferedWriter(new FileWriterWithEncoding(outputFile.toString(), "UTF-8"), BUFFER_SIZE);
beanWriter.write("[");
beanWriter.newLine();
boolean isFirstFeed = true;
for (FacebookDataFeed facebookDataFeed : fbFeeds) {
if(isFirstFeed){
isFirstFeed = false;
} else {
beanWriter.write(",");
}
beanWriter.write(facebookDataFeed.getFeed().toString());
beanWriter.newLine();
}
beanWriter.write("]");
beanWriter.flush();
beanWriter.close();
// Compressing generated file
FileCompressor compressor = new FileCompressor(folderLocation, folderLocation, fileName + extension);
downloadFileURL = PersisterConfigurator.getInstance()
.getProperty(PersisterConfigurationProperty.PERSISTER_DOWNLOAD_URL)
+ collectionCode + FILE_SEPARATOR + compressor.zip();
FileSystemOperations.deleteFile(outputFile.toString());
} catch (IOException e) {
logger.error(collectionCode + ": IO Exception for Facebook Posts File Generation", e);
}
return ResultStatus.getUIWrapper(collectionCode, "File Generated", downloadFileURL, true);
}
}