/**
* This class provides an implementation to create a CSV file containing tweets with variable number of classifiers.
*
* @author Imran
* @param <CellProcessors>
*/
package qa.qcri.aidr.io;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.ws.rs.client.Client;
import javax.ws.rs.client.ClientBuilder;
import javax.ws.rs.client.WebTarget;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.Response;
import org.apache.commons.io.output.FileWriterWithEncoding;
import org.apache.log4j.Logger;
import org.glassfish.jersey.jackson.JacksonFeature;
import org.supercsv.cellprocessor.Optional;
import org.supercsv.cellprocessor.constraint.NotNull;
import org.supercsv.cellprocessor.ift.CellProcessor;
import org.supercsv.encoder.DefaultCsvEncoder;
import org.supercsv.exception.SuperCsvCellProcessorException;
import org.supercsv.io.CsvBeanWriter;
import org.supercsv.io.CsvMapWriter;
import org.supercsv.io.ICsvBeanWriter;
import org.supercsv.io.ICsvMapWriter;
import org.supercsv.prefs.CsvPreference;
import qa.qcri.aidr.common.filter.NominalLabel;
import qa.qcri.aidr.utils.ClassifiedTweet;
import qa.qcri.aidr.utils.PersisterConfigurationProperty;
import qa.qcri.aidr.utils.PersisterConfigurator;
import qa.qcri.aidr.utils.Tweet;
public class ReadWriteCSV<CellProcessors> {
private static Logger logger = Logger.getLogger(ReadWriteCSV.class);
private String collectionCode = null;
public static final String[] ClassifiedTweetCSVHeader = new String[]{"tweetID", "message","userID", "userName", "userURL", "createdAt", "tweetURL", "crisisName"};
public static final String[] ClassifiedTweetIDCSVHeader = new String[]{"tweetID", "crisisName"};
public static final int FIXED_CLASSIFIED_TWEET_HEADER_SIZE = ClassifiedTweetCSVHeader.length;
public static final int FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE = ClassifiedTweetIDCSVHeader.length;
public static final int VARIABLE_HEADER_SIZE = 7; // number of variable header elements per classifier
private static final int DEFAULT_CLASSIFIER_COUNT = 1;
private static int countWritten = 0;
public ReadWriteCSV(String collectionCode) {
this.collectionCode = collectionCode;
}
public ReadWriteCSV() {
this(null);
}
private static CellProcessor[] getProcessors4TweetIDSCCSV() {
final CellProcessor[] processors = new CellProcessor[]{
//new UniqueHashCode(), // tweetID (must be unique)
new NotNull() // tweetID (must be unique)
};
return processors;
}
private static CellProcessor[] getCollectorTweetsProcessors() {
final CellProcessor[] processors = new CellProcessor[]{
//new UniqueHashCode(), // tweetID (must be unique)
//new NotNull(), // tweetID (must be unique)
new Optional(), // data shows that sometimes tweetID CAN be null!
new Optional(), // message
//new FmtDate("dd/MM/yyyy"), // birthDate
new Optional(), // userID
new Optional(), // userName
//new Optional(new FmtBool("Y", "N")), // isRT
new Optional(), // userURL
new Optional(), // createdAt
new Optional() // tweet permanent URL
};
return processors;
}
private static CellProcessor[] getClassifiedTweetVariableProcessors(final int count) {
CellProcessor[] processors = new CellProcessor[count];
for (int i = 0;i < count;i++) {
processors[i] = new Optional();
}
return processors;
}
private static CellProcessor[] getProcessors4ClassifiedCCSV() {
final CellProcessor[] processors = new CellProcessor[]{
//new UniqueHashCode(), // tweetID (must be unique)
//new NotNull(), // tweetID (must be unique)
new Optional(), // tweetID - data shows that sometimes tweetID CAN be null!
new Optional(), // message
new Optional(), // userID
new Optional(), // userName
new Optional(), // userURL
new Optional(), // createdAt
new NotNull(), // crisis name
new Optional(), // attribute name
new Optional(), // attribute code
new Optional(), // label name
new Optional(), // label description
new Optional(), // label code
new Optional(), // confidence
new Optional() // humanLabeled
};
return processors;
}
private static CellProcessor[] getProcessors4ClassifiedTweetIDSCCSV() {
final CellProcessor[] processors = new CellProcessor[]{
//new UniqueHashCode(), // tweetID (must be unique)
new NotNull(), // tweetID (must be unique): sometimes CAN be null!
new NotNull(), // crisis name
new Optional(), // attribute name
new Optional(), // attribute code
new Optional(), // label name
new Optional(), // label description
new Optional(), // label code
new Optional(), // confidence
new Optional(), // humanLabeled
};
return processors;
}
public ICsvBeanWriter getCSVBeanWriter(String fileToWrite) {
try {
return new CsvBeanWriter(new FileWriter(fileToWrite, true),
new CsvPreference.Builder(CsvPreference.EXCEL_PREFERENCE)
.useEncoder(new DefaultCsvEncoder())
.build() );
} catch (IOException e) {
// TODO Auto-generated catch block
logger.error("Error in creating CSV Bean writer!");
logger.error("Exception",e);
}
return null;
}
public ICsvMapWriter getCSVMapWriter(String fileToWrite) {
try {
return new CsvMapWriter(new FileWriterWithEncoding(fileToWrite,"UTF-8", true),
new CsvPreference.Builder(CsvPreference.EXCEL_PREFERENCE)
.useEncoder(new DefaultCsvEncoder())
.build() );
} catch (IOException e) {
logger.error("Error in creating CSV Bean writer!"+e);
}
return null;
}
public ICsvBeanWriter writeCollectorTweetIDSCSV(ICsvBeanWriter beanWriter, List<Tweet> tweetsList, String collectionDIR, String fileName) {
try {
// the header elements are used to map the bean values to each column (names must match)
//final String[] header = new String[]{"tweetID", "message","userID", "userName", "userURL", "createdAt", "tweetURL"};
//final CellProcessor[] processors = getProcessors();
// koushik: shouldn't we be writing only the tweetIDs?
final String[] header = new String[]{"tweetID"};
final CellProcessor[] processors = getProcessors4TweetIDSCCSV();
String persisterDIR = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH);
//fileName = StringUtils.substringBefore(fileName, ".json"); //removing .json extension
String fileToWrite = persisterDIR + collectionDIR + "/" + fileName;
logger.info(collectionDIR + ": Writing CSV file : " + fileToWrite);
if (null == beanWriter) {
beanWriter = getCSVBeanWriter(fileToWrite);
// write the header
beanWriter.writeHeader(header);
}
for (final Tweet tweet : tweetsList) {
try {
if (tweet.getTweetID() != null) {
beanWriter.write(tweet, header, processors);
}
} catch (SuperCsvCellProcessorException e) {
logger.error(collectionDIR + ": SuperCSV error");
}
}
} catch (IOException ex) {
logger.error(collectionDIR + ": IO Exception occured");
}
//return fileName+".csv";
return beanWriter;
}
public ICsvMapWriter writeClassifiedTweetIDsCSV(String[] runningHeader, ICsvMapWriter mapWriter, final List<ClassifiedTweet> tweetsList, String collectionDIR, String fileName) {
// the header elements are used to map the bean values to each column (names must match)
String[] header = ClassifiedTweetIDCSVHeader;
//String[] runningHeader = null;
try {
if (null == mapWriter) {
String persisterDIR = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH);
String fileToWrite = persisterDIR + collectionDIR + "/" + fileName;
logger.info(collectionDIR + ": Writing CSV file : " + fileToWrite);
mapWriter = getCSVMapWriter(fileToWrite);
// First write the header
if (runningHeader != null) mapWriter.writeHeader(runningHeader);
countWritten = 0;
}
} catch (Exception ex) {
logger.error(collectionDIR + ": Exception occured when creating a mapWriter instance");
logger.error("Exception",ex);
return null;
}
// Now write to CSV file using CsvMapWriter
for (final ClassifiedTweet tweet : tweetsList) {
try {
//logger.info("Current header length :: Actual number of cells needed: " + runningHeader.length + "::" + getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweet.getNominalLabels().size()));
if (runningHeader.length < getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweet.getNominalLabels().size())) {
// reallocate header
runningHeader = resetClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetIDCSVHeader, ReadWriteCSV.FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweet.getNominalLabels().size());
logger.info("Reallocated running header. After reallocation, Current header length :: Actual number of cells needed: "
+ runningHeader.length + "::" + getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_ID_HEADER_SIZE, tweet.getNominalLabels().size()));
}
final Map<String, Object> tweetToWrite = createClassifiedTweetIDCsvMap(runningHeader, tweet);
final CellProcessor[] processors = getClassifiedTweetVariableProcessors(runningHeader.length);
mapWriter.write(tweetToWrite, runningHeader, processors);
++countWritten;
} catch (SuperCsvCellProcessorException e) {
logger.error(collectionDIR + ": SuperCSV error. Offending tweet: " + tweet.getTweetID());
} catch (IOException e) {
logger.error(collectionDIR + "IOException in writing tweet: " + tweet.getTweetID());
}
}
return mapWriter;
}
public ICsvMapWriter writeClassifiedTweetIDsOnlyCSV(String[] runningHeader, ICsvMapWriter mapWriter, final List<ClassifiedTweet> tweetsList, String collectionDIR, String fileName) {
// the header elements are used to map the bean values to each column (names must match)
//String[] runningHeader = null;
try {
if (null == mapWriter) {
String persisterDIR = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH);
String fileToWrite = persisterDIR + collectionDIR + "/" + fileName;
logger.info(collectionDIR + ": Writing CSV file : " + fileToWrite);
mapWriter = getCSVMapWriter(fileToWrite);
// First write the header
if (runningHeader != null) mapWriter.writeHeader(runningHeader);
countWritten = 0;
}
} catch (Exception ex) {
logger.error(collectionDIR + ": Exception occured when creating a mapWriter instance");
logger.error("Exception",ex);
return null;
}
// Now write to CSV file using CsvMapWriter
for (final ClassifiedTweet tweet : tweetsList) {
try {
final Map<String, Object> tweetToWrite = new HashMap<String, Object>();
tweetToWrite.put(runningHeader[0], tweet.getTweetID());
final CellProcessor[] processors = getClassifiedTweetVariableProcessors(runningHeader.length);
mapWriter.write(tweetToWrite, runningHeader, processors);
++countWritten;
} catch (SuperCsvCellProcessorException e) {
logger.error(collectionDIR + ": SuperCSV error. Offending tweet: " + tweet.getTweetID());
} catch (IOException e) {
logger.error(collectionDIR + "IOException in writing tweet: " + tweet.getTweetID());
}
}
return mapWriter;
}
public ICsvBeanWriter writeCollectorTweetsCSV(List<Tweet> tweetsList, String collectionDIR, String fileName, ICsvBeanWriter beanWriter) {
try {
final String[] header = new String[]{"tweetID", "message","userID", "userName", "userURL", "createdAt", "tweetURL"};
final CellProcessor[] processors = getCollectorTweetsProcessors();
if(null == beanWriter){
String persisterDIR = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH);
//fileName = StringUtils.substringBefore(fileName, ".json"); //removing .json extension
String fileToWrite = persisterDIR + collectionDIR + "/" + fileName;
logger.info(collectionDIR + ": Writing CSV file : " + fileToWrite);
beanWriter = getCSVBeanWriter(fileToWrite);
beanWriter.writeHeader(header);
}
for (final Tweet tweet : tweetsList) {
try {
beanWriter.write(tweet, header, processors);
} catch (SuperCsvCellProcessorException e) {
logger.error(collectionDIR + ": SuperCSV error");
}
}
} catch (IOException ex) {
logger.error(collectionDIR + ": IO Exception occured");
}
return beanWriter;
}
public ICsvMapWriter writeClassifiedTweetsCSV(String[] runningHeader, List<ClassifiedTweet> tweetsList, String collectionDIR, String fileName, ICsvMapWriter mapWriter) {
String[] header = ClassifiedTweetCSVHeader;
try {
if (null == mapWriter) {
String persisterDIR = PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.DEFAULT_PERSISTER_FILE_PATH);
String fileToWrite = persisterDIR + collectionDIR + "/" + fileName;
logger.info(collectionDIR + ": Writing CSV file : " + fileToWrite);
mapWriter = getCSVMapWriter(fileToWrite);
// First write the header
if (runningHeader != null) mapWriter.writeHeader(runningHeader);
countWritten = 0;
}
} catch (Exception ex) {
logger.error(collectionDIR + ": Exception occured when creating a mapWriter instance");
logger.error("Exception",ex);
return null;
}
// Now write to CSV file using CsvMapWriter
logger.info("Received length of tweets List to write = " + tweetsList.size());
for (final ClassifiedTweet tweet : tweetsList) {
try {
//logger.info("Current header length :: Actual number of cells needed: " + runningHeader.length + "::" + getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet.getNominalLabels().size()));
if (runningHeader.length < getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet.getNominalLabels().size())) {
// reallocate header
runningHeader = resetClassifiedTweetHeader(ReadWriteCSV.ClassifiedTweetCSVHeader, ReadWriteCSV.FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet.getNominalLabels().size());
logger.info("Reallocated running header. After reallocation, Current header length :: Actual number of cells needed: "
+ runningHeader.length + "::" + getClassifedTweetHeaderSize(FIXED_CLASSIFIED_TWEET_HEADER_SIZE, tweet.getNominalLabels().size()));
}
final Map<String, Object> tweetToWrite = createClassifiedTweetCsvMap(runningHeader, tweet);
final CellProcessor[] processors = getClassifiedTweetVariableProcessors(runningHeader.length);
//logger.info("Going to write: " + tweetToWrite);
mapWriter.write(tweetToWrite, runningHeader, processors);
++countWritten;
} catch (SuperCsvCellProcessorException e) {
logger.error(collectionDIR + ": SuperCSV error. Offending tweet: " + tweet.getTweetID());
logger.error("Exception",e);
} catch (IOException e) {
logger.error(collectionDIR + "IOException in writing tweet: " + tweet.getTweetID());
}
}
logger.info("Actual number of tweets written so far: " + countWritten);
return mapWriter;
}
public String[] resetClassifiedTweetHeader(String[] header, int fixedHeaderSize, int numberOfClassifiers) {
String[] fullHeader = new String[getClassifedTweetHeaderSize(fixedHeaderSize, numberOfClassifiers)];
for (int i = 0;i < header.length;i++) {
fullHeader[i] = header[i];
}
int endPoint = header.length;
for (int i = 0;i < numberOfClassifiers;i++) {
fullHeader[endPoint] = new String("attributeName_" + (i+1));
fullHeader[endPoint+1] = new String("attributeCode_" + (i+1));
fullHeader[endPoint+2] = new String("labelName_" + (i+1));
fullHeader[endPoint + 3] = new String("labelDescription_" + (i+1));
fullHeader[endPoint + 4] = new String("labelCode_" + (i+1));
fullHeader[endPoint+5] = new String("confidence_" + (i+1));
fullHeader[endPoint+6] = new String("humanLabeled_" + (i+1));
endPoint += VARIABLE_HEADER_SIZE;
}
logger.info("Number of classifiers = " + numberOfClassifiers + ", headerSize = " + fullHeader.length);
return fullHeader;
}
public String[] setClassifiedTweetHeader(String[] header, int fixedHeaderSize, ClassifiedTweet tweet) {
int numberOfClassifiers = 0;
Map<String, Integer> classifierCount = getClassifierCountForCrisis(this.collectionCode);
if (classifierCount.containsKey("count") && classifierCount.get("count") == -1) {
// estimate based on current 'tweet'
numberOfClassifiers = getClassiferCountFromTweet(tweet);
logger.info("Estimated classifier count based on first tweet = " + numberOfClassifiers);
} else {
// set as per obtained value
numberOfClassifiers = classifierCount.get("count");
logger.info("Number of classifier count based on tagger-API data = " + numberOfClassifiers);
}
String[] fullHeader = new String[getClassifedTweetHeaderSize(fixedHeaderSize, numberOfClassifiers)];
for (int i = 0;i < header.length;i++) {
fullHeader[i] = header[i];
}
int endPoint = header.length;
for (int i = 0;i < numberOfClassifiers;i++) {
fullHeader[endPoint] = new String("attributeName_" + (i+1));
fullHeader[endPoint+1] = new String("attributeCode_" + (i+1));
fullHeader[endPoint+2] = new String("labelName_" + (i+1));
fullHeader[endPoint + 3] = new String("labelDescription_" + (i+1));
fullHeader[endPoint + 4] = new String("labelCode_" + (i+1));
fullHeader[endPoint+5] = new String("confidence_" + (i+1));
fullHeader[endPoint+6] = new String("humanLabeled_" + (i+1));
endPoint += VARIABLE_HEADER_SIZE;
}
logger.info("Number of classifiers = " + numberOfClassifiers + ", headerSize = " + fullHeader.length);
return fullHeader;
}
private Map<String, Integer> getClassifierCountForCrisis(String collectionCode) {
Map<String, Integer> jsonResponse = null;
Client client = ClientBuilder.newBuilder().register(JacksonFeature.class).build();
try {
/**
* Rest call to Tagger
*/
WebTarget webResource = client.target(PersisterConfigurator.getInstance().getProperty(PersisterConfigurationProperty.TAGGER_MAIN_URL) + "/crisis/attributes/count/" + collectionCode);
Response clientResponse = webResource.request(MediaType.APPLICATION_JSON).get();
jsonResponse = clientResponse.readEntity(Map.class);
logger.info("Tagger API returned: " + jsonResponse);
if (jsonResponse.containsKey("count") && jsonResponse.get("count") != null) {
if (jsonResponse.get("count") > 0) {
return jsonResponse;
} else {
jsonResponse.put("count", DEFAULT_CLASSIFIER_COUNT);
}
} else {
jsonResponse = new HashMap<String, Integer>();
jsonResponse.put("count", -1);
}
} catch (Exception e) {
logger.info("Unable to get classifiers count from Tagger API, will try based on first read tweet, jsonResponse = " + jsonResponse);
jsonResponse = new HashMap<String, Integer>();
jsonResponse.put("count", -1);
}
return jsonResponse;
}
public int getClassifedTweetHeaderSize(int fixedHeaderSize, ClassifiedTweet tweet) {
int numberOfClassifiers = 0;
if (tweet.getNominalLabels() != null) {
numberOfClassifiers = tweet.getNominalLabels().size();
logger.info("From nominal_labels size = " + tweet.getNominalLabels().size());
} else {
numberOfClassifiers = DEFAULT_CLASSIFIER_COUNT;
logger.info("From default value = " + DEFAULT_CLASSIFIER_COUNT);
}
return (fixedHeaderSize + numberOfClassifiers * VARIABLE_HEADER_SIZE); // number of nominal_label elements
}
public int getClassifedTweetHeaderSize(int fixedHeaderSize, int numberOfClassifiers) {
return (fixedHeaderSize + numberOfClassifiers * VARIABLE_HEADER_SIZE); // number of nominal_label elements
}
public int getClassiferCountFromTweet(ClassifiedTweet tweet) {
int numberOfClassifiers = 0;
if (tweet.getNominalLabels() != null) {
numberOfClassifiers = tweet.getNominalLabels().size();
logger.info("From nominal_labels size = " + tweet.getNominalLabels().size());
} else {
numberOfClassifiers = DEFAULT_CLASSIFIER_COUNT;
logger.info("From default value = " + DEFAULT_CLASSIFIER_COUNT);
}
return numberOfClassifiers; // number of nominal_labels
}
private Map<String, Object> createClassifiedTweetCsvMap(String[] header, ClassifiedTweet tweet) {
Map<String, Object> tweetToWrite = new HashMap<String, Object>();
int i = 0;
tweetToWrite.put(header[i], tweet.getTweetID());
tweetToWrite.put(header[i+1], tweet.getMessage());
tweetToWrite.put(header[i+2], tweet.getUserID());
tweetToWrite.put(header[i+3], tweet.getUserName());
tweetToWrite.put(header[i+4], tweet.getUserURL());
tweetToWrite.put(header[i+5], tweet.getCreatedAt());
tweetToWrite.put(header[i+6], tweet.getTweetURL());
tweetToWrite.put(header[i+7], tweet.getCrisisName());
i = i + ClassifiedTweetCSVHeader.length;
if (tweet.getNominalLabels() != null) {
//logger.info("[createClassifiedTweetCsvMap] tweet toString :" + tweet.toString());
//logger.info("[createClassifiedTweetCsvMap] tweet getNominalLabels size :" + tweet.getNominalLabels().size());
tweetToWrite = writeVariableAttributeData(header, i, tweetToWrite, tweet);
}
return tweetToWrite;
}
private Map<String, Object> writeVariableAttributeData(final String[] header, final int startIndex, Map<String, Object> tweetToWrite, final ClassifiedTweet tweet) {
int i = startIndex;
//logger.info("[writeVariableAttributeData] tweet getNominalLabels size :" + tweet.getNominalLabels().size());
//logger.info("[writeVariableAttributeData] startIndex :" + i);
for (int j = 0;j < tweet.getNominalLabels().size();j++) {
try{
NominalLabel nLabel = tweet.getNominalLabels().get(j);
if (nLabel != null) {
//logger.info("[writeVariableAttributeData] nLabel attribute_name :" + nLabel.attribute_name);
tweetToWrite.put(header[i], nLabel.attribute_name);
tweetToWrite.put(header[i+1], nLabel.attribute_code);
tweetToWrite.put(header[i+2], nLabel.label_name);
tweetToWrite.put(header[i+3], nLabel.label_description);
tweetToWrite.put(header[i+4], nLabel.label_code);
tweetToWrite.put(header[i+5], nLabel.confidence);
tweetToWrite.put(header[i+6], nLabel.from_human);
i += VARIABLE_HEADER_SIZE;
}
}
catch(Exception e){
logger.error("[writeVariableAttributeData] excpetion : " + e.getMessage() + " - " + e.getStackTrace());
}
}
return tweetToWrite;
}
private Map<String, Object> createClassifiedTweetIDCsvMap(String[] header, ClassifiedTweet tweet) {
Map<String, Object> tweetToWrite = new HashMap<String, Object>();
int i = 0;
tweetToWrite.put(header[i], tweet.getTweetID());
tweetToWrite.put(header[i+1], tweet.getCrisisName());
i = ClassifiedTweetIDCSVHeader.length;
if (tweet.getNominalLabels() != null) {
tweetToWrite = writeVariableAttributeData(header, i, tweetToWrite, tweet);
}
return tweetToWrite;
}
}