package com.maalaang.omtwitter.tools;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.InvalidXMLException;
import com.maalaang.omtwitter.corpus.TwitterCorpusStat;
import com.maalaang.omtwitter.io.LogSystemStream;
import com.maalaang.omtwitter.io.OMTwitterCorpusFile;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileWriter;
import com.maalaang.omtwitter.model.OMTweet;
import com.maalaang.omtwitter.uima.pipeline.OMTwitterFixedFlowPipeline;
public class ConstructTwitterSentimentCorpus {
private Properties prop = null;
private Logger logger = null;
public static void main(String[] args) {
try {
Properties prop = new Properties();
prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"));
LogSystemStream.redirectErrToLog(Level.ERROR);
ConstructTwitterSentimentCorpus con = new ConstructTwitterSentimentCorpus(prop);
con.run();
} catch (Exception e) {
e.printStackTrace();
}
}
public ConstructTwitterSentimentCorpus(Properties prop) {
this.prop = prop;
this.logger = Logger.getLogger(getClass());
}
public void run() throws Exception {
constructSentiCorpusFromSearchCorpus();
constructSentiCorpusFromSampleCorpus();
constructBalancedSentiCorpus();
}
private void constructSentiCorpusFromSearchCorpus() throws InvalidXMLException, IOException, ResourceConfigurationException, ResourceInitializationException {
OMTwitterFixedFlowPipeline pipeline = new OMTwitterFixedFlowPipeline();
pipeline.setReader("TwitterCorpusReader", "com/maalaang/omtwitter/uima/reader/uima-twitter-corpus-reader.xml");
pipeline.setReaderParameter("TwitterCorpusReader", "twitterCorpusFile", prop.getProperty("raw.corpus.search.file"));
pipeline.setReaderParameter("TwitterCorpusReader", "fields", "ID AUTHOR DATE QUERY TEXT");
pipeline.setReaderParameter("TwitterCorpusReader", "fieldsDelimiter", "\\t");
pipeline.addAnnotator("StanfordPosAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-stanford-pos-annotator.xml");
pipeline.addAnnotator("SnowballStemAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-snowball-stem-annotator.xml");
pipeline.addAnnotator("SentiWordNetAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-sentiment-score-annotator.xml");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "sentiScoreDicObjectFile", "resource/generated/sentiwordnet/SentiWordNet_3.0.0_20100908.stem.average.dic.object");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "maxWindowSize", 5);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "useStemToFindDic", Boolean.TRUE);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "usePosToFindDic", Boolean.TRUE);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "posTagset", "BROWN_CORPUS");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "annotationTypeName", "com.maalaang.omtwitter.uima.type.SentiWordNetAnnotation");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameId", "id");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNamePositiveScore", "positiveScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameNegativeScore", "negativeScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameSubjectiveScore", "subjectiveScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameObjectiveScore", "objectiveScore");
pipeline.addAnnotator("TwitterSentimentScoreAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-twitter-sentiment-score-annotator.xml");
pipeline.setAnnotatorParameter("TwitterSentimentScoreAnnotator", "sentiScoreDicObjectFile", "resource/generated/senti_corpus/mobile_devices_20120426.tweet.senti.smiley.removed.merged.neutral.added.dic.object");
pipeline.addConsumer("TwitterSentimentCorpusWriteConsumer", "com/maalaang/omtwitter/uima/consumer/uima-twitter-sentiment-corpus-write-consumer.xml");
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFile", prop.getProperty("senti.corpus.file.search"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFields", prop.getProperty("senti.corpus.search.fields"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFieldsDelim", prop.getProperty("senti.corpus.search.fields.delim"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "stopwordSetFile", prop.getProperty("stopword.set.file"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterUserNameWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.search.filter.user.name.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterUserNamePostLimit", Integer.parseInt(prop.getProperty("senti.corpus.search.filter.user.name.post.limit")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterStopwordThreshold", Integer.parseInt(prop.getProperty("senti.corpus.search.filter.stopword.threshold")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterCosineSimilarityWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.search.cosine.similarity.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterCosineSimilarityThreshold", Float.parseFloat(prop.getProperty("senti.corpus.search.cosine.similarity.threshold")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "subjectivityScoreWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.search.subjectivity.score.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "swnSubjectivityFactor", Float.parseFloat(prop.getProperty("senti.corpus.search.swn.subjectivity.factor")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "tscSubjectivityFactor", Float.parseFloat(prop.getProperty("senti.corpus.search.tsc.subjectivity.factor")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "swnSubjectivityScoreWindowStart", Float.parseFloat(prop.getProperty("senti.corpus.search.swn.subjectivity.score.window.start")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "tscSubjectivityScoreWindowStart", Float.parseFloat(prop.getProperty("senti.corpus.search.tsc.subjectivity.score.window.start")));
pipeline.run(true, "ConstructSentiCorpusFromSearchCorpus.xml");
}
private void constructSentiCorpusFromSampleCorpus() throws InvalidXMLException, IOException, ResourceConfigurationException, ResourceInitializationException {
OMTwitterFixedFlowPipeline pipeline = new OMTwitterFixedFlowPipeline();
pipeline.setReader("TwitterCorpusReader", "com/maalaang/omtwitter/uima/reader/uima-twitter-corpus-reader.xml");
pipeline.setReaderParameter("TwitterCorpusReader", "twitterCorpusFile", prop.getProperty("raw.corpus.sample.file"));
pipeline.setReaderParameter("TwitterCorpusReader", "fields", "ID AUTHOR DATE TEXT");
pipeline.setReaderParameter("TwitterCorpusReader", "fieldsDelimiter", "\\t");
pipeline.addAnnotator("StanfordPosAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-stanford-pos-annotator.xml");
pipeline.addAnnotator("SnowballStemAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-snowball-stem-annotator.xml");
pipeline.addAnnotator("SentiWordNetAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-sentiment-score-annotator.xml");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "sentiScoreDicObjectFile", "resource/generated/sentiwordnet/SentiWordNet_3.0.0_20100908.stem.average.dic.object");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "maxWindowSize", 5);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "useStemToFindDic", Boolean.TRUE);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "usePosToFindDic", Boolean.TRUE);
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "posTagset", "BROWN_CORPUS");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "annotationTypeName", "com.maalaang.omtwitter.uima.type.SentiWordNetAnnotation");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameId", "id");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNamePositiveScore", "positiveScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameNegativeScore", "negativeScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameSubjectiveScore", "subjectiveScore");
pipeline.setAnnotatorParameter("SentiWordNetAnnotator", "featureNameObjectiveScore", "objectiveScore");
pipeline.addAnnotator("TwitterSentimentScoreAnnotator", "com/maalaang/omtwitter/uima/annotator/uima-twitter-sentiment-score-annotator.xml");
pipeline.setAnnotatorParameter("TwitterSentimentScoreAnnotator", "sentiScoreDicObjectFile", "resource/generated/senti_corpus/mobile_devices_20120426.tweet.senti.smiley.removed.merged.neutral.added.dic.object");
pipeline.addConsumer("TwitterSentimentCorpusWriteConsumer", "com/maalaang/omtwitter/uima/consumer/uima-twitter-sentiment-corpus-write-consumer.xml");
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFile", prop.getProperty("senti.corpus.file.sample"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFields", prop.getProperty("senti.corpus.sample.fields"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "corpusFieldsDelim", prop.getProperty("senti.corpus.sample.fields.delim"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "stopwordSetFile", prop.getProperty("stopword.set.file"));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterUserNameWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.sample.filter.user.name.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterUserNamePostLimit", Integer.parseInt(prop.getProperty("senti.corpus.sample.filter.user.name.post.limit")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterStopwordThreshold", Integer.parseInt(prop.getProperty("senti.corpus.sample.filter.stopword.threshold")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterCosineSimilarityWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.sample.cosine.similarity.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "filterCosineSimilarityThreshold", Float.parseFloat(prop.getProperty("senti.corpus.sample.cosine.similarity.threshold")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "subjectivityScoreWindowSize", Integer.parseInt(prop.getProperty("senti.corpus.sample.subjectivity.score.window.size")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "swnSubjectivityFactor", Float.parseFloat(prop.getProperty("senti.corpus.sample.swn.subjectivity.factor")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "tscSubjectivityFactor", Float.parseFloat(prop.getProperty("senti.corpus.sample.tsc.subjectivity.factor")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "swnSubjectivityScoreWindowStart", Float.parseFloat(prop.getProperty("senti.corpus.sample.swn.subjectivity.score.window.start")));
pipeline.setConsumerParameter("TwitterSentimentCorpusWriteConsumer", "tscSubjectivityScoreWindowStart", Float.parseFloat(prop.getProperty("senti.corpus.sample.tsc.subjectivity.score.window.start")));
pipeline.run(true, "constructSentiCorpusFromSampleCorpus.xml");
}
private void constructBalancedSentiCorpus() throws IOException {
balanceSentiment(prop.getProperty("senti.corpus.file.search"), prop.getProperty("senti.corpus.search.fields.delim"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.search.fields"), "\\s+"),
prop.getProperty("senti.corpus.file"), prop.getProperty("senti.corpus.fields.delim"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.fields"), "\\s+"), false);
balanceSentiment(prop.getProperty("senti.corpus.file.sample"), prop.getProperty("senti.corpus.sample.fields.delim"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.sample.fields"), "\\s+"),
prop.getProperty("senti.corpus.file"), prop.getProperty("senti.corpus.fields.delim"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.fields"), "\\s+"), true);
printCorpusStat(prop.getProperty("senti.corpus.file"), prop.getProperty("senti.corpus.fields.delim"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.fields"), "\\s+"));
}
private void balanceSentiment(String file1, String fieldDelim1, int[] fields1, String file2, String fieldDelim2, int[] fields2, boolean append) throws IOException {
Map<Integer,Integer> sentiFreq = TwitterCorpusStat.sentimentFreq(file1, fieldDelim1, fields1);
int[] freq = new int[3];
freq[0] = sentiFreq.get(OMTweet.POLARITY_POSITIVE);
freq[1] = sentiFreq.get(OMTweet.POLARITY_NEGATIVE);
freq[2] = sentiFreq.get(OMTweet.POLARITY_NEUTRAL);
int[][] indices = new int[3][];
for (int i = 0; i < 3; i++) {
indices[i] = null;
}
int sbjDiff;
if ((sbjDiff = Math.min(freq[0], freq[1]) * 2) > freq[2]) {
sbjDiff = (sbjDiff - freq[3]) / 2;
} else {
sbjDiff = 0;
}
if (freq[0] != freq[1] || sbjDiff > 0) {
int senti1 = 0;
int senti2 = 1;
if (freq[0] < freq[1]) {
senti1 = 1;
senti2 = 0;
}
indices[senti1] = randomIndices(freq[senti1], freq[senti2] - sbjDiff, System.currentTimeMillis());
freq[senti1] = freq[senti2] - sbjDiff;
if (sbjDiff > 0) {
indices[senti2] = randomIndices(freq[senti2], freq[senti2] - sbjDiff, System.currentTimeMillis());
freq[senti2] = freq[senti2] - sbjDiff;
}
}
int sbj = freq[0] + freq[1];
if (sbj < freq[2]) {
indices[2] = randomIndices(freq[2], sbj, System.currentTimeMillis());
freq[2] = sbj;
} else if (sbj > freq[2]) {
throw new IllegalStateException();
}
int[] idx = new int[3];
int[] cursor = new int[3];
for (int i = 0; i < 3; i++) {
idx[i] = 0;
cursor[i] = 0;
}
OMTwitterCorpusFileReader reader = new OMTwitterCorpusFileReader(file1, fieldDelim1, fields1);
OMTwitterCorpusFileWriter writer = new OMTwitterCorpusFileWriter(file2, fieldDelim2, fields2, append);
int senti = 0;
while (reader.hasNext()) {
OMTweet tweet = reader.next();
switch(tweet.getPolarity()) {
case OMTweet.POLARITY_POSITIVE:
senti = 0;
break;
case OMTweet.POLARITY_NEGATIVE:
senti = 1;
break;
case OMTweet.POLARITY_NEUTRAL:
senti = 2;
break;
}
if (indices[senti] == null) {
writer.write(tweet);
} else if (cursor[senti] < indices[senti].length && indices[senti][cursor[senti]] == idx[senti]) {
writer.write(tweet);
cursor[senti]++;
}
idx[senti]++;
}
writer.close();
reader.close();
}
private int[] randomIndices(int size1, int size2, long seed) {
if (size1 <= size2) {
throw new IllegalArgumentException();
}
Random random = new Random(seed);
List<Integer> indices = new ArrayList<Integer>(size1);
for (int i = 0; i < size1; i++) {
indices.add(i);
}
int rand1;
int rand2;
int tmp;
for (int i = 0; i < size1; i++) {
rand1 = random.nextInt(size1);
rand2 = random.nextInt(size1);
tmp = indices.get(rand1);
indices.set(rand1, indices.get(rand2));
indices.set(rand2, tmp);
}
for (int i = 0; i < size1; i++) {
rand2 = random.nextInt(size1);
tmp = indices.get(i);
indices.set(i, indices.get(rand2));
indices.set(rand2, tmp);
}
indices = indices.subList(0, size2);
Collections.sort(indices);
int[] chosen = new int[size2];
for (int i = 0; i < size2; i++) {
chosen[i] = indices.get(i);
}
return chosen;
}
private void printCorpusStat(String corpusFile, String fieldDelim, int[] fields) throws UnsupportedEncodingException, FileNotFoundException {
Map<Integer,Integer> freq = TwitterCorpusStat.sentimentFreq(corpusFile, fieldDelim, fields);
logger.info("+-------------------------------------------------------------+");
logger.info(OMTweet.POLARITY_STR_POSITIVE + " tweets: " + freq.get(OMTweet.POLARITY_POSITIVE));
logger.info(OMTweet.POLARITY_STR_NEGATIVE + " tweets: " + freq.get(OMTweet.POLARITY_NEGATIVE));
logger.info(OMTweet.POLARITY_STR_NEUTRAL + " tweets: " + freq.get(OMTweet.POLARITY_NEUTRAL));
logger.info("+-------------------------------------------------------------+");
logger.info(OMTweet.POLARITY_STR_SUBJECTIVE + " tweets: " + freq.get(OMTweet.POLARITY_SUBJECTIVE));
logger.info(OMTweet.POLARITY_STR_OBJECTIVE + " tweets: " + freq.get(OMTweet.POLARITY_OBJECTIVE));
logger.info("+-------------------------------------------------------------+");
logger.info(OMTweet.POLARITY_STR_NOT_SPECIFIED + " tweets: " + freq.get(OMTweet.POLARITY_NOT_SPECIFIED));
logger.info("+-------------------------------------------------------------+");
}
}