package com.maalaang.omtwitter.tools;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectOutputStream;
import java.io.OutputStreamWriter;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import org.apache.log4j.Logger;
import com.maalaang.omtwitter.io.OMTwitterCorpusFile;
import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader;
import com.maalaang.omtwitter.io.OMTwitterReader;
import com.maalaang.omtwitter.model.OMTweet;
import com.maalaang.omtwitter.resource.TwitterSentiCorpusDictionary;
import com.maalaang.omtwitter.text.OMTweetToken;
import com.maalaang.omtwitter.text.OMTweetTokenizer;
public class BuildTwitterSentiCorpusDictionary {
private Logger logger = null;
private final static int INDEX_POS = 0;
private final static int INDEX_NEG = 1;
private final static int INDEX_NEU = 2;
private final static int INDEX_ALL = 3;
public final static int SORT_BY_POS_SCORE = 0;
public final static int SORT_BY_NEG_SCORE = 1;
public final static int SORT_BY_LEXICAL = 2;
private OMTweetTokenizer tweetTokenizer = null;
/**
* @param args
*/
public static void main(String[] args) {
try {
BuildTwitterSentiCorpusDictionary builder = new BuildTwitterSentiCorpusDictionary();
Properties prop = new Properties();
prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"));
builder.buildDicFile(prop.getProperty("tsc.file"), OMTwitterCorpusFile.fieldNameToId(prop.getProperty("senti.corpus.fields"), "\\s+"), prop.getProperty("senti.corpus.fields.delim"), prop.getProperty("tsc.dic.text"));
builder.createObjectFile(prop.getProperty("tsc.dic.text"), prop.getProperty("tsc.dic.object"));
} catch (IOException e) {
e.printStackTrace();
}
}
public BuildTwitterSentiCorpusDictionary() {
logger = Logger.getLogger(getClass());
tweetTokenizer = new OMTweetTokenizer();
}
private Map<String,TokenFreq> buildFreqMap(OMTwitterReader reader, int[] countArray) {
HashMap<String,TokenFreq> map = new HashMap<String, TokenFreq>();
TokenFreq tokenFreq = null;
while (reader.hasNext()) {
OMTweet tweet = reader.next();
int polarity = tweet.getPolarity();
OMTweetToken[] tokenList = tweetTokenizer.tokenize(tweet.getText());
for (OMTweetToken tok : tokenList) {
String key = tok.getNormalizedText();
if (map.containsKey(key)) {
tokenFreq = map.get(key);
} else {
tokenFreq = new TokenFreq();
}
switch (polarity) {
case OMTweet.POLARITY_POSITIVE:
tokenFreq.pos++;
countArray[INDEX_POS]++;
break;
case OMTweet.POLARITY_NEGATIVE:
tokenFreq.neg++;
countArray[INDEX_NEG]++;
break;
case OMTweet.POLARITY_NEUTRAL:
tokenFreq.neu++;
countArray[INDEX_NEU]++;
break;
case OMTweet.POLARITY_NOT_SPECIFIED:
throw new IllegalStateException();
}
countArray[INDEX_ALL]++;
map.put(key, tokenFreq);
}
}
return map;
}
public void buildDicFile(String file, int[] fields, String fieldDelimiter, String out) throws IOException {
OMTwitterCorpusFileReader reader = new OMTwitterCorpusFileReader(file, fieldDelimiter, fields);
Map<String,TokenFreq> map = null;
int[] countArray = new int[4];
double[] maxScoreArray = new double[2];
map = buildFreqMap(reader, countArray);
findMaxScore(map, countArray, maxScoreArray);
writeDicFile(out, map, SORT_BY_LEXICAL, countArray, maxScoreArray);
reader.close();
}
public void createObjectFile(String in, String out) throws IOException {
logger.info("load Twitter Sentiment Corpus dictionary - " + in);
TwitterSentiCorpusDictionary dic = new TwitterSentiCorpusDictionary();
dic.load(in);
logger.info("loaded");
logger.info("write object file - " + out);
ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(out));
oos.writeObject(dic);
oos.close();
logger.info("done");
}
private void writeDicFile(String out, Map<String,TokenFreq> map, int sortFlag, int[] countArray, double[] maxScoreArray) throws IOException {
TreeSet<Entry<String, TokenFreq>> sortedSet = new TreeSet<Map.Entry<String,TokenFreq>>(getFreqComparator(sortFlag, countArray));
sortedSet.addAll(map.entrySet());
BufferedWriter fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8"));
for (Entry<String, TokenFreq> e : sortedSet) {
TokenFreq f = e.getValue();
fw.write(e.getKey());
fw.write('\t');
fw.write(String.format(Locale.ENGLISH, "%.4f", getPosScore(f.pos, f.neg, f.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]) / maxScoreArray[INDEX_POS]));
fw.write('\t');
fw.write(String.format(Locale.ENGLISH, "%.4f", getNegScore(f.pos, f.neg, f.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]) / maxScoreArray[INDEX_NEG]));
fw.write('\n');
}
fw.close();
}
private void findMaxScore(Map<String,TokenFreq> map, int[] countArray, double[] maxScoreArray) {
double maxPosScore = 0.0;
double maxNegScore = 0.0;
Set<Entry<String,TokenFreq>> set = map.entrySet();
for (Entry<String,TokenFreq> e : set) {
TokenFreq f = e.getValue();
double posScore = getPosScore(f.pos, f.neg, f.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
double negScore = getNegScore(f.pos, f.neg, f.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
if (maxPosScore < posScore) {
maxPosScore = posScore;
}
if (maxNegScore < negScore) {
maxNegScore = negScore;
}
}
maxScoreArray[INDEX_POS] = maxPosScore;
maxScoreArray[INDEX_NEG] = maxNegScore;
}
private Comparator<Map.Entry<String,TokenFreq>> getFreqComparator(int sortByFlag, final int[] countArray) {
Comparator<Map.Entry<String,TokenFreq>> comp = null;
switch (sortByFlag) {
case SORT_BY_POS_SCORE:
comp = new Comparator<Map.Entry<String,TokenFreq>>() {
public int compare(Entry<String, TokenFreq> o1, Entry<String, TokenFreq> o2) {
TokenFreq f1 = o1.getValue();
TokenFreq f2 = o2.getValue();
double score1 = getPosScore(f1.pos, f1.neg, f1.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
double score2 = getPosScore(f2.pos, f2.neg, f2.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
if (score1 > score2) {
return -1;
} else if (score1 < score2) {
return 1;
} else {
return o1.getKey().compareTo(o2.getKey());
}
}
};
break;
case SORT_BY_NEG_SCORE:
comp = new Comparator<Map.Entry<String,TokenFreq>>() {
public int compare(Entry<String, TokenFreq> o1, Entry<String, TokenFreq> o2) {
TokenFreq f1 = o1.getValue();
TokenFreq f2 = o2.getValue();
double score1 = getNegScore(f1.pos, f1.neg, f1.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
double score2 = getNegScore(f2.pos, f2.neg, f2.neu, countArray[INDEX_POS], countArray[INDEX_NEG], countArray[INDEX_NEU]);
if (score1 > score2) {
return -1;
} else if (score1 < score2) {
return 1;
} else {
return o1.getKey().compareTo(o2.getKey());
}
}
};
break;
case SORT_BY_LEXICAL:
comp = new Comparator<Map.Entry<String,TokenFreq>>() {
public int compare(Entry<String, TokenFreq> o1, Entry<String, TokenFreq> o2) {
return o1.getKey().compareTo(o2.getKey());
}
};
break;
}
return comp;
}
public static double getPosScore(int posCnt, int negCnt, int neuCnt, int posTotalCnt, int negTotalCnt, int neuTotalCnt) {
return ((double)posCnt / (double)posTotalCnt) * ((negTotalCnt + neuTotalCnt) / (double)(negCnt + neuCnt + 10));
}
public static double getNegScore(int posCnt, int negCnt, int neuCnt, int posTotalCnt, int negTotalCnt, int neuTotalCnt) {
return ((double)negCnt / (double)negTotalCnt) * ((posTotalCnt + neuTotalCnt) / (double)(posCnt + neuCnt + 10));
}
private class TokenFreq {
public int pos = 0;
public int neg = 0;
public int neu = 0;
}
}