package com.maalaang.omtwitter.uima.consumer; import java.io.BufferedWriter; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.LinkedList; import java.util.Set; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.collection.CasConsumer_ImplBase; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceProcessException; import org.apache.uima.util.Level; import org.apache.uima.util.Logger; import com.maalaang.omtwitter.io.CollectionTextReader; import com.maalaang.omtwitter.io.OMTwitterCorpusFileReader; import com.maalaang.omtwitter.io.OMTwitterCorpusFileWriter; import com.maalaang.omtwitter.model.OMTweet; import com.maalaang.omtwitter.model.OMTweet_Impl; import com.maalaang.omtwitter.text.EmoticonProcessor; import com.maalaang.omtwitter.text.FilterCosineSimilarity; import com.maalaang.omtwitter.text.FilterStopword; import com.maalaang.omtwitter.text.FilterUserName; import com.maalaang.omtwitter.text.TweetFilterPipeline; import com.maalaang.omtwitter.uima.type.SentiWordNetAnnotation; import com.maalaang.omtwitter.uima.type.TweetAnnotation; import com.maalaang.omtwitter.uima.type.TwitterSentiCorpusAnnotation; public class TwitterSentimentCorpusWriteConsumer extends CasConsumer_ImplBase { private final static String PARAM_CORPUS_FILE = "corpusFile"; private static final String PARAM_CORPUS_FIELDS = "corpusFields"; private static final String PARAM_CORPUS_FIELDS_DELIM = "corpusFieldsDelim"; private static final String PARAM_STOPWORD_SET_FILE = "stopwordSetFile"; private static final String PARAM_FILTER_USER_NAME_WINDOW_SIZE = "filterUserNameWindowSize"; private static final String PARAM_FILTER_USER_NAME_POST_LIMIT = "filterUserNamePostLimit"; private static final String PARAM_FILTER_STOPWORD_THRESHOLD = "filterStopwordThreshold"; private static final String PARAM_FILTER_COSINE_SIMILARITY_WINDOW_SIZE = "filterCosineSimilarityWindowSize"; private static final String PARAM_FILTER_COSINE_SIMILARITY_THRESHOLD = "filterCosineSimilarityThreshold"; private static final String PARAM_SUBJECTIVITY_SCORE_WINDOW_SIZE = "subjectivityScoreWindowSize"; private static final String PARAM_SWN_SUBJECTIVITY_FACTOR = "swnSubjectivityFactor"; private static final String PARAM_TSC_SUBJECTIVITY_FACTOR= "tscSubjectivityFactor"; private static final String PARAM_SWN_SUBJECTIVITY_SCORE_WINDOW_START= "swnSubjectivityScoreWindowStart"; private static final String PARAM_TSC_SUBJECTIVITY_SCORE_WINDOW_START= "tscSubjectivityScoreWindowStart"; private SimpleDateFormat dateFormat = null; private OMTwitterCorpusFileWriter corpusWriter = null; private BufferedWriter bw = null; private EmoticonProcessor emoticonProcessor = null; private Logger logger = null; private TweetFilterPipeline filterPipe = null; private Set<String> stopwords = null; private int sbjScoreWindowSize = 0; private LinkedList<Double> swnSbjScoreWindow = null; private LinkedList<Double> tscSbjScoreWindow = null; private double swnSbjScoreWindowSum = 0.0; private double tscSbjScoreWindowSum = 0.0; private double swnSbjScoreWindowStart = 0.0; private double tscSbjScoreWindowStart = 0.0; private double swnSubjectivityFactor = 0.0; private double tscSubjectivityFactor = 0.0; public void initialize() throws ResourceInitializationException { super.initialize(); emoticonProcessor = new EmoticonProcessor(); logger = getLogger(); String fieldsNameStr = (String) getConfigParameterValue(PARAM_CORPUS_FIELDS); String fieldsDelim = (String) getConfigParameterValue(PARAM_CORPUS_FIELDS_DELIM); String[] fieldNames = fieldsNameStr.split("\\s+"); int[] fields = new int[fieldNames.length]; for (int i = 0; i < fieldNames.length; i++) { fields[i] = OMTwitterCorpusFileReader.fieldNameToId(fieldNames[i]); } try { corpusWriter = new OMTwitterCorpusFileWriter((String) getConfigParameterValue(PARAM_CORPUS_FILE), fieldsDelim, fields); stopwords = CollectionTextReader.readSetString((String) getConfigParameterValue(PARAM_STOPWORD_SET_FILE)); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceInitializationException(e); } dateFormat = new SimpleDateFormat(OMTweet.DATE_FORMAT); filterPipe = new TweetFilterPipeline(); filterPipe.add(new FilterUserName((Integer) getConfigParameterValue(PARAM_FILTER_USER_NAME_WINDOW_SIZE), (Integer) getConfigParameterValue(PARAM_FILTER_USER_NAME_POST_LIMIT))); filterPipe.add(new FilterStopword(stopwords, (Integer) getConfigParameterValue(PARAM_FILTER_STOPWORD_THRESHOLD))); filterPipe.add(new FilterCosineSimilarity((Integer) getConfigParameterValue(PARAM_FILTER_COSINE_SIMILARITY_WINDOW_SIZE), (Float) getConfigParameterValue(PARAM_FILTER_COSINE_SIMILARITY_THRESHOLD))); filterPipe.initialize(); swnSbjScoreWindow = new LinkedList<Double>(); tscSbjScoreWindow = new LinkedList<Double>(); sbjScoreWindowSize = (Integer) getConfigParameterValue(PARAM_SUBJECTIVITY_SCORE_WINDOW_SIZE); swnSubjectivityFactor = (Float) getConfigParameterValue(PARAM_SWN_SUBJECTIVITY_FACTOR); tscSubjectivityFactor = (Float) getConfigParameterValue(PARAM_TSC_SUBJECTIVITY_FACTOR); swnSbjScoreWindowStart = (Float) getConfigParameterValue(PARAM_SWN_SUBJECTIVITY_SCORE_WINDOW_START) * swnSubjectivityFactor; tscSbjScoreWindowStart = (Float) getConfigParameterValue(PARAM_TSC_SUBJECTIVITY_SCORE_WINDOW_START) * tscSubjectivityFactor; } public void processCas(CAS aCAS) throws ResourceProcessException { try { JCas jcas = aCAS.getJCas(); TweetAnnotation tweetAnn = (TweetAnnotation) jcas.getAnnotationIndex(TweetAnnotation.type).iterator().next(); // calculate subjectivity scores double swnSbjScoreSum = 0.0; AnnotationIndex<Annotation> swnAnnIndex = jcas.getAnnotationIndex(SentiWordNetAnnotation.type); FSIterator<Annotation> swnAnnIter = swnAnnIndex.iterator(); while (swnAnnIter.hasNext()) { SentiWordNetAnnotation swnAnn = (SentiWordNetAnnotation) swnAnnIter.next(); swnSbjScoreSum += swnAnn.getPositiveScore() + swnAnn.getNegativeScore(); } int swnAnnSize = swnAnnIndex.size(); double swnSbjScore = swnAnnSize > 0 ? swnSbjScoreSum / swnAnnSize : -1.0; double tscSbjScoreSum = 0.0; AnnotationIndex<Annotation> tscAnnIndex = jcas.getAnnotationIndex(TwitterSentiCorpusAnnotation.type); FSIterator<Annotation> tscAnnIter = tscAnnIndex.iterator(); while (tscAnnIter.hasNext()) { TwitterSentiCorpusAnnotation tscAnn = (TwitterSentiCorpusAnnotation) tscAnnIter.next(); tscSbjScoreSum += tscAnn.getPositiveScore() + tscAnn.getNegativeScore(); } int tscAnnSize = tscAnnIndex.size(); double tscSbjScore = tscAnnSize > 0 ? tscSbjScoreSum / tscAnnSize : -1.0; logger.log(Level.FINE, "swnSbjScore=" + swnSbjScore + ", tscSbjScore=" + tscSbjScore); // do filtering Date date = null; try { date = dateFormat.parse(tweetAnn.getDate()); } catch (Exception e) { logger.log(Level.WARNING, "failed to parse a date - " + e.getMessage()); } OMTweet tweet = new OMTweet_Impl(tweetAnn.getId(), tweetAnn.getAuthor(), date, tweetAnn.getCoveredText(), tweetAnn.getQuery()); if (!filterPipe.check(tweet)) { maintainSbjScoreWindow(swnSbjScore, tscSbjScore); if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "[SKIP] filtered out - " + tweet); } return; } // write positive & negative tweets judged by smiley and frowny emoticons emoticonProcessor.updatePolarityByEmoticon(tweet); switch (tweet.getPolarity()) { case OMTweet.POLARITY_POSITIVE: case OMTweet.POLARITY_NEGATIVE: emoticonProcessor.removeEmoticon(tweet); corpusWriter.write(tweet); maintainSbjScoreWindow(swnSbjScore, tscSbjScore); if (logger.isLoggable(Level.FINE)) { logger.log(Level.FINE, "[" + tweet.getPolarityString() + "] write tweet - " + tweet); } return; } // write neutral tweets filtered based on subjectivity scores compared with the scores of the windows if (swnSbjScore > 0.0 && tscSbjScore > 0.0) { if (swnSbjScoreWindow.size() < sbjScoreWindowSize) { if (swnSbjScore < swnSbjScoreWindowStart && tscSbjScore < tscSbjScoreWindowStart) { tweet.setPolarity(OMTweet.POLARITY_NEUTRAL); corpusWriter.write(tweet); } } else if (swnSbjScore < swnSubjectivityFactor * swnSbjScoreWindowSum / swnSbjScoreWindow.size() && tscSbjScore < tscSubjectivityFactor * tscSbjScoreWindowSum / tscSbjScoreWindow.size()) { tweet.setPolarity(OMTweet.POLARITY_NEUTRAL); corpusWriter.write(tweet); } maintainSbjScoreWindow(swnSbjScore, tscSbjScore); } } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceProcessException(e); } } public void destroy() { filterPipe.close(); try { bw.close(); } catch (IOException e) { logger.log(Level.SEVERE, e.getMessage()); } super.destroy(); } /** * Maintain the windows for the scores of past tweets * @param swnSbjScore */ private void maintainSbjScoreWindow(double swnSbjScore, double tscSbjScore) { if (swnSbjScore > 0.0) { swnSbjScoreWindow.add(swnSbjScore); swnSbjScoreWindowSum += swnSbjScore; if (swnSbjScoreWindow.size() > sbjScoreWindowSize) { swnSbjScoreWindowSum -= swnSbjScoreWindow.remove(); } } if (tscSbjScore > 0.0) { tscSbjScoreWindow.add(tscSbjScore); tscSbjScoreWindowSum += tscSbjScore; if (tscSbjScoreWindow.size() > sbjScoreWindowSize) { tscSbjScoreWindowSum -= tscSbjScoreWindow.remove(); } } } }