package udacity.storm; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.testing.TestWordSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import backtype.storm.utils.Utils; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.HashMap; import java.util.Map; import com.google.common.base.Preconditions; /** * A bolt that counts the words that it receives */ public class InfoBolt extends BaseRichBolt { // To output tuples from this bolt to the next stage bolts, if any private OutputCollector collector; // Map to store the count of the words //ArrayList<String> happyCodes = new ArrayList<String>( // Arrays.asList("#")); private static final String SPACE_EXCEPTIONS = "\\n\\r"; public static final String SPACE_CHAR_CLASS = "\\p{C}\\p{Z}&&[^" + SPACE_EXCEPTIONS + "\\p{Cs}]"; public static final String SPACE_REGEX = "[" + SPACE_CHAR_CLASS + "]"; public static final String PUNCTUATION_CHAR_CLASS = "\\p{P}\\p{M}\\p{S}" + SPACE_EXCEPTIONS; public static final String PUNCTUATION_REGEX = "[" + PUNCTUATION_CHAR_CLASS + "]"; private static final String EMOTICON_DELIMITER = SPACE_REGEX + "|" + PUNCTUATION_REGEX; public static final Pattern SMILEY_REGEX_PATTERN = Pattern.compile(":[)DdpP]|:[ -]\\)|<3"); public static final Pattern FROWNY_REGEX_PATTERN = Pattern.compile(":[(<]|:[ -]\\("); public static final Pattern EMOTICON_REGEX_PATTERN = Pattern.compile("(?<=^|" + EMOTICON_DELIMITER + ")(" + SMILEY_REGEX_PATTERN.pattern() + "|" + FROWNY_REGEX_PATTERN.pattern() + ")+(?=$|" + EMOTICON_DELIMITER + ")"); public static final Pattern EMOJI_REGEX = Pattern.compile("([\uD83C-\uDBFF\uDC00-\uDFFF])+"); public static final Pattern EMOTICON_REGEX = Pattern.compile("[\uF301-\uF618]+"); private static final Charset UTF_8 = Charset.forName("UTF-8"); ArrayList<String> happy = new ArrayList<String>(Arrays.asList("1f601", "1f602", "1f603", "1f604", "1f605", "1f606", "1f609", "1f60A", "1f60B", "1f60D", "1f618", "1f61A", "1f61C", "1f61D", "1f624", "1f632", "1f638", "1f639", "1f63A", "1f63B", "1f63D", "1f647", "1f64B", "1f64C", "1f64F", "U+270C", "U+2728", "U+2764", "U+263A", "U+2665", "U+3297", "1f31F", "1f44F", "1f48B", "1f48F", "1f491", "1f492", "1f493", "1f495", "1f496", "1f497", "1f498", "1f499", "1f49A", "1f49B", "1f49C", "1f49D", "1f49D", "1f49F", "1f4AA", "1f600", "1f607", "1f608", "1f60E", "1f617", "1f619", "1f61B", "1f31E")); ArrayList<String> mediumHappy = new ArrayList<String>(Arrays.asList("1f60C", "1f60F", "1f633", "1f63C", "1f646", "U+2B50", "1f44D", "1f44C")); ArrayList<String> neutral = new ArrayList<String>(Arrays.asList("1f614", "1f623", "U+2753", "U+2754", "1f610", "1f611", "1f62E", "1f636")); ArrayList<String> mediumUnhappy = new ArrayList<String>(Arrays.asList("1f612", "1f613", "1f616", "1f61E", "1f625", "1f628", "1f62A", "1f62B", "1f637", "1f635", "1f63E", "U+26A0", "1f44E", "1f4A4", "1f615", "1f61F", "1f62F", "1f634")); ArrayList<String> unhappy = new ArrayList<String>(Arrays.asList("1f620", "1f621", "1f622", "1f629", "1f62D", "1f630", "1f631", "1f63F", "1f640", "1f645", "1f64D", "1f64E", "U+274C", "U+274E", "1f494", "1f626", "1f627", "1f62C")); @Override public void prepare( Map map, TopologyContext topologyContext, OutputCollector outputCollector) { // save the collector for emitting tuples collector = outputCollector; } @Override public void execute(Tuple tuple) { try { // get the word from the 1st column of incoming tuple Map<String, String> emoticonAndScore; String originalTweet = tuple.getStringByField("original-tweet"); String word = tuple.getStringByField("tweet-word"); String noun = tuple.getStringByField("noun"); String verb = tuple.getStringByField("verb"); String object = tuple.getStringByField("object"); String county_id = tuple.getStringByField("county_id"); String url = tuple.getStringByField("url"); int sentiment = tuple.getIntegerByField("sentiment"); /* emoticonAndScore = getScoreIfEmoticonPresent(originalTweet); int matchedEmoticonScore = Integer.parseInt(emoticonAndScore.get("score")); String matchedEmoticon = emoticonAndScore.get("emoticon"); System.out.println("Emotion: " + matchedEmoticon + " Score: " + emoticonAndScore.get("score")); collector.emit(new Values(originalTweet,word,noun,verb,object,county_id,url,matchedEmoticonScore, matchedEmoticon, sentiment));*/ collector.emit(new Values(originalTweet,word,noun,verb,object,county_id,url,0, "", sentiment)); } catch(Exception e) { e.printStackTrace(); } } public Map<String,String> getScoreIfEmoticonPresent(String input) { boolean result = false ; Matcher matcher = null; String matchedString=""; String unicodeString = ""; String matchedStringScore="0"; Map<String,String> emoticonAndScore = new HashMap<String, String>(); System.out.println("Entering regex mathcing getEmoticon for String : " + input); if (EMOJI_REGEX != null) { matcher = EMOJI_REGEX.matcher(input); if(matcher.matches()) { for( int i = 0; i < matcher.groupCount(); i++ ) { System.out.println("MAtcher group: " + String.valueOf(i)); matchedString = matcher.group(i); char[] ca = matchedString.toCharArray(); for(int j = 0; j < ca.length; j=j+2 ) { System.out.println( String.format("%04x", Character.toCodePoint(ca[j], ca[j+1])) ); unicodeString = String.format("%04x", Character.toCodePoint(ca[j], ca[j+1])); break; } } System.out.println("Emoji Matcher found: " + matchedString); result = true; } } if (EMOTICON_REGEX != null) { matcher = EMOTICON_REGEX.matcher(input); if(matcher.matches()) { for( int i = 0; i < matcher.groupCount(); i++ ) { char[] ca = matchedString.toCharArray(); for(int j = 0; j < ca.length; j=j+2 ) { System.out.println( String.format("%04x", Character.toCodePoint(ca[j], ca[j+1])) ); unicodeString = String.format("%04x", Character.toCodePoint(ca[j], ca[j+1])); break; } } System.out.println("Emoticon Matcher found: " + matchedString); result = true; } } if (result == true){ if(happy.contains( unicodeString )){ matchedStringScore = "5"; } else if(mediumHappy.contains( unicodeString )){ matchedStringScore = "4"; } else if(neutral.contains( unicodeString )){ matchedStringScore = "3"; } else if(mediumUnhappy.contains( unicodeString )){ matchedStringScore = "2"; } else if(unhappy.contains( unicodeString )){ matchedStringScore = "1"; } } if(matcher != null && matcher.matches()) { emoticonAndScore.put("emoticon", matcher.group(0)); } else { emoticonAndScore.put("emoticon", ""); } emoticonAndScore.put("score", matchedStringScore); return emoticonAndScore; } @Override public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) { // tell storm the schema of the output tuple for this spout // tuple consists of a two columns called 'word' and 'count' // declare the first column 'word', second column 'count' //outputFieldsDeclarer.declare(new Fields("word","count")); outputFieldsDeclarer.declare(new Fields("original-tweet", "tweet-word", "noun", "verb", "object", "county_id", "url", "matchedEmoticonScore", "matchedEmoticon", "sentiment")); } }