package udacity.storm; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.testing.TestWordSpout; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import backtype.storm.utils.Utils; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.stanford.nlp.tagger.maxent.MaxentTagger; import udacity.storm.tools.CountiesLookup; /** * A bolt that parses the tweet into words */ public class ParseTweetBolt extends BaseRichBolt { // To output tuples from this bolt to the count bolt OutputCollector collector; StringBuilder result; private String[] skipWords = {"rt", "to", "me","la","on","that","que", "followers","watch","know","not","have","like","I'm","new","good","do", "more","es","te","followers","Followers","las","you","and","de","my","is", "en","una","in","for","this","go","en","all","no","don't","up","are", "http","http:","https","https:","http://","https://","with","just","your", "para","want","your","you're","really","video","it's","when","they","their","much", "would","what","them","todo","FOLLOW","retweet","RETWEET","even","right","like", "bien","Like","will","Will","pero","Pero","can't","were","Can't","Were","TWITTER", "make","take","This","from","about","como","esta","follows","followed"}; //MaxentTagger tagger ; CountiesLookup clookup ; @Override public void prepare( Map map, TopologyContext topologyContext, OutputCollector outputCollector) { // save the output collector for emitting tuples collector = outputCollector; //tagger = new MaxentTagger("models/english-caseless-left3words-distsim.tagger"); clookup= new CountiesLookup(); result = new StringBuilder(); } @Override public void execute(Tuple tuple) { // get the 1st column 'tweet' from tuple String tweet = tuple.getStringByField("tweet").split("DELIMITER")[0]; double latitude = Double.parseDouble(tuple.getStringByField("tweet").split("DELIMITER")[1].split(",")[0]); double longitude = Double.parseDouble(tuple.getStringByField("tweet").split("DELIMITER")[1].split(",")[1]); String county_id = clookup.getCountyCodeByGeo(latitude, longitude); int sentiment = tuple.getIntegerByField("sentiment"); String url = tuple.getString(0).split("DELIMITER")[2]; // provide the delimiters for splitting the tweet String delims = "[ .,?!]+"; String [] posTweet = PartOfSpeechTagger(tweet); //String [] posTweet = new String[]{"i" ,"got" ,"ebola"}; if(posTweet != null) { String[] tokens = tweet.split(delims); System.out.print("\tParseTweetBolt\tDEBUG:" + posTweet + ", URL: " + url + "\n"); // for each token/word, emit it result.setLength(0); int n = tokens.length; for (int i = 0; i < n; i++) { if(!Arrays.asList(skipWords).contains(tokens[i])){ result.append(tokens[i]); } } collector.emit(new Values(tweet, result.toString(), posTweet[0], posTweet[1], posTweet[2], county_id, url, sentiment)); } } public String [] PartOfSpeechTagger(String sentence) { String [] result = new String[3]; result[0] = ""; result[1] = ""; result[2] = ""; /*String tagged = tagger.tagString(sentence); System.out.println("\tParseTweetBolt\tDEBUG: tagged sentence is " + tagged); int seen = 0; if(tagged.contains("_NN ")) { result[0] = tagged.substring(0, tagged.indexOf("_NN")); result[0] = result[0].substring(Math.max(0,result[0].lastIndexOf(" "))); System.out.println("\tParseTweetBolt\tDEBUG: subject is " + result[0]); seen++; } if(tagged.contains("_VB ")) { result[1] = tagged.substring(0, tagged.indexOf("_VB")); result[1] = result[1].substring(Math.max(0,result[1].lastIndexOf(" "))); System.out.println("\tParseTweetBolt\tDEBUG: verb is " + result[1]); seen++; } if(tagged.contains("_NNP ")) { result[2] = tagged.substring(0, tagged.indexOf("_NNP")); result[2] = result[2].substring(Math.max(0,result[2].lastIndexOf(" "))); System.out.println("\tParseTweetBolt\tDEBUG: verb is " + result[2]); seen++; } if(seen > 1 ) return result; else return null; */ return result; } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { // tell storm the schema of the output tuple for this spout // tuple consists of a single column called 'tweet-word' declarer.declare(new Fields("original-tweet", "tweet-word", "noun", "verb", "object", "county_id", "url", "sentiment")); } }