package udacity.storm;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.testing.TestWordSpout;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
import backtype.storm.utils.Utils;
import java.util.Map;
import java.util.Arrays;
/**
* A bolt that parses the tweet into words
*/
public class ParseTweetBolt extends BaseRichBolt
{
// To output tuples from this bolt to the count bolt
OutputCollector collector;
private String[] skipWords = {"rt", "to", "me","la","on","that","que",
"followers","watch","know","not","have","like","I'm","new","good","do",
"more","es","te","followers","Followers","las","you","and","de","my","is",
"en","una","in","for","this","go","en","all","no","don't","up","are",
"http","http:","https","https:","http://","https://","with","just","your",
"para","want","your","you're","really","video","it's","when","they","their","much",
"would","what","them","todo","FOLLOW","retweet","RETWEET","even","right","like",
"bien","Like","will","Will","pero","Pero","can't","were","Can't","Were","TWITTER",
"make","take","This","from","about","como","esta","follows","followed"};
@Override
public void prepare(
Map map,
TopologyContext topologyContext,
OutputCollector outputCollector)
{
// save the output collector for emitting tuples
collector = outputCollector;
}
@Override
public void execute(Tuple tuple)
{
// get the 1st column 'tweet' from tuple
String tweet = tuple.getString(0);
// provide the delimiters for splitting the tweet
String delims = "[ .,?!]+";
// now split the tweet into tokens
String[] tokens = tweet.split(delims);
// for each token/word, emit it
for (String token: tokens) {
//emit only words greater than length 3 and not stopword list
if(token.length() > 3 && !Arrays.asList(skipWords).contains(token)){
if(token.startsWith("#")){
collector.emit(new Values(token));
}
}
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer)
{
// tell storm the schema of the output tuple for this spout
// tuple consists of a single column called 'tweet-word'
declarer.declare(new Fields("tweet-word"));
}
}