package storm.applications.bolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.mutable.MutableInt; import storm.applications.constants.SpamFilterConstants.*; /** * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class TokenizerBolt extends AbstractBolt { private static final String splitregex = "\\W"; private static final Pattern wordregex = Pattern.compile("\\w+"); @Override public Map<String, Fields> getDefaultStreamFields() { Map<String, Fields> streams = new HashMap<>(); streams.put(Stream.TRAINING, new Fields(Field.WORD, Field.COUNT, Field.IS_SPAM)); streams.put(Stream.TRAINING_SUM, new Fields(Field.SPAM_TOTAL, Field.HAM_TOTAL)); streams.put(Stream.ANALYSIS, new Fields(Field.ID, Field.WORD, Field.NUM_WORDS)); return streams; } @Override public void execute(Tuple input) { String content = input.getStringByField(Field.MESSAGE); if (input.getSourceComponent().equals(Component.TRAINING_SPOUT)) { boolean isSpam = input.getBooleanByField(Field.IS_SPAM); Map<String, MutableInt> words = tokenize(content); int spamTotal = 0, hamTotal = 0; for (Map.Entry<String, MutableInt> entry : words.entrySet()) { String word = entry.getKey(); int count = entry.getValue().toInteger(); if (isSpam) { spamTotal += count; } else { hamTotal += count; } collector.emit(Stream.TRAINING, input, new Values(word, count, isSpam)); } collector.emit(Stream.TRAINING_SUM, input, new Values(spamTotal, hamTotal)); } else if (input.getSourceComponent().equals(Component.ANALYSIS_SPOUT)) { String id = input.getStringByField(Field.ID); Map<String, MutableInt> words = tokenize(content); for (Map.Entry<String, MutableInt> entry : words.entrySet()) { collector.emit(Stream.ANALYSIS, input, new Values(id, entry.getKey(), words.size())); } } collector.ack(input); } private Map<String, MutableInt> tokenize(String content) { String[] tokens = content.split(splitregex); Map<String, MutableInt> words = new HashMap<>(); for (String token : tokens) { String word = token.toLowerCase(); Matcher m = wordregex.matcher(word); if (m.matches()) { MutableInt count = words.get(word); if (count == null) { words.put(word, new MutableInt()); } else { count.increment(); } } } return words; } }