package storm.applications.bolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.KryoException; import com.esotericsoftware.kryo.io.Input; import java.io.FileInputStream; import java.io.FileNotFoundException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.applications.constants.SpamFilterConstants.Conf; import static storm.applications.constants.SpamFilterConstants.DEFAULT_WORDMAP; import storm.applications.constants.SpamFilterConstants.Field; import storm.applications.constants.SpamFilterConstants.Stream; import storm.applications.model.spam.Word; import storm.applications.model.spam.WordMap; /** * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class WordProbabilityBolt extends AbstractBolt { private static final Logger LOG = LoggerFactory.getLogger(WordProbabilityBolt.class); private static Kryo kryoInstance; private WordMap words; @Override public Fields getDefaultFields() { return new Fields(Field.ID, Field.WORD, Field.NUM_WORDS); } @Override public void initialize() { String wordMapFile = config.getString(Conf.WORD_PROB_WORDMAP, null); boolean useDefault = config.getBoolean(Conf.WORD_PROB_WORDMAP_USE_DEFAULT, true); if (wordMapFile != null) { words = loadWordMap(wordMapFile); } if (words == null) { if (useDefault) { words = loadDefaultWordMap(); } else { words = new WordMap(); } } } @Override public void execute(Tuple input) { if (input.getSourceStreamId().equals(Stream.TRAINING)) { String word = input.getStringByField(Field.WORD); int count = input.getIntegerByField(Field.COUNT); boolean isSpam = input.getBooleanByField(Field.IS_SPAM); Word w = words.get(word); if (w == null) { w = new Word(word); words.put(word, w); } if (isSpam) { w.countBad(count); } else { w.countGood(count); } } else if (input.getSourceStreamId().equals(Stream.TRAINING_SUM)) { int spamCount = input.getIntegerByField(Field.SPAM_TOTAL); int hamCount = input.getIntegerByField(Field.HAM_TOTAL); words.incSpamTotal(spamCount); words.incHamTotal(hamCount); for (Word word : words.values()) { word.calcProbs(words.getSpamTotal(), words.getHamTotal()); } } else if (input.getSourceStreamId().equals(Stream.ANALYSIS)) { String id = input.getStringByField(Field.ID); String word = input.getStringByField(Field.WORD); int numWords = input.getIntegerByField(Field.NUM_WORDS); Word w = words.get(word); if (w == null) { w = new Word(word); w.setPSpam(0.4f); } collector.emit(input, new Values(id, w, numWords)); } collector.ack(input); } private static Kryo getKryoInstance() { if (kryoInstance == null) { kryoInstance = new Kryo(); kryoInstance.register(Word.class, new Word.WordSerializer()); kryoInstance.register(WordMap.class, new WordMap.WordMapSerializer()); } return kryoInstance; } private static WordMap loadDefaultWordMap() { try { Input input = new Input(WordProbabilityBolt.class.getResourceAsStream(DEFAULT_WORDMAP)); WordMap object = getKryoInstance().readObject(input, WordMap.class); input.close(); return object; } catch(KryoException ex) { LOG.error("Unable to deserialize the wordmap object", ex); } return null; } private static WordMap loadWordMap(String path) { try { Input input = new Input(new FileInputStream(path)); WordMap object = getKryoInstance().readObject(input, WordMap.class); input.close(); return object; } catch(FileNotFoundException ex) { LOG.error("The file path was not found", ex); } catch(KryoException ex) { LOG.error("Unable to deserialize the wordmap object", ex); } return null; } }