package storm.applications.bolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import storm.applications.constants.SpamFilterConstants.Conf; import storm.applications.constants.SpamFilterConstants.Field; import storm.applications.model.spam.Word; /** * * @author Maycon Viana Bordin <mayconbordin@gmail.com> */ public class BayesRuleBolt extends AbstractBolt { private double spamProbability; private Map<String, AnalysisSummary> analysisSummary; @Override public Fields getDefaultFields() { return new Fields(Field.ID, Field.SPAM_PROB, Field.IS_SPAM); } @Override public void initialize() { spamProbability = config.getDouble(Conf.BAYES_RULE_SPAM_PROB, 0.9d); analysisSummary = new HashMap<>(); } @Override public void execute(Tuple input) { String id = input.getStringByField(Field.ID); Word word = (Word) input.getValueByField(Field.WORD); int numWords = input.getIntegerByField(Field.NUM_WORDS); AnalysisSummary summary = analysisSummary.get(id); if (summary == null) { summary = new AnalysisSummary(); analysisSummary.put(id, summary); } summary.uniqueWords++; updateSummary(summary, word); if (summary.uniqueWords >= numWords) { // calculate bayes float pspam = bayes(summary); collector.emit(new Values(id, pspam, (pspam > spamProbability))); analysisSummary.remove(id); } collector.ack(input); } private float bayes(AnalysisSummary summary) { // Apply Bayes' rule (via Graham) float pposproduct = 1.0f; float pnegproduct = 1.0f; // For every word, multiply Spam probabilities ("Pspam") together // (As well as 1 - Pspam) for (Word w : summary) { pposproduct *= w.getPSpam(); pnegproduct *= (1.0f - w.getPSpam()); } // Apply formula return pposproduct / (pposproduct + pnegproduct); } private void updateSummary(AnalysisSummary summary, Word word) { int limit = 15; // If this list is empty, then add this word in! if (summary.isEmpty()) { summary.add(word); } // Otherwise, add it in sorted order by interesting level else { for (int j = 0; j < summary.size(); j++) { // For every word in the list already Word nw = summary.get(j); // If it's the same word, don't bother if (word.getWord().equals(nw.getWord())) { break; // If it's more interesting stick it in the list } else if (word.interesting() > nw.interesting()) { summary.add(j, word); break; } // If we get to the end, just tack it on there else if (j == summary.size()-1) { summary.add(word); } } } // If the list is bigger than the limit, delete entries // at the end (the more "interesting" ones are at the // start of the list while (summary.size() > limit) summary.remove(summary.size()-1); } private static class AnalysisSummary extends ArrayList<Word> { public int uniqueWords = 0; } }