package storm.applications.model.spam; import com.esotericsoftware.kryo.Kryo; import com.esotericsoftware.kryo.Serializer; import com.esotericsoftware.kryo.io.Input; import com.esotericsoftware.kryo.io.Output; import java.io.Serializable; public class Word implements Serializable { private static final long serialVersionUID = 1667802979041340740L; private String word; // The String itself private int countBad; // The total times it appears in "bad" messages private int countGood; // The total times it appears in "good" messages private float rBad; // bad count / total bad words private float rGood; // good count / total good words private float pSpam; // probability this word is Spam public Word() { } // Create a word, initialize all vars to 0 public Word(String s) { word = s; countBad = 0; countGood = 0; rBad = 0.0f; rGood = 0.0f; pSpam = 0.0f; } private Word(String word, int countBad, int countGood, float rBad, float rGood, float pSpam) { this.word = word; this.countBad = countBad; this.countGood = countGood; this.rBad = rBad; this.rGood = rGood; this.pSpam = pSpam; } // Increment bad counter public void countBad() { countBad++; } // Increment good counter public void countGood() { countGood++; } public void countBad(int increment) { countBad += increment; } // Increment good counter public void countGood(int increment) { countGood += increment; } public void calcProbs(long badTotal, long goodTotal) { calcBadProb(badTotal); calcGoodProb(goodTotal); finalizeProb(); } // Computer how often this word is bad public void calcBadProb(long total) { if (total > 0) rBad = countBad / (float) total; } // Computer how often this word is good public void calcGoodProb(long total) { if (total > 0) rGood = 2*countGood / (float) total; // multiply 2 to help fight against false positives (via Graham) } // Implement bayes rules to computer how likely this word is "spam" public void finalizeProb() { if (rGood + rBad > 0) pSpam = rBad / (rBad + rGood); if (pSpam < 0.01f) pSpam = 0.01f; else if (pSpam > 0.99f) pSpam = 0.99f; } // The "interesting" rating for a word is // How different from 0.5 it is public float interesting() { return Math.abs(0.5f - pSpam); } // Some getters and setters public float getPGood() { return rGood; } public float getPBad() { return rBad; } public float getPSpam() { return pSpam; } public void setPSpam(float f) { pSpam = f; } public String getWord() { return word; } public static class WordSerializer extends Serializer<Word> { @Override public void write (Kryo kryo, Output output, Word object) { output.writeString(object.word); output.writeInt(object.countBad); output.writeInt(object.countGood); output.writeFloat(object.rBad); output.writeFloat(object.rGood); output.writeFloat(object.pSpam); } @Override public Word read (Kryo kryo, Input input, Class<Word> type) { return new Word(input.readString(), input.readInt(), input.readInt(), input.readFloat(), input.readFloat(), input.readFloat()); } } @Override public String toString() { return "Word{" + "word=" + word + ", countBad=" + countBad + ", countGood=" + countGood + ", rBad=" + rBad + ", rGood=" + rGood + ", pSpam=" + pSpam + '}'; } }