package email; import java.io.*; import java.util.*; import edu.cmu.minorthird.text.*; import edu.cmu.minorthird.util.*; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.algorithms.linear.*; import java.util.regex.*; import java.math.*; import edu.cmu.minorthird.util.LineProcessingUtil; //just for comparison with paper results import edu.cmu.minorthird.util.gui.ViewerFrame; import edu.cmu.minorthird.classify.experiments.Expt; import edu.cmu.minorthird.classify.ClassifierLearner; import edu.cmu.minorthird.classify.experiments.Tester; import edu.cmu.minorthird.classify.experiments.Evaluation; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.algorithms.svm.*; import edu.cmu.minorthird.classify.algorithms.trees.*; /** * Classify an incoming email message as having or not a Signature File. * * It follows the description in "Learning to Extract Signature and Reply Lines from Email", * V.R.Carvalho and W.W.Cohen, CEAS (Conference of Email and Anti-Span), 2004 * * Uses the last 10 last lines of email message. * * @author Vitor R. Carvalho - May 2004 */ public class SigFileDetector { private BinaryClassifier classifier; // serialization stuff static public final long serialVersionUID = 1; public final int CURRENT_VERSION_NUMBER = 1; //--------------------- Constructors ----------------------------------------------------- public SigFileDetector() { try { //File file = new File("models/sigDetectionAdaBoostModel"); File file = new File("apps/email/models/VPsigDetectionModel"); //File file = new File("apps/email/models/AB99sigDetectionModel"); classifier = (BinaryClassifier)IOUtil.loadSerialized(file); } catch (Exception e) { e.printStackTrace(); } } public SigFileDetector(File file) { try { classifier = (BinaryClassifier)IOUtil.loadSerialized(file); } catch (Exception e) { e.printStackTrace(); } } //--------------------- Methods ----------------------------------------------------- /** * in case you decide you need a better model - * make sure to change the instance representation accordingly */ public void setClassifier(String newClassifier) { try { File file = new File(newClassifier); classifier = (BinaryClassifier)IOUtil.loadSerialized(file); } catch (Exception e) { e.printStackTrace(); } } /** * Detects if there is a sig in the email message. * @param email message as String * @return boolean - true, the msg has a sig. False, otherwise. */ public boolean hasSig(String wholeMessage){ // detect binary features on message and // returns a single instance representing this message SigDetectorByLine byLine = new SigDetectorByLine(); Instance instance = byLine.getInstance(wholeMessage); //apply the classifier to the instance boolean decision = (classifier.score(instance)<0)? false:true; return decision; } //--------------------- Inner Class ----------------------------------------------------- /** * Inner class to help extracting the features * from last line of message. */ public class SigDetectorByLine { //window for search of features - usually last tail_lines lines private final int tail_lines = 10; private int firstSearchLine = 0; private int fromLine = 0; private int lastSearchLine; private MutableInstance instance; public SigDetectorByLine(){ firstSearchLine = 0; fromLine = 0; instance = new MutableInstance(); } private MutableInstance getInstance(String wholeMessage) { clear(); String[] strArray = preProcessMailMessage(wholeMessage); MutableInstance inst = processMailFile(strArray); return inst; } //reset parameters, in case a the same object is called to several messages private void clear() { firstSearchLine = 0; fromLine = 0; instance = new MutableInstance(); } /** * Detects the presence of any of the features in the last tail_lines * lines of mail file * * @param the String[] with the message lines * @return the instance (set of features) representing the message * * OBS: DON'T change any of these feature functions - in case you do, * you'll need a new classifier trained on the new feature set. * */ private MutableInstance processMailFile(String[] arrayOfLines) { int ind; double temp = 0; if(lastSearchLine<=firstSearchLine){ throw new IllegalStateException("ERROR parsing message"); } for(int i=lastSearchLine; i>=firstSearchLine; i--){ ind = lastSearchLine - i;//line index, for instance, the 3rd to last line //list of features if(LineProcessingUtil.lineMatcher("^[\\s]*---*[\\s]*$", arrayOfLines[i])){ instance.addBinary( new Feature("sigMarker"+ind)); } if(LineProcessingUtil.lineMatcher("^[\\s|\\t]?[\\s|\\t]?---?[\\s|\\t]*$", arrayOfLines[i])){ instance.addBinary( new Feature("sigBeginMarker"+ind)); } if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*([\\*]|#|[\\+]|[\\^]|-|[\\~]|[\\&]|[////]|[\\$]|_|[\\!]|[\\/]|[\\%]|[\\:]|[\\=]){10,}[\\s]*$", arrayOfLines[i])){ instance.addBinary( new Feature("otherMarkers"+ind)); } if(LineProcessingUtil.lineMatcher("Dept\\.|University|Corp\\.|Corporations?|College|Ave\\.|Laboratory|[D|d]isclaimer|Division|Professor|Laboratories|Institutes?|Services|Engineering|Director|Sciences?|Address|Manager|Fax|Office|Mobile|Phone|Street|St\\.|Avenue", arrayOfLines[i])){ instance.addBinary( new Feature("specWords"+ind)); } if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][\\w|\\+|\\.|\\_|\\-]+\\@[\\w|\\-|\\_|\\.]+\\.[a-zA-z]{2,5}[^(\\<|\\>)]", arrayOfLines[i])){ instance.addBinary( new Feature("emailA"+ind)); } if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][(\\w|\\+|\\_|\\-)]+\\@[(\\w|\\-|\\_)]+[\\.][a-zA-z]{2,5}", arrayOfLines[i])){ instance.addBinary( new Feature("emailB"+ind)); } if(LineProcessingUtil.lineMatcher("[\\s|\\t](http\\:\\/\\/)*(www|web|w3)*(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)*[\\w]+", arrayOfLines[i])){ instance.addBinary( new Feature("url"+ind)); } if(LineProcessingUtil.lineMatcher("(\\-?\\d)*\\d\\d\\s?\\-?\\s?\\d\\d\\d\\d", arrayOfLines[i])){ instance.addBinary( new Feature("phone"+ind)); } if(LineProcessingUtil.lineMatcher("[A-Z][a-z]+\\s\\s?[A-Z][\\.]?\\s\\s?[A-Z][a-z]+", arrayOfLines[i])){ instance.addBinary( new Feature("completeName"+ind)); } if(LineProcessingUtil.lineMatcher("\"$", arrayOfLines[i])){ instance.addBinary( new Feature("endQuote"+ind)); } if(LineProcessingUtil.lineMatcher("^[\\w|\\-]+\\:", arrayOfLines[i])){ instance.addBinary( new Feature("header"+ind)); } //vitor if (LineProcessingUtil.lineMatcher("^[\\s|\\t]*$",arrayOfLines[i])){ instance.addBinary( new Feature("BlankL"+ind)); } else{ instance.addBinary( new Feature("elseBlankL"+ind)); } int ddd = LineProcessingUtil.indentNumber(arrayOfLines[i]); if(ddd==1){instance.addBinary( new Feature("indentUni"+ind));} else if(ddd==2){instance.addBinary( new Feature("indentBi"+ind));} else if(ddd>=3){instance.addBinary( new Feature("indentTri"+ind));} temp = LineProcessingUtil.punctuationPercentage(arrayOfLines[i]); if (temp>0.20) instance.addBinary( new Feature("punctPerc20"+ind)); else instance.addBinary( new Feature("punctPerc0"+ind)); if (temp>0.50) instance.addBinary( new Feature("punctPerc50"+ind)); if (temp>0.75) instance.addBinary( new Feature("punctPerc75"+ind)); if (temp>0.90) instance.addBinary( new Feature("punctPerc90"+ind)); //2 lines starting with same punctuation symbol if((i>0)&&(LineProcessingUtil.startWithSameInitialPunctCharacters(arrayOfLines[i], arrayOfLines[i-1]))){ instance.addBinary( new Feature("prevsicline"+ind)); } //extract proper name from FROM field and match it if (fromLine > 0){ if(SigFilePredictor.detectFromName(arrayOfLines[fromLine], arrayOfLines[i])){ instance.addBinary( new Feature("fromL"+ind)); } } } return instance; } /** * Splits the message in lines and calculates the last line and the * first line to be searched. Also detects the "From:" line and excludes header. * * @param incoming message as a String * @return String[] with message lines. */ private String[] preProcessMailMessage(String wholeMessage) { //spliting in lines - see in the readFile method that we insert \n //note that, if last lines of file are blank lines(not even spaces), they are //not split in new elements of the array arrayOfLines //however, if a line has \\s spaces, then all lines before it are split into the array! String[] arrayOfLines = wholeMessage.split("\n"); int arraylength = arrayOfLines.length; //calculating the lastSearchLine //Let's disregard the blank lines at the end of file int numberOfBlankLines = 0; int temp1 = arraylength -1; while(LineProcessingUtil.lineMatcher("^[\\s]*$", arrayOfLines[temp1])){ numberOfBlankLines++; temp1--; } lastSearchLine = arraylength - 1 - numberOfBlankLines; //calculating the firstSearchLine //TBD: need an intelligent way to exclude header of email- a </body> tag would be the proper solution //for the moment, find the "From:" line and use the next line as the end of header //Even better than the From: solution: any line that starts with "[\\w]+:" int tempfromLine = 0; int endOfHeaderLine = 0; int i = 0; while(i<=lastSearchLine){ if(LineProcessingUtil.lineMatcher("^\\s?\\s?From\\:", arrayOfLines[i])){tempfromLine = i; endOfHeaderLine = tempfromLine + 1;} if((tempfromLine>0)&&(LineProcessingUtil.lineMatcher("^\\s?\\s?[a-zA-Z][a-z|A-Z|\\-|\\_]+\\:", arrayOfLines[i]))){ endOfHeaderLine = i + 1; } else if(tempfromLine>0) {break;} i++; } fromLine = tempfromLine; firstSearchLine = lastSearchLine - tail_lines + 1; if (firstSearchLine < endOfHeaderLine) {firstSearchLine = endOfHeaderLine;} if ((lastSearchLine<= firstSearchLine)||(lastSearchLine - firstSearchLine > (tail_lines +1))){ lastSearchLine = arraylength - 1; firstSearchLine = lastSearchLine - tail_lines +1; if (firstSearchLine<0) firstSearchLine =0; } return arrayOfLines; } } private void createSigModel(TextLabels labels) throws IOException{ //SigDetectorByLine byLine = new SigDetectorByLine(); SigDetectorByLine byLine = new SigDetectorByLine(); edu.cmu.minorthird.text.TextBase textBase = labels.getTextBase(); Dataset dataset = new BasicDataset(); for (edu.cmu.minorthird.text.Span.Looper it = textBase.documentSpanIterator(); it.hasNext();) { ClassLabel myLabel = new ClassLabel(); edu.cmu.minorthird.text.Span span = it.nextSpan(); String spanString = span.asString(); //parse the message into a set of features MutableInstance myInst = byLine.getInstance(spanString); //get the labels if (labels.hasType(span, "sig")){ myLabel = ClassLabel.binaryLabel(+1); } else{ myLabel = ClassLabel.binaryLabel(-1); } //build the dataset Example ex = new Example((Instance)myInst, myLabel); dataset.add(ex); } System.out.println("dataset size = " +dataset.size()); //just to compare with paper performance //ClassifierLearner learner2 = new AdaBoost();new BatchVersion(new VotedPerceptron(), 5); //Splitter splitter = Expt.toSplitter("k5"); //Evaluation eval = Tester.evaluate(learner2, dataset, splitter); //ViewerFrame frame = new ViewerFrame("numeric demo", eval.toGUI()); //train and save the model String modelName = "mysigDetectionModel"; System.out.println("training the Model..."); ClassifierLearner learner = new BatchVersion(new VotedPerceptron(), 15); //ClassifierLearner learner = new AdaBoost(); Classifier cl = new DatasetClassifierTeacher(dataset).train(learner); System.out.println("saving model in file..."+modelName); IOUtil.saveSerialized((Serializable)cl, new File(modelName)); return; } //--------------------- Main method / Test routine ----------------------------------------------------- static public void main(String[] args) { try { //Usage check if (args.length < 1) { usage(); return; } //parsing inputs boolean create = false; String opt = args[0]; if ((opt.startsWith("-create"))||(opt.startsWith("create"))) { create = true; } if(create){ //create model mode TextLabels labels = FancyLoader.loadTextLabels(args[1]);//parse the .bsh file SigFileDetector det = new SigFileDetector(); det.createSigModel(labels); } else{ //detect mode SigFileDetector det = new SigFileDetector(); for(int i=0; i< args.length; i++){ String wholeMessage = LineProcessingUtil.readFile(args[i]); boolean isSig = det.hasSig(wholeMessage); if(isSig){ System.out.println(args[i]+" has Signature"); } else{ System.out.println(args[i]+" has NOT Signature"); } } } } catch (Exception e) { e.printStackTrace(); } } private static void usage(){ System.out.println("Usage: SigFileDetector filename1 filename2 ..."); System.out.println(" OR..."); System.out.println("SigFileDetector -create yourfile.bsh"); /* *in .env file, annotations follow: * * *addToType filename1 0 -1 sig *addToType filename2 0 -1 sig *addToType filename3 0 -1 notsig * */ } }