package email;
import org.apache.log4j.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.ArrayList;
import java.util.List;
import java.io.*;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.util.*;
import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.classify.algorithms.trees.*;
import edu.cmu.minorthird.classify.algorithms.linear.*;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.util.LineProcessingUtil;
//just for comparison with paper results
import edu.cmu.minorthird.util.gui.ViewerFrame;
import edu.cmu.minorthird.classify.experiments.Expt;
import edu.cmu.minorthird.classify.ClassifierLearner;
import edu.cmu.minorthird.classify.experiments.Tester;
import edu.cmu.minorthird.classify.experiments.Evaluation;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.classify.algorithms.svm.*;
import edu.cmu.minorthird.classify.algorithms.trees.*;
/**
*
* Extracts the signature file lines email messages *
*
* It follows the description in "Learning to Extract Signature and Reply
* Lines from Email",
* V.R.Carvalho and W.W.Cohen, CEAS (Conference of Email and Anti-Span), 2004
* *
* @author vitor|AT|cs.cmu.edu
* May 2004
*
* OBS: this implementation assumes the incoming message has a sig file.
*
*/
public class SigFilePredictor
{
private final static String tag = "sig";
private BinaryClassifier model;
private static Logger log = Logger.getLogger(SigFilePredictor.class);
// serialization stuff
static public final long serialVersionUID = 1;
public final int CURRENT_VERSION_NUMBER = 1;
//--------------------- Constructors -----------------------------------------------------
public SigFilePredictor(){
try{
//File file = new File("/afs/cs.cmu.edu/user/vitor/VPsigPredictionModel");
File file = new File("apps/email/models/VPsigPredictionModel");
//File file = new File("apps/email/models/sigModel");
model = (BinaryClassifier)IOUtil.loadSerialized(file);
}
catch (Exception e){
e.printStackTrace();
}
}
public SigFilePredictor(File file){
try{
model = (BinaryClassifier)IOUtil.loadSerialized(file);
}
catch (Exception e){
e.printStackTrace();
}
}
//--------------------- Methods -----------------------------------------------------
/**
* in case you decide you need a better model on the fly -
* make sure to change the instance representation accordingly
*
* @param string with the filename of the model(classifier)
*/
public void setClassifier(String newClassifier)
{
try{
File file = new File(newClassifier);
model = (BinaryClassifier)IOUtil.loadSerialized(file);
}
catch (Exception e){
e.printStackTrace();
}
}
/**
* Predicts the sig file lines in the email message.
* @param email message as String
* @return ArrayList with instances (set of features)
*/
public ArrayList Predict(String wholeMessage){
WindowRepresentation windowRep = new WindowRepresentation(wholeMessage);
MutableInstance[] ins = windowRep.getInstances();
ArrayList finalList = windowRep.ClassifyInstances(model, tag);
return finalList;
}
/**
* Detects if there is a sig in the email message AND
* predicts (extracts) the signature lines
*.
* @param email message as String
* @return ArrayList with instances (set of features)g
*/
public ArrayList DetectAndPredict (String wholeMessage){
SigFileDetector det = new SigFileDetector();
boolean hasSig = det.hasSig(wholeMessage);
if(!hasSig) {return null;}
ArrayList temp = Predict(wholeMessage);
return temp;
}
//--------------------- Inner Class -----------------------------------------------------
/**
* Inner class to represent the message as a sequence of
* features - using window features (neighbor lines)
*
*/
public static class WindowRepresentation
{
private String wholeMessage;
private String[] arrayOfLines;
private MutableInstance[] instanceArray;
private int[] firstCharIndex;
private final int tail_lines = 10;
private final int Th = 0;
public WindowRepresentation(String message){
wholeMessage = message;
String[] temp_arrayOfLines = LineProcessingUtil.getMessageLines(message);
createArrayOfLines(temp_arrayOfLines);
}
public void createArrayOfLines(String[] temp_arrayOfLines){
arrayOfLines = temp_arrayOfLines;
int arraysize = temp_arrayOfLines.length;
instanceArray = new MutableInstance[arraysize];
firstCharIndex = new int[arraysize];
firstCharIndex[0] = 0;
for (int i=1;i<arraysize;i++){
firstCharIndex[i] = firstCharIndex[i-1] + arrayOfLines[i-1].length() + 1;
}
}
public MutableInstance[] getInstances()
{
MutableInstance[] inst = processMailFile(arrayOfLines);
return inst;
}
public int[] getFirstCharIndex()
{
return firstCharIndex;
}
public String getWholeMessage()
{
return wholeMessage;
}
public String[] getArrayOfLines(){
return arrayOfLines;
}
public ArrayList ClassifyInstances(BinaryClassifier model, String tag)
{
ArrayList bemlocal = new ArrayList();
int charBegin;
for(int i=0; i<instanceArray.length;i++){
boolean decision = (model.score(instanceArray[i])<Th)? false:true;
if(decision){
//System.out.println(" POSITIVE = " +i);
//System.out.println(instanceArray[i].toString());
//System.out.println(arrayOfLines[i]);
log.debug(arrayOfLines[i]);
charBegin = wholeMessage.indexOf(arrayOfLines[i], firstCharIndex[i]-1);
if(charBegin<0) charBegin = firstCharIndex[i]; //just in case
bemlocal.add(new CharAnnotation(charBegin,arrayOfLines[i].length()+1, tag));
}
}
log.debug("\n\n");
return bemlocal;
}
/**
* @param email message as a String[] of lines
* @return same msg represented as an MutableInstance[]
*
* DON'T change any of these feature functions - in case you do,
* you'll need a new classifier (model) trained on the new feature set.
*
*/
private MutableInstance[] processMailFile(String[] arrayOfLines)
{
int fromLine = findFromLine(arrayOfLines);
int size = arrayOfLines.length;
for(int i=0; i<size; i++){
instanceArray[i]=new MutableInstance();
//list of features:
//check first line feature
if(i==0){instanceArray[i].addBinary(new Feature("firstL"));}
if(i==1){instanceArray[i].addBinary(new Feature("secondL"));}
//check last line feature
if(i==size-1){instanceArray[i].addBinary(new Feature("lastL"));}
if(i==size-2){instanceArray[i].addBinary(new Feature("lastbutoneL"));}
if(i==size-3){instanceArray[i].addBinary(new Feature("lastbutbutoneL"));}
//header feature
if (LineProcessingUtil.lineMatcher("^\\s?\\s?[\\w|\\-]+\\:", arrayOfLines[i]))
{
if(!LineProcessingUtil.lineMatcher("^\\s?\\s?(http|HTTP|Phone|PHONE|phone|email|EMAIL|Internet|INTERNET|internet)+\\:", arrayOfLines[i])){
instanceArray[i].addBinary(new Feature("header"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeheader"));
}
}
//blank line features
if(i>1){
if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*$", arrayOfLines[i-2])){
instanceArray[i].addBinary( new Feature("prevprevblankL"));
if((size - i)<tail_lines+2) instanceArray[i].addBinary( new Feature("closeprevprevblankL"));
}
}
if(i>0){
if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*$", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevblankL"));
if((size - i)<tail_lines+1) instanceArray[i].addBinary( new Feature("closeprevblankL"));
}
}
if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*$", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("blankL"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeblankL"));
}
else{
instanceArray[i].addBinary( new Feature("notblankL"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*$", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nextblankL"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextblankL"));
}
}
if(i< size -2){
if(LineProcessingUtil.lineMatcher("^[\\s|\\t]*$", arrayOfLines[i+2])){
instanceArray[i].addBinary( new Feature("nextnextblankL"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextnextblankL"));
}
}
//sig marker feature
if(i>0){
if(LineProcessingUtil.lineMatcher("^[\\s]*---*[\\s]*$", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevsigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevsigMarker"));
}
}
if(LineProcessingUtil.lineMatcher("^[\\s]*---*[\\s]*$", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("sigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closesigMarker"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^[\\s]*---*[\\s]*$", arrayOfLines[i+1]))
instanceArray[i].addBinary( new Feature("nextsigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextsigMarker"));
}
//trueSigMarker - post-addition
if(i>3){
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i-4])){
instanceArray[i].addBinary( new Feature("prevprevprevprevtruesigMarker"));
if((size - i)<tail_lines+4) instanceArray[i].addBinary( new Feature("prevprevprevprevclosetruesigMarker"));
}
}
if(i>2){
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i-3])){
instanceArray[i].addBinary( new Feature("prevprevprevtruesigMarker"));
if((size - i)<tail_lines+3) instanceArray[i].addBinary( new Feature("prevprevprevclosetruesigMarker"));
}
}
if(i>1){
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i-2])){
instanceArray[i].addBinary( new Feature("prevprevtruesigMarker"));
if((size - i)<tail_lines+2) instanceArray[i].addBinary( new Feature("prevprevclosetruesigMarker"));
}
}
if(i>0) {
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevtruesigMarker"));
if((size - i)<tail_lines+1) instanceArray[i].addBinary( new Feature("prevclosetruesigMarker"));
}
}
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("truesigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closetruesigMarker"));
}
if(i< size -1) {
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nexttruesigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("nextclosetruesigMarker"));
}
}
if(i< size -2) {
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i+2])){
instanceArray[i].addBinary( new Feature("nextnexttruesigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("nextnextclosetruesigMarker"));
}
}
if(i< size -3) {
if (LineProcessingUtil.lineMatcher("^[\\s]?[\\s]?---?[\\s]*$", arrayOfLines[i+3])){
instanceArray[i].addBinary( new Feature("nextnextnexttruesigMarker"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("nextnextnextclosetruesigMarker"));
}
}
//other markers features
if(i>0){
if(LineProcessingUtil.lineMatcher("^[\\s]*([\\*]|#|[\\+]|[\\^]|-|[\\~]|[\\&]|[////]|[\\$]|_|[\\!]|[\\/]|[\\%]|[\\:]|[\\=]){10,}[\\s]*$", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevotherMarkers"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevotherMarkers"));
}
}
if(LineProcessingUtil.lineMatcher("^[\\s]*([\\*]|#|[\\+]|[\\^]|-|[\\~]|[\\&]|[////]|[\\$]|_|[\\!]|[\\/]|[\\%]|[\\:]|[\\=]){10,}[\\s]*$", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("otherMarkers"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeotherMarkers"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^[\\s]*([\\*]|#|[\\+]|[\\^]|-|[\\~]|[\\&]|[////]|[\\$]|_|[\\!]|[\\/]|[\\%]|[\\:]|[\\=]){10,}[\\s]*$", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nextotherMarkers"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextotherMarkers"));
}
}
//special works feature
if(i>0){
if(LineProcessingUtil.lineMatcher("Dept\\.|University|Corp\\.|Corporations?|College|Ave\\.|Laboratory|[D|d]isclaimer|Division|Professor|Laboratories|Institutes?|Services|Engineering|Director|Sciences?|Address|Fax|Office|Mobile|Phone|Manager|Street|St\\.|Avenue", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevspecWords"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevspecWords"));
}
}
if(LineProcessingUtil.lineMatcher("Dept\\.|University|Corp\\.|Corporations?|College|Ave\\.|Laboratory|[D|d]isclaimer|Division|Professor|Laboratories|Institutes?|Services|Engineering|Director|Sciences?|Address|Fax|Office|Mobile|Phone|Manager|Street|St\\.|Avenue", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("specWords"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closespecWords"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("Dept\\.|University|Corp\\.|Corporations?|College|Ave\\.|Laboratory|[D|d]isclaimer|Division|Professor|Laboratories|Institutes?|Services|Engineering|Director|Sciences?|Address|Fax|Office|Mobile|Phone|Manager|Street|St\\.|Avenue", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nextspecWords"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextspecWords"));
}
}
//email feature
if(i>0){
if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][\\w|\\+|\\.|\\_|\\-]+\\@[\\w|\\-|\\_|\\.]+\\.[a-zA-z]{2,5}[^(\\<|\\>)]", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevemail"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevemail"));
}
}
if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][\\w|\\+|\\.|\\_|\\-]+\\@[\\w|\\-|\\_|\\.]+\\.[a-zA-z]{2,5}[^(\\<|\\>)]", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("email"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeemail"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][\\w|\\+|\\.|\\_|\\-]+\\@[\\w|\\-|\\_|\\.]+\\.[a-zA-z]{2,5}[^(\\<|\\>)]", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nextemail"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextemail"));
}
}
if(LineProcessingUtil.lineMatcher("[^(\\<|\\>)][(\\w|\\+|\\_|\\-)]+\\@[(\\w|\\-|\\_)]+[\\.][a-zA-z]{2,5}", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("emailB"));//short emails
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeemailB"));
}
//URL feature
if(i>0){
if(LineProcessingUtil.lineMatcher("[\\s](http\\:\\/\\/)*(www|web|w3)*(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)*[\\w]+", arrayOfLines[i-1])){
instanceArray[i].addBinary( new Feature("prevurl"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevurl"));
}
}
if(LineProcessingUtil.lineMatcher("[\\s](http\\:\\/\\/)*(www|web|w3)*(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)*[\\w]+", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("url"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeurl"));
}
if(i< size -1){
if(LineProcessingUtil.lineMatcher("[\\s](http\\:\\/\\/)*(www|web|w3)*(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)\\.(\\w[\\w|\\-]+)*[\\w]+", arrayOfLines[i+1])){
instanceArray[i].addBinary( new Feature("nexturl"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("nextprevurl"));
}
}
//phone
if(i>0){
if(LineProcessingUtil.lineMatcher("(\\-?\\d)*\\d\\d\\s?\\-?\\s?\\d\\d\\d\\d", arrayOfLines[i-1]))
instanceArray[i].addBinary( new Feature("prevphone"));
}
if(LineProcessingUtil.lineMatcher("(\\-?\\d)*\\d\\d\\s?\\-?\\s?\\d\\d\\d\\d", arrayOfLines[i]))
instanceArray[i].addBinary( new Feature("phone"));
if(i< size -1){
if(LineProcessingUtil.lineMatcher("(\\-?\\d)*\\d\\d\\s?\\-?\\s?\\d\\d\\d\\d", arrayOfLines[i+1]))
instanceArray[i].addBinary( new Feature("nextphone"));
}
//names like Vitor R. Carvalho or John F. Kennedy
if(LineProcessingUtil.lineMatcher("[A-Z][a-z]+\\s\\s?[A-Z][\\.]?\\s\\s?[A-Z][a-z]+", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("namepat"));
if((size - i)<tail_lines) instanceArray[i].addBinary(new Feature("closenamepat"));
}
//end-of-line quotes
if(LineProcessingUtil.lineMatcher("\"$", arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("endQuote"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeendQuote"));
}
//FROM line feature
if (fromLine > 0){
if(SigFilePredictor.detectFromName(arrayOfLines[fromLine], arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("fromL"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closefromL"));
}
if(i< size -1){
if(SigFilePredictor.detectFromName(arrayOfLines[fromLine], arrayOfLines[i])){
instanceArray[i].addBinary( new Feature("nextfromL"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextfromL"));
}
}
}
//reply symbol
if(i>0){
if(LineProcessingUtil.lineMatcher("^\\>.*", arrayOfLines[i-1]))
instanceArray[i].addBinary( new Feature("prevreplySymbol"));
}
if(LineProcessingUtil.lineMatcher("^\\>.*", arrayOfLines[i]))
instanceArray[i].addBinary( new Feature("replySymbol"));
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^\\>.*", arrayOfLines[i+1]))
instanceArray[i].addBinary( new Feature("nextreplySymbol"));
}
//other reply symbol
if(i>0){
if(LineProcessingUtil.lineMatcher("^[\\=|\\:|\\#|\\:|\\-|\\+|\\&|\\%|\\}]\\s*\\w+.*", arrayOfLines[i-1]))
instanceArray[i].addBinary(new Feature("prevotherreplySymbol"));
}
if(LineProcessingUtil.lineMatcher("^[\\=|\\:|\\#|\\:|\\-|\\+|\\&|\\%|\\}]\\s*\\w+.*", arrayOfLines[i]))
instanceArray[i].addBinary(new Feature("otherreplySymbol"));
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^[\\=|\\:|\\#|\\:|\\-|\\+|\\&|\\%|\\}]\\s*\\w+.*", arrayOfLines[i+1]))
instanceArray[i].addBinary(new Feature("nextotherreplySymbol"));
}
//punct starting and followed by ">"
if(i>0){
if(LineProcessingUtil.lineMatcher("^\\p{Punct}{1,2}\\>.*", arrayOfLines[i-1]))
instanceArray[i].addBinary(new Feature("prevpunct"));
}
if(LineProcessingUtil.lineMatcher("^\\p{Punct}{1,2}\\>.*", arrayOfLines[i]))
instanceArray[i].addBinary(new Feature("punct"));
if(i< size -1){
if(LineProcessingUtil.lineMatcher("^\\p{Punct}{1,2}\\>.*", arrayOfLines[i+1]))
instanceArray[i].addBinary(new Feature("nextpunct"));
}
//writes and wrote features
if(i>0){
if(LineProcessingUtil.lineMatcher(" writes:$", arrayOfLines[i-1]))
instanceArray[i].addBinary(new Feature("prevwrites"));
if(LineProcessingUtil.lineMatcher(" wrote:$", arrayOfLines[i-1]))
instanceArray[i].addBinary(new Feature("prevwrote"));
}
if(LineProcessingUtil.lineMatcher(" writes:$", arrayOfLines[i]))
instanceArray[i].addBinary(new Feature("writes"));
if(LineProcessingUtil.lineMatcher(" wrote:$", arrayOfLines[i]))
instanceArray[i].addBinary(new Feature("wrote"));
if(i< size -1){
if(LineProcessingUtil.lineMatcher(" writes:$", arrayOfLines[i+1]))
instanceArray[i].addBinary(new Feature("nextwrites"));
if(LineProcessingUtil.lineMatcher(" wrote:$", arrayOfLines[i+1]))
instanceArray[i].addBinary(new Feature("nextwrote"));
}
//same initial punct characters
if((i>0)&&(LineProcessingUtil.startWithSameInitialPunctCharacters(arrayOfLines[i], arrayOfLines[i-1]))){
instanceArray[i].addBinary(new Feature("prevsicline"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevsicline"));
}
if((i<size-1)&&(LineProcessingUtil.startWithSameInitialPunctCharacters(arrayOfLines[i], arrayOfLines[i+1]))){
instanceArray[i].addBinary( new Feature("nextsicline"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextsicline"));
}
//number of leading tabs
int ddd = LineProcessingUtil.indentNumber(arrayOfLines[i]);
if(ddd==1){instanceArray[i].addBinary( new Feature("indentUni"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeindentUni"));}
if(ddd==2){instanceArray[i].addBinary( new Feature("indentBi"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeindentBi"));}
if(ddd>=3){instanceArray[i].addBinary( new Feature("indentTri"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeindentTri"));}
if(i>0){
ddd = LineProcessingUtil.indentNumber(arrayOfLines[i-1]);
if(ddd==1){instanceArray[i].addBinary( new Feature("previndentUni"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevindentUni"));}
if(ddd==2){instanceArray[i].addBinary( new Feature("previndentBi"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevindentBi"));}
if(ddd>=3){instanceArray[i].addBinary( new Feature("previndentTri"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closeprevindentTri"));}
}
if(i<size-1){
ddd = LineProcessingUtil.indentNumber(arrayOfLines[i+1]);
if(ddd==1){instanceArray[i].addBinary( new Feature("nextindentUni"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextindentUni"));}
if(ddd==2){instanceArray[i].addBinary( new Feature("nextindentBi"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextindentBi"));}
if(ddd>=3){instanceArray[i].addBinary( new Feature("nextindentTri"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closenextindentTri"));}
}
//punctuation percentage
double temp = LineProcessingUtil.punctuationPercentage(arrayOfLines[i]);
if (temp>0.20){
instanceArray[i].addBinary( new Feature("punctPerc20"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc20"));
}
else {
instanceArray[i].addBinary( new Feature("punctPerc0"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc20"));
}
if (temp>0.50){
instanceArray[i].addBinary( new Feature("punctPerc50"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc50"));
}
if (temp>0.75){
instanceArray[i].addBinary( new Feature("punctPerc75"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc75"));
}
if (temp>0.90){
instanceArray[i].addBinary( new Feature("punctPerc90"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc90"));
}
if (i>0){
temp = LineProcessingUtil.punctuationPercentage(arrayOfLines[i-1]);
if (temp>0.20){
instanceArray[i].addBinary( new Feature("punctPerc20prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc20prev"));
}
if (temp>0.50){
instanceArray[i].addBinary( new Feature("punctPerc50prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc50prev"));
}
if (temp>0.75){
instanceArray[i].addBinary( new Feature("punctPerc75prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc75prev"));
}
if (temp>0.90){
instanceArray[i].addBinary( new Feature("punctPerc90prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc90prev"));
}
}
if (i<size-1){
temp = LineProcessingUtil.punctuationPercentage(arrayOfLines[i+1]);
if (temp>0.20){
instanceArray[i].addBinary( new Feature("punctPerc20next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc20next"));
}
if (temp>0.50){
instanceArray[i].addBinary( new Feature("punctPerc50next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc50next"));
}
if (temp>0.75){
instanceArray[i].addBinary( new Feature("punctPerc75next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc75next"));
}
if (temp>0.90){
instanceArray[i].addBinary( new Feature("punctPerc90next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closepunctPerc90next"));
}
}
//word characters percentage
temp = LineProcessingUtil.wordCharactersPercentage(arrayOfLines[i]);
if (temp<0.10){
instanceArray[i].addBinary( new Feature("charPerc10"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc10"));
}
if (temp<0.30){
instanceArray[i].addBinary( new Feature("charPerc30"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc30"));
}
if (temp<0.60){
instanceArray[i].addBinary( new Feature("charPerc60"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc60"));
}
if (temp<0.90){
instanceArray[i].addBinary( new Feature("charPerc90"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc90"));
}
if (i>0){
temp = LineProcessingUtil.wordCharactersPercentage(arrayOfLines[i-1]);
if (temp<0.10){
instanceArray[i].addBinary( new Feature("charPerc10prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc10prev"));
}
if (temp<0.30){
instanceArray[i].addBinary( new Feature("charPerc30prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc30prev"));
}
if (temp<0.60){
instanceArray[i].addBinary( new Feature("charPerc60prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc60prev"));
}
if (temp<0.90){
instanceArray[i].addBinary( new Feature("charPerc90prev"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc90prev"));
}
}
if (i<size-1){
temp = LineProcessingUtil.wordCharactersPercentage(arrayOfLines[i+1]);
if (temp<0.10){
instanceArray[i].addBinary( new Feature("charPerc10next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc10next"));
}
if (temp<0.30){
instanceArray[i].addBinary( new Feature("charPerc30next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc30next"));
}
if (temp<0.60){
instanceArray[i].addBinary( new Feature("charPerc60next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc60next"));
}
if (temp<0.90){
instanceArray[i].addBinary( new Feature("charPerc90next"));
if((size - i)<tail_lines) instanceArray[i].addBinary( new Feature("closecharPerc90next"));
}
}
}//end of for
return instanceArray;
}
//find the line where the from: information is
private int findFromLine(String[] arrayOfLines)
{
int fromLine = -1;
int size = arrayOfLines.length;
for (int i = 0; i<size-1; i++){
if(LineProcessingUtil.lineMatcher("^\\s?\\s?\\p{Punct}?[F|f][R|r][O|o][M|m]\\:", arrayOfLines[i])){
fromLine = i;
break;
}
}
return fromLine;
}
}
//--------------------- Main method / Test routines / Support methods -----------------------------------------------------
/**
* From Line feature function: extracts a "name" from the fromLine of
* an email message and attempts to match any of its components with
* the words in the target line
* In other words, if a piece of the sender's name is detected in this line, it
* returns true. False, otherwise.
*
* @param fromLine in String format
* @param testLine in String format
* @return true, if any part of the sender's name is found.
*
*/
public static boolean detectFromName(String tmp, String testLine)
{
String inputStr = tmp;
//try first pattern name first (Vitor R. Carvalho)
String patternStr = "([A-Z][a-z]+\\s\\s?[A-Z]?[\\.]?\\s\\s?([A-Z][a-z]+))";
// Compile and use regular expression
Pattern mypattern = Pattern.compile(patternStr);
Matcher matcher = mypattern.matcher(inputStr);
boolean matchFound = matcher.find();
if (matchFound)
{
int groupsize = matcher.groupCount() + 1;
String[] groupStr = new String[groupsize];
for (int j = 0; j <= matcher.groupCount(); j++)
{
groupStr[j] = matcher.group(j);
if (LineProcessingUtil.lineMatcher(groupStr[j], testLine))
{
return true;
}
}
}
else
{
//try another string pattern (Vitor Carvalho)
patternStr = "([A-Z][a-z]+\\s\\s?([A-Z][a-z]+))";
// Compile and use regular expression
Pattern myPattern = Pattern.compile(patternStr);
Matcher matcher2 = myPattern.matcher(inputStr);
boolean newMatchFound = matcher2.find();
if (newMatchFound)
{
int groupsize = matcher.groupCount() + 1;
String[] groupStr = new String[groupsize];
for (int j = 0; j <= matcher2.groupCount(); j++)
{
groupStr[j] = matcher2.group(j);
if (LineProcessingUtil.lineMatcher(groupStr[j], testLine))
{
return true;
}
}
}
}
//in case nothing was found
return false;
}
static public void createModel(String[] args, String linetag) throws IOException{
String modelName = linetag+"Model";
Dataset dataset = new BasicDataset();
//starts from position 1
//args is teh file array, but the first position is not used here
for(int j=1; j< args.length; j++){
//parse the message
String message = LineProcessingUtil.readFile(args[j]);
String[] strOfLines = LineProcessingUtil.getMessageLines(message);
ClassLabel[] linelabel = new ClassLabel[strOfLines.length];
for(int i=0; i<strOfLines.length; i++){
if(strOfLines[i].startsWith("#sig# ")){
strOfLines[i] = strOfLines[i].substring(6);
if(linetag.compareTo("sig")==0) linelabel[i] = ClassLabel.binaryLabel(+1);
else linelabel[i] = ClassLabel.binaryLabel(-1);
}
else if(strOfLines[i].startsWith("#reply#")){
strOfLines[i] = strOfLines[i].substring(7);
if(linetag.compareTo("reply")==0) linelabel[i] = ClassLabel.binaryLabel(+1);
else linelabel[i] = ClassLabel.binaryLabel(-1);
}
else{
linelabel[i] = ClassLabel.binaryLabel(-1);
}
}
WindowRepresentation windowRep = new SigFilePredictor.WindowRepresentation(message);
windowRep.createArrayOfLines(strOfLines);//to exclude #sig# from feature extraction
MutableInstance[] ins = windowRep.getInstances();
for(int i=0; i<strOfLines.length; i++){
Example example = new Example((Instance)ins[i], linelabel[i]);
dataset.add(example);
}
}
System.out.println("dataset size = " +dataset.size());
//just to compare with paper performance
//ClassifierLearner learner2 = new AdaBoost();
//new BatchVersion(new VotedPerceptron(), 15);
//Splitter splitter = Expt.toSplitter("k5");
//Evaluation eval = Tester.evaluate(learner2, dataset, splitter);
//ViewerFrame frame = new ViewerFrame("numeric demo", eval.toGUI());
System.out.println("training the Model...");
ClassifierLearner learner = new BatchVersion(new VotedPerceptron(), 15);
Classifier cl = new DatasetClassifierTeacher(dataset).train(learner);
System.out.println("saving model in file..."+modelName);
IOUtil.saveSerialized((Serializable)cl, new File(modelName));
return;
}
static public void main(String[] args)
{
try {
//Usage check
if (args.length < 1)
{
usage();
return;
}
//parsing inputs
boolean create = false;
String opt = args[0];
if ((opt.startsWith("-create"))||(opt.startsWith("create"))) {
create = true;
}
if(create){ //creates a model
SigFilePredictor.createModel(args, "sig");
}
else{ //prediction mode
System.out.println("For details, set the verbosity level in config/log4j.properties\n");
//SigFilePredictor pred = new SigFilePredictor(new File("/afs/cs.cmu.edu/user/vitor/VPsigPredictionModel");
SigFilePredictor pred = new SigFilePredictor();
for(int i=0; i< args.length; i++){
System.out.println(args[i]);
String wholeMessage = LineProcessingUtil.readFile(args[i]);
ArrayList onelist = pred.Predict(wholeMessage);
// System.out.print(onelist.toString());
}
}
} catch (Exception e) {
usage();
e.printStackTrace();
}
}
private static void usage()
{
System.out.println("usage: SigFilePredictor filename1 filename2 ...");
System.out.println("OR");
System.out.println("usage: SigFilePredictor -create filename1 filename2 ...");
System.out.println("PS: to create, use \"Signature and Reply Dataset\" annotation stile as in www.cs.cmu.edu/~vitor/codeAndData.html");
}
}