package email;
import edu.cmu.minorthird.text.CharAnnotation;
import edu.cmu.minorthird.text.StringAnnotator;
import edu.cmu.minorthird.util.*;
import java.io.*;
import edu.cmu.minorthird.classify.Example;
import edu.cmu.minorthird.classify.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
/*
* Extracts the reply lines from email messages *
*
* It follows the description in "Learning to Extract Signature and Reply
* Lines from Email",
* V.R.Carvalho and W.W.Cohen, CEAS (Conference of Email and Anti-Span), 2004
*
* @author Vitor R. Carvalho
* May 2004
*/
public class ReplyToAnnotator extends StringAnnotator
{
private BinaryClassifier model;
private static final int Th = 0;
private static Logger log = Logger.getLogger(ReplyToAnnotator.class);
// serialization stuff
static private final long serialVersionUID = 1;
private final int CURRENT_VERSION_NUMBER = 1;
//--------------------- Constructors -----------------------------------------------------
public ReplyToAnnotator(){
try{
File file = new File("apps/email/models/VPreplyModel");//vp15
model = (BinaryClassifier)IOUtil.loadSerialized(file);
}
catch (Exception e){
e.printStackTrace();
}
}
public ReplyToAnnotator(File file){
try{
model = (BinaryClassifier)IOUtil.loadSerialized(file);
}
catch (Exception e){
e.printStackTrace();
}
}
//--------------------- Methods -----------------------------------------------------
//
protected CharAnnotation[] annotateString(String spanString)
{
ArrayList list = this.Predict(spanString);
if(list.isEmpty()){return null;}
CharAnnotation[] cann = (CharAnnotation[])list.toArray(new CharAnnotation[list.size()]);
return cann;
}
public String deleteReplyLinesFromMsg(String doc){
SigFilePredictor.WindowRepresentation windowRep = new SigFilePredictor.WindowRepresentation(doc);
StringBuffer notreplybuff = new StringBuffer();
ClassifyInstances(windowRep, "reply", notreplybuff, null);
return notreplybuff.toString();
}
public String getMsgReplyLines(String doc){
SigFilePredictor.WindowRepresentation windowRep = new SigFilePredictor.WindowRepresentation(doc);
StringBuffer replybuff = new StringBuffer();
ClassifyInstances(windowRep, "reply", null, replybuff);
return replybuff.toString();
}
public String explainAnnotation(edu.cmu.minorthird.text.TextLabels labels, edu.cmu.minorthird.text.Span documentSpan)
{
return "reply-to extraction - not implemented yet!";
}
/*
* Classifies the lines of the incoming message, as being or not a
* reply line
*
* @param email message in string representation
* @return an ArrayList with all reply lines in a CharAnnotation[] format
*/
private ArrayList Predict(String msg)
{
SigFilePredictor.WindowRepresentation windowRep = new SigFilePredictor.WindowRepresentation(msg);
ArrayList herelist = ClassifyInstances(windowRep, "reply", null, null);
return herelist;
}
/*
* Classifies the lines in reply or non-reply lines
* - It calls the serializable classifier to each msg line
* @param windowRepresentation of the message (inner class of SigFilePredictor)
* @param tag to be used ("reply")
* @return an ArrayList with all reply lines in a CharAnnotation[] format
*
*/
private ArrayList ClassifyInstances(SigFilePredictor.WindowRepresentation windowRep, String tag, StringBuffer notreply, StringBuffer reply)
{
ArrayList bemlocal = new ArrayList();
MutableInstance[] ins = windowRep.getInstances();
int[] firstCharIndex = windowRep.getFirstCharIndex();
String[] arrayOfLines = windowRep.getArrayOfLines();
String wholeMessage = windowRep.getWholeMessage();
for(int i=0; i<ins.length;i++){
boolean decision = (model.score(ins[i])<Th)? false:true;
int charBegin;
if(decision){
//System.out.println("POSITIVE = " +i);//to debug
//System.out.println(ins[i].toString());
if(!(reply==null)) reply.append(arrayOfLines[i]+"\n");
log.debug(arrayOfLines[i]);
charBegin = wholeMessage.indexOf(arrayOfLines[i], firstCharIndex[i]-1);
if(charBegin<0) charBegin = firstCharIndex[i]; //just in case
//bemlocal.add(new CharAnnotation(charBegin,arrayOfLines[i].length(), tag));
bemlocal.add(new CharAnnotation(charBegin,arrayOfLines[i].length()+1, tag));
}
else{
if(!(notreply==null)) notreply.append(arrayOfLines[i]+"\n");
}
}
log.debug("\n\n");
return bemlocal;
}
//--------------------- main method/testing -----------------------------------------------------
//for testing purposes
public static void main(String[] args)
{
try {
//Usage check
if (args.length < 1)
{
usage();
return;
}
//parsing inputs
boolean create = false;
String opt = args[0];
if ((opt.startsWith("-create"))||(opt.startsWith("create"))) {
create = true;
}
if(create){ //creates a model
SigFilePredictor.createModel(args, "reply");
}
else{ //prediction mode
System.out.println("For details, set the verbosity level in config/log4j.properties\n");
ReplyToAnnotator repto = new ReplyToAnnotator();
for(int i=0; i< args.length; i++){
String message = LineProcessingUtil.readFile(args[i]);
CharAnnotation[] onelist = repto.annotateString(message);
String onelist3 = repto.getMsgReplyLines(message);
System.out.println("\n######### Reply Lines of "+args[i]+" #######");
System.out.print(onelist3.toString());
//System.out.println("\n######### Msg After Removing the Reply Lines #######");
//String onelist2 = repto.deleteReplyLinesFromMsg(message);
//System.out.print(onelist2.toString()+"\n\n");
}
}
} catch (Exception e) {
usage();
e.printStackTrace();
}
}
private static void usage()
{
System.out.println("usage: ReplyToAnnotator filename1 filename2 ...");
System.out.println("OR");
System.out.println("usage: ReplyToAnnotator -create filename1 filename2 ...");
System.out.println("to create, use \"Signature and Reply Dataset\" annotation stile as in www.cs.cmu.edu/~vitor/codeAndData.html");
}
}