package email; /** * This class is a first attemp to classify email messages into "speech acts". * * It follows the description in * "Learning to Classify Email into "Speech Acts"", * V.R.Carvalho, W.W.Cohen, T. M. Mitchell ; EMNLP 2004 * * To use it, please: * 1- add minorthird/apps/email/class to your CLASSPATH * 2- from minorthird/apps/email directory, compile it using "ant build" * 3- from minorhthird/ directory, run it using "java -Xmx500m email.SpeechAct directoryName" * * The output will be something like: * C:\minorthird>java -Xmx500m email.SpeechAct dummy * textbase size = 8 * msgId_28975_fIRMID_N03F2_1997_09_15_00_39_57 (_____ _DLV_ _PROP_ _____ _____ ___________ _DLVCMT__ _____) * msgId_547_fIRMID_N03F2_1997_08_26_15_24_57 (_REQ_ _DLV_ _PROP_ _____ _____ _REQAMDPROP _________ _MEET_) * msgId_11137_fIRMID_N03F2_1997_09_04_23_56_57 (_REQ_ _DLV_ ______ _____ _____ _REQAMDPROP _DLVCMT_ ______) * * * Reminder: it currently uses all words inside the text file (bag of lower case * words model). If you want to use only the words surrounded by a <body> tag, * go to the main method, and please change the following lines: * * comment: * for (Span.Looper it = textBase.documentSpanIterator(); it.hasNext();){ * Span span = (Span)it.next(); * and uncomment: * for (Iterator it = labels.instanceIterator("body"); it.hasNext();) { * Span span = (Span)it.nextSpan(); * * * @author Vitor R. Carvalho * Created Jun 15, 2004 * */ import java.io.File; import java.io.IOException; import org.apache.log4j.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.ArrayList; import java.util.List; import java.util.*; import java.io.*; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.util.*; import edu.cmu.minorthird.ui.CommandLineUtil; import edu.cmu.minorthird.ui.Recommended; import edu.cmu.minorthird.text.*; import edu.cmu.minorthird.text.gui.*; import edu.cmu.minorthird.classify.algorithms.trees.*; import edu.cmu.minorthird.classify.algorithms.linear.*; import edu.cmu.minorthird.classify.Example; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.util.LineProcessingUtil; //just for comparison with paper results import edu.cmu.minorthird.util.gui.ViewerFrame; import edu.cmu.minorthird.classify.experiments.Expt; import edu.cmu.minorthird.classify.ClassifierLearner; import edu.cmu.minorthird.classify.experiments.Tester; import edu.cmu.minorthird.classify.experiments.Evaluation; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.algorithms.svm.*; import edu.cmu.minorthird.classify.algorithms.trees.*; import edu.cmu.minorthird.text.learn.SpanFeatureExtractor; public class SpeechAct { private BinaryClassifier req_model; private BinaryClassifier dlv_model; private BinaryClassifier cmt_model; private BinaryClassifier prop_model; private BinaryClassifier amd_model; private BinaryClassifier reqamdprop_model; private BinaryClassifier dlvcmt_model; private Classifier meet_model; private Classifier ddata_model; private static Logger log = Logger.getLogger(SpeechAct.class); // serialization stuff static public final long serialVersionUID = 1; public final int CURRENT_VERSION_NUMBER = 1; private SpanFeatureExtractor fe = edu.cmu.minorthird.text.learn.SampleFE.BAG_OF_LC_WORDS; public SpeechAct() { try { //all models below are based on LC_BOW only. File reqfile = new File("apps/email/models/Req_Model");//DT req_model = (BinaryClassifier) IOUtil.loadSerialized(reqfile); File dlvfile = new File("apps/email/models/Dlv_Model");//VP,batch15 dlv_model = (BinaryClassifier) IOUtil.loadSerialized(dlvfile); File propfile = new File("apps/email/models/Prop_Model");//VP,batch15 prop_model = (BinaryClassifier) IOUtil.loadSerialized(propfile); File cmtfile = new File("apps/email/models/Cmt_Model");//VP,batch15 cmt_model = (BinaryClassifier) IOUtil.loadSerialized(cmtfile); File amdfile = new File("apps/email/models/Amd_Model");//VP,batch15 amd_model = (BinaryClassifier) IOUtil.loadSerialized(amdfile); File reqamdpropfile = new File("apps/email/models/ReqAmdProp_Model");//DT reqamdprop_model = (BinaryClassifier) IOUtil.loadSerialized(reqamdpropfile); File dlvcmtfile = new File("apps/email/models/DlvCmt_Model");//VP,batch15 dlvcmt_model = (BinaryClassifier) IOUtil.loadSerialized(dlvcmtfile); File meetfile = new File("apps/email/models/meet_Model");//maxent meet_model = (Classifier) IOUtil.loadSerialized(meetfile); File ddatafile = new File("apps/email/models/ddata_Model");//vp15 ddata_model = (Classifier) IOUtil.loadSerialized(ddatafile); } catch (Exception e) { e.printStackTrace(); } } private ClassLabel classification(BinaryClassifier model, Instance mi) { return model.classification(mi); } private boolean bclassify(BinaryClassifier model, Instance mi) { double Th = 0; return (model.score(mi)>Th)? true:false; } private boolean bclassify(Classifier model, Instance mi) { return model.classification(mi).isPositive(); } //------------------------------------------ public Instance getInstance(String str){ TextLabels tl = new BasicTextLabels(str); return fe.extractInstance(tl, tl.getTextBase().documentSpan("nullId")); } public boolean isMeeting(String str) { return bclassify(meet_model, getInstance(str)); } public boolean isDData(String str) { return bclassify(ddata_model, getInstance(str)); } public boolean isRequest(String str) { return bclassify(req_model, getInstance(str)); } public boolean isDelivery(String str) { return bclassify(dlv_model, getInstance(str)); } public boolean isCommitment(String str) { return bclassify(cmt_model, getInstance(str)); } public boolean isProposal(String str) { return bclassify(prop_model, getInstance(str)); } public boolean isDirective(String str) { return bclassify(reqamdprop_model, getInstance(str)); } public boolean isCommissive(String str) { return bclassify(dlvcmt_model, getInstance(str)); } //------------------------------------------ TextLabels readBsh(File dir, File envfile) throws Exception{ System.out.println("reading data files..."); TextLabels lala = TextBaseLoader.loadDirOfTaggedFiles(dir); TextBase basevitor = lala.getTextBase(); TextLabelsLoader labelLoaderVitor = new TextLabelsLoader(); System.out.println("reading env file"); labelLoaderVitor.importOps((MutableTextLabels)lala, basevitor, envfile); return lala; } //just for fun private void createModel(String[] args) throws IOException{ String mytag = args[1]; String modelName = mytag+"Model"; Dataset dataset = new BasicDataset(); TextLabels labels; try { //I hope you have labeled data, otherwise... //labels = Family.readBsh(new File("dummy/"), new File("dummy.env")); //labels = Family.readBsh(new File("C:/m3test/total/data/"), new File("C:/m3test/total/env/all"+mytag+".env")); labels = readBsh(new File("C:/m3test/total/data/"), new File("C:/m3test/total/env/all"+mytag+".env")); TextBase base = labels.getTextBase(); dataset = CommandLineUtil.toDataset(labels, fe, null, mytag); //ClassifierLearner learner = new BatchVersion(new VotedPerceptron(), 15); ClassifierLearner learner = new Recommended.DecisionTreeLearner(); //if you want to do crosssvalidation tests //Splitter split = Expt.toSplitter("k5"); //Evaluation eval = Tester.evaluate(learner, dataset, split); //ViewerFrame frame = new ViewerFrame("numeric demo", eval.toGUI()); //eval.summarize(); System.out.println("training the Model..."); Classifier cl = new DatasetClassifierTeacher(dataset).train(learner); System.out.println("saving model in file..." + modelName); IOUtil.saveSerialized((Serializable) cl, new File(modelName)); } catch (Exception e) { e.printStackTrace(); } return; } public static void main(String[] args) { //Usage check try { if ((args.length < 1)|| (args.length>3)) { usage(); return; } boolean create = false; String opt = args[0]; if ((opt.startsWith("-create")) || (opt.startsWith("create"))) { SpeechAct sa = new SpeechAct(); sa.createModel(args); } else { File dir; String outputFileName = ""; StringBuffer reqBuf = new StringBuffer("Requests or Proposals\n"); boolean sophie = false; //list only the requests in a directory - Sophie's request //to Sophie - usage: SpeechAct -reqSophie tmpdirectory outputFileName if ((opt.startsWith("-sophie")) || (opt.startsWith("sophie"))) { //to Sophie - usage: SpeechAct -reqSophie tmpdirectory outputFileName sophie = true; dir = new File(args[1]);//directory name outputFileName = args[2];//output file name } else{ dir = new File(args[0]); } SpeechAct sa = new SpeechAct(); MutableTextLabels labels = TextBaseLoader.loadDirOfTaggedFiles(dir); TextBase textBase = labels.getTextBase(); System.out.println("textbase size = " + textBase.size()); //TextBaseEditor.edit(labels, new File("moomoomoo")); for (Span.Looper it = textBase.documentSpanIterator(); it.hasNext();){ Span span = (Span)it.next(); //for (Iterator it = labels.instanceIterator("body"); it.hasNext();) { //Span span = (Span)it.nextSpan(); MutableInstance ins = (MutableInstance)sa.fe.extractInstance(labels, span); boolean reqbool = sa.bclassify(sa.req_model, ins); boolean dlvbool = sa.bclassify(sa.dlv_model, ins); boolean propbool = sa.bclassify(sa.prop_model, ins); boolean cmtbool = sa.bclassify(sa.cmt_model, ins); boolean amdbool = sa.bclassify(sa.amd_model, ins); boolean reqamdpropbool = sa.bclassify(sa.reqamdprop_model, ins); boolean dlvcmtbool = sa.bclassify(sa.dlvcmt_model, ins); boolean meetbool = sa.bclassify(sa.meet_model, ins); boolean ddatabool = sa.bclassify(sa.ddata_model, ins); if((sophie)&&(reqamdpropbool)){ reqBuf.append(span.getDocumentId()+"\n"); } String reqs = reqbool? "_REQ_":"_____"; String dlvs = dlvbool? "_DLV_":"_____"; String props = propbool? "_PROP_":"______"; String cmts = cmtbool? "_CMT_":"_____"; String amds = amdbool? "_AMD_":"_____"; String reqamdprops = reqamdpropbool? "_REQAMDPROP":"___________"; String dlvcmts = dlvcmtbool? "_DLVCMT__":"_________"; String meets = meetbool? "__MEET___":"_________"; String ddatas = ddatabool? "__dDATA___":"_________"; System.out.print(span.getDocumentId()+" ("+reqs+" "+dlvs+" "+props+" "+cmts+" "+amds+" "+reqamdprops+" "+dlvcmts+" "+meets+" "+ddatas+")\n"); // String spanString = span.asString(); } //kludge if(sophie) LineProcessingUtil.writeToOutputFile(outputFileName, reqBuf); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); usage(); } } private static void usage() { System.out.println("usage: SpeechAct directoryName"); System.out.println("\n\n OR, if you have labeled data and want to create a model\n"); System.out.println("usage: SpeechAct -create VerbAct"); System.out.println("VerbAct = Req, Dlv, Cmt, Prop, Amd, ReqAmdProp or DlvCmt"); } }