import edu.cmu.minorthird.util.*; import edu.cmu.minorthird.util.gui.*; import edu.cmu.minorthird.text.*; import edu.cmu.minorthird.text.gui.*; import edu.cmu.minorthird.text.mixup.*; import edu.cmu.minorthird.text.learn.*; import edu.cmu.minorthird.classify.*; import edu.cmu.minorthird.classify.algorithms.linear.*; import edu.cmu.minorthird.classify.algorithms.trees.*; import edu.cmu.minorthird.classify.algorithms.svm.*; import edu.cmu.minorthird.classify.experiments.*; import edu.cmu.minorthird.classify.sequential.*; import java.util.*; import java.io.*; public class PersonNameTagger extends AbstractAnnotator { private Annotator learnedAnnotator; private MixupProgram featureProgram; public PersonNameTagger(String learnedAnnotatorFile) throws IOException,Mixup.ParseException { learnedAnnotator = (Annotator)IOUtil.loadSerialized(new File(learnedAnnotatorFile)); featureProgram = new MixupProgram(new File("nameFeatures.mixup")); } public void doAnnotate(MonotonicTextLabels labels) { MixupInterpreter interp = new MixupInterpreter(featureProgram); interp.eval(labels); learnedAnnotator.annotate( labels ); } public String explainAnnotation(TextLabels labels,Span span) { return "just because"; } public static void main(String[] args) { try { PersonNameTagger tagger = new PersonNameTagger(args[0]); TextBaseLoader baseLoader = new TextBaseLoader(TextBaseLoader.DOC_PER_FILE, TextBaseLoader.FILE_NAME); TextBase base = baseLoader.load(new File(args[1])); MonotonicTextLabels labels = new BasicTextLabels( base ); tagger.annotate( labels ); saveType(labels, "predicted_name", new File(args[2])); } catch (Exception e) { e.printStackTrace(); System.out.println("usage: annotatorFile mailDirectory tags"); } } private static void saveType(TextLabels labels, String type, File file) throws FileNotFoundException { PrintStream out = new PrintStream(new FileOutputStream(file)); for (Span.Looper j=labels.instanceIterator(type); j.hasNext(); ) { Span s = j.nextSpan(); if (s.size()>0) { int lo = s.getTextToken(0).getLo(); int hi = s.getTextToken(s.size()-1).getHi(); out.println("addToType "+s.getDocumentId()+" "+lo+" "+(hi-lo)+" "+type); } } out.close(); } }