import edu.cmu.minorthird.util.*;
import edu.cmu.minorthird.util.gui.*;
import edu.cmu.minorthird.text.*;
import edu.cmu.minorthird.text.gui.*;
import edu.cmu.minorthird.text.mixup.*;
import edu.cmu.minorthird.text.learn.*;
import edu.cmu.minorthird.text.learn.experiments.*;
import edu.cmu.minorthird.classify.experiments.*;
import edu.cmu.minorthird.classify.*;
import edu.cmu.minorthird.classify.sequential.*;
import java.util.*;
import java.io.*;
public class Main
{
static private int featureWindow=5,classWindow=3,epochs=6;
static private Map propertyMap = new HashMap();
static private String classToLearn = "name";
static private Extraction2TaggingReduction reduction = new InsideOutsideReduction();
//static private Extraction2TaggingReduction reduction = new BeginContinueEndUniqueReduction();
static private String mixupFileStem="nameFeatures_v2";
public static void main(String[] args)
{
int pos=0;
String
command=null,
learnerName="new CollinsPerceptronLearner("+classWindow+","+epochs+")",
labelsKey=null,
saveFileName=null,
splitterName="r50",
show=null;
while (pos<args.length) {
String opt = args[pos++];
if (opt.equals("-do")) command = args[pos++];
else if (opt.startsWith("-lea")) learnerName = args[pos++];
else if (opt.startsWith("-labels")) labelsKey = args[pos++];
else if (opt.startsWith("-save")) saveFileName = args[pos++];
else if (opt.startsWith("-split")) splitterName = args[pos++];
else if (opt.startsWith("-mix")) mixupFileStem = args[pos++];
else if (opt.startsWith("-show")) show = args[pos++];
else if (opt.startsWith("-class")) classToLearn = args[pos++];
else throw new IllegalArgumentException("illegal option "+opt);
}
propertyMap.put("featureWindow",Integer.toString(featureWindow));
propertyMap.put("classWindow",Integer.toString(classWindow));
propertyMap.put("epochs",Integer.toString(epochs));
propertyMap.put("learner",learnerName);
propertyMap.put("labels",labelsKey);
propertyMap.put("splitter",splitterName);
propertyMap.put("mixup",mixupFileStem);
//System.out.println("saved annotator file: '"+saveFileName+"'");
System.out.println("class: '"+classToLearn+"'");
try {
System.out.println("loading labels "+labelsKey);
if (labelsKey==null) throw new IllegalArgumentException("-labels LABELS must be specified");
MutableTextLabels labels = (MutableTextLabels)FancyLoader.loadTextLabels(labelsKey);
System.out.println("loading "+mixupFileStem+".mixup...");
MixupProgram program = new MixupProgram(new File(mixupFileStem+".mixup"));
MixupInterpreter interp = new MixupInterpreter(program);
interp.eval(labels);
if ("train".equals(command)) {
SequenceClassifierLearner learner = SequenceAnnotatorExpt.toSeqLearner(learnerName);
if (saveFileName==null) saveFileName = "out.ann";
buildAnnotator(labels, learner, saveFileName);
} else if ("test".equals(command)) {
if (saveFileName==null) saveFileName = "out.ann";
testAnnotator(labels, saveFileName);
} else if ("expt".equals(command)) {
SequenceClassifierLearner learner = SequenceAnnotatorExpt.toSeqLearner(learnerName);
Splitter splitter = Expt.toSplitter(splitterName);
doExpt(labels, splitter, learner, saveFileName, "all".equals(show));
} else if ("printWords".equals(command)) {
printAllWords(labels);
} else {
throw new IllegalArgumentException("unknown command '"+command+"'");
}
} catch (Exception ex) {
ex.printStackTrace();
System.out.println(
"usage: -do train -labels KEY -mixup FILESTEM -learner LEARNER -save FILE");
System.out.println(
" -do test -labels KEY -mixup FILESTEM -save FILE");
System.out.println(
" -do expt -labels KEY -mixup FILESTEM -learner LEARNER -splitter SPLIT -save FILE [-show all]");
System.out.println(
" also: -class name|date changes class to learn");
System.out.println(
" also: -mixup mixupFileStem");
}
}
/* do a train/test experiment */
public static void doExpt(
MutableTextLabels labels,
Splitter splitter,
SequenceClassifierLearner learner,
String outputFile,
boolean explore)
{
try {
System.out.println("teacher uses 'true_"+classToLearn+"'");
//TextBaseEditor.edit(labels, null);
AnnotatorTeacher teacher = new TextLabelsAnnotatorTeacher(labels,"true_"+classToLearn);
SpanFeatureExtractor fe = fe(labels);
SequenceDataset sequenceDataset =
SequenceAnnotatorLearner.prepareSequenceData(
labels,"true_"+classToLearn,null,fe,classWindow,new InsideOutsideReduction());
//ViewerFrame fd = new ViewerFrame("Name Learning Result",sequenceDataset.toGUI());
DatasetIndex index = new DatasetIndex(sequenceDataset);
System.out.println("Dataset: examples "+sequenceDataset.size()
+" features: "+index.numberOfFeatures()
+" avg features/examples: "+index.averageFeaturesPerExample());
Evaluation e = null;
if (!explore) {
e = Tester.evaluate(learner,sequenceDataset,splitter);
for (Iterator i=propertyMap.keySet().iterator(); i.hasNext(); ) {
String prop = (String)i.next();
e.setProperty( prop, (String)propertyMap.get(prop) );
}
} else {
CrossValidatedSequenceDataset cvd = new CrossValidatedSequenceDataset( learner, sequenceDataset, splitter );
ViewerFrame f = new ViewerFrame("Name Learning Result",cvd.toGUI());
e = cvd.getEvaluation();
}
String[] tags = e.summaryStatisticNames();
double[] d = e.summaryStatistics();
for (int i=0; i<d.length; i++) {
System.out.println(tags[i]+": "+d[i]);
}
if (outputFile!=null) {
e.save(new File(outputFile));
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static SpanFeatureExtractor fe(TextLabels labels)
{
NameFE fe = new NameFE();
Set props = labels.getTokenProperties();
System.out.println("props: "+props);
boolean useEq = true;
fe.setWindowSize(featureWindow);
fe.setTokenPropertyFeatures( props );
fe.setUseEqOnNonAnchors(useEq);
fe.setRequiredAnnotation(mixupFileStem);
propertyMap.put("properties", props.toString() );
propertyMap.put("useEqOnNonAnchors", Boolean.toString(useEq) );
return fe;
}
/* train and save an annotator */
public static void buildAnnotator(MutableTextLabels labels,SequenceClassifierLearner learner,String outputFile)
{
try {
AnnotatorTeacher teacher = new TextLabelsAnnotatorTeacher(labels,"true_"+classToLearn);
SpanFeatureExtractor fe = fe(labels);
SequenceDataset sequenceDataset =
SequenceAnnotatorLearner.prepareSequenceData(
labels,"true_"+classToLearn,null,fe,classWindow,new InsideOutsideReduction());
//ViewerFrame fd = new ViewerFrame("Name Learning Result",sequenceDataset.toGUI());
SequenceClassifier sequenceClassifier =
new DatasetSequenceClassifierTeacher(sequenceDataset).train(learner);
Annotator annotator =
new SequenceAnnotatorLearner.SequenceAnnotator(sequenceClassifier,fe,"predicted_"+classToLearn);
IOUtil.saveSerialized((Serializable)annotator,new File(outputFile));
} catch (Exception e) {
e.printStackTrace();
}
}
/* load and use an annotator */
public static void testAnnotator(MutableTextLabels labels,String inFile)
{
try {
Annotator annotator = (Annotator)IOUtil.loadSerialized(new File(inFile));
annotator.annotate(labels);
TextBaseEditor.edit(labels,new File("myCorrections.env"));
} catch (Exception e) {
e.printStackTrace();
}
}
/** Use this to help find all name and non-name words in a text
*/
public static void printAllWords(MutableTextLabels labels)
{
try {
MixupProgram prog = new MixupProgram(new String[]{
"defTokenProp inTrueName:t =top: ... [@true_"+classToLearn+"] ..."});
MixupInterpreter interp = new MixupInterpreter(prog);
interp.eval(labels);
for (Span.Looper i=labels.getTextBase().documentSpanIterator(); i.hasNext(); ) {
Span s = i.nextSpan();
for (int j=0; j<s.size(); j++) {
Token t = s.getToken(j);
String tag = (labels.getProperty(t,"inTrueName")!=null) ? classToLearn : "word";
System.out.println(tag + " " +t.getValue());
}
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}