/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Wei Li <a href="mailto:weili@cs.umass.edu">weili@cs.umass.edu</a>
*/
package cc.mallet.share.weili.ner.enron;
import junit.framework.*;
import java.util.Iterator;
import java.util.Random;
import java.util.regex.*;
import java.io.*;
import cc.mallet.fst.*;
import cc.mallet.optimize.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.pipe.tsf.*;
import cc.mallet.share.upenn.ner.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
public class TUI
{
private static String CAPS = "[\\p{Lu}]";
private static String LOW = "[\\p{Ll}]";
private static String CAPSNUM = "[\\p{Lu}\\p{Nd}]";
private static String ALPHA = "[\\p{Lu}\\p{Ll}]";
private static String ALPHANUM = "[\\p{Lu}\\p{Ll}\\p{Nd}]";
private static String PUNT = "[,\\.;:?!()]";
private static String QUOTE = "[\"`']";
public static void main(String[] args) throws IOException {
String datadir = "/usr/can/tmp3/weili/NER/Enron/data";
String conlllexdir = "/usr/col/tmp1/weili/Resource/conllDict/";
String idflexdir = "/usr/col/tmp1/weili/Resource/idfDict/";
String placelexdir = "/usr/col/tmp1/weili/Resource/places";
Pipe conllLexiconsPipe = new SerialPipes (new Pipe[] {
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOPER")),
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOLOC")),
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOORG")),
new TrieLexiconMembership (new File(conlllexdir + "conll/CONLLTWOMISC")),
});
Pipe googleLexiconsPipe = new SerialPipes (new Pipe[] {
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGSOCCER")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGGOVT")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNGO")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGMILITARY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCOMPANY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGBANK")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGTRADE")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGNEWS")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGOPERATINGSYSTEM")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGPOLITICALPARTY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGTRAVEL")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGBASEBALLTEAMAUGF")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCARMODEL")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGCARCOMPANY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGENGLISHCOUNTYAUG")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGUNIVERSITY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCNATIONALITYAUGF")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCDISEASEAUG")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCTIME")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCAWARDS")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCMOVIESAUGF")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCPOLITICALPARTY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCRELIGION")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCGOVT")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCWAR")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCCURRENCY")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/LOC")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/PERFL")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/MISCF")),
new TrieLexiconMembership (new File(conlllexdir + "googlesets/ORGFRAWEDITEDSORTED")),
});
Pipe fixedLexiconsPipe = new SerialPipes (new Pipe[] {
new LexiconMembership ("FIRSTHIGHEST", new File(conlllexdir + "personname/ssdi.prfirsthighest"), true),
new LexiconMembership ("FIRSTHIGH", new File(conlllexdir + "personname/ssdi.prfirsthigh"), true),
new LexiconMembership ("FIRSTMED", new File(conlllexdir + "personname/ssdi.prfirstmed"), true),
new LexiconMembership ("FIRSTLOW", new File(conlllexdir + "personname/ssdi.prfirstlow"), true),
new LexiconMembership ("LASTHIGHEST", new File(conlllexdir + "personname/ssdi.prlasthighest"), true),
new LexiconMembership ("LASTHIGH", new File(conlllexdir + "personname/ssdi.prlasthigh"), true),
new LexiconMembership ("LASTMED", new File(conlllexdir + "personname/ssdi.prlastmed"), true),
new LexiconMembership ("LASTLOW", new File(conlllexdir + "personname/ssdi.prlastlow"), true),
new LexiconMembership ("HONORIFIC", new File(conlllexdir + "personname/honorifics"), true),
new LexiconMembership ("NAMESUFFIX", new File(conlllexdir + "personname/namesuffixes"), true),
new LexiconMembership ("NAMEPARTICLE", new File(conlllexdir + "personname/name-particles"), true),
new LexiconMembership ("DAY", new File(conlllexdir + "days"), true),
new LexiconMembership ("MONTH", new File(conlllexdir + "months"), true),
new LexiconMembership ("PLACESUFFIX", new File(conlllexdir + "place-suffixes"), true),
new TrieLexiconMembership ("COUNTRY", new File(conlllexdir + "countries"), true),
new TrieLexiconMembership ("COUNTRYCAPITAL", new File(conlllexdir + "country-capitals"), true),
new TrieLexiconMembership ("USSTATE", new File(conlllexdir + "US-states"), true),
new TrieLexiconMembership ("COMPANYNAME", new File(conlllexdir + "company-names"), true),
new TrieLexiconMembership ("COMPANYSUFFIX", new File(conlllexdir + "company-suffixes"), true),
new TrieLexiconMembership ("CONTINENT", new File(conlllexdir + "continents"), true),
new LexiconMembership ("STOPWORD", new File(conlllexdir + "stopwords"), true),
new TrieLexiconMembership (new File(conlllexdir + "biz.yahoo/COMPANYNAME.ABBREV")),
new TrieLexiconMembership (new File(conlllexdir + "utexas/UNIVERSITIES")),
});
Pipe idfLexiconsPipe = new SerialPipes (new Pipe[] {
new TrieLexiconMembership ("IDF_DES", new File(idflexdir + "designator.data"), true),
new TrieLexiconMembership ("IDF_FIR", new File(idflexdir + "firstnames.data"), true),
new TrieLexiconMembership ("IDF_LOC", new File(idflexdir + "locations.data"), true),
new TrieLexiconMembership ("IDF_NAT", new File(idflexdir + "nations.data"), true),
new TrieLexiconMembership ("IDF_ABB", new File(idflexdir + "non-final-abbrevs.data"), true),
new TrieLexiconMembership ("IDF_ORG", new File(idflexdir + "organization.data"), true),
new TrieLexiconMembership ("IDF_PER", new File(idflexdir + "person.data"), true),
});
Pipe spellingFeaturesPipe = new SerialPipes (new Pipe[] {
new RegexMatches ("INITCAP", Pattern.compile (CAPS+".*")),
new RegexMatches ("CAPITALIZED", Pattern.compile (CAPS+LOW+"*")),
new RegexMatches ("ALLCAPS", Pattern.compile (CAPS+"+")),
new RegexMatches ("MIXEDCAPS", Pattern.compile ("[A-Z][a-z]+[A-Z][A-Za-z]*")),
new RegexMatches ("CONTAINSDIGITS", Pattern.compile (".*[0-9].*")),
new RegexMatches ("ALLDIGITS", Pattern.compile ("[0-9]+")),
new RegexMatches ("NUMERICAL", Pattern.compile ("[-0-9]+[\\.,]+[0-9\\.,]+")),
new RegexMatches ("MULTIDOTS", Pattern.compile ("\\.\\.+")),
new RegexMatches ("ENDSINDOT", Pattern.compile ("[^\\.]+.*\\.")),
new RegexMatches ("CONTAINSDASH", Pattern.compile (ALPHANUM+"+-"+ALPHANUM+"*")),
new RegexMatches ("ACRO", Pattern.compile ("[A-Z][A-Z\\.]*\\.[A-Z\\.]*")),
new RegexMatches ("LONELYINITIAL", Pattern.compile (CAPS+"\\.")),
new RegexMatches ("SINGLECHAR", Pattern.compile (ALPHA)),
new RegexMatches ("CAPLETTER", Pattern.compile ("[A-Z]")),
new RegexMatches ("PUNC", Pattern.compile (PUNT)),
new RegexMatches ("QUOTE", Pattern.compile (QUOTE)),
});
SerialPipes p = new SerialPipes (new Pipe[] {
new EnronMessage2TokenSequence (),
//original
//new TokenText("W="),
//spellingFeaturesPipe,
new NEPipes(new File(placelexdir)),
conllLexiconsPipe,
googleLexiconsPipe,
fixedLexiconsPipe,
idfLexiconsPipe,
new OffsetConjunctions (new int[][]{{-1},{1}}),
new PrintTokenSequenceFeatures(),
new TokenSequence2FeatureVectorSequence (true, true)
});
InstanceList ilist = new InstanceList (p);
ilist.addThruPipe (new FileIterator (datadir, FileIterator.STARTING_DIRECTORIES));
Random r = new Random (1);
InstanceList[] ilists = ilist.split (r, new double[] {0.8, 0.2});
Alphabet targets = p.getTargetAlphabet();
System.out.print ("State labels:");
for (int i = 0; i < targets.size(); i++)
System.out.print (" " + targets.lookupObject(i));
System.out.println ("");
System.out.println ("Number of features = "+p.getDataAlphabet().size());
CRF crf = new CRF (p, null);
crf.addStatesForThreeQuarterLabelsConnectedAsIn (ilists[0]);
CRFTrainerByLabelLikelihood crft = new CRFTrainerByLabelLikelihood (crf);
crft.setGaussianPriorVariance (100.0);
for (int i = 0; i < crf.numStates(); i++)
crf.getState(i).setInitialWeight (Transducer.IMPOSSIBLE_WEIGHT);
crf.getState("O").setInitialWeight(0.0);
System.out.println("Training on "+ilists[0].size()+" training instances.");
MultiSegmentationEvaluator eval =
new MultiSegmentationEvaluator (new InstanceList[] {ilists[0], ilists[1]},
new String[] {"train", "test"},
new String[] {"B-DATE", "B-TIME", "B-LOCATION", "B-PERSON",
"B-ORGANIZATION", "B-ACRONYM", "B-PHONE", "B-MONEY", "B-PERCENT"},
new String[] {"I-DATE", "I-TIME", "I-LOCATION", "I-PERSON",
"I-ORGANIZATION", "I-ACRONYM", "I-PHONE", "I-MONEY", "I-PERCENT"});
if (args[0].equals("FeatureInduction"))
throw new IllegalStateException ("Feature induction not yet supported.");
/* crf.trainWithFeatureInduction (ilists[0], null, ilists[1],
eval, 99999,
10, 60, 500, 0.5, false,
new double[] {.1, .2, .5, .7}); */
else if (args[0].equals("NoFeatureInduction")) {
crft.train (ilists[0], 5, new double[] {.1, .2, .5, .7});
while (!crft.trainIncremental(ilists[0])) {
eval.evaluate(crft);
if (crft.getIteration() % 5 == 0)
new ViterbiWriter (args[2], ilists[0], "train", ilists[1], "test");
}
} else {
System.err.println("Feature induction or not? Give me a choice.");
System.exit(1);
}
crf.write(new File(args[1]));
}
}