package edu.stanford.nlp.pipeline;
import java.util.Set;
import junit.framework.TestCase;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.parser.nndep.DependencyParser;
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.util.Sets;
/** This test checks whether our trained POS tagger and parser models are using the identical POS tag set
* for the various languages that we support. It's a good idea if they are.
*
* @author Christopher Manning
*/
public class TaggerParserPosTagCompatibilityITest extends TestCase {
private static void testTagSet4(String[] lexParsers,
String[] maxentTaggers,
String[] srParsers,
String[] nnDepParsers) {
LexicalizedParser lp = LexicalizedParser.loadModel(lexParsers[0]);
Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
for (String name : maxentTaggers) {
MaxentTagger tagger = new MaxentTagger(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
"left - right: " + Sets.diff(tagSet, tagger.tagSet()) +
"; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n",
tagSet, tagger.tagSet());
}
for (String name : lexParsers) {
LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
"left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) +
"; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n",
tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
}
for (String name : srParsers) {
ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
"left - right: " + Sets.diff(tagSet, srp.tagSet()) +
"; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n",
tagSet, srp.tagSet());
}
for (String name : nnDepParsers) {
DependencyParser dp = DependencyParser.loadFromModelFile(name);
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
"left - right: " + Sets.diff(tagSet, dp.getPosSet()) +
"; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n",
tagSet, dp.getPosSet());
}
}
private static final String[] englishTaggers = {
"edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger",
"edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger",
"edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger",
};
private static final String[] englishParsers = {
"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz",
"edu/stanford/nlp/models/lexparser/englishRNN.ser.gz",
"edu/stanford/nlp/models/lexparser/englishFactored.ser.gz",
};
private static final String[] englishSrParsers = {
"edu/stanford/nlp/models/srparser/englishSR.beam.ser.gz",
"edu/stanford/nlp/models/srparser/englishSR.ser.gz",
};
private static final String[] englishNnParsers = {
"edu/stanford/nlp/models/parser/nndep/english_SD.gz",
"edu/stanford/nlp/models/parser/nndep/english_UD.gz"
};
public void testEnglishTagSet() {
testTagSet4(englishParsers, englishTaggers, englishSrParsers, englishNnParsers);
}
private static final String[] germanTaggers = {
"edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
"edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
// "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger", // No longer supported; always worse than hgc
"edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
};
private static final String[] germanParsers = {
"edu/stanford/nlp/models/lexparser/germanPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/germanFactored.ser.gz",
};
private static final String[] germanSrParsers = {
"edu/stanford/nlp/models/srparser/germanSR.ser.gz",
};
private static final String[] germanNnParsers = {
// This one uses UD tag set not fine-grained tags!
// "edu/stanford/nlp/models/parser/nndep/UD_German.gz",
};
public void testGermanTagSet() {
testTagSet4(germanParsers, germanTaggers, germanSrParsers, germanNnParsers);
}
private static final String[] chineseTaggers = {
"edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger",
};
private static final String[] chineseParsers = {
// Can't compare Xinhua ones because they have a smaller tag set than the full CTB v6+
// "edu/stanford/nlp/models/lexparser/xinhuaPCFG.ser.gz",
"edu/stanford/nlp/models/lexparser/chineseFactored.ser.gz",
"edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz",
// "edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz",
// "edu/stanford/nlp/models/lexparser/xinhuaFactored.ser.gz",
};
private static final String[] chineseSrParsers = {
"edu/stanford/nlp/models/srparser/chineseSR.ser.gz",
};
private static final String[] chineseNnParsers = {
// this one doesn't quite work because Factored has URL tag but UD_Chinese doesn't (not quite sure why...).
// "edu/stanford/nlp/models/parser/nndep/UD_Chinese.gz"
};
public void testChineseTagSet() {
testTagSet4(chineseParsers, chineseTaggers, chineseSrParsers, chineseNnParsers);
}
private static final String[] spanishTaggers = {
"edu/stanford/nlp/models/pos-tagger/spanish/spanish.tagger",
"edu/stanford/nlp/models/pos-tagger/spanish/spanish-distsim.tagger",
};
private static final String[] spanishParsers = {
"edu/stanford/nlp/models/lexparser/spanishPCFG.ser.gz",
};
private static final String[] spanishSrParsers = {
// todo [cdm 2014]: For some reason the SR parsers don't have the same tag set, missing 6 tags....
// "edu/stanford/nlp/models/srparser/spanishSR.ser.gz",
// "edu/stanford/nlp/models/srparser/spanishSR.beam.ser.gz",
};
private static final String[] spanishNnParsers = {
};
public void testSpanishTagSet() {
testTagSet4(spanishParsers, spanishTaggers, spanishSrParsers, spanishNnParsers);
}
private static final String[] frenchTaggers = {
"edu/stanford/nlp/models/pos-tagger/french/french.tagger",
};
private static final String[] frenchParsers = {
"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
};
private static final String[] frenchSrParsers = {
// todo [cdm 2016]: For some reason the SR parsers don't have the same tag set. Investigate.
// "edu/stanford/nlp/models/srparser/frenchSR.beam.ser.gz",
// "edu/stanford/nlp/models/srparser/frenchSR.ser.gz",
};
private static final String[] frenchNnParsers = {
};
public void testFrenchTagSet() {
testTagSet4(frenchParsers, frenchTaggers, frenchSrParsers, frenchNnParsers);
}
// todo: Add Arabic sometime
// todo: Add nndep parsers sometime
}