// Copyright 2013 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package marmot.test.morph;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import junit.framework.AssertionFailedError;
import org.junit.Test;
import marmot.core.Model;
import marmot.core.Options;
import marmot.core.PerceptronTrainer;
import marmot.core.Sequence;
import marmot.core.Tagger;
import marmot.core.Token;
import marmot.morph.MorphEvaluator;
import marmot.morph.MorphModel;
import marmot.morph.MorphOptions;
import marmot.morph.MorphResult;
import marmot.morph.Sentence;
import marmot.morph.Word;
import marmot.morph.io.SentenceReader;
import marmot.util.Copy;
import marmot.util.FileUtils;
import marmot.util.StringUtils.Mode;
public class PipelineTest {
private String getResourceFile(String name) {
Package pack = getClass().getPackage();
String path = pack.getName().replace(".", "/");
return String.format("res:///%s/%s", path, name);
}
public static List<Sequence> getSentences(String filename, int number) {
List<Sequence> list = new LinkedList<Sequence>();
for (Sequence sentence : new SentenceReader(filename)) {
list.add(sentence);
if (number >= 0 && list.size() >= number) {
break;
}
}
return list;
}
public List<Sequence> getTrainSentences() {
List<Sequence> sentences = new LinkedList<Sequence>();
List<Word> tokens;
tokens = new LinkedList<Word>();
tokens.add(new Word("das", "A", "c=N|n=S"));
tokens.add(new Word("ist", "V", "n=S"));
tokens.add(new Word("ein", "A", "c=N|n=S"));
tokens.add(new Word("Test", "N", "c=N|n=S"));
tokens.add(new Word(".", ".", "_"));
sentences.add(new Sentence(tokens));
tokens = new LinkedList<Word>();
tokens.add(new Word("die", "A", "c=N|n=P"));
tokens.add(new Word("Rüben", "N", "c=N|n=P"));
tokens.add(new Word("sind", "V", "n=P"));
tokens.add(new Word("kalt", "J", "c=N|n=P"));
tokens.add(new Word(".", ".", "_"));
sentences.add(new Sentence(tokens));
return sentences;
}
public List<Sequence> getTestSentences() {
List<Word> tokens;
tokens = new LinkedList<Word>();
tokens.add(new Word("das", "A", "c=N|n=S"));
tokens.add(new Word("ist", "V", "n=S"));
tokens.add(new Word("mein", "A", "c=N|n=S"));
tokens.add(new Word("Test", "N", "c=N|n=S"));
tokens.add(new Word(".", ".", "_"));
return Collections.singletonList((Sequence) new Sentence(tokens));
}
public Model getModel(Collection<Sequence> sentences, MorphOptions options) {
MorphModel model = new MorphModel();
model.init(options, sentences);
return model;
}
@Test
public void toyPosTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(MorphOptions.NUM_ITERATIONS, "10");
options.setProperty(MorphOptions.VECTOR_SIZE, "1024");
options.setProperty(MorphOptions.CANDIDATES_PER_STATE, "[9, 9, 9]");
options.setProperty(MorphOptions.PRUNE, "false");
toyTestWithOptions(options);
options.setProperty(MorphOptions.PRUNE, "true");
options.setProperty(MorphOptions.TAG_MORPH, "false");
toyTestWithOptions(options);
options.setProperty(MorphOptions.CANDIDATES_PER_STATE,
"[4, 2, 1.5, 1.25]");
toyTestWithOptions(options);
}
@Test
public void toyTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(MorphOptions.NUM_ITERATIONS, "10");
options.setProperty(MorphOptions.VECTOR_SIZE, "1024");
options.setProperty(MorphOptions.CANDIDATES_PER_STATE, "[9, 9, 9]");
options.setProperty(MorphOptions.PRUNE, "false");
toyTestWithOptions(options);
options.setProperty(MorphOptions.PRUNE, "true");
toyTestWithOptions(options);
options.setProperty(MorphOptions.CANDIDATES_PER_STATE,
"[4, 2, 1.5, 1.25]");
toyTestWithOptions(options);
}
@Test
public void realTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.USE_HASH_FEATURE_TABLE, "false");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 98.51, 54.10);
}
@Test
public void realInfixTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.MAX_AFFIX_LENGTH, "5");
options.setProperty(MorphOptions.FEATURE_TEMPLATES,
"form,rare,infix,context,sig,bigrams");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 99, 51.50);
}
@Test
public void realFloatTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.FLOAT_TYPE_DICT,
getResourceFile("svd_small.txt"));
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 98.89, 54.52);
}
@Test
public void realOptimizerTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.OPTIMIZE_NUM_ITERATIONS, "true");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
// realOptimizerTestWithOptions(options, 97.96, 54.18);
}
@Test
public void realNonHashTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "1");
options.setProperty(MorphOptions.USE_HASH_VECTOR, "false");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, "0.1");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 98.51, 54.10);
}
@Test
public void realNormalizeFormTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.FORM_NORMALIZATION,
Mode.lower.toString());
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 99.11, 55.38);
}
@Test
public void realSpecialSignatureTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.SPECIAL_SIGNATURE, "true");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 98.45, 53.96);
}
@Test
public void realPosTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4," + getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE, "form-index=1,tag-index=4,"
+ getResourceFile("tst.txt"));
options.setProperty(MorphOptions.TAG_MORPH, "false");
realTestWithOptions(options, 99.66, 79.14);
}
@Test
public void realPerceptronPosTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.PRUNE, "false");
options.setProperty(Options.ORDER, "1");
options.setProperty(Options.TRAINER,
PerceptronTrainer.class.getCanonicalName());
options.setProperty(MorphOptions.TAG_MORPH, "false");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 98.84, 77.49);
}
@Test
public void realOracleTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(Options.ORACLE, "true");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.txt"));
realTestWithOptions(options, 99.94, 53.39);
}
@Test
public void realFstTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("trn.fst.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("tst.fst.txt"));
realTestWithOptions(options, 99.33, 70.10);
}
@Test
public void realFstNoDefaultFeaturesTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("trn.fst.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("tst.fst.txt"));
options.setProperty(MorphOptions.USE_DEFAULT_FEATURES, "false");
realTestWithOptions(options, 60.84, 50.13);
}
@Test
public void realAramorphBaselineTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(MorphOptions.INTERNAL_ANALYZER, "ar");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("trn.aramorph.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,"
+ getResourceFile("tst.aramorph.txt"));
realTestWithOptions(options, 100.00, 66.26);
}
@Test
public void realFstMaxLevelTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(Options.MAX_TRANSITION_FEATURE_LEVEL, "0");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("trn.fst.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("tst.fst.txt"));
realTestWithOptions(options, 99.06, 69.46);
}
@Test
public void realOracleFstTest() {
MorphOptions options = new MorphOptions();
options.setProperty(Options.SEED, "42");
options.setProperty(Options.NUM_ITERATIONS, "10");
options.setProperty(Options.VECTOR_SIZE, "10000000");
options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]");
options.setProperty(Options.PRUNE, "true");
options.setProperty(Options.ORDER, "3");
options.setProperty(Options.PENALTY, ".1");
options.setProperty(Options.ORACLE, "true");
options.setProperty(MorphOptions.TRAIN_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("trn.fst.txt"));
options.setProperty(MorphOptions.TEST_FILE,
"form-index=1,tag-index=4,morph-index=6,token-feature-index=7,"
+ getResourceFile("tst.fst.txt"));
realTestWithOptions(options, 99.83, 70.94);
}
public void toyTestWithOptions(MorphOptions options) {
options.setProperty(MorphOptions.SHAPE, "false");
testWithOptions(options, getTrainSentences(), getTestSentences(),
100.0, 100.0, 0., 0.);
}
public void realOptimizerTestWithOptions(final MorphOptions options,
double train_acc, double test_acc) {
testOptimizerWithOptions(options,
getSentences(options.getTrainFile(), 100),
getSentences(options.getTestFile(), 100), train_acc, test_acc);
}
public static void realTestWithOptions(final MorphOptions options,
double train_acc, double test_acc) {
realTestWithOptions(options, train_acc, test_acc, 0.0, 0.0);
}
public static void realTestWithOptions(final MorphOptions options,
double train_acc, double test_acc, double lemma_train_acc,
double lemma_test_acc) {
testWithOptions(options, getSentences(options.getTrainFile(), 100),
getSentences(options.getTestFile(), 100), train_acc, test_acc,
lemma_train_acc, lemma_test_acc);
}
public static void testWithOptions(MorphOptions options,
List<Sequence> train_sentences, List<Sequence> test_sentences,
double train_threshold, double test_threshold,
double train_lemma_threshold, double test_lemma_threshold) {
StackTraceElement[] stack = Thread.currentThread().getStackTrace();
String caller = "None";
if (stack.length > 3) {
caller = stack[3].getMethodName();
}
Tagger tagger = MorphModel.train(options, train_sentences, null);
assertModelPerformanceOnTestset(caller + " Train", tagger,
train_sentences, train_threshold, train_lemma_threshold);
assertModelPerformanceOnTestset(caller + " Test ", tagger,
test_sentences, test_threshold, test_lemma_threshold);
File tempfile;
try {
tempfile = File.createTempFile("tagger", ".marmot");
tempfile.deleteOnExit();
FileUtils.saveToFile(tagger, tempfile);
Tagger loaded_tagger = FileUtils.loadFromFile(tempfile);
assertModelPerformanceOnTestset(caller + " Test (reload) ",
loaded_tagger, test_sentences, test_threshold);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static void testOptimizerWithOptions(MorphOptions options,
List<Sequence> train_sentences, List<Sequence> test_sentences,
double train_threshold, double test_threshold) {
StackTraceElement[] stack = Thread.currentThread().getStackTrace();
String caller = "None";
if (stack.length > 3) {
caller = stack[3].getMethodName();
}
assert test_sentences != null;
Tagger tagger = MorphModel.trainOptimal(options, train_sentences, null);
assertModelPerformanceOnTestset(caller + " Train", tagger, train_sentences, train_threshold);
assertModelPerformanceOnTestset(caller + " Test ", tagger, test_sentences, test_threshold);
Tagger clone = Copy.clone(tagger);
assertModelPerformanceOnTestset(caller + " Test (reload) ", clone, test_sentences, test_threshold);
}
public static void assertModelPerformanceOnTestset(String name,
Tagger tagger, List<Sequence> sentences, double threshold) {
assertModelPerformanceOnTestset(name, tagger, sentences, threshold, 0.0);
}
public static void assertModelPerformanceOnTestset(String name,
Tagger tagger, List<Sequence> sentences, double threshold,
double lemma_threshold) {
MorphResult result = new MorphResult(tagger.getModel(),
tagger.getNumLevels());
MorphModel model = (MorphModel) tagger.getModel();
for (Sequence sentence : sentences) {
for (Token token : sentence) {
Word word = (Word) token;
model.addIndexes(word, false);
}
result.increment(MorphEvaluator.eval(tagger, (Sentence) sentence));
}
double accuracy = (result.num_tokens - result.morph_errors) * 100.
/ result.num_tokens;
double lemma_accuracy = (result.num_tokens - result.lemma_errors)
* 100. / result.num_tokens;
System.err.println(result.toString());
if (accuracy - threshold < -1e-5) {
throw new AssertionFailedError(accuracy + " < " + threshold);
}
if (lemma_accuracy - lemma_threshold < -1e-5) {
throw new AssertionFailedError(accuracy + " < " + threshold);
}
}
}