// Copyright 2015 Thomas Müller
// This file is part of MarMoT, which is licensed under GPLv3.
package lemming.test.lemma.ranker;
import java.util.List;
import lemming.lemma.LemmaInstance;
import lemming.lemma.LemmaOptions;
import lemming.lemma.LemmaResult;
import lemming.lemma.Lemmatizer;
import lemming.lemma.ranker.RankerTrainer;
import lemming.lemma.ranker.RankerTrainer.RankerTrainerOptions;
import lemming.test.lemma.toutanova.SimpleTrainerTest;
import marmot.test.morph.PipelineTest;
import org.junit.Test;
public class RankerTrainerTest extends SimpleTrainerTest {
@Test
public void isSerializable() {
testIfLemmatizerIsSerializable(new RankerTrainer());
}
@Test
public void smallTest() {
RankerTrainer trainer = new RankerTrainer();
trainer.getOptions().setOption(RankerTrainerOptions.USE_PERCEPTRON, false).setOption(RankerTrainerOptions.QUADRATIC_PENALTY, 1.0);
//trainer.getOptions().setOption(RerankerTrainerOptions.UNIGRAM_FILE, "min-count=5,/mounts/data/proj/marmot/lemmatizer/data/de/unigrams.txt");
//trainer.getOptions().setOption(RerankerTrainerOptions.UNIGRAM_FILE, Arrays.asList("min-count=1,/mounts/data/proj/marmot/lemmatizer/data/de/aspell.txt", "min-count=5,/mounts/data/proj/marmot/lemmatizer/data/de/unigrams.txt"));
//trainer.getOptions().setOption(RerankerTrainerOptions.UNIGRAM_FILE, Arrays.asList("min-count=1,/mounts/data/proj/marmot/lemmatizer/data/de/aspell.txt"));
trainer.getOptions().setOption(RankerTrainerOptions.USE_SHAPE_LEXICON, true);
trainer.getOptions().setOption(RankerTrainerOptions.NUM_EDIT_TREE_STEPS, 0);
// trainer.getOptions().setOption(RerankerTrainerOptions.UNIGRAM_FILE, Arrays.asList("min-count=5,/mounts/data/proj/marmot/lemmatizer/data/de/unigrams.txt"));
// trainer.getOptions().setOption(RerankerTrainerOptions.ASPELL_PATH, Aspell.ASPELL_PATH);
// trainer.getOptions().setOption(RerankerTrainerOptions.ASPELL_LANG, "de");
// INFORMATION: 9501 / 9504 = 99,9684 (OOV: 0 / 0 = NaN)
// Apr 22, 2015 3:39:51 PM marmot.lemma.Result logAccuracy
// INFORMATION: 74304 / 76704 = 96,8711 (OOV: 24266 / 26505 = 91,5525)
// Apr 22, 2015 3:40:02 PM marmot.lemma.Result logAccuracy
// INFORMATION: 73640 / 76704 = 96,0054 (OOV: 23714 / 26505 = 89,4699)
trainer.getOptions().setOption(LemmaOptions.USE_MORPH, false);
trainer.getOptions().setOption(RankerTrainerOptions.USE_HASH_FEATURE_TABLE, true);
// INFORMATION: 9501 / 9504 = 99,9684 (Type: 3809 / 3812 = 99,9213)
// Apr 22, 2015 10:25:52 AM marmot.lemma.Result logAccuracy
// INFORMATION: 74272 / 76704 = 96,8294 (Type: 18713 / 20389 = 91,7799)
// Apr 22, 2015 10:25:59 AM marmot.lemma.Result logAccuracy
// INFORMATION: 73591 / 76704 = 95,9415 (Type: 14530 / 16640 = 87,3197)
// INFORMATION: 9501 / 9504 = 99,9684 (Type: 3809 / 3812 = 99,9213)
// Apr 22, 2015 9:46:37 AM marmot.lemma.Result logAccuracy
// INFORMATION: 74105 / 76704 = 96,6116 (Type: 18580 / 20389 = 91,1276)
// Apr 22, 2015 9:46:42 AM marmot.lemma.Result logAccuracy
// NFORMATION: 73375 / 76704 = 95,6599 (Type: 14385 / 16640 = 86,4483)
// ASPELL (explicit), Wiki
// Apr 22, 2015 9:44:05 AM marmot.lemma.Result logAccuracy
// INFORMATION: 9501 / 9504 = 99,9684 (Type: 3809 / 3812 = 99,9213)
// Apr 22, 2015 9:44:13 AM marmot.lemma.Result logAccuracy
// INFORMATION: 74325 / 76704 = 96,8985 (Type: 18737 / 20389 = 91,8976)
// Apr 22, 2015 9:44:19 AM marmot.lemma.Result logAccuracy
// INFORMATION: 73640 / 76704 = 96,0054 (Type: 14551 / 16640 = 87,4459)
// ASPELL
// Apr 22, 2015 9:35:46 AM marmot.lemma.Result logAccuracy
// INFORMATION: 9501 / 9504 = 99,9684 (Type: 3809 / 3812 = 99,9213)
// Apr 22, 2015 9:36:09 AM marmot.lemma.Result logAccuracy
// INFORMATION: 74053 / 76704 = 96,5439 (Type: 18548 / 20389 = 90,9706)
// Apr 22, 2015 9:36:15 AM marmot.lemma.Result logAccuracy
// INFORMATION: 73336 / 76704 = 95,6091 (Type: 14367 / 16640 = 86,3401)
runModerateTest(trainer, 1., 1., true);
// MORPH 88.09
// POS 87.40
// MORPH 88.06 78.32
// POS 87.39 86.91
// 97.11
}
@Test
public void smallNoMorphMarmotTest() {
String trainfile = "form-index=1,lemma-index=2,res:///marmot/test/morph/trn.txt";
String devfile = "form-index=1,lemma-index=2,res:///marmot/test/morph/tst.txt";
List<LemmaInstance> instances = LemmaInstance.getInstances(PipelineTest.getSentences(trainfile, 100));
System.err.println(instances.size());
RankerTrainer trainer = new RankerTrainer();
trainer.getOptions().setOption(RankerTrainerOptions.USE_PERCEPTRON, false).setOption(RankerTrainerOptions.QUADRATIC_PENALTY, 0.0);
trainer.getOptions().setOption(LemmaOptions.USE_MORPH, false);
trainer.getOptions().setOption(LemmaOptions.USE_POS, false);
Lemmatizer lemmatizer = trainer.train(instances, null);
instances = LemmaInstance.getInstances(PipelineTest.getSentences(devfile, 100));
LemmaResult result = LemmaResult.test(lemmatizer, instances);
result.logAccuracy();
}
@Test
public void smallMarmotTest() {
RankerTrainer trainer = new RankerTrainer();
String trainfile = "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/trn.txt";
String devfile = "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/tst.txt";
trainer.getOptions().setOption(RankerTrainerOptions.USE_PERCEPTRON, false).setOption(RankerTrainerOptions.QUADRATIC_PENALTY, 0.0);
trainer.getOptions().setOption(LemmaOptions.USE_MORPH, true);
List<LemmaInstance> instances = LemmaInstance.getInstances(PipelineTest.getSentences(trainfile, 100));
Lemmatizer lemmatizer = trainer.train(instances, null);
instances = LemmaInstance.getInstances(PipelineTest.getSentences(devfile, 100));
LemmaResult result = LemmaResult.test(lemmatizer, instances);
result.logAccuracy();
assert result.getTokenAccuracy() > 97.94;
}
}