// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package lemming.test.lemma.toutanova; import java.util.List; import lemming.lemma.BackupLemmatizerTrainer; import lemming.lemma.LemmaInstance; import lemming.lemma.LemmaOptions; import lemming.lemma.Lemmatizer; import lemming.lemma.LemmatizerGeneratorTrainer; import lemming.lemma.LemmatizerTrainer; import lemming.lemma.SimpleLemmatizerTrainer; import lemming.lemma.BackupLemmatizerTrainer.BackupLemmatizerTrainerOptions; import lemming.lemma.SimpleLemmatizerTrainer.SimpleLemmatizerTrainerOptions; import lemming.lemma.toutanova.EditTreeAlignerTrainer; import lemming.lemma.toutanova.FirstOrderDecoder; import lemming.lemma.toutanova.ToutanovaTrainer; import lemming.lemma.toutanova.ToutanovaTrainer.ToutanovaOptions; import marmot.morph.io.SentenceReader; import org.junit.Test; public class ToutanovaTrainerTest extends SimpleTrainerTest { @Test public void isSerializable() { testIfLemmatizerIsSerializable(new SimpleLemmatizerTrainer()); testIfLemmatizerIsSerializable(new ToutanovaTrainer()); } @Test public void copyTest() { LemmatizerTrainer trainer = new ToutanovaTrainer(); String indexes = "form-index=4,lemma-index=5,"; String trainfile = indexes + getResourceFile("trn_sml.tsv"); String testfile = indexes + getResourceFile("dev_sml.tsv"); List<LemmaInstance> training_instances = getCopyInstances(LemmaInstance.getInstances(new SentenceReader(trainfile))); Lemmatizer lemmatizer = trainer.train(training_instances, null); List<LemmaInstance> instances = LemmaInstance.getInstances(new SentenceReader( testfile)); assertAccuracy(lemmatizer, getCopyInstances(instances), 99.1935); } @Test public void moderateZeroOrderAlignerPosTest() { BackupLemmatizerTrainer trainer = new BackupLemmatizerTrainer(); LemmaOptions options = trainer.getOptions(); options.setOption(BackupLemmatizerTrainerOptions.TRAINER_PREF + SimpleLemmatizerTrainerOptions.USE_BACKUP, false); options.setOption(BackupLemmatizerTrainerOptions.BACKUP_PREF + ToutanovaOptions.SEED, 10L); // 90.75 88.46 HA // 93.90 90.72 SA // 83.57 81.03 ETA // // 99.84 92.85 SA // 99.84 91.14 ETA seed=5, shuffle in builder // 99.89 90.23 ETA seed=4, shuffle in builder // 99.84 90.57 ETA seed=3, shuffle in builder // 99.84 90.43 25s // 99.84 90.41 21s // 92.73 101s // 94.54 313s // 94.93 814s runModerateTest(trainer, 1., 92.72); } @Test public void smallTest() { LemmatizerGeneratorTrainer simple_trainer = new SimpleLemmatizerTrainer(); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.HANDLE_UNSEEN, false); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.USE_BACKUP, false); simple_trainer.getOptions().setOption(LemmaOptions.USE_POS, true); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.ABSTAIN_IF_AMBIGIOUS, true); ToutanovaTrainer trainer = new ToutanovaTrainer(); trainer.getOptions().setOption(ToutanovaOptions.NUM_ITERATIONS, 10); trainer.getOptions().setOption(ToutanovaOptions.USE_POS, false); trainer.getOptions().setOption(ToutanovaOptions.FILTER_ALPHABET, 1); trainer.getOptions().setOption(ToutanovaOptions.AVERAGING, false); trainer.getOptions().setOption(ToutanovaOptions.DECODER, FirstOrderDecoder.class); LemmatizerTrainer btrainer = new BackupLemmatizerTrainer(simple_trainer, trainer); runSmallTest(btrainer, 1., 79.56); } @Test public void smallPosTest() { LemmatizerGeneratorTrainer simple_trainer = new SimpleLemmatizerTrainer(); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.HANDLE_UNSEEN, false); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.USE_BACKUP, false); simple_trainer.getOptions().setOption(LemmaOptions.USE_POS, true); simple_trainer.getOptions().setOption(SimpleLemmatizerTrainerOptions.ABSTAIN_IF_AMBIGIOUS, true); ToutanovaTrainer trainer = new ToutanovaTrainer(); trainer.getOptions().setOption(ToutanovaOptions.NUM_ITERATIONS, 10); trainer.getOptions().setOption(ToutanovaOptions.USE_POS, true); trainer.getOptions().setOption(ToutanovaOptions.FILTER_ALPHABET, 1); trainer.getOptions().setOption(ToutanovaOptions.AVERAGING, true); trainer.getOptions().setOption(ToutanovaOptions.DECODER, FirstOrderDecoder.class); LemmatizerTrainer btrainer = new BackupLemmatizerTrainer(simple_trainer, trainer); runSmallTest(btrainer, 1., 84.88); } }