// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package marmot.test.morph; import java.util.LinkedList; import java.util.List; import marmot.core.Options; import marmot.core.Sequence; import marmot.morph.MorphOptions; import marmot.morph.io.SentenceReader; import org.junit.Test; public class JointTaggerLemmatizer { @Test public void smallTest() { MorphOptions options = new MorphOptions(); options.setProperty(Options.VERBOSE, "true"); options.setProperty(Options.SEED, "42"); options.setProperty(Options.VECTOR_SIZE, "10000000"); options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]"); options.setProperty(Options.PRUNE, "true"); options.setProperty(Options.ORDER, "1"); options.setProperty(Options.PENALTY, "1.0"); options.setProperty(MorphOptions.TAG_MORPH, "true"); options.setProperty(MorphOptions.LEMMATIZE, "true"); options.setProperty(MorphOptions.GOLD_LEMMA, "false"); options.setProperty(MorphOptions.LEMMA_PRETRAINING, "true"); options.setProperty(MorphOptions.MARGINALIZE_LEMMAS, "false"); options.setProperty(MorphOptions.LEMMA_TAG_DEPENDENT, "true"); options.setProperty(MorphOptions.USE_HASH_FEATURE_TABLE, "true"); options.setProperty(MorphOptions.LEMMA_LEMMING_GENERATOR, "2"); options.setProperty(MorphOptions.RESTRICT_POS_TAGS_TO_SEEN_COMBINATIONS, "true"); //options.setProperty(MorphOptions.LEMMAS_IGNORE_FEATURES, "case=*|case=nom|case=acc|case=dat|case=gen"); //options.setProperty(MorphOptions.LEMMA_USE_MORPH, "false"); options.setProperty(Options.NUM_ITERATIONS, "10"); options.setProperty(MorphOptions.TRAIN_FILE, "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/trn.txt"); options.setProperty(MorphOptions.TEST_FILE, "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/tst.txt"); List<Sequence> train_sentences = getSentences(options.getTrainFile(), 1000); List<Sequence> test_sentences = getSentences(options.getTestFile(), -1); // Joint // all : 10304 / 18939 = 54,4063% 0.0 // lemma : 16691 / 18939 = 88,1303% // all : 9160 / 18939 = 48,3658% 1.0 // lemma : 16536 / 18939 = 87,3119% // all : 10301 / 18939 = 54,3904% 0.01 // lemma : 16637 / 18939 = 87,8452% // all : 14281 / 18939 = 75,4052% 0.0 1000 // lemma : 18237 / 18939 = 96,2934% // all : 14300 / 18939 = 75,5056% 0.01 1000 // lemma : 18234 / 18939 = 96,2775% // all : 14397 / 18939 = 76,0177% 0.1 // lemma : 18217 / 18939 = 96,1878% // Pipeline // all : 9988 / 18939 = 52,7377% 0.0 // all : 7946 / 18939 = 41,9558% 1.0 // all : 9949 / 18939 = 52,5318% 0.01 // all : 14613 / 18939 = 77,1582% 0.0 1000 // all : 14628 / 18939 = 77,2374% 0.01 1000 PipelineTest.testWithOptions(options, train_sentences, test_sentences, 1., 1., 1., 1.); //PipelineTest.testWithOptions(options, train_sentences, test_sentences, 99.56, 53.13, 100., 87.66); } @Test public void test() { MorphOptions options = new MorphOptions(); options.setProperty(Options.VERBOSE, "true"); options.setProperty(Options.SEED, "42"); options.setProperty(Options.VECTOR_SIZE, "10000000"); options.setProperty(Options.CANDIDATES_PER_STATE, "[4, 2, 1.5, 1.25]"); options.setProperty(Options.PRUNE, "true"); options.setProperty(Options.ORDER, "1"); options.setProperty(Options.PENALTY, "0.0"); options.setProperty(MorphOptions.TAG_MORPH, "false"); options.setProperty(MorphOptions.LEMMATIZE, "false"); options.setProperty(Options.NUM_ITERATIONS, "10"); options.setProperty(MorphOptions.TRAIN_FILE, "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/trn.txt"); options.setProperty(MorphOptions.TEST_FILE, "form-index=1,lemma-index=2,tag-index=4,morph-index=6,res:///marmot/test/morph/tst.txt"); List<Sequence> train_sentences = getSentences(options.getTrainFile(), 1000); List<Sequence> test_sentences = getSentences(options.getTestFile(), -1); PipelineTest.testWithOptions(options, train_sentences, test_sentences, 98.53, 75.22, 99.88, 96.23); } private List<Sequence> getSentences(String trainFile, int limit) { List<Sequence> list = new LinkedList<>(); for (Sequence sequence : new SentenceReader(trainFile)) { list.add(sequence); } if (limit >= 0 && list.size() > limit) { list = list.subList(0, limit); } return list; } }