/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package opennlp.tools.lemmatizer;
import java.io.File;
import java.io.IOException;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import opennlp.tools.util.InsufficientTrainingDataException;
import opennlp.tools.util.MockInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
/**
* This is the test class for {@link LemmatizerME}.
* <p>
* A proper testing and evaluation of the name finder is only possible with a
* large corpus which contains a huge amount of test sentences.
* <p>
* The scope of this test is to make sure that the name finder code can be
* executed. This test can not detect mistakes which lead to incorrect feature
* generation or other mistakes which decrease the tagging performance of the
* name finder.
* <p>
* In this test the {@link LemmatizerME} is trained with a small amount of
* training sentences and then the computed model is used to predict sentences
* from the training sentences.
*/
public class LemmatizerMETest {
private LemmatizerME lemmatizer;
private static String[] tokens = { "Rockwell", "said", "the", "agreement", "calls", "for",
"it", "to", "supply", "200", "additional", "so-called", "shipsets", "for",
"the", "planes", "." };
private static String[] postags = { "NNP", "VBD", "DT", "NN", "VBZ", "IN", "PRP", "TO", "VB",
"CD", "JJ", "JJ", "NNS", "IN", "DT", "NNS", "." };
private static String[] expect = { "rockwell", "say", "the", "agreement", "call", "for",
"it", "to", "supply", "200", "additional", "so-called", "shipset", "for",
"the", "plane", "." };
@Before
public void startup() throws IOException {
// train the lemmatizer
ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
new PlainTextByLineStream(new MockInputStreamFactory(
new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8"));
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
params.put(TrainingParameters.CUTOFF_PARAM, 5);
LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream,
params, new LemmatizerFactory());
this.lemmatizer = new LemmatizerME(lemmatizerModel);
}
@Test
public void testLemmasAsArray() throws Exception {
String[] lemmas = lemmatizer.lemmatize(tokens, postags);
Assert.assertArrayEquals(expect, lemmas);
}
@Test(expected = InsufficientTrainingDataException.class)
public void testInsufficientData() throws IOException {
ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
new PlainTextByLineStream(new MockInputStreamFactory(
new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")),
"UTF-8"));
TrainingParameters params = new TrainingParameters();
params.put(TrainingParameters.ITERATIONS_PARAM, 100);
params.put(TrainingParameters.CUTOFF_PARAM, 5);
LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory());
}
}