LemmatizerMETest.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.lemmatizer;

import java.io.File;
import java.io.IOException;

import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import opennlp.tools.util.InsufficientTrainingDataException;
import opennlp.tools.util.MockInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

/**
 * This is the test class for {@link LemmatizerME}.
 * <p>
 * A proper testing and evaluation of the name finder is only possible with a
 * large corpus which contains a huge amount of test sentences.
 * <p>
 * The scope of this test is to make sure that the name finder code can be
 * executed. This test can not detect mistakes which lead to incorrect feature
 * generation or other mistakes which decrease the tagging performance of the
 * name finder.
 * <p>
 * In this test the {@link LemmatizerME} is trained with a small amount of
 * training sentences and then the computed model is used to predict sentences
 * from the training sentences.
 */
public class LemmatizerMETest {

  private LemmatizerME lemmatizer;

  private static String[] tokens = { "Rockwell", "said", "the", "agreement", "calls", "for",
      "it", "to", "supply", "200", "additional", "so-called", "shipsets", "for",
      "the", "planes", "." };

  private static String[] postags = { "NNP", "VBD", "DT", "NN", "VBZ", "IN", "PRP", "TO", "VB",
      "CD", "JJ", "JJ", "NNS", "IN", "DT", "NNS", "." };

  private static String[] expect = { "rockwell", "say", "the", "agreement", "call", "for",
      "it", "to", "supply", "200", "additional", "so-called", "shipset", "for",
      "the", "plane", "." };

  @Before
  public void startup() throws IOException {
    // train the lemmatizer

    ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
    params.put(TrainingParameters.CUTOFF_PARAM, 5);

    LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream,
        params, new LemmatizerFactory());

    this.lemmatizer = new LemmatizerME(lemmatizerModel);
  }

  @Test
  public void testLemmasAsArray() throws Exception {

    String[] lemmas = lemmatizer.lemmatize(tokens, postags);

    Assert.assertArrayEquals(expect, lemmas);
  }
  
  @Test(expected = InsufficientTrainingDataException.class)
  public void testInsufficientData() throws IOException {
 
    ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
            new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")),
                "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
    params.put(TrainingParameters.CUTOFF_PARAM, 5);

    LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory());

  }

}