NameFinderMETest.java example

Explorer
opennlp-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.namefind;

import java.io.File;
import java.util.Collections;

import org.junit.Assert;
import org.junit.Test;

import opennlp.tools.ml.model.SequenceClassificationModel;
import opennlp.tools.util.MockInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;

/**
 * This is the test class for {@link NameFinderME}.
 * <p>
 * A proper testing and evaluation of the name finder
 * is only possible  with a large corpus which contains
 * a huge amount of test sentences.
 * <p>
 * The scope of this test is to make sure that the name finder
 * code can be executed. This test can not detect
 * mistakes which lead to incorrect feature generation
 * or other mistakes which decrease the tagging
 * performance of the name finder.
 * <p>
 * In this test the {@link NameFinderME} is trained with
 * a small amount of training sentences and then the
 * computed model is used to predict sentences from the
 * training sentences.
 */
public class NameFinderMETest {

  private static final String TYPE_OVERRIDE = "aType";
  private static final String DEFAULT = "default";

  @Test
  public void testNameFinder() throws Exception {

    // train the name finder
    String encoding = "ISO-8859-1";

    ObjectStream<NameSample> sampleStream =
        new NameSampleDataStream(
            new PlainTextByLineStream(new MockInputStreamFactory(
              new File("opennlp/tools/namefind/AnnotatedSentences.txt")), encoding));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    TokenNameFinder nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = {"Alisa",
        "appreciated",
        "the",
        "hint",
        "and",
        "enjoyed",
        "a",
        "delicious",
        "traditional",
        "meal."};

    Span[] names = nameFinder.find(sentence);

    Assert.assertEquals(1, names.length);
    Assert.assertEquals(new Span(0, 1, DEFAULT), names[0]);

    sentence = new String[] {
        "Hi",
        "Mike",
        ",",
        "it's",
        "Stefanie",
        "Schmidt",
        "."
    };

    names = nameFinder.find(sentence);

    Assert.assertEquals(2, names.length);
    Assert.assertEquals(new Span(1, 2, DEFAULT), names[0]);
    Assert.assertEquals(new Span(4, 6, DEFAULT), names[1]);
  }

  /**
   * Train NamefinderME using AnnotatedSentencesWithTypes.txt with "person"
   * nameType and try the model in a sample text.
   */
  @Test
  public void testNameFinderWithTypes() throws Exception {

    // train the name finder
    String encoding = "ISO-8859-1";

    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence2 = new String[] { "Hi", "Mike", ",", "it's", "Stefanie",
        "Schmidt", "." };

    Span[] names2 = nameFinder.find(sentence2);

    Assert.assertEquals(2, names2.length);
    Assert.assertEquals(new Span(1, 2, "person"), names2[0]);
    Assert.assertEquals(new Span(4, 6, "person"), names2[1]);
    Assert.assertEquals("person", names2[0].getType());
    Assert.assertEquals("person", names2[1].getType());

    String[] sentence = { "Alisa", "appreciated", "the", "hint", "and",
        "enjoyed", "a", "delicious", "traditional", "meal." };

    Span[] names = nameFinder.find(sentence);

    Assert.assertEquals(1, names.length);
    Assert.assertEquals(new Span(0, 1, "person"), names[0]);
    Assert.assertTrue(hasOtherAsOutcome(nameFinderModel));
  }

  /**
   * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it.
   * This is related to the issue OPENNLP-9
   */
  @Test
  public void testOnlyWithNames() throws Exception {

    // train the name finder
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
            new PlainTextByLineStream(new MockInputStreamFactory(
              new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
            params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
            "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");

    Span[] names1 = nameFinder.find(sentence);

    Assert.assertEquals(new Span(0, 2, DEFAULT), names1[0]);
    Assert.assertEquals(new Span(2, 4, DEFAULT), names1[1]);
    Assert.assertEquals(new Span(4, 6, DEFAULT), names1[2]);
    Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
  }

  @Test
  public void testOnlyWithNamesTypeOverride() throws Exception {

    // train the name finder
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", TYPE_OVERRIDE, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
        "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");

    Span[] names1 = nameFinder.find(sentence);

    Assert.assertEquals(new Span(0, 2, TYPE_OVERRIDE), names1[0]);
    Assert.assertEquals(new Span(2, 4, TYPE_OVERRIDE), names1[1]);
    Assert.assertEquals(new Span(4, 6, TYPE_OVERRIDE), names1[2]);
    Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
  }

  /**
   * Train NamefinderME using OnlyWithNamesWithTypes.train.
   * The goal is to check if the model validator accepts it.
   * This is related to the issue OPENNLP-9
   */
  @Test
  public void testOnlyWithNamesWithTypes() throws Exception {

    // train the name finder
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " +
        "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+");

    Span[] names1 = nameFinder.find(sentence);

    Assert.assertEquals(new Span(0, 2, "person"), names1[0]);
    Assert.assertEquals(new Span(2, 4, "person"), names1[1]);
    Assert.assertEquals(new Span(4, 6, "person"), names1[2]);
    Assert.assertEquals("person", names1[2].getType());
    Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
  }

  /**
   * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it.
   * This is related to the issue OPENNLP-9
   */
  @Test
  public void testOnlyWithEntitiesWithTypes() throws Exception {

    // train the name finder
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = "NATO United States Barack Obama".split("\\s+");

    Span[] names1 = nameFinder.find(sentence);

    Assert.assertEquals(new Span(0, 1, "organization"), names1[0]); // NATO
    Assert.assertEquals(new Span(1, 3, "location"), names1[1]); // United States
    Assert.assertEquals("person", names1[2].getType());
    Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel));
  }

  private boolean hasOtherAsOutcome(TokenNameFinderModel nameFinderModel) {
    SequenceClassificationModel<String> model = nameFinderModel.getNameFinderSequenceModel();
    String[] outcomes = model.getOutcomes();
    for (String outcome : outcomes) {
      if (outcome.equals(NameFinderME.OTHER)) {
        return true;
      }
    }
    return false;
  }

  @Test
  public void testDropOverlappingSpans() {
    Span[] spans = new Span[] {new Span(1, 10), new Span(1,11), new Span(1,11), new Span(5, 15)};
    Span[] remainingSpan = NameFinderME.dropOverlappingSpans(spans);
    Assert.assertEquals(new Span(1, 11), remainingSpan[0]);
  }

  /**
   * Train NamefinderME using voa1.train with several
   * nameTypes and try the model in a sample text.
   */
  @Test
  public void testNameFinderWithMultipleTypes() throws Exception {

    // train the name finder
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
        new PlainTextByLineStream(new MockInputStreamFactory(
          new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));

    TrainingParameters params = new TrainingParameters();
    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
    params.put(TrainingParameters.CUTOFF_PARAM, 1);

    TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
        params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

    NameFinderME nameFinder = new NameFinderME(nameFinderModel);

    // now test if it can detect the sample sentences

    String[] sentence = new String[] { "U", ".", "S", ".", "President",
        "Barack", "Obama", "has", "arrived", "in", "South", "Korea", ",",
        "where", "he", "is", "expected", "to", "show", "solidarity", "with",
        "the", "country", "'", "s", "president", "in", "demanding", "North",
        "Korea", "move", "toward", "ending", "its", "nuclear", "weapons",
        "programs", "." };

    Span[] names1 = nameFinder.find(sentence);

    Assert.assertEquals(new Span(0, 4, "location"), names1[0]);
    Assert.assertEquals(new Span(5, 7, "person"), names1[1]);
    Assert.assertEquals(new Span(10, 12, "location"), names1[2]);
    Assert.assertEquals(new Span(28, 30, "location"), names1[3]);
    Assert.assertEquals("location", names1[0].getType());
    Assert.assertEquals("person", names1[1].getType());
    Assert.assertEquals("location", names1[2].getType());
    Assert.assertEquals("location", names1[3].getType());

    sentence = new String[] { "Scott", "Snyder", "is", "the", "director", "of",
        "the", "Center", "for", "U", ".", "S", ".", "Korea", "Policy", "." };

    Span[] names2 = nameFinder.find(sentence);

    Assert.assertEquals(2, names2.length);
    Assert.assertEquals(new Span(0, 2, "person"), names2[0]);
    Assert.assertEquals(new Span(7, 15, "organization"), names2[1]);
    Assert.assertEquals("person", names2[0].getType());
    Assert.assertEquals("organization", names2[1].getType());
  }

}