/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.namefind; import java.io.File; import java.util.Collections; import org.junit.Assert; import org.junit.Test; import opennlp.tools.ml.model.SequenceClassificationModel; import opennlp.tools.util.MockInputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; import opennlp.tools.util.TrainingParameters; /** * This is the test class for {@link NameFinderME}. * <p> * A proper testing and evaluation of the name finder * is only possible with a large corpus which contains * a huge amount of test sentences. * <p> * The scope of this test is to make sure that the name finder * code can be executed. This test can not detect * mistakes which lead to incorrect feature generation * or other mistakes which decrease the tagging * performance of the name finder. * <p> * In this test the {@link NameFinderME} is trained with * a small amount of training sentences and then the * computed model is used to predict sentences from the * training sentences. */ public class NameFinderMETest { private static final String TYPE_OVERRIDE = "aType"; private static final String DEFAULT = "default"; @Test public void testNameFinder() throws Exception { // train the name finder String encoding = "ISO-8859-1"; ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentences.txt")), encoding)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); TokenNameFinder nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = {"Alisa", "appreciated", "the", "hint", "and", "enjoyed", "a", "delicious", "traditional", "meal."}; Span[] names = nameFinder.find(sentence); Assert.assertEquals(1, names.length); Assert.assertEquals(new Span(0, 1, DEFAULT), names[0]); sentence = new String[] { "Hi", "Mike", ",", "it's", "Stefanie", "Schmidt", "." }; names = nameFinder.find(sentence); Assert.assertEquals(2, names.length); Assert.assertEquals(new Span(1, 2, DEFAULT), names[0]); Assert.assertEquals(new Span(4, 6, DEFAULT), names[1]); } /** * Train NamefinderME using AnnotatedSentencesWithTypes.txt with "person" * nameType and try the model in a sample text. */ @Test public void testNameFinderWithTypes() throws Exception { // train the name finder String encoding = "ISO-8859-1"; ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence2 = new String[] { "Hi", "Mike", ",", "it's", "Stefanie", "Schmidt", "." }; Span[] names2 = nameFinder.find(sentence2); Assert.assertEquals(2, names2.length); Assert.assertEquals(new Span(1, 2, "person"), names2[0]); Assert.assertEquals(new Span(4, 6, "person"), names2[1]); Assert.assertEquals("person", names2[0].getType()); Assert.assertEquals("person", names2[1].getType()); String[] sentence = { "Alisa", "appreciated", "the", "hint", "and", "enjoyed", "a", "delicious", "traditional", "meal." }; Span[] names = nameFinder.find(sentence); Assert.assertEquals(1, names.length); Assert.assertEquals(new Span(0, 1, "person"), names[0]); Assert.assertTrue(hasOtherAsOutcome(nameFinderModel)); } /** * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithNames() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, DEFAULT), names1[0]); Assert.assertEquals(new Span(2, 4, DEFAULT), names1[1]); Assert.assertEquals(new Span(4, 6, DEFAULT), names1[2]); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); } @Test public void testOnlyWithNamesTypeOverride() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", TYPE_OVERRIDE, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, TYPE_OVERRIDE), names1[0]); Assert.assertEquals(new Span(2, 4, TYPE_OVERRIDE), names1[1]); Assert.assertEquals(new Span(4, 6, TYPE_OVERRIDE), names1[2]); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); } /** * Train NamefinderME using OnlyWithNamesWithTypes.train. * The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithNamesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = ("Neil Abercrombie Anibal Acevedo-Vila Gary Ackerman " + "Robert Aderholt Daniel Akaka Todd Akin Lamar Alexander Rodney Alexander").split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 2, "person"), names1[0]); Assert.assertEquals(new Span(2, 4, "person"), names1[1]); Assert.assertEquals(new Span(4, 6, "person"), names1[2]); Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); } /** * Train NamefinderME using OnlyWithNames.train. The goal is to check if the model validator accepts it. * This is related to the issue OPENNLP-9 */ @Test public void testOnlyWithEntitiesWithTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/OnlyWithEntitiesWithTypes.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT"); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = "NATO United States Barack Obama".split("\\s+"); Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 1, "organization"), names1[0]); // NATO Assert.assertEquals(new Span(1, 3, "location"), names1[1]); // United States Assert.assertEquals("person", names1[2].getType()); Assert.assertTrue(!hasOtherAsOutcome(nameFinderModel)); } private boolean hasOtherAsOutcome(TokenNameFinderModel nameFinderModel) { SequenceClassificationModel<String> model = nameFinderModel.getNameFinderSequenceModel(); String[] outcomes = model.getOutcomes(); for (String outcome : outcomes) { if (outcome.equals(NameFinderME.OTHER)) { return true; } } return false; } @Test public void testDropOverlappingSpans() { Span[] spans = new Span[] {new Span(1, 10), new Span(1,11), new Span(1,11), new Span(5, 15)}; Span[] remainingSpan = NameFinderME.dropOverlappingSpans(spans); Assert.assertEquals(new Span(1, 11), remainingSpan[0]); } /** * Train NamefinderME using voa1.train with several * nameTypes and try the model in a sample text. */ @Test public void testNameFinderWithMultipleTypes() throws Exception { // train the name finder ObjectStream<NameSample> sampleStream = new NameSampleDataStream( new PlainTextByLineStream(new MockInputStreamFactory( new File("opennlp/tools/namefind/voa1.train")), "UTF-8")); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream, params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec())); NameFinderME nameFinder = new NameFinderME(nameFinderModel); // now test if it can detect the sample sentences String[] sentence = new String[] { "U", ".", "S", ".", "President", "Barack", "Obama", "has", "arrived", "in", "South", "Korea", ",", "where", "he", "is", "expected", "to", "show", "solidarity", "with", "the", "country", "'", "s", "president", "in", "demanding", "North", "Korea", "move", "toward", "ending", "its", "nuclear", "weapons", "programs", "." }; Span[] names1 = nameFinder.find(sentence); Assert.assertEquals(new Span(0, 4, "location"), names1[0]); Assert.assertEquals(new Span(5, 7, "person"), names1[1]); Assert.assertEquals(new Span(10, 12, "location"), names1[2]); Assert.assertEquals(new Span(28, 30, "location"), names1[3]); Assert.assertEquals("location", names1[0].getType()); Assert.assertEquals("person", names1[1].getType()); Assert.assertEquals("location", names1[2].getType()); Assert.assertEquals("location", names1[3].getType()); sentence = new String[] { "Scott", "Snyder", "is", "the", "director", "of", "the", "Center", "for", "U", ".", "S", ".", "Korea", "Policy", "." }; Span[] names2 = nameFinder.find(sentence); Assert.assertEquals(2, names2.length); Assert.assertEquals(new Span(0, 2, "person"), names2[0]); Assert.assertEquals(new Span(7, 15, "organization"), names2[1]); Assert.assertEquals("person", names2[0].getType()); Assert.assertEquals("organization", names2[1].getType()); } }