/* * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ------------------- * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit * http://www.manning.com/ingersoll */ package com.tamingtext.opennlp; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.NameSample; import opennlp.tools.namefind.NameSampleDataStream; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.PlainTextByLineStream; import opennlp.tools.util.Span; import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator; import opennlp.tools.util.featuregen.AggregatedFeatureGenerator; import opennlp.tools.util.featuregen.PreviousMapFeatureGenerator; import opennlp.tools.util.featuregen.TokenClassFeatureGenerator; import opennlp.tools.util.featuregen.TokenFeatureGenerator; import opennlp.tools.util.featuregen.WindowFeatureGenerator; import org.junit.Test; import com.tamingtext.TamingTextTestJ4; import com.tamingtext.util.MemoryStatus; public class NameFinderTest extends TamingTextTestJ4 { //<start id="ne-display1"/> private void displayNames(Span[] names, String[] tokens) { for (int si = 0; si < names.length; si++) { //<co id="co.opennlp.name.eachname"/> StringBuilder cb = new StringBuilder(); for (int ti = names[si].getStart(); ti < names[si].getEnd(); ti++) { cb.append(tokens[ti]).append(" "); //<co id="co.opennlp.name.eachtoken"/> } System.out.println(cb.substring(0, cb.length() - 1)); //<co id="co.opennlp.name.extra"/> System.out.println("\ttype: " + names[si].getType()); } } /*<calloutlist> <callout arearefs="co.opennlp.name.eachname"><para>Iterate over each name.</para></callout> <callout arearefs="co.opennlp.name.eachtoken"><para>Iterate over each token in the name.</para></callout> <callout arearefs="co.opennlp.name.extra"><para>Remove the extra space at the end of the name and print.</para></callout> </calloutlist>*/ //<end id="ne-display1"/> //private Span[] mergeSpans(Span[][] spans) { // return null; //} //<start id="ne-remove-conflicts"/> private void removeConflicts(List<Annotation> allAnnotations) { if (allAnnotations.size() < 2) return; //<co id="co.opennlp.name.earlyreturn"/> java.util.Collections.sort(allAnnotations); //<co id="co.opennlp.name.sort"/> List<Annotation> stack = new ArrayList<Annotation>(); //<co id="co.opennlp.name.stack"/> stack.add(allAnnotations.get(0)); for (int ai = 1; ai < allAnnotations.size(); ai++) { //<co id="co.opennlp.name.eachname2"/> Annotation curr = (Annotation) allAnnotations.get(ai); boolean deleteCurr = false; for (int ki = stack.size() - 1; ki >= 0; ki--) { //<co id="co.opennlp.name.eachstack"/> Annotation prev = (Annotation) stack.get(ki); if (prev.getSpan().equals(curr.getSpan())) { //<co id="co.opennlp.name.isequal"/> if (prev.getProb() > curr.getProb()) { deleteCurr = true; break; } else { allAnnotations.remove(stack.remove(ki)); ai--; //<co id="co.opennlp.name.change4delete"/> } } else if (prev.getSpan().intersects(curr.getSpan())) { //<co id="co.opennlp.name.iscrossing"/> if (prev.getProb() > curr.getProb()) { deleteCurr = true; break; } else { allAnnotations.remove(stack.remove(ki)); ai--; //<co id="co.opennlp.name.change4delete2"/> } } else if (prev.getSpan().contains(curr.getSpan())) { //<co id="co.opennlp.name.issubsumed"/> break; } else { //<co id="co.opennlp.name.ispast"/> stack.remove(ki); } } if (deleteCurr) { allAnnotations.remove(ai); ai--; //<co id="co.opennlp.name.change4delete3"/> deleteCurr = false; } else { stack.add(curr); } } } /* <calloutlist> <callout arearefs="co.opennlp.name.earlyreturn"><para>Exit early if there will be no conflicts.</para></callout> <callout arearefs="co.opennlp.name.sort"><para>Sort the names based on their span's start index ascending then end index decending.</para></callout> <callout arearefs="co.opennlp.name.stack"><para>Initialize a stack to keep track of previous names.</para></callout> <callout arearefs="co.opennlp.name.eachname2"><para>Iterate over each name.</para></callout> <callout arearefs="co.opennlp.name.eachstack"><para>Iterate over each item in the stack.</para></callout> <callout arearefs="co.opennlp.name.isequal"><para>Test if a name span is identical to another name span, and if so remove the less probable one.</para></callout> <callout arearefs="co.opennlp.name.change4delete co.opennlp.name.change4delete2 co.opennlp.name.change4delete3"><para>Update index of name after deletion to negate ai++ at end of for-loop.</para></callout> <callout arearefs="co.opennlp.name.iscrossing"><para>Test if a name span is over-lapping another name span, and if so remove the less probable one.</para></callout> <callout arearefs="co.opennlp.name.issubsumed"><para>Test if a name span is subsumed by another name span, and if so exit loop.</para></callout> <callout arearefs="co.opennlp.name.ispast"><para>Test if a name span is past another name span, and if so remove previous name from the stack.</para></callout> </calloutlist> */ //<end id="ne-remove-conflicts"/> @Test public void testRemoveConflicts() { List<Annotation> annotations = new ArrayList<Annotation>(); annotations.add(new Annotation("person", new Span(1, 5), 0.75)); annotations.add(new Annotation("person", new Span(7, 10), 0.95)); annotations.add(new Annotation("location", new Span(11, 15), 0.85)); removeConflicts(annotations); assertTrue(annotations.size() == 3); annotations.add(new Annotation("location", new Span(2, 7), 0.85)); removeConflicts(annotations); assertTrue(annotations.size() == 3); assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 2); annotations.clear(); annotations.add(new Annotation("person", new Span(1, 5), 0.75)); annotations.add(new Annotation("person", new Span(7, 10), 0.95)); annotations.add(new Annotation("location", new Span(11, 15), 0.85)); annotations.add(new Annotation("person", new Span(3, 8), 0.85)); removeConflicts(annotations); assertTrue(annotations.size() == 2); assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 7); } public void multiModel() throws IOException { File modelDir = getModelDir(); //<start id="ne-multi"/> String[] sentences = { "Former first lady Nancy Reagan was taken to a " + "suburban Los Angeles " + "hospital \"as a precaution\" Sunday after a fall at " + "her home, an " + "aide said. ", "The 86-year-old Reagan will remain overnight for " + "observation at a hospital in Santa Monica, California, " + "said Joanne " + "Drake, chief of staff for the Reagan Foundation."}; NameFinderME[] finders = new NameFinderME[3]; String[] names = {"person", "location", "date"}; for (int mi = 0; mi < names.length; mi++) { //<co id="co.opennlp.name.1"/> finders[mi] = new NameFinderME(new TokenNameFinderModel( new FileInputStream( new File(modelDir, "en-ner-" + names[mi] + ".bin") ))); } Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="co.opennlp.name.2"/> for (int si = 0; si < sentences.length; si++) { //<co id="co.opennlp.name.3"/> List<Annotation> allAnnotations = new ArrayList<Annotation>(); String[] tokens = tokenizer.tokenize(sentences[si]);//<co id="co.opennlp.name.4"/> for (int fi = 0; fi < finders.length; fi++) { //<co id="co.opennlp.name.5"/> Span[] spans = finders[fi].find(tokens); //<co id="co.opennlp.name.6"/> double[] probs = finders[fi].probs(spans); //<co id="co.opennlp.name.7"/> for (int ni = 0; ni < spans.length; ni++) { allAnnotations.add( //<co id="co.opennlp.name.8"/> new Annotation(names[fi], spans[ni], probs[ni]) ); } } removeConflicts(allAnnotations); //<co id="co.opennlp.name.9"/> } /*<calloutlist> <callout arearefs="co.opennlp.name.1"> <para>Initialize a new model for identifying people, locations, and dates based on the binary compressed model in the files "en-ner-person.bin", "en-ner-location.bin", "en-ner-date.bin". </para> </callout> <callout arearefs="co.opennlp.name.2"> <para>Obtain a reference to a tokenizer to split the sentence into individual words and symbols. </para> </callout> <callout arearefs="co.opennlp.name.3"> <para>Iterate over each sentence.</para> </callout> <callout arearefs="co.opennlp.name.4"> <para>Split the sentence into an array of tokens.</para> </callout> <callout arearefs="co.opennlp.name.5"> <para>Iterate over each of the name finders (person, location, date).</para> </callout> <callout arearefs="co.opennlp.name.6"> <para>Identify the names in the sentence and return token-based offsets to these names.</para> </callout> <callout arearefs="co.opennlp.name.7"> <para>Get the probabilities associated with the associated matches.</para> </callout> <callout arearefs="co.opennlp.name.8"> <para>Collect each of the identified names from each of the name finders.</para></callout> <callout arearefs="co.opennlp.name.9"> <para>Resolve any cases of overlapping names in favor of the more probable name.</para></callout> </calloutlist>*/ //<end id="ne-multi"/> } @Test public void testMultiNameSamples() throws IOException { File destDir = new File("target"); //<start id="ne-namesample-type"/> String taggedSent = "<START:person> Britney Spears <END> was reunited " + "with her sons <START:date> Saturday <END> "; ObjectStream<NameSample> nss = new NameSampleDataStream( new PlainTextByLineStream(new StringReader(taggedSent))); TokenNameFinderModel model = NameFinderME.train( "en", "default" , nss, (AdaptiveFeatureGenerator) null, Collections.<String,Object>emptyMap(), 70 , 1 ); File outFile = new File(destDir,"multi-custom.bin"); FileOutputStream outFileStream = new FileOutputStream(outFile); model.serialize(outFileStream); NameFinderME nameFinder = new NameFinderME(model); String[] tokens = (" Britney Spears was reunited with her sons Saturday .") .split("\\s+"); Span[] names = nameFinder.find(tokens); displayNames(names, tokens); //<end id="ne-namesample-type"/> assertEquals("person", names[0].getType()); assertEquals("date", names[1].getType()); } @Test public void testMemoryUsageNonPooled() throws IOException { File modelDir = getModelDir(); MemoryStatus memStatus = new MemoryStatus(); memStatus.dumpMemory("before non-pooled model load"); //String[] names = {"person"}; //String[] names = {"date","location","money","organization","percentage","person","time"}; String[] names = {"person","location","date"}; NameFinderME[] finders = new NameFinderME[names.length]; for (int mi = 0; mi < names.length; mi++) { finders[mi] = new NameFinderME(new TokenNameFinderModel( new FileInputStream( new File(modelDir, "en-ner-" + names[mi] + ".bin") ))); } memStatus.dumpMemory("after non-pooled model load of " + Arrays.toString(names)); // ----------before non-pooled model load---------- // Code Cache 511.88 KBytes // Par Eden Space 6.32 MBytes // Par Survivor Space 0.00 Bytes // CMS Old Gen 0.00 Bytes // CMS Perm Gen 5.88 MBytes // Total 12.70 MBytes // --------------------------------- // ----------after non-pooled model load of person, money, date---------- // Code Cache 622.19 KBytes // Par Eden Space 4.29 MBytes // Par Survivor Space 3.19 MBytes // CMS Old Gen 142.21 MBytes // CMS Perm Gen 6.22 MBytes // Total 156.51 MBytes // --------------------------------- } @Test public void testMemoryUsagePooled() throws IOException { File modelDir = getModelDir(); MemoryStatus memStatus = new MemoryStatus(); memStatus.dumpMemory("before pooled model load"); //String[] names = {"person"}; //String[] names = {"date","location","money","organization","percentage","person","time"}; //<start id="ne-pool"/> String[] names = {"person","location","date"}; NameFinderME[] finders = new NameFinderME[names.length]; for (int mi = 0; mi < names.length; mi++) { //<co id="co.opennlp.name.init4"/> finders[mi] = new NameFinderME( new PooledTokenNameFinderModel( //<co id="co.opennlp.name.pool"/> new FileInputStream( new File(modelDir, "en-ner-" + names[mi] + ".bin")))); } /*<calloutlist> <callout arearefs="co.opennlp.name.init4"><para>Initialize name finders for identifying people, locations, and dates</para></callout> <callout arearefs="co.opennlp.name.pool"><para>Use the string-pooling model to reduce memory footprint.</para></callout> </calloutlist>*/ //<end id="ne-pool"/> memStatus.dumpMemory("after pooled model load of " + Arrays.toString(names)); // ----------before pooled model load---------- // Code Cache 514.13 KBytes // Par Eden Space 6.18 MBytes // Par Survivor Space 0.00 Bytes // CMS Old Gen 0.00 Bytes // CMS Perm Gen 5.88 MBytes // Total 12.57 MBytes // --------------------------------- // ----------after pooled model load---------- // Code Cache 626.75 KBytes // Par Eden Space 7.16 MBytes // Par Survivor Space 2.06 MBytes // CMS Old Gen 61.59 MBytes // CMS Perm Gen 32.95 MBytes // Total 104.37 MBytes // --------------------------------- } @Test public void trainNameFinder() throws IOException { File baseDir = new File("src/test/resources"); File destDir = new File("target"); //<start id="ne-train"/> File inFile = new File(baseDir,"person.train"); NameSampleDataStream nss = new NameSampleDataStream( //<co id="co.opennlp.name.initnamestream"/> new PlainTextByLineStream( new java.io.FileReader(inFile))); int iterations = 100; int cutoff = 5; TokenNameFinderModel model = NameFinderME.train( //<co id="co.opennlp.name.train"/> "en", // language "person", // type nss, (AdaptiveFeatureGenerator) null, Collections.<String,Object>emptyMap(), iterations, cutoff); File outFile = new File(destDir, "person-custom.bin"); FileOutputStream outFileStream = new FileOutputStream(outFile); model.serialize(outFileStream); //<co id="co.opennlp.name.persist3"/> /*<calloutlist> <callout arearefs="co.opennlp.name.initnamestream"><para>Create a stream of name samples based on annotated data in the "person.train" file.</para></callout> <callout arearefs="co.opennlp.name.train"><para>Train the model.</para></callout> <callout arearefs="co.opennlp.name.persist3"><para>Save the model to a file.</para></callout> </calloutlist>*/ //<end id="ne-train"/> } @Test @SuppressWarnings("unused") public void trainNameFinderWithCustomFeatures() throws IOException { File baseDir = new File("src/test/resources"); File destDir = new File("target"); //<start id="ne-features"/> AggregatedFeatureGenerator featureGenerators = new AggregatedFeatureGenerator( //<co id="co.opennlp.name.createfeat"/> new WindowFeatureGenerator( new TokenFeatureGenerator(), 2, 2), //<co id="co.opennlp.name.tokenfeat"/> new WindowFeatureGenerator( new TokenClassFeatureGenerator(), 2, 2), //<co id="co.opennlp.name.tokenclassfeat"/> new PreviousMapFeatureGenerator() //<co id="co.opennlp.name.prevfeat"/> ); /*<calloutlist> <callout arearefs="co.opennlp.name.createfeat"><para>Creates an aggregated feature generator containing the 3 generators defined below.</para></callout> <callout arearefs="co.opennlp.name.tokenfeat"><para>Creates a feature generator corresponding to the tokens in a 5-token widow (2 to the left, and 2 to the right).</para></callout> <callout arearefs="co.opennlp.name.tokenclassfeat"><para>Creates a feature generator corresponding to the token classes of the tokens in a 5-token widow (2 to the left, and 2 to the right).</para></callout> <callout arearefs="co.opennlp.name.prevfeat"><para>Creates a feature generator which specifies how this token was previously tagged.</para></callout> </calloutlist>*/ //<end id="ne-features"/> //<start id="ne-features-train"/> File inFile = new File(baseDir,"person.train"); NameSampleDataStream nss = new NameSampleDataStream( //<co id="co.opennlp.name.initfeat"/> new PlainTextByLineStream( new java.io.FileReader(inFile))); int iterations = 100; int cutoff = 5; TokenNameFinderModel model = NameFinderME.train( //<co id="co.opennlp.name.train2"/> "en", // language "person", // type nss, featureGenerators, Collections.<String,Object>emptyMap(), iterations, cutoff); File outFile = new File(destDir,"person-custom2.bin"); FileOutputStream outFileStream = new FileOutputStream(outFile); model.serialize(outFileStream); //<co id="co.opennlp.name.persist2"/> /*<calloutlist> <callout arearefs="co.opennlp.name.initfeat"><para>Create the sample stream..</para></callout> <callout arearefs="co.opennlp.name.train2"><para>Train the model with a custom feature generator.</para></callout> <callout arearefs="co.opennlp.name.persist2"><para>Save the model to a file.</para></callout> </calloutlist>*/ //<end id="ne-features-train"/> //<start id="ne-features-test"/> NameFinderME finder = new NameFinderME( new TokenNameFinderModel( new FileInputStream( new File(destDir, "person-custom2.bin") )), featureGenerators, NameFinderME.DEFAULT_BEAM_SIZE); //<end id="ne-features-test"/> } @SuppressWarnings("unused") @Test public void test() throws IOException { //<start id="ne-setup"/> String[] sentences = { "Former first lady Nancy Reagan was taken to a " + "suburban Los Angeles " + "hospital \"as a precaution\" Sunday after a " + "fall at her home, an " + "aide said. ", "The 86-year-old Reagan will remain overnight for " + "observation at a hospital in Santa Monica, California, " + "said Joanne " + "Drake, chief of staff for the Reagan Foundation."}; NameFinderME finder = new NameFinderME( //<co id="co.opennlp.name.initmodel"/> new TokenNameFinderModel(new FileInputStream(getPersonModel())) ); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="co.opennlp.name.inittokenizer2"/> for (int si = 0; si < sentences.length; si++) { String[] tokens = tokenizer.tokenize(sentences[si]); //<co id="co.opennlp.name.tokenize2"/> Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames3"/> displayNames(names, tokens); } finder.clearAdaptiveData(); //<co id="co.opennlp.name.clear"/> /*<calloutlist> <callout arearefs="co.opennlp.name.initmodel"> <para>Initialize a new model for identifying people names based on the binary compressed model in the file "en-ner-person.bin".</para> </callout> <callout arearefs="co.opennlp.name.inittokenizer2"> <para>Initialize a tokenizer to split the sentence into individual words and symbols.</para> </callout> <callout arearefs="co.opennlp.name.tokenize2"> <para>Split the sentence into an array of tokens.</para> </callout> <callout arearefs="co.opennlp.name.findnames3"> <para>Identify the names in the sentence and return token-based offsets to these names.</para> </callout> <callout arearefs="co.opennlp.name.clear"> <para>Clear data structures that store which words have been seen previously in the document and whether these words were considered part of a person's name.</para> </callout> </calloutlist>*/ //<end id="ne-setup"/> //<start id="ne-display2"/> for (int si = 0; si < sentences.length; si++) { //<co id="co.opennlp.name.eachsent2"/> Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]); //<co id="co.opennlp.name.tokenizepos"/> String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]); //<co id="co.opennlp.name.convert2strings"/> Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames4"/> for (int ni = 0; ni < names.length; ni++) { Span startSpan = tokenSpans[names[ni].getStart()]; //<co id="co.opennlp.name.computestart"/> int nameStart = startSpan.getStart(); Span endSpan = tokenSpans[names[ni].getEnd() - 1]; //<co id="co.opennlp.name.computeend"/> int nameEnd = endSpan.getEnd(); String name = sentences[si].substring(nameStart, nameEnd); //<co id="co.opennlp.name.namestring"/> System.out.println(name); } } /*<calloutlist> <callout arearefs="co.opennlp.name.eachsent2"> <para>Iterate over each sentence.</para> </callout> <callout arearefs="co.opennlp.name.tokenizepos"> <para>Split the sentence into an array of tokens and return the character offsets (spans) of those tokens.</para> </callout> <callout arearefs="co.opennlp.name.findnames4"> <para> Identify the names in the sentence and return token-based offsets to these names. </para> </callout> <callout arearefs="co.opennlp.name.computestart"> <para> Compute the start character index of the name. </para> </callout> <callout arearefs="co.opennlp.name.computeend"> <para> Compute the end character index (last character +1) of the name. </para> </callout> <callout arearefs="co.opennlp.name.computeend"> <para> Compute the string which represents the name. </para> </callout> </calloutlist>*/ //<end id="ne-display2"/> //<start id="ne-prob"/> for (int si = 0; si < sentences.length; si++) {//<co id="co.opennlp.name.eachsent3"/> String[] tokens = tokenizer.tokenize(sentences[si]); //<co id="co.opennlp.name.tokenize3"/> Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames1"/> double[] spanProbs = finder.probs(names); //<co id="co.opennlp.name.probs"/> } /*<calloutlist> <callout arearefs="co.opennlp.name.eachsent3"><para>Iterate over each sentence.</para></callout> <callout arearefs="co.opennlp.name.tokenize3"><para>Split the sentence into an array of tokens.</para></callout> <callout arearefs="co.opennlp.name.findnames1"><para>Identify the names in the sentence and return token-based offsets to these names.</para></callout> <callout arearefs="co.opennlp.name.probs"><para>Return the probability associated with each name.</para></callout> </calloutlist>*/ //<end id="ne-prob"/> } } class Annotation implements Comparable<Annotation> { private Span span; private String type; private double prob; public Annotation(String type, Span span, double prob) { this.span = span; this.type = type; this.prob = prob; } public Span getSpan() { return span; } public String getType() { return type; } public double getProb() { return prob; } public int compareTo(Annotation a) { int c = span.compareTo(a.span); if (c == 0) { c = Double.compare(prob, a.prob); if (c == 0) { c = type.compareTo(a.type); } } return c; } public String toString() { return type + " " + span + " " + prob; } }