NameFinderTest.java example

Explorer

book-master
- src
  - main
    - java
      - com
        tamingtext
        classifier
        bayes
        BayesUpdateRequestProcessor.java
        BayesUpdateRequestProcessorFactory.java
        ClassifyDocument.java
        ExtractTrainingData.java
        maxent
        CategoryDataStream.java
        NameFinderFeatureGenerator.java
        TestMaxent.java
        TrainMaxent.java
        mlt
        CategoryHits.java
        MoreLikeThisCategorizer.java
        TestMoreLikeThis.java
        TrainMoreLikeThis.java
        fuzzy
        LevenshteinDistance.java
        MovieMatcher.java
        OverlapMeasures.java
        SpellCorrector.java
        TrieNode.java
        TypeAheadResponseWriter.java
        opennlp
        PooledGenericModelReader.java
        PooledGenericModelSerializer.java
        PooledTokenNameFinderModel.java
        qa
        AnswerTypeClassifier.java
        AnswerTypeContextGenerator.java
        AnswerTypeEventStream.java
        ChunkParser.java
        PassageRankingComponent.java
        QAParams.java
        QuestionQParser.java
        QuestionQParserPlugin.java
        WexWikiContentSource.java
        WikipediaIndexer.java
        WikipediaWexIndexer.java
        tagging
        LuceneCategoryExtractor.java
        LuceneTagExtractor.java
        tagrecommender
        CountStackOverflowTags.java
        ExtractStackOverflowData.java
        MoreLikeThisRequest.java
        StackOverflowParser.java
        StackOverflowPost.java
        StackOverflowStream.java
        StackOverflowTagTransformer.java
        TagRecommenderClient.java
        TestStackOverflowTagger.java
        Util.java
        texttamer
        solr
        NameFilter.java
        NameFilterFactory.java
        SentenceTokenizer.java
        SentenceTokenizerFactory.java
        util
        Constants.java
        FileUtil.java
        MemoryStatus.java
        NameFinderFactory.java
        OpenNLPUtil.java
        SentenceDetectorFactory.java
        SplitInput.java
        StringUtil.java
        TamingTextDriver.java
  - test
    - java
      - com
        tamingtext
        TTTestCaseJ4.java
        TamingTextTestJ4.java
        carrot2
        Carrot2ExampleTest.java
        classifier
        bayes
        BayesUpdateRequestProcessorTest.java
        ExtractTrainingDataTest.java
        mlt
        MoreLikeThisQueryTest.java
        frankenstein
        Frankenstein.java
        fuzzy
        LevenshteinDistanceTest.java
        OverlapMeasuresTest.java
        TrieNodeTest.java
        mahout
        VectorExamplesTest.java
        opennlp
        AnswerTypeTest.java
        ChunkParserTest.java
        NameFinderTest.java
        POSTaggerTest.java
        ParserTest.java
        qa
        MockQuestionQParserPlugin.java
        PassageRankingComponentTest.java
        QATest.java
        sentences
        SentenceDetectionTest.java
        snowball
        SnowballStemmerTest.java
        solr
        SolrJTest.java
        texttamer
        solr
        NameFilterTest.java
        SentenceTokenizerTest.java
        tika
        TikaTest.java
        util
        StringUtilTest.java

/*
 * Copyright 2008-2011 Grant Ingersoll, Thomas Morton and Drew Farris
 *
 *    Licensed under the Apache License, Version 2.0 (the "License");
 *    you may not use this file except in compliance with the License.
 *    You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing, software
 *    distributed under the License is distributed on an "AS IS" BASIS,
 *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *    See the License for the specific language governing permissions and
 *    limitations under the License.
 * -------------------
 * To purchase or learn more about Taming Text, by Grant Ingersoll, Thomas Morton and Drew Farris, visit
 * http://www.manning.com/ingersoll
 */

package com.tamingtext.opennlp;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.namefind.NameSampleDataStream;
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
import opennlp.tools.util.featuregen.AggregatedFeatureGenerator;
import opennlp.tools.util.featuregen.PreviousMapFeatureGenerator;
import opennlp.tools.util.featuregen.TokenClassFeatureGenerator;
import opennlp.tools.util.featuregen.TokenFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;

import org.junit.Test;

import com.tamingtext.TamingTextTestJ4;
import com.tamingtext.util.MemoryStatus;

public class NameFinderTest extends TamingTextTestJ4 {


//<start id="ne-display1"/>
  private void displayNames(Span[] names, String[] tokens) {
    for (int si = 0; si < names.length; si++) { //<co id="co.opennlp.name.eachname"/>
      StringBuilder cb = new StringBuilder();
      for (int ti = names[si].getStart(); ti < names[si].getEnd(); ti++) {
        cb.append(tokens[ti]).append(" "); //<co id="co.opennlp.name.eachtoken"/>
      }
      System.out.println(cb.substring(0, cb.length() - 1)); //<co id="co.opennlp.name.extra"/>
      System.out.println("\ttype: " + names[si].getType());
    }
  }
/*<calloutlist>
<callout arearefs="co.opennlp.name.eachname"><para>Iterate over each name.</para></callout>
<callout arearefs="co.opennlp.name.eachtoken"><para>Iterate over each token in the name.</para></callout>
<callout arearefs="co.opennlp.name.extra"><para>Remove the extra space at the end of the name and print.</para></callout>
</calloutlist>*/
//<end id="ne-display1"/>

  //private Span[] mergeSpans(Span[][] spans) {
  //  return null;
  //}

  //<start id="ne-remove-conflicts"/>
  private void removeConflicts(List<Annotation> allAnnotations) {
    if (allAnnotations.size() < 2) return; //<co id="co.opennlp.name.earlyreturn"/>
    java.util.Collections.sort(allAnnotations); //<co id="co.opennlp.name.sort"/>
    List<Annotation> stack = new ArrayList<Annotation>(); //<co id="co.opennlp.name.stack"/>
    stack.add(allAnnotations.get(0));
    for (int ai = 1; ai < allAnnotations.size(); ai++) { //<co id="co.opennlp.name.eachname2"/>
      Annotation curr = (Annotation) allAnnotations.get(ai);
      boolean deleteCurr = false;
      for (int ki = stack.size() - 1; ki >= 0; ki--) { //<co id="co.opennlp.name.eachstack"/>
        Annotation prev = (Annotation) stack.get(ki);
        if (prev.getSpan().equals(curr.getSpan())) { //<co id="co.opennlp.name.isequal"/>
          if (prev.getProb() > curr.getProb()) {
            deleteCurr = true;
            break;
          } else {
            allAnnotations.remove(stack.remove(ki));
            ai--;  //<co id="co.opennlp.name.change4delete"/> 
          }
        } else if (prev.getSpan().intersects(curr.getSpan())) { //<co id="co.opennlp.name.iscrossing"/>
          if (prev.getProb() > curr.getProb()) {
            deleteCurr = true;
            break;
          } else {
            allAnnotations.remove(stack.remove(ki));
            ai--;  //<co id="co.opennlp.name.change4delete2"/> 
          }
        } else if (prev.getSpan().contains(curr.getSpan())) { //<co id="co.opennlp.name.issubsumed"/>
          break;
        } else { //<co id="co.opennlp.name.ispast"/>
          stack.remove(ki);
        }
      }
      if (deleteCurr) {
        allAnnotations.remove(ai);
        ai--; //<co id="co.opennlp.name.change4delete3"/> 
        deleteCurr = false;
      } else {
        stack.add(curr);
      }
    }
  }

  /*
  <calloutlist>
  <callout arearefs="co.opennlp.name.earlyreturn"><para>Exit early if there will be no conflicts.</para></callout>
  <callout arearefs="co.opennlp.name.sort"><para>Sort the names based on their span's start index ascending then end index decending.</para></callout>
  <callout arearefs="co.opennlp.name.stack"><para>Initialize a stack to keep track of previous names.</para></callout>
  <callout arearefs="co.opennlp.name.eachname2"><para>Iterate over each name.</para></callout>
  <callout arearefs="co.opennlp.name.eachstack"><para>Iterate over each item in the stack.</para></callout>
  <callout arearefs="co.opennlp.name.isequal"><para>Test if a name span is identical to another name span, and if so remove the less probable one.</para></callout>
  <callout arearefs="co.opennlp.name.change4delete co.opennlp.name.change4delete2 co.opennlp.name.change4delete3"><para>Update index of name after deletion to negate ai++ at end of for-loop.</para></callout>
  <callout arearefs="co.opennlp.name.iscrossing"><para>Test if a name span is over-lapping another name span, and if so remove the less probable one.</para></callout>
  <callout arearefs="co.opennlp.name.issubsumed"><para>Test if a name span is subsumed by another name span, and if so exit loop.</para></callout>
  <callout arearefs="co.opennlp.name.ispast"><para>Test if a name span is past another name span, and if so remove previous name from the stack.</para></callout>
  </calloutlist>
  */
  //<end id="ne-remove-conflicts"/>
  @Test
  public void testRemoveConflicts() {
    List<Annotation> annotations = new ArrayList<Annotation>();
    annotations.add(new Annotation("person", new Span(1, 5), 0.75));
    annotations.add(new Annotation("person", new Span(7, 10), 0.95));
    annotations.add(new Annotation("location", new Span(11, 15), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 3);
    annotations.add(new Annotation("location", new Span(2, 7), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 3);
    assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 2);
    annotations.clear();
    annotations.add(new Annotation("person", new Span(1, 5), 0.75));
    annotations.add(new Annotation("person", new Span(7, 10), 0.95));
    annotations.add(new Annotation("location", new Span(11, 15), 0.85));
    annotations.add(new Annotation("person", new Span(3, 8), 0.85));
    removeConflicts(annotations);
    assertTrue(annotations.size() == 2);
    assertTrue(((Annotation) annotations.get(0)).getSpan().getStart() == 7);
  }

  public void multiModel() throws IOException {

    File modelDir = getModelDir();
    //<start id="ne-multi"/>    
    String[] sentences = {
      "Former first lady Nancy Reagan was taken to a " +
              "suburban Los Angeles " +
      "hospital \"as a precaution\" Sunday after a fall at " +
              "her home, an " +
      "aide said. ",
      "The 86-year-old Reagan will remain overnight for " +
      "observation at a hospital in Santa Monica, California, " +
              "said Joanne " +
      "Drake, chief of staff for the Reagan Foundation."};
    NameFinderME[] finders = new NameFinderME[3];
    String[] names = {"person", "location", "date"};
    for (int mi = 0; mi < names.length; mi++) {  //<co id="co.opennlp.name.1"/>
      finders[mi] = new NameFinderME(new TokenNameFinderModel(
          new FileInputStream(
              new File(modelDir, "en-ner-" + names[mi] + ".bin")
          )));
    }

    Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="co.opennlp.name.2"/>
    for (int si = 0; si < sentences.length; si++) { //<co id="co.opennlp.name.3"/>
      List<Annotation> allAnnotations = new ArrayList<Annotation>();
      String[] tokens = tokenizer.tokenize(sentences[si]);//<co id="co.opennlp.name.4"/>
      for (int fi = 0; fi < finders.length; fi++) { //<co id="co.opennlp.name.5"/>
        Span[] spans = finders[fi].find(tokens); //<co id="co.opennlp.name.6"/>
        double[] probs = finders[fi].probs(spans); //<co id="co.opennlp.name.7"/>
        for (int ni = 0; ni < spans.length; ni++) {
          allAnnotations.add( //<co id="co.opennlp.name.8"/>
              new Annotation(names[fi], spans[ni], probs[ni])
          ); 
        }
      }
      removeConflicts(allAnnotations); //<co id="co.opennlp.name.9"/>
    }
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.1">
      <para>Initialize a new model for identifying people, locations, and dates
        based on the binary compressed model in the files "en-ner-person.bin",
        "en-ner-location.bin", "en-ner-date.bin".
      </para>
    </callout>
    <callout arearefs="co.opennlp.name.2">
      <para>Obtain a reference to a tokenizer to split the sentence into 
        individual words and symbols.
      </para>
    </callout>
    <callout arearefs="co.opennlp.name.3">
      <para>Iterate over each sentence.</para>
    </callout>
    <callout arearefs="co.opennlp.name.4">
      <para>Split the sentence into an array of tokens.</para>
    </callout>
    <callout arearefs="co.opennlp.name.5">
      <para>Iterate over each of the name finders (person, location, date).</para>
    </callout>
    <callout arearefs="co.opennlp.name.6">
      <para>Identify the names in the sentence and return token-based offsets
         to these names.</para>
    </callout>
    <callout arearefs="co.opennlp.name.7">
      <para>Get the probabilities associated with the associated matches.</para>
    </callout>
    <callout arearefs="co.opennlp.name.8">
      <para>Collect each of the identified names from each of the name 
        finders.</para></callout>
    <callout arearefs="co.opennlp.name.9">
      <para>Resolve any cases of overlapping names in favor of the more 
        probable name.</para></callout>
    </calloutlist>*/
    //<end id="ne-multi"/>

  }

  @Test
  public void testMultiNameSamples() throws IOException {
    File destDir = new File("target");
    
    //<start id="ne-namesample-type"/>
    String taggedSent = 
      "<START:person> Britney Spears <END> was reunited " +
      "with her sons <START:date> Saturday <END> ";
    ObjectStream<NameSample> nss = new NameSampleDataStream(
        new PlainTextByLineStream(new StringReader(taggedSent)));
    TokenNameFinderModel model = NameFinderME.train( 
        "en", 
        "default" ,
        nss, 
        (AdaptiveFeatureGenerator) null,
        Collections.<String,Object>emptyMap(),
        70 , 1 );
    
    File outFile = new File(destDir,"multi-custom.bin");
    FileOutputStream outFileStream = new FileOutputStream(outFile);
    model.serialize(outFileStream); 
    
    NameFinderME nameFinder = new NameFinderME(model);
    
    String[] tokens = 
        (" Britney Spears was reunited with her sons Saturday .")
        .split("\\s+");
    Span[] names = nameFinder.find(tokens);
    displayNames(names, tokens);
    //<end id="ne-namesample-type"/>
    
    assertEquals("person", names[0].getType());
    assertEquals("date", names[1].getType());
  }

  @Test
  public void testMemoryUsageNonPooled() throws IOException {
    File modelDir = getModelDir();
    MemoryStatus memStatus = new MemoryStatus();
    memStatus.dumpMemory("before non-pooled model load");
    //String[] names = {"person"};
    //String[] names = {"date","location","money","organization","percentage","person","time"};
    String[] names = {"person","location","date"};
    NameFinderME[] finders = new NameFinderME[names.length];
    for (int mi = 0; mi < names.length; mi++) {
      finders[mi] = new NameFinderME(new TokenNameFinderModel(
          new FileInputStream(
              new File(modelDir, "en-ner-" + names[mi] + ".bin")
              )));
    }
    memStatus.dumpMemory("after non-pooled model load of " + Arrays.toString(names));
    
    //    ----------before non-pooled model load----------
    //    Code Cache 511.88 KBytes
    //    Par Eden Space 6.32 MBytes
    //    Par Survivor Space 0.00 Bytes
    //    CMS Old Gen 0.00 Bytes
    //    CMS Perm Gen 5.88 MBytes
    //    Total 12.70 MBytes
    //    ---------------------------------
    //    ----------after non-pooled model load of person, money, date----------
    //    Code Cache 622.19 KBytes
    //    Par Eden Space 4.29 MBytes
    //    Par Survivor Space 3.19 MBytes
    //    CMS Old Gen 142.21 MBytes
    //    CMS Perm Gen 6.22 MBytes
    //    Total 156.51 MBytes
    //    ---------------------------------
  }
  
  @Test
  public void testMemoryUsagePooled() throws IOException {
    File modelDir = getModelDir();
    MemoryStatus memStatus = new MemoryStatus();
    memStatus.dumpMemory("before pooled model load");
    //String[] names = {"person"};
    //String[] names = {"date","location","money","organization","percentage","person","time"};
    //<start id="ne-pool"/>
    String[] names = {"person","location","date"};
    NameFinderME[] finders = new NameFinderME[names.length];
    for (int mi = 0; mi < names.length; mi++) { //<co id="co.opennlp.name.init4"/>
      finders[mi] = new NameFinderME(
        new PooledTokenNameFinderModel( //<co id="co.opennlp.name.pool"/>
          new FileInputStream(
              new File(modelDir, "en-ner-"
                      + names[mi] + ".bin"))));
    }
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.init4"><para>Initialize name finders for identifying people, locations, and dates</para></callout>
    <callout arearefs="co.opennlp.name.pool"><para>Use the string-pooling model to reduce memory footprint.</para></callout>
    </calloutlist>*/
    //<end id="ne-pool"/>
    memStatus.dumpMemory("after pooled model load of " + Arrays.toString(names));
    
    //    ----------before pooled model load----------
    //    Code Cache 514.13 KBytes
    //    Par Eden Space 6.18 MBytes
    //    Par Survivor Space 0.00 Bytes
    //    CMS Old Gen 0.00 Bytes
    //    CMS Perm Gen 5.88 MBytes
    //    Total 12.57 MBytes
    //    ---------------------------------
    //    ----------after pooled model load----------
    //    Code Cache 626.75 KBytes
    //    Par Eden Space 7.16 MBytes
    //    Par Survivor Space 2.06 MBytes
    //    CMS Old Gen 61.59 MBytes
    //    CMS Perm Gen 32.95 MBytes
    //    Total 104.37 MBytes
    //    ---------------------------------
  }

  @Test
  public void trainNameFinder() throws IOException {
    File baseDir = new File("src/test/resources");
    File destDir = new File("target");
    //<start id="ne-train"/>
    File inFile = new File(baseDir,"person.train");
    NameSampleDataStream nss = new NameSampleDataStream( //<co id="co.opennlp.name.initnamestream"/>
      new PlainTextByLineStream(
        new java.io.FileReader(inFile)));

    int iterations = 100;
    int cutoff = 5;
    TokenNameFinderModel model = NameFinderME.train( //<co id="co.opennlp.name.train"/>
        "en", // language
        "person", // type
        nss, 
        (AdaptiveFeatureGenerator) null,
        Collections.<String,Object>emptyMap(),
        iterations,
        cutoff);
    
    File outFile = new File(destDir, "person-custom.bin");
    FileOutputStream outFileStream = new FileOutputStream(outFile);
    model.serialize(outFileStream); //<co id="co.opennlp.name.persist3"/>
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.initnamestream"><para>Create a stream of name samples based on annotated data in the "person.train" file.</para></callout>
    <callout arearefs="co.opennlp.name.train"><para>Train the model.</para></callout>
    <callout arearefs="co.opennlp.name.persist3"><para>Save the model to a file.</para></callout>
    </calloutlist>*/

    //<end id="ne-train"/>
  }

  @Test
  @SuppressWarnings("unused")
  public void trainNameFinderWithCustomFeatures() throws IOException {
    File baseDir = new File("src/test/resources");
    File destDir = new File("target");
    
    //<start id="ne-features"/>    
    AggregatedFeatureGenerator featureGenerators = 
      new AggregatedFeatureGenerator( //<co id="co.opennlp.name.createfeat"/>
        new WindowFeatureGenerator(
          new TokenFeatureGenerator(), 2, 2), //<co id="co.opennlp.name.tokenfeat"/>
        new WindowFeatureGenerator(
          new TokenClassFeatureGenerator(), 2, 2), //<co id="co.opennlp.name.tokenclassfeat"/>
        new PreviousMapFeatureGenerator() //<co id="co.opennlp.name.prevfeat"/>
      );  
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.createfeat"><para>Creates an aggregated feature generator containing the 3 generators defined below.</para></callout>
    <callout arearefs="co.opennlp.name.tokenfeat"><para>Creates a feature generator corresponding to the tokens in a 5-token widow (2 to the left, and 2 to the right).</para></callout>
    <callout arearefs="co.opennlp.name.tokenclassfeat"><para>Creates a feature generator corresponding to the token classes of the tokens in a 5-token widow (2 to the left, and 2 to the right).</para></callout>
    <callout arearefs="co.opennlp.name.prevfeat"><para>Creates a feature generator which specifies how this token was previously tagged.</para></callout>     
    </calloutlist>*/
    //<end id="ne-features"/>

    //<start id="ne-features-train"/>
    File inFile = new File(baseDir,"person.train");
    NameSampleDataStream nss = new NameSampleDataStream( //<co id="co.opennlp.name.initfeat"/>
      new PlainTextByLineStream(
        new java.io.FileReader(inFile)));

    int iterations = 100;
    int cutoff = 5;
    TokenNameFinderModel model = NameFinderME.train( //<co id="co.opennlp.name.train2"/>
        "en", // language
        "person", // type
        nss, 
        featureGenerators, 
        Collections.<String,Object>emptyMap(),
        iterations, 
        cutoff);

    File outFile = new File(destDir,"person-custom2.bin");
    FileOutputStream outFileStream = new FileOutputStream(outFile);
    model.serialize(outFileStream); //<co id="co.opennlp.name.persist2"/>
    /*<calloutlist>
   <callout arearefs="co.opennlp.name.initfeat"><para>Create the sample stream..</para></callout>
   <callout arearefs="co.opennlp.name.train2"><para>Train the model with a custom feature generator.</para></callout>
   <callout arearefs="co.opennlp.name.persist2"><para>Save the model to a file.</para></callout>
   </calloutlist>*/
    //<end id="ne-features-train"/>
    
    //<start id="ne-features-test"/>
    NameFinderME finder = new NameFinderME(
        new TokenNameFinderModel(
            new FileInputStream(
                new File(destDir, "person-custom2.bin")
                )), featureGenerators, NameFinderME.DEFAULT_BEAM_SIZE);
    //<end id="ne-features-test"/>
  }

  @SuppressWarnings("unused")
  @Test
  public void test() throws IOException {
    
    //<start id="ne-setup"/>
    String[] sentences = {
      "Former first lady Nancy Reagan was taken to a " +
              "suburban Los Angeles " +
      "hospital \"as a precaution\" Sunday after a " +
              "fall at her home, an " +
      "aide said. ",
      
      "The 86-year-old Reagan will remain overnight for " +
      "observation at a hospital in Santa Monica, California, " +
              "said Joanne " +
      "Drake, chief of staff for the Reagan Foundation."};
    
    NameFinderME finder = new NameFinderME(  //<co id="co.opennlp.name.initmodel"/>
      new TokenNameFinderModel(new FileInputStream(getPersonModel()))
    );
    
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="co.opennlp.name.inittokenizer2"/>
    
    for (int si = 0; si < sentences.length; si++) {
      String[] tokens = tokenizer.tokenize(sentences[si]); //<co id="co.opennlp.name.tokenize2"/>
      Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames3"/>
      displayNames(names, tokens);
    }
    
    finder.clearAdaptiveData(); //<co id="co.opennlp.name.clear"/>
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.initmodel">
      <para>Initialize a new model for identifying people names based on the 
        binary compressed model in the file "en-ner-person.bin".</para>
    </callout>
    <callout arearefs="co.opennlp.name.inittokenizer2">
      <para>Initialize a tokenizer to split the sentence into individual words 
        and symbols.</para>
    </callout>
    <callout arearefs="co.opennlp.name.tokenize2">
      <para>Split the sentence into an array of tokens.</para>
    </callout>
    <callout arearefs="co.opennlp.name.findnames3">
      <para>Identify the names in the sentence and return token-based offsets
      to these names.</para>
    </callout>
    <callout arearefs="co.opennlp.name.clear">
      <para>Clear data structures that store which words have been seen 
      previously in the document and whether these words were considered part 
      of a person's name.</para>
    </callout>    
    </calloutlist>*/
    //<end id="ne-setup"/>

    //<start id="ne-display2"/>
    for (int si = 0; si < sentences.length; si++) { //<co id="co.opennlp.name.eachsent2"/>
      Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]); //<co id="co.opennlp.name.tokenizepos"/>
      String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]); //<co id="co.opennlp.name.convert2strings"/>
      Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames4"/>

      for (int ni = 0; ni < names.length; ni++) {
        Span startSpan = tokenSpans[names[ni].getStart()]; //<co id="co.opennlp.name.computestart"/>
        int nameStart  = startSpan.getStart(); 
        
        Span endSpan   = tokenSpans[names[ni].getEnd() - 1]; //<co id="co.opennlp.name.computeend"/>
        int nameEnd    = endSpan.getEnd();
        
        String name = sentences[si].substring(nameStart, nameEnd); //<co id="co.opennlp.name.namestring"/>
        System.out.println(name);
      }
    }
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.eachsent2">
      <para>Iterate over each sentence.</para>
    </callout>
    <callout arearefs="co.opennlp.name.tokenizepos">
      <para>Split the sentence into an array of tokens and return the 
        character offsets (spans) of those tokens.</para>
    </callout>
    <callout arearefs="co.opennlp.name.findnames4">
      <para>
      Identify the names in the sentence and return token-based offsets to these names.
      </para>
    </callout>
    <callout arearefs="co.opennlp.name.computestart">
      <para>
      Compute the start character index of the name.
      </para>
    </callout>    
    <callout arearefs="co.opennlp.name.computeend">
      <para>
      Compute the end character index (last character +1) of the name.
      </para>
    </callout>
    <callout arearefs="co.opennlp.name.computeend">
      <para>
      Compute the string which represents the name.
      </para>
    </callout>
    </calloutlist>*/
    //<end id="ne-display2"/>
    //<start id="ne-prob"/>
    for (int si = 0; si < sentences.length; si++) {//<co id="co.opennlp.name.eachsent3"/>
      String[] tokens = tokenizer.tokenize(sentences[si]); //<co id="co.opennlp.name.tokenize3"/>
      Span[] names = finder.find(tokens); //<co id="co.opennlp.name.findnames1"/>
      double[] spanProbs = finder.probs(names); //<co id="co.opennlp.name.probs"/>
    }
    /*<calloutlist>
    <callout arearefs="co.opennlp.name.eachsent3"><para>Iterate over each sentence.</para></callout>
    <callout arearefs="co.opennlp.name.tokenize3"><para>Split the sentence into an array of tokens.</para></callout>
    <callout arearefs="co.opennlp.name.findnames1"><para>Identify the names in the sentence and return token-based offsets to these names.</para></callout>
    <callout arearefs="co.opennlp.name.probs"><para>Return the probability associated with each name.</para></callout>
    </calloutlist>*/
    //<end id="ne-prob"/>
  }
}

class Annotation implements Comparable<Annotation> {
  private Span span;
  private String type;
  private double prob;

  public Annotation(String type, Span span, double prob) {
    this.span = span;
    this.type = type;
    this.prob = prob;
  }

  public Span getSpan() {
    return span;
  }

  public String getType() {
    return type;
  }

  public double getProb() {
    return prob;
  }

  public int compareTo(Annotation a) {
    int c = span.compareTo(a.span);
    if (c == 0) {
      c = Double.compare(prob, a.prob);
      if (c == 0) {
        c = type.compareTo(a.type);
      }
    }
    return c;
  }

  public String toString() {
    return type + " " + span + " " + prob;
  }
}