PipelineITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import java.util.List;

import org.junit.Assert;
import junit.framework.TestCase;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;
import java.util.function.Function;
import edu.stanford.nlp.util.Iterables;
import edu.stanford.nlp.util.StringUtils;

public class PipelineITest extends TestCase {

  public void testPipeline() throws Exception {
    // create pipeline
    AnnotationPipeline pipeline = new AnnotationPipeline();
    pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
    pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
    pipeline.addAnnotator(new POSTaggerAnnotator(false));
    pipeline.addAnnotator(new MorphaAnnotator(false));
    pipeline.addAnnotator(new NERCombinerAnnotator(false));
    pipeline.addAnnotator(new ParserAnnotator(false, -1));
    //pipeline.addAnnotator(new CorefAnnotator(null, null, null, false));
    //pipeline.addAnnotator(new SRLAnnotator(false));

    // create annotation with text
    String text = "Dan Ramage is working for\nMicrosoft. He's in Seattle! \n";
    Annotation document = new Annotation(text);
    Assert.assertEquals(text, document.toString());
    Assert.assertEquals(text, document.get(CoreAnnotations.TextAnnotation.class));

    // annotate text with pipeline
    pipeline.annotate(document);

    // demonstrate typical usage
    for (CoreMap sentence: document.get(CoreAnnotations.SentencesAnnotation.class)) {

      // get the tree for the sentence
      Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);

      // get the tokens for the sentence and iterate over them
      for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {

        // get token attributes
        String tokenText = token.get(CoreAnnotations.TextAnnotation.class);
        String tokenPOS = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
        String tokenLemma = token.get(CoreAnnotations.LemmaAnnotation.class);
        String tokenNE = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);

        // text, pos, lemma and name entity tag should be defined
        Assert.assertNotNull(tokenText);
        Assert.assertNotNull(tokenPOS);
        Assert.assertNotNull(tokenLemma);
        Assert.assertNotNull(tokenNE);
      }
      // tree should be defined
      Assert.assertNotNull(tree);
    }

    // get tokens
    List<CoreLabel> tokens = document.get(CoreAnnotations.TokensAnnotation.class);
    String tokensText = "Dan Ramage is working for Microsoft . He 's in Seattle !";
    Assert.assertNotNull(tokens);
    Assert.assertEquals(12, tokens.size());
    Assert.assertEquals(tokensText, join(tokens));
    Assert.assertEquals(0, (int)tokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    Assert.assertEquals(3, (int)tokens.get(0).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    Assert.assertEquals("NNP", tokens.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals("VBZ", tokens.get(2).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals(".", tokens.get(11).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals("Ramage", tokens.get(1).get(CoreAnnotations.LemmaAnnotation.class));
    Assert.assertEquals("be", tokens.get(2).get(CoreAnnotations.LemmaAnnotation.class));
    Assert.assertEquals("PERSON", tokens.get(0).get(CoreAnnotations.NamedEntityTagAnnotation.class));
    Assert.assertEquals("PERSON", tokens.get(1).get(CoreAnnotations.NamedEntityTagAnnotation.class));
    Assert.assertEquals("LOCATION", tokens.get(10).get(CoreAnnotations.NamedEntityTagAnnotation.class));

    // get sentences
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    Assert.assertNotNull(sentences);
    Assert.assertEquals(2, sentences.size());

    // sentence 1
    String text1 = "Dan Ramage is working for\nMicrosoft.";
    CoreMap sentence1 = sentences.get(0);
    Assert.assertEquals(text1, sentence1.toString());
    Assert.assertEquals(text1, sentence1.get(CoreAnnotations.TextAnnotation.class));
    Assert.assertEquals(0, (int)sentence1.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    Assert.assertEquals(36, (int)sentence1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    Assert.assertEquals(0, (int)sentence1.get(CoreAnnotations.TokenBeginAnnotation.class));
    Assert.assertEquals(7, (int)sentence1.get(CoreAnnotations.TokenEndAnnotation.class));

    // sentence 1 tree
    Tree tree1 = Tree.valueOf("(ROOT (S (NP (NNP Dan) (NNP Ramage)) (VP (VBZ is) " +
        "(VP (VBG working) (PP (IN for) (NP (NNP Microsoft))))) (. .)))");
    Assert.assertEquals(tree1, sentence1.get(TreeCoreAnnotations.TreeAnnotation.class));

    // sentence 1 tokens
    String tokenText1 = "Dan Ramage is working for Microsoft .";
    List<CoreLabel> tokens1 = sentence1.get(CoreAnnotations.TokensAnnotation.class);
    Assert.assertNotNull(tokens1);
    Assert.assertEquals(7, tokens1.size());
    Assert.assertEquals(tokenText1, join(tokens1));
    Assert.assertEquals(4, (int)tokens1.get(1).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    Assert.assertEquals(10, (int)tokens1.get(1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    Assert.assertEquals("IN", tokens1.get(4).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals("NNP", tokens1.get(5).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals("work", tokens1.get(3).get(CoreAnnotations.LemmaAnnotation.class));
    Assert.assertEquals(".", tokens1.get(6).get(CoreAnnotations.LemmaAnnotation.class));
    Assert.assertEquals("ORGANIZATION", tokens1.get(5).get(CoreAnnotations.NamedEntityTagAnnotation.class));

    // sentence 2
    String text2 = "He's in Seattle!";
    CoreMap sentence2 = sentences.get(1);
    Assert.assertEquals(text2, sentence2.toString());
    Assert.assertEquals(text2, sentence2.get(CoreAnnotations.TextAnnotation.class));
    Assert.assertEquals(37, (int)sentence2.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    Assert.assertEquals(53, (int)sentence2.get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    Assert.assertEquals(7, (int)sentence2.get(CoreAnnotations.TokenBeginAnnotation.class));
    Assert.assertEquals(12, (int)sentence2.get(CoreAnnotations.TokenEndAnnotation.class));

    // sentence 2 tree (note error on Seattle, caused by part of speech tagger)
    Tree tree2 = Tree.valueOf("(ROOT (S (NP (PRP He)) (VP (VBZ 's) (PP (IN in) " +
        "(NP (NNP Seattle)))) (. !)))");
    Assert.assertEquals(tree2, sentence2.get(TreeCoreAnnotations.TreeAnnotation.class));

    // sentence 2 tokens
    String tokenText2 = "He 's in Seattle !";
    List<CoreLabel> tokens2 = sentence2.get(CoreAnnotations.TokensAnnotation.class);
    Assert.assertNotNull(tokens2);
    Assert.assertEquals(5, tokens2.size());
    Assert.assertEquals(tokenText2, join(tokens2));
    Assert.assertEquals(39, (int)tokens2.get(1).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    Assert.assertEquals(41, (int)tokens2.get(1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    Assert.assertEquals("VBZ", tokens2.get(1).get(CoreAnnotations.PartOfSpeechAnnotation.class));
    Assert.assertEquals("be", tokens2.get(1).get(CoreAnnotations.LemmaAnnotation.class));
    Assert.assertEquals("LOCATION", tokens2.get(3).get(CoreAnnotations.NamedEntityTagAnnotation.class));
  }

  private static String join(List<CoreLabel> tokens) {
    return StringUtils.join(Iterables.transform(tokens, new Function<CoreLabel, String>() {
      public String apply(CoreLabel token) {
        return token.get(CoreAnnotations.TextAnnotation.class);
      }
    }));
  }
}