MorphaAnnotatorITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import java.util.ArrayList;
import java.util.List;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.CoreMap;

/**
 * Tests a short pipeline starting from raw text and finishing with
 * the MorphaAnnotator.  Checks the output to make sure it is as
 * expected.
 *
 * @author Heeyoung Lee
 * @author John Bauer
 */
public class MorphaAnnotatorITest extends TestCase {
  private static AnnotationPipeline fullPipeline;
  private static AnnotationPipeline shortPipeline;

  @Override
  public void setUp() throws Exception {
    synchronized(MorphaAnnotatorITest.class) {
      if (fullPipeline == null) {
        fullPipeline = new AnnotationPipeline();
        fullPipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
        fullPipeline.addAnnotator(new WordsToSentencesAnnotator(false));
        fullPipeline.addAnnotator(new POSTaggerAnnotator(false));
        fullPipeline.addAnnotator(new MorphaAnnotator(false));
      }

      if (shortPipeline == null) {
        shortPipeline = new AnnotationPipeline();
        shortPipeline.addAnnotator(new MorphaAnnotator(false));
      }
    }
  }

  private static void checkResult(List<CoreLabel> words) {
    assertEquals(words.size(), answer.length);

    for (int i = 0 ; i < answer.length ; i++){
      CoreLabel word = words.get(i);
      String lemma = word.get(CoreAnnotations.LemmaAnnotation.class);
      assertEquals(lemma, answer[i]);
    }
  }

  public void testMorphaAnnotator() throws Exception {
    Annotation document = new Annotation(text);
    fullPipeline.annotate(document);
    checkResult(document.get(CoreAnnotations.TokensAnnotation.class));
  }

  private static List<CoreLabel> getTestWords() {
    List<CoreLabel> words = new ArrayList<CoreLabel>();
    if (tokenizedText.length != tokenizedTags.length) {
      throw new AssertionError("tokenizedText and tokenizedTags " +
                               "must be of the same length");
    }
    for (int i = 0; i < tokenizedText.length; ++i) {
      CoreLabel word = new CoreLabel();
      word.setWord(tokenizedText[i]);
      word.set(CoreAnnotations.TextAnnotation.class, tokenizedText[i]);
      word.setTag(tokenizedTags[i]);
      words.add(word);
    }
    return words;
  }

  public void testSentencesAnnotation() throws Exception {
    List<CoreLabel> words = getTestWords();

    CoreMap sentence = new ArrayCoreMap();
    sentence.set(CoreAnnotations.TokensAnnotation.class, words);
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    sentences.add(sentence);
    Annotation document = new Annotation(text);
    document.set(CoreAnnotations.SentencesAnnotation.class, sentences);

    shortPipeline.annotate(document);
    checkResult(words);
  }

  private static final String text = "I saw him ordering them to saw. Jack 's father has n't played\ngolf since 20 years ago . I 'm going to the\nbookstore to return a book Jack and his friends bought me .";

  private static final String[] answer =
  {"I", "see", "he", "order", "they", "to", "saw", ".", "Jack", "'s",
   "father", "have", "not", "play", "golf", "since", "20", "year", "ago",
   ".", "I", "be", "go", "to", "the", "bookstore", "to", "return", "a",
    "book", "Jack", "and", "he", "friend", "buy", "I", "."};

  private static final String[] tokenizedText =
  {"I", "saw", "him", "ordering", "them", "to", "saw", ".", "Jack", "'s",
   "father", "has", "n't", "played", "golf", "since", "20", "years", "ago",
   ".", "I", "'m", "going", "to", "the", "bookstore", "to", "return", "a",
   "book", "Jack", "and", "his", "friends", "bought", "me", "."};

  private static final String[] tokenizedTags =
  {"PRP", "VBD", "PRP", "VBG", "PRP", "TO", "NN", ".", "NNP", "POS",
   "NN", "VBZ", "RB", "VBN", "NN", "IN", "CD", "NNS", "RB",
   ".", "PRP", "VBP", "VBG", "TO", "DT", "NN", "TO", "VB", "DT",
   "NN", "NNP", "CC", "PRP$", "NNS", "VBD", "PRP", "."};
}