ChineseAnnotationPipelineITest.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import java.util.List;

import edu.stanford.nlp.ling.SegmenterCoreAnnotations.CharactersAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseCharAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseSegAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;

public class ChineseAnnotationPipelineITest extends TestCase {
  AnnotationPipeline pipeline = null;

  @Override
  public void setUp() throws Exception {
    synchronized(ChineseAnnotationPipelineITest.class) {
      if (pipeline == null) {
        // This is loaded from the Chinese models jar file.  Editing
        // it directly in the source tree and hoping to see changes
        // will be a very frustrating experience.
        pipeline = new StanfordCoreNLP("StanfordCoreNLP-chinese.properties");
      }
    }
  }

  public void testFullPipeline() {
    String query = "你马上回来北京吗？";
    String[] expectedWords = {"你", "马上", "回来", "北京", "吗", "？"};
    String[] expectedCharacters = {"你","马","上","回","来",
                                   "北","京","吗","？"};
    boolean[] expectedSegs = {true, true, false, true, false,
                              true, false, true, true};
    String[] expectedNer = {"O", "O", "O", "GPE", "O", "O"};

    assertEquals(expectedCharacters.length, expectedSegs.length);
    assertEquals(expectedWords.length, expectedNer.length);

    // pipeline is expected to have tokenization, segmentation and ner
    Annotation ann = new Annotation(query);
    pipeline.annotate(ann);

    List<CoreMap> sentences = ann.get(SentencesAnnotation.class);
    assertFalse(sentences == null);
    assertEquals(1, sentences.size());

    List<CoreLabel> tokens = sentences.get(0).get(TokensAnnotation.class);
    assertEquals(expectedWords.length, tokens.size());
    for (int i = 0; i < expectedWords.length; ++i) {
      assertEquals(expectedWords[i], tokens.get(i).word());
      assertEquals(expectedNer[i], tokens.get(i).ner());
    }

    List<CoreLabel> characters = ann.get(CharactersAnnotation.class);
    assertEquals(expectedCharacters.length, characters.size());
    for (int i = 0; i < expectedCharacters.length; ++i) {
      CoreLabel word = characters.get(i);
      assertEquals(expectedCharacters[i],
                   word.get(ChineseCharAnnotation.class));
      assertEquals(expectedSegs[i] ? "1" : "0",
                   word.get(ChineseSegAnnotation.class));
    }
  }

  public void testTwoSentences() {
    String query = "你马上回来北京吗？我要回去美国。";
    Annotation ann = new Annotation(query);
    pipeline.annotate(ann);

    List<CoreMap> sentences = ann.get(SentencesAnnotation.class);
    assertFalse(sentences == null);
    assertEquals(2, sentences.size());

    String[][] expectedWords = { {"你", "马上", "回来", "北京", "吗", "？"},
                                 {"我", "要", "回去", "美国", "。"} };
    int[][] expectedPositions = { {0, 1, 3, 5, 7, 8, 9},
                                  {9, 10, 11, 13, 15, 16} };
    for (int i = 0; i < 2; ++i) {
      List<CoreLabel> tokens = sentences.get(i).get(TokensAnnotation.class);
      assertEquals(expectedWords[i].length, tokens.size());
      for (int j = 0; j < expectedWords.length; ++j) {
        assertEquals(expectedWords[i][j], tokens.get(j).word());
        assertEquals(expectedPositions[i][j], tokens.get(j).beginPosition());
        assertEquals(expectedPositions[i][j+1], tokens.get(j).endPosition());
      }
    }
  }

}