ChineseSegmenterRegressionITest.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.CoreAnnotations.*;
import edu.stanford.nlp.util.*;

import java.util.*;
import junit.framework.TestCase;


public class ChineseSegmenterRegressionITest  extends TestCase {

  public StanfordCoreNLP pipeline;

  // strings to test on
  public ArrayList<String> inputStrings = new ArrayList<String>();

  // expected token lists
  ArrayList<ArrayList<String>> expectedTokenLists = new ArrayList<ArrayList<String>>();

  public void setUp() {

    // first set up Chinese pipeline
    Properties props = StringUtils.argsToProperties("-props", "StanfordCoreNLP-chinese.properties");
    props.setProperty("annotators", "tokenize,ssplit");
    pipeline = new StanfordCoreNLP(props);

    // build list of sample input strings
    inputStrings = new ArrayList<>();
    expectedTokenLists= new ArrayList<>();

    // example 1
    inputStrings.add("巴拉克·奥巴马是美国总统。他在2008年当选");
    ArrayList exampleOneTokenList = new ArrayList<>(
            Arrays.asList(
                    new String[]{"巴拉克·奥巴马","是","美国","总统","。","他","在","2008年","当选"}));
    expectedTokenLists.add(exampleOneTokenList);

    // example 2
    inputStrings.add("声明全文如下:\n" +
            "    \n" +
            "   \n" +
            "中国政府欢迎乌克兰销毁其境内全部核武器的决定,\n" +
            "对乌克兰议会于11月16日批准乌克兰作为无核武器国\n" +
            "家加入《不扩散核武器条约》表示赞赏。");
    ArrayList exampleTwoTokenList =
            new ArrayList<>(
                    Arrays.asList(new String[]{"声明","全","文","如下",":","中国","政府","欢迎","乌克兰","销毁","其","境内",
                            "全部","核武器","的","决定",",","对","乌克兰","议会","于","11月","16日","批准","乌克兰","作为",
                            "无核武器","国家","加入","《","不扩散","核武器","条约","》","表示","赞赏","。"}));
    expectedTokenLists.add(exampleTwoTokenList);

    // example 3
    inputStrings.add("协定规定,自协定签署之日起一年后,缔约四国之间\n" +
            "实现澜沧江-湄公河商船通航,缔约任何一方的船舶均可\n" +
            "按照协定的规定在中国的思茅港和老挝的琅勃拉邦港之间\n" +
            "自由航行。");
    ArrayList exampleThreeTokenList =
            new ArrayList<>(
                    Arrays.asList(new String[]{"协定","规定",",","自","协定","签署","之日起","一","年","后",",","缔约","四",
                            "国","之间","实现","澜沧江","-","湄公河","商船","通航",",","缔约","任何","一","方","的","船舶",
                            "均","可","按照","协定","的","规定","在","中国","的","思茅","港","和","老挝","的","琅勃拉邦","港",
                            "之间","自由","航行","。"}));
    expectedTokenLists.add(exampleThreeTokenList);
  }

  public void testChineseSegmentation() {
    int exampleCount = 0;
    for (String inputString : inputStrings) {
      Annotation ann = new Annotation(inputString);
      pipeline.annotate(ann);
      ArrayList<String> foundTokens = new ArrayList<String>();
      for (CoreMap sentence : ann.get(SentencesAnnotation.class)) {
        for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
          foundTokens.add(token.word());
        }
      }
      // check if the token lists are identical
      //System.err.println(foundTokens.toString());
      //System.err.println(expectedTokenLists.get(exampleCount));
      assertEquals(expectedTokenLists.get(exampleCount), foundTokens);
      exampleCount++;
    }
  }

}