package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.*; import edu.stanford.nlp.ling.CoreAnnotations.*; import edu.stanford.nlp.util.*; import java.util.*; import junit.framework.TestCase; public class ChineseSegmenterRegressionITest extends TestCase { public StanfordCoreNLP pipeline; // strings to test on public ArrayList<String> inputStrings = new ArrayList<String>(); // expected token lists ArrayList<ArrayList<String>> expectedTokenLists = new ArrayList<ArrayList<String>>(); public void setUp() { // first set up Chinese pipeline Properties props = StringUtils.argsToProperties("-props", "StanfordCoreNLP-chinese.properties"); props.setProperty("annotators", "tokenize,ssplit"); pipeline = new StanfordCoreNLP(props); // build list of sample input strings inputStrings = new ArrayList<>(); expectedTokenLists= new ArrayList<>(); // example 1 inputStrings.add("巴拉克·奥巴马是美国总统。他在2008年当选"); ArrayList exampleOneTokenList = new ArrayList<>( Arrays.asList( new String[]{"巴拉克·奥巴马","是","美国","总统","。","他","在","2008年","当选"})); expectedTokenLists.add(exampleOneTokenList); // example 2 inputStrings.add("声明全文如下:\n" + " \n" + " \n" + "中国政府欢迎乌克兰销毁其境内全部核武器的决定,\n" + "对乌克兰议会于11月16日批准乌克兰作为无核武器国\n" + "家加入《不扩散核武器条约》表示赞赏。"); ArrayList exampleTwoTokenList = new ArrayList<>( Arrays.asList(new String[]{"声明","全","文","如下",":","中国","政府","欢迎","乌克兰","销毁","其","境内", "全部","核武器","的","决定",",","对","乌克兰","议会","于","11月","16日","批准","乌克兰","作为", "无核武器","国家","加入","《","不扩散","核武器","条约","》","表示","赞赏","。"})); expectedTokenLists.add(exampleTwoTokenList); // example 3 inputStrings.add("协定规定,自协定签署之日起一年后,缔约四国之间\n" + "实现澜沧江-湄公河商船通航,缔约任何一方的船舶均可\n" + "按照协定的规定在中国的思茅港和老挝的琅勃拉邦港之间\n" + "自由航行。"); ArrayList exampleThreeTokenList = new ArrayList<>( Arrays.asList(new String[]{"协定","规定",",","自","协定","签署","之日起","一","年","后",",","缔约","四", "国","之间","实现","澜沧江","-","湄公河","商船","通航",",","缔约","任何","一","方","的","船舶", "均","可","按照","协定","的","规定","在","中国","的","思茅","港","和","老挝","的","琅勃拉邦","港", "之间","自由","航行","。"})); expectedTokenLists.add(exampleThreeTokenList); } public void testChineseSegmentation() { int exampleCount = 0; for (String inputString : inputStrings) { Annotation ann = new Annotation(inputString); pipeline.annotate(ann); ArrayList<String> foundTokens = new ArrayList<String>(); for (CoreMap sentence : ann.get(SentencesAnnotation.class)) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { foundTokens.add(token.word()); } } // check if the token lists are identical //System.err.println(foundTokens.toString()); //System.err.println(expectedTokenLists.get(exampleCount)); assertEquals(expectedTokenLists.get(exampleCount), foundTokens); exampleCount++; } } }