package edu.stanford.nlp.pipeline;
import junit.framework.TestCase;
import java.util.List;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations.CharactersAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseCharAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.ChineseSegAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
public class ChineseAnnotationPipelineITest extends TestCase {
AnnotationPipeline pipeline = null;
@Override
public void setUp() throws Exception {
synchronized(ChineseAnnotationPipelineITest.class) {
if (pipeline == null) {
// This is loaded from the Chinese models jar file. Editing
// it directly in the source tree and hoping to see changes
// will be a very frustrating experience.
pipeline = new StanfordCoreNLP("StanfordCoreNLP-chinese.properties");
}
}
}
public void testFullPipeline() {
String query = "你马上回来北京吗?";
String[] expectedWords = {"你", "马上", "回来", "北京", "吗", "?"};
String[] expectedCharacters = {"你","马","上","回","来",
"北","京","吗","?"};
boolean[] expectedSegs = {true, true, false, true, false,
true, false, true, true};
String[] expectedNer = {"O", "O", "O", "GPE", "O", "O"};
assertEquals(expectedCharacters.length, expectedSegs.length);
assertEquals(expectedWords.length, expectedNer.length);
// pipeline is expected to have tokenization, segmentation and ner
Annotation ann = new Annotation(query);
pipeline.annotate(ann);
List<CoreMap> sentences = ann.get(SentencesAnnotation.class);
assertFalse(sentences == null);
assertEquals(1, sentences.size());
List<CoreLabel> tokens = sentences.get(0).get(TokensAnnotation.class);
assertEquals(expectedWords.length, tokens.size());
for (int i = 0; i < expectedWords.length; ++i) {
assertEquals(expectedWords[i], tokens.get(i).word());
assertEquals(expectedNer[i], tokens.get(i).ner());
}
List<CoreLabel> characters = ann.get(CharactersAnnotation.class);
assertEquals(expectedCharacters.length, characters.size());
for (int i = 0; i < expectedCharacters.length; ++i) {
CoreLabel word = characters.get(i);
assertEquals(expectedCharacters[i],
word.get(ChineseCharAnnotation.class));
assertEquals(expectedSegs[i] ? "1" : "0",
word.get(ChineseSegAnnotation.class));
}
}
public void testTwoSentences() {
String query = "你马上回来北京吗?我要回去美国。";
Annotation ann = new Annotation(query);
pipeline.annotate(ann);
List<CoreMap> sentences = ann.get(SentencesAnnotation.class);
assertFalse(sentences == null);
assertEquals(2, sentences.size());
String[][] expectedWords = { {"你", "马上", "回来", "北京", "吗", "?"},
{"我", "要", "回去", "美国", "。"} };
int[][] expectedPositions = { {0, 1, 3, 5, 7, 8, 9},
{9, 10, 11, 13, 15, 16} };
for (int i = 0; i < 2; ++i) {
List<CoreLabel> tokens = sentences.get(i).get(TokensAnnotation.class);
assertEquals(expectedWords[i].length, tokens.size());
for (int j = 0; j < expectedWords.length; ++j) {
assertEquals(expectedWords[i][j], tokens.get(j).word());
assertEquals(expectedPositions[i][j], tokens.get(j).beginPosition());
assertEquals(expectedPositions[i][j+1], tokens.get(j).endPosition());
}
}
}
}