package edu.stanford.nlp.pipeline;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import org.junit.Test;
import java.io.*;
import java.util.*;
import static org.junit.Assert.*;
public class ChineseSerializationITest {
@Test
public void testChineseSerialization() {
try {
AnnotationSerializer serializer = new ProtobufAnnotationSerializer();
// write Chinese doc
String sampleChineseDocument = "巴拉克·奥巴马是美国总统。他在2008年当选";
Properties chineseProperties = StringUtils.argsToProperties("-props",
"StanfordCoreNLP-chinese.properties");
Annotation doc = new StanfordCoreNLP(chineseProperties).process(sampleChineseDocument);
ByteArrayOutputStream ks = new ByteArrayOutputStream();
serializer.write(doc, ks).close();
// read
InputStream kis = new ByteArrayInputStream(ks.toByteArray());
Pair<Annotation, InputStream> pair = serializer.read(kis);
pair.second.close();
Annotation readDoc = pair.first;
kis.close();
// check characters are equal
List<CoreLabel> docChars = doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
List<CoreLabel> readDocChars = doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
assertEquals(docChars.size(),readDocChars.size());
int numChars = docChars.size();
int currChar = 0;
while (currChar < numChars) {
assertEquals(docChars.get(currChar),readDocChars.get(currChar));
currChar++;
}
// check that sentences are equal
/*int sentenceCount = 0;
while (sentenceCount < doc.get(CoreAnnotations.SentencesAnnotation.class).size()) {
assertEquals(doc.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceCount),
readDoc.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceCount));
sentenceCount++;
}*/
// check JSON output is same
String docJSON = JSONOutputter.jsonPrint(doc);
String readDocJSON = JSONOutputter.jsonPrint(readDoc);
assertEquals(docJSON,readDocJSON);
} catch (Exception e) { throw new RuntimeException(e); }
}
}