package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.SegmenterCoreAnnotations; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; import org.junit.Test; import java.io.*; import java.util.*; import static org.junit.Assert.*; public class ChineseSerializationITest { @Test public void testChineseSerialization() { try { AnnotationSerializer serializer = new ProtobufAnnotationSerializer(); // write Chinese doc String sampleChineseDocument = "巴拉克·奥巴马是美国总统。他在2008年当选"; Properties chineseProperties = StringUtils.argsToProperties("-props", "StanfordCoreNLP-chinese.properties"); Annotation doc = new StanfordCoreNLP(chineseProperties).process(sampleChineseDocument); ByteArrayOutputStream ks = new ByteArrayOutputStream(); serializer.write(doc, ks).close(); // read InputStream kis = new ByteArrayInputStream(ks.toByteArray()); Pair<Annotation, InputStream> pair = serializer.read(kis); pair.second.close(); Annotation readDoc = pair.first; kis.close(); // check characters are equal List<CoreLabel> docChars = doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class); List<CoreLabel> readDocChars = doc.get(SegmenterCoreAnnotations.CharactersAnnotation.class); assertEquals(docChars.size(),readDocChars.size()); int numChars = docChars.size(); int currChar = 0; while (currChar < numChars) { assertEquals(docChars.get(currChar),readDocChars.get(currChar)); currChar++; } // check that sentences are equal /*int sentenceCount = 0; while (sentenceCount < doc.get(CoreAnnotations.SentencesAnnotation.class).size()) { assertEquals(doc.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceCount), readDoc.get(CoreAnnotations.SentencesAnnotation.class).get(sentenceCount)); sentenceCount++; }*/ // check JSON output is same String docJSON = JSONOutputter.jsonPrint(doc); String readDocJSON = JSONOutputter.jsonPrint(readDoc); assertEquals(docJSON,readDocJSON); } catch (Exception e) { throw new RuntimeException(e); } } }