package edu.stanford.nlp.simple; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.pipeline.*; import edu.stanford.nlp.util.Lazy; import java.io.IOException; import java.util.List; import java.util.Properties; /** * A sentence running with the Chinese models. * * @author <a href="mailto:angeli@cs.stanford.edu">Gabor Angeli</a> */ public class ChineseDocument extends Document { /** * The default {@link ChineseSegmenterAnnotator} implementation */ private static final Lazy<Annotator> chineseSegmenter = Lazy.of(() -> new ChineseSegmenterAnnotator("segment", new Properties() {{ setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz"); setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese"); setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz"); setProperty("segment.sighanPostProcessing", "true"); }})); /** * The empty {@link java.util.Properties} object, for use with creating default annotators. */ static final Properties EMPTY_PROPS = new Properties() {{ try { load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties")); } catch (IOException e) { throw new RuntimeIOException(e); } setProperty("language", "chinese"); setProperty("annotators", ""); setProperty("parse.binaryTrees", "true"); }}; /** * Create a new document from the passed in text. * @param text The text of the document. */ public ChineseDocument(String text) { super(ChineseDocument.EMPTY_PROPS, text); } /** * Convert a CoreNLP Annotation object to a Document. * @param ann The CoreNLP Annotation object. */ @SuppressWarnings("Convert2streamapi") public ChineseDocument(Annotation ann) { super(ChineseDocument.EMPTY_PROPS, ann); } /** * Create a Document object from a read Protocol Buffer. * @see edu.stanford.nlp.simple.Document#serialize() * @param proto The protocol buffer representing this document. */ public ChineseDocument(CoreNLPProtos.Document proto) { super(ChineseDocument.EMPTY_PROPS, proto); } /** * Create a new chinese document from the passed in text and the given properties. * @param text The text of the document. */ protected ChineseDocument(Properties props, String text) { super(props, text); } /** {@inheritDoc} */ @Override public List<Sentence> sentences(Properties props) { return this.sentences(props, chineseSegmenter.get()); } /** <<<<<<< HEAD * No lemma annotator for Chinese -- set the lemma to be the word. * * @see Document#runLemma(Properties) */ @Override protected Document runLemma(Properties props) { return mockLemma(props); } /** * No sentiment analysis implemented for Chinese. * * @see Document#runSentiment(Properties) */ @Override protected Document runSentiment(Properties props) { throw new IllegalArgumentException("Sentiment analysis is not implemented for Chinese"); } /** * The Neural Dependency Parser doesn't support Chinese yet, so back off to running the * constituency parser instead. */ @Override // TODO(danqi; from Gabor): remove this method when we have a trained NNDep model Document runDepparse(Properties props) { return runParse(props); } }