ChineseDocument.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.simple;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.util.Lazy;

import java.io.IOException;
import java.util.List;
import java.util.Properties;

/**
 * A sentence running with the Chinese models.
 *
 * @author <a href="mailto:angeli@cs.stanford.edu">Gabor Angeli</a>
 */
public class ChineseDocument extends Document {

  /**
   * The default {@link ChineseSegmenterAnnotator} implementation
   */
  private static final Lazy<Annotator> chineseSegmenter = Lazy.of(() -> new ChineseSegmenterAnnotator("segment", new Properties() {{
    setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz");
    setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese");
    setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz");
    setProperty("segment.sighanPostProcessing", "true");
  }}));

  /**
   * The empty {@link java.util.Properties} object, for use with creating default annotators.
   */
  static final Properties EMPTY_PROPS = new Properties() {{
    try {
      load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    setProperty("language", "chinese");
    setProperty("annotators", "");
    setProperty("parse.binaryTrees", "true");
  }};

  /**
   * Create a new document from the passed in text.
   * @param text The text of the document.
   */
  public ChineseDocument(String text) {
    super(ChineseDocument.EMPTY_PROPS, text);
  }

  /**
   * Convert a CoreNLP Annotation object to a Document.
   * @param ann The CoreNLP Annotation object.
   */
  @SuppressWarnings("Convert2streamapi")
  public ChineseDocument(Annotation ann) {
    super(ChineseDocument.EMPTY_PROPS, ann);
  }


  /**
   * Create a Document object from a read Protocol Buffer.
   * @see edu.stanford.nlp.simple.Document#serialize()
   * @param proto The protocol buffer representing this document.
   */
  public ChineseDocument(CoreNLPProtos.Document proto) {
    super(ChineseDocument.EMPTY_PROPS, proto);
  }

  /**
   * Create a new chinese document from the passed in text and the given properties.
   * @param text The text of the document.
   */
  protected ChineseDocument(Properties props, String text) {
    super(props, text);
  }


  /** {@inheritDoc} */
  @Override
  public List<Sentence> sentences(Properties props) {
    return this.sentences(props, chineseSegmenter.get());
  }


  /**
<<<<<<< HEAD
   * No lemma annotator for Chinese -- set the lemma to be the word.
   *
   * @see Document#runLemma(Properties)
   */
  @Override
  protected Document runLemma(Properties props) {
    return mockLemma(props);
  }


  /**
   * No sentiment analysis implemented for Chinese.
   *
   * @see Document#runSentiment(Properties)
   */
  @Override
  protected Document runSentiment(Properties props) {
    throw new IllegalArgumentException("Sentiment analysis is not implemented for Chinese");
  }

  /**
   * The Neural Dependency Parser doesn't support Chinese yet, so back off to running the
   * constituency parser instead.
   */
  @Override  // TODO(danqi; from Gabor): remove this method when we have a trained NNDep model
  Document runDepparse(Properties props) {
    return runParse(props);
  }
}