Document.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.simple;

import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.naturalli.NaturalLogicAnnotations;
import edu.stanford.nlp.naturalli.OperatorSpec;
import edu.stanford.nlp.naturalli.Polarity;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.*;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.ref.SoftReference;
import java.util.*;
import java.util.function.Function;
import java.util.function.Supplier;

import static edu.stanford.nlp.simple.Sentence.SINGLE_SENTENCE_DOCUMENT;
import static edu.stanford.nlp.pipeline.Annotator.*;

/**
 * A representation of a Document. Most blobs of raw text should become documents.
 *
 * @author Gabor Angeli
 */
@SuppressWarnings("unused")
public class Document {

  /**
   * The empty {@link java.util.Properties} object, for use with creating default annotators.
   */
  static final Properties EMPTY_PROPS = PropertiesUtils.asProperties(
      "language", "english",
      "annotators", "",
      "tokenize.class", "PTBTokenizer",
      "tokenize.language", "en",
      "parse.binaryTrees", "true",
      "mention.type", "dep",
      "coref.mode", "statistical",  // Use the new coref
      "coref.md.type", "dep"
  );


  /**
   * The caseless {@link java.util.Properties} object.
   *
   * @see Document#caseless()
   * @see Sentence#caseless()
   */
  static final Properties CASELESS_PROPS = PropertiesUtils.asProperties(
        "language", "english",
        "annotators", "",
        "tokenize.class", "PTBTokenizer",
        "tokenize.language", "en",
        "parse.binaryTrees", "true",
        "pos.model", "edu/stanford/nlp/models/pos-tagger/wsj-0-18-caseless-left3words-distsim.tagger",
        "parse.model", "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz",
        "ner.model", "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz," +
                             "edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz," +
                             "edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz");


  /**
   * The backend to use for constructing {@link edu.stanford.nlp.pipeline.Annotator}s.
   */
  private static AnnotatorImplementations backend = new AnnotatorImplementations();

  /**
   * The default {@link edu.stanford.nlp.pipeline.TokenizerAnnotator} implementation
   */
  private static final Annotator defaultTokenize = backend.tokenizer(EMPTY_PROPS);
  /**
   * The default {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator} implementation
   */
  private static final Annotator defaultSSplit = backend.wordToSentences(EMPTY_PROPS);
  /**
   * The default {@link edu.stanford.nlp.pipeline.POSTaggerAnnotator} implementation
   */
  private static Supplier<Annotator> defaultPOS = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.posTagger(EMPTY_PROPS);
      }
      return impl;
    }
  };
  /**
   * The default {@link edu.stanford.nlp.pipeline.MorphaAnnotator} implementation
   */
  private static final Supplier<Annotator> defaultLemma = () -> backend.morpha(EMPTY_PROPS, false);

  /**
   * The default {@link edu.stanford.nlp.pipeline.NERCombinerAnnotator} implementation
   */
  private static Supplier<Annotator> defaultNER = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.ner(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.RegexNERAnnotator} implementation
   */
  private static Supplier<Annotator> defaultRegexner = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.tokensRegexNER(EMPTY_PROPS, Annotator.STANFORD_REGEXNER);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.ParserAnnotator} implementation
   */
  private static Supplier<Annotator> defaultParse = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.parse(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.DependencyParseAnnotator} implementation
   */
  private static Supplier<Annotator> defaultDepparse = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.dependencies(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotator} implementation
   */
  private static Supplier<Annotator> defaultNatlog = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.natlog(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link EntityMentionsAnnotator} implementation
   */
  private static Supplier<Annotator> defaultEntityMentions = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.entityMentions(EMPTY_PROPS, Annotator.STANFORD_ENTITY_MENTIONS);
      }
      return impl;
    }
  };

  /**
   * The default {@link KBPAnnotator} implementation
   */
  private static Supplier<Annotator> defaultKBP = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.kbp(EMPTY_PROPS);
      }
      return impl;
    }
  };


  /**
   * The default {@link edu.stanford.nlp.naturalli.OpenIE} implementation
   */
  private static Supplier<Annotator> defaultOpenie = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.openie(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.MentionAnnotator} implementation
   */
  private static Supplier<Annotator> defaultMention = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.mention(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.CorefAnnotator} implementation
   */
  private static Supplier<Annotator> defaultCoref = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.coref(EMPTY_PROPS);
      }
      return impl;
    }
  };

  /**
   * The default {@link edu.stanford.nlp.pipeline.SentimentAnnotator} implementation
   */
  private static Supplier<Annotator> defaultSentiment = new Supplier<Annotator>() {
    Annotator impl = null;

    @Override
    public synchronized Annotator get() {
      if (impl == null) {
        impl = backend.sentiment(EMPTY_PROPS, Annotator.STANFORD_SENTIMENT);
      }
      return impl;
    }
  };


  /**
   * Cache the most recently used custom annotators.
   */
  private static final AnnotatorPool customAnnotators = AnnotatorPool.SINGLETON;


  /**
   * Either get a custom annotator which was recently defined, or create it if it has never been defined.
   * This method is synchronized to avoid race conditions when loading the annotators.
   *
   * @param name The name of the annotator.
   * @param props The properties used to create the annotator, if we need to create it.
   * @param annotator The actual function used to make the annotator, if needed.
   *
   * @return An annotator as specified by the given name and properties.
   */
  private synchronized static Supplier<Annotator> getOrCreate(String name, Properties props, Supplier<Annotator> annotator) {
    customAnnotators.register(name, props, Lazy.cache(annotator));
    return () -> customAnnotators.get(name);
  }

  /** The protocol buffer representing this document */
  protected final CoreNLPProtos.Document.Builder impl;

  /** The list of sentences associated with this document */
  protected List<Sentence> sentences = null;

  /** A serializer to assist in serializing and deserializing from Protocol buffers */
  protected final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(false );

  /**
   * THIS IS NONSTANDARD.
   * An indicator of whether we have run the OpenIE annotator.
   * Unlike most other annotators, it's quite common for a sentence to not have any extracted triples,
   * and therefore it's hard to determine whether we should rerun the annotator based solely on the saved
   * annotation.
   * At the same time, the proto file should not have this flag in it.
   * So, here it is.
   */
  private boolean haveRunOpenie = false;

  /**
   * THIS IS NONSTANDARD.
   * An indicator of whether we have run the KBP annotator.
   * Unlike most other annotators, it's quite common for a sentence to not have any extracted triples,
   * and therefore it's hard to determine whether we should rerun the annotator based solely on the saved
   * annotation.
   * At the same time, the proto file should not have this flag in it.
   * So, here it is.
   */
  private boolean haveRunKBP = false;

  /** The default properties to use for annotating things (e.g., coref for the document level) */
  private Properties defaultProps = EMPTY_PROPS;


  /**
   * Set the backend implementations for our CoreNLP pipeline.
   * For example, to a {@link ServerAnnotatorImplementations}.
   *
   * @param backend The backend to use from now on for annotating
   *                documents.
   */
  public static void setBackend(AnnotatorImplementations backend) {
    Document.backend = backend;
  }


  /**
   * Use the CoreNLP Server ({@link StanfordCoreNLPServer}) for the
   * heavyweight backend annotation job.
   *
   * @param host The hostname of the server.
   * @param port The port the server is running on.
   */
  public static void useServer(String host, int port) {
    backend = new ServerAnnotatorImplementations(host, port);
  }


  /**
   * Use the CoreNLP Server ({@link StanfordCoreNLPServer}) for the
   * heavyweight backend annotation job, authenticating with the given
   * credentials.
   *
   * @param host The hostname of the server.
   * @param port The port the server is running on.
   * @param apiKey The api key to use as the username for authentication
   * @param apiSecret The api secrete to use as the password for authentication
   * @param lazy Only run the annotations that are required at this time. If this is
   *             false, we will also run a bunch of standard annotations, to cut down on
   *             expected number of round-trips.
   */
  public static void useServer(String host, int port,
                               String apiKey, String apiSecret,
                               boolean lazy) {
    backend = new ServerAnnotatorImplementations(host, port, apiKey, apiSecret, lazy);
  }


  /** @see Document#useServer(String, int, String, String, boolean) */
  public static void useServer(String host,
                               String apiKey, String apiSecret,
                               boolean lazy) {
    useServer(host, host.startsWith("http://") ? 80 : 443, apiKey, apiSecret, lazy);
  }

  /** @see Document#useServer(String, int, String, String, boolean) */
  public static void useServer(String host,
                               String apiKey, String apiSecret) {
    useServer(host, host.startsWith("http://") ? 80 : 443, apiKey, apiSecret, true);
  }


  /*
   * A static block that'll automatically fault in the CoreNLP server, if the appropriate environment
   * variables are set.
   * These are:
   *
   * <ul>
   *     <li>CORENLP_HOST</li> -- this is already sufficient to trigger creating a server
   *     <li>CORENLP_PORT</li>
   *     <li>CORENLP_KEY</li>
   *     <li>CORENLP_SECRET</li>
   *     <li>CORENLP_LAZY</li>  (if true, do as much annotation on a single round-trip as possible)
   * </ul>
   */
  static {
    String host    = System.getenv("CORENLP_HOST");
    String portStr = System.getenv("CORENLP_PORT");
    String key     = System.getenv("CORENLP_KEY");
    String secret  = System.getenv("CORENLP_SECRET");
    String lazystr = System.getenv("CORENLP_LAZY");
    if (host != null) {
      int port = 443;
      if (portStr == null) {
        if (host.startsWith("http://")) {
          port = 80;
        }
      } else {
        port = Integer.parseInt(portStr);
      }
      boolean lazy = true;
      if (lazystr != null) {
        lazy = Boolean.parseBoolean(lazystr);
      }
      if (key != null && secret != null) {
        useServer(host, port, key, secret, lazy);
      } else {
        useServer(host, port);
      }
    }
  }


  /**
   * Create a new document from the passed in text and the given properties.
   * @param text The text of the document.
   */
  public Document(Properties props, String text) {
    this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
  }


  /**
   * Create a new document from the passed in text.
   * @param text The text of the document.
   */
  public Document(String text) {
    this(EMPTY_PROPS, text);
  }

  /**
   * Convert a CoreNLP Annotation object to a Document.
   * @param ann The CoreNLP Annotation object.
   */
  @SuppressWarnings("Convert2streamapi")
  public Document(Properties props, Annotation ann) {
    StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations());  // cache the annotator pool
    this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann);
    List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
    this.sentences = new ArrayList<>(sentences.size());
    for (CoreMap sentence : sentences) {
      this.sentences.add(new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), defaultProps));
    }
  }


  /** @see Document#Document(Properties, Annotation) */
  public Document(Annotation ann) {
    this(Document.EMPTY_PROPS, ann);
  }

  /**
   * Create a Document object from a read Protocol Buffer.
   * @see edu.stanford.nlp.simple.Document#serialize()
   * @param proto The protocol buffer representing this document.
   */
  @SuppressWarnings("Convert2streamapi")
  public Document(Properties props, CoreNLPProtos.Document proto) {
    StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations());  // cache the annotator pool
    this.impl = proto.toBuilder();
    if (proto.getSentenceCount() > 0) {
      this.sentences = new ArrayList<>(proto.getSentenceCount());
      for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
        this.sentences.add(new Sentence(this, sentence.toBuilder(), defaultProps));
      }
    }
  }


  /** @see Document#Document(Properties, CoreNLPProtos.Document)  */
  public Document(CoreNLPProtos.Document proto) {
    this(Document.EMPTY_PROPS, proto);
  }


  /**
   * Make this document caseless. That is, from now on, run the caseless models
   * on the document by default rather than the standard CoreNLP models.
   *
   * @return This same document, but with the default properties swapped out.
   */
  public Document caseless() {
    this.defaultProps = CASELESS_PROPS;
    return this;
  }

  /**
   * Make this document case sensitive.
   * A document is case sensitive by default; this only has an effect if you have previously
   * called {@link Sentence#caseless()}.
   *
   * @return This same document, but with the default properties swapped out.
   */
  public Document cased() {
    this.defaultProps = EMPTY_PROPS;
    return this;
  }

  /**
   * Serialize this Document as a Protocol Buffer.
   * This can be deserialized with the constructor {@link Document#Document(edu.stanford.nlp.pipeline.CoreNLPProtos.Document)}.
   *
   * @return The document as represented by a Protocol Buffer.
   */
  public CoreNLPProtos.Document serialize() {
    synchronized (impl) {
      // Serialize sentences
      this.impl.clearSentence();
      for (Sentence sent : sentences()) {
        this.impl.addSentence(sent.serialize());
      }
      // Serialize document
      return impl.build();
    }
  }

  /**
   * Write this document to an output stream.
   * Internally, this stores the document as a protocol buffer, and saves that buffer to the output stream.
   * This method does not close the stream after writing.
   *
   * @param out The output stream to write to. The stream is not closed after the method returns.
   * @throws IOException Thrown from the underlying write() implementation.
   *
   * @see Document#deserialize(InputStream)
   */
  public void serialize(OutputStream out) throws IOException {
    serialize().writeDelimitedTo(out);
    out.flush();
  }

  /**
   * Read a document from an input stream.
   * This does not close the input stream.
   *
   * @param in The input stream to deserialize from.
   * @return The next document encoded in the input stream.
   * @throws IOException Thrown by the underlying parse() implementation.
   *
   * @see Document#serialize(java.io.OutputStream)
   */
  public static Document deserialize(InputStream in) throws IOException {
    return new Document(CoreNLPProtos.Document.parseDelimitedFrom(in));
  }

  /**
   * <p>
   *  Write this annotation as a JSON string.
   *  Optionally, you can also specify a number of operations to call on the document before
   *  dumping it to JSON.
   *  This allows the user to ensure that certain annotations have been computed before the document
   *  is dumped.
   *  For example:
   * </p>
   *
   * <pre>{@code
   *   String json = new Document("Lucy in the sky with diamonds").json(Sentence::parse, Sentence::ner);
   * }</pre>
   *
   * <p>
   *   will create a JSON dump of the document, ensuring that at least the parse tree and ner tags are populated.
   * </p>
   *
   * @param functions The (possibly empty) list of annotations to populate on the document before dumping it
   *                  to JSON.
   * @return The JSON String for this document.
   */
  @SafeVarargs
  public final String json(Function<Sentence, Object>... functions) {
    for (Function<Sentence, Object> f : functions) {
      f.apply(this.sentence(0));
    }
    try {
      return new JSONOutputter().print(this.asAnnotation());
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  /**
   * Like the {@link Document@json(Function...)} function, but with minified JSON more suitable
   * for sending over the wire.
   *
   * @param functions The (possibly empty) list of annotations to populate on the document before dumping it
   *                  to JSON.
   * @return The JSON String for this document, without unnecessary whitespace.
   *
   */
  @SafeVarargs
  public final String jsonMinified(Function<Sentence, Object>... functions) {
    for (Function<Sentence, Object> f : functions) {
      f.apply(this.sentence(0));
    }
    try {
      AnnotationOutputter.Options options = new AnnotationOutputter.Options();
      options.pretty = false;
      return new JSONOutputter().print(this.asAnnotation(), options);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  /**
   * <p>
   *  Write this annotation as an XML string.
   *  Optionally, you can also specify a number of operations to call on the document before
   *  dumping it to XML.
   *  This allows the user to ensure that certain annotations have been computed before the document
   *  is dumped.
   *  For example:
   * </p>
   *
   * <pre>{@code
   *   String xml = new Document("Lucy in the sky with diamonds").xml(Document::parse, Document::ner);
   * }</pre>
   *
   * <p>
   *   will create a XML dump of the document, ensuring that at least the parse tree and ner tags are populated.
   * </p>
   *
   * @param functions The (possibly empty) list of annotations to populate on the document before dumping it
   *                  to XML.
   * @return The XML String for this document.
   */
  @SafeVarargs
  public final String xml(Function<Sentence, Object>... functions) {
    for (Function<Sentence, Object> f : functions) {
      f.apply(this.sentence(0));
    }
    try {
      return new XMLOutputter().print(this.asAnnotation());
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  /**
   * Like the {@link Document@xml(Function...)} function, but with minified XML more suitable
   * for sending over the wire.
   *
   * @param functions The (possibly empty) list of annotations to populate on the document before dumping it
   *                  to XML.
   * @return The XML String for this document, without unecessary whitespace.
   *
   */
  @SafeVarargs
  public final String xmlMinified(Function<Sentence, Object>... functions) {
    for (Function<Sentence, Object> f : functions) {
      f.apply(this.sentence(0));
    }
    try {
      AnnotationOutputter.Options options = new AnnotationOutputter.Options();
      options.pretty = false;
      return new XMLOutputter().print(this.asAnnotation(false), options);
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
  }

  /**
   * Get the sentences in this document, as a list.
   * @param props The properties to use in the {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator}.
   * @return A list of Sentence objects representing the sentences in the document.
   */
  public List<Sentence> sentences(Properties props) {
    return this.sentences(props,
        props == EMPTY_PROPS ? defaultTokenize : getOrCreate(Annotator.STANFORD_TOKENIZE, props, () -> backend.tokenizer(props)).get());
  }

  /**
   * Get the sentences in this document, as a list.
   * @param props The properties to use in the {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator}.
   * @return A list of Sentence objects representing the sentences in the document.
   */
  protected List<Sentence> sentences(Properties props, Annotator tokenizer) {
    if (sentences == null) {
      Annotator ssplit = props == EMPTY_PROPS ? defaultSSplit : getOrCreate(STANFORD_SSPLIT, props, () -> backend.wordToSentences(props)).get();
      // Annotate
      Annotation ann = new Annotation(this.impl.getText());
      tokenizer.annotate(ann);
      ssplit.annotate(ann);
      // Grok results
      // (docid)
      if (ann.containsKey(CoreAnnotations.DocIDAnnotation.class)) {
        impl.setDocID(ann.get(CoreAnnotations.DocIDAnnotation.class));
      }
      // (sentences)
      List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
      this.sentences = new ArrayList<>(sentences.size());
      for (CoreMap sentence : sentences) {
        //Sentence sent = new Sentence(this, sentence);
        Sentence sent = new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), defaultProps);
        this.sentences.add(sent);
        this.impl.addSentence(sent.serialize());
      }
    }

    return sentences;
  }

  /** @see Document#sentences(java.util.Properties) */
  public List<Sentence> sentences() {
    return sentences(EMPTY_PROPS);
  }

  /** @see Document#sentences(java.util.Properties) */
  public Sentence sentence(int sentenceIndex, Properties props) {
    return sentences(props).get(sentenceIndex);
  }

  /** @see Document#sentences(java.util.Properties) */
  public Sentence sentence(int sentenceIndex) {
    return sentences().get(sentenceIndex);
  }

  /** Get the raw text of the document, as input by, e.g., {@link Document#Document(String)}. */
  public String text() {
    synchronized (impl) {
      return impl.getText();
    }
  }

  /**
   * Returns the coref chains in the document. This is a map from coref cluster IDs, to the coref chain
   * with that ID.
   * @param props The properties to use in the {@link edu.stanford.nlp.pipeline.DeterministicCorefAnnotator}.
   */
  public Map<Integer, CorefChain> coref(Properties props) {
    synchronized (this.impl) {
      if (impl.getCorefChainCount() == 0) {
        // Run prerequisites
        this.runLemma(props).runNER(props);
        if (CorefProperties.mdType(props) != CorefProperties.MentionDetectionType.DEPENDENCY) {
          this.runParse(props);
        } else {
          this.runDepparse(props);
        }
        // Run mention
        Supplier<Annotator> mention = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultMention : getOrCreate(STANFORD_MENTION, props, () -> backend.mention(props));
        // Run coref
        Supplier<Annotator> coref = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultCoref : getOrCreate(STANFORD_COREF, props, () -> backend.coref(props));
        Annotation ann = asAnnotation(true);
        mention.get().annotate(ann);
        coref.get().annotate(ann);
        // Convert to proto
        synchronized (serializer) {
          for (CorefChain chain : ann.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
            impl.addCorefChain(serializer.toProto(chain));
          }
        }
      }
      Map<Integer, CorefChain> corefs = Generics.newHashMap();
      for (CoreNLPProtos.CorefChain chain : impl.getCorefChainList()) {
        corefs.put(chain.getChainID(), fromProto(chain));
      }
      return corefs;
    }
  }

  /** @see Document#coref(java.util.Properties) */
  public Map<Integer, CorefChain> coref() {
    return coref(defaultProps);
  }

  /** Returns the document id of the document, if one was found */
  public Optional<String> docid() {
    synchronized (impl) {
      if (impl.hasDocID()) {
        return Optional.of(impl.getDocID());
      } else {
        return Optional.empty();
      }
    }
  }

  /** Sets the document id of the document, returning this. */
  public Document setDocid(String docid) {
    synchronized (impl) {
      this.impl.setDocID(docid);
    }
    return this;
  }


  /**
   * <p>
   *   Bypass the tokenizer and sentence splitter -- axiomatically set the sentences for this document.
   *   This is a VERY dangerous method to call if you don't know what you're doing.
   *   The primary use case is for forcing single-sentence documents, where most of the fields in the document
   *   do not matter.
   * </p>
   *
   * @param sentences The sentences to force for the sentence list of this document.
   */
  void forceSentences(List<Sentence> sentences) {
    this.sentences = sentences;
    synchronized (impl) {
      this.impl.clearSentence();
      for (Sentence sent : sentences) {
        this.impl.addSentence(sent.serialize());
      }
    }
  }



  //
  // Begin helpers
  //

  Document runPOS(Properties props) {
    // Cached result
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasPos()) {
      return this;
    }
    // Prerequisites
    sentences();
    // Run annotator
    Supplier<Annotator> pos = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultPOS : getOrCreate(STANFORD_POS, props, () -> backend.posTagger(props));
    Annotation ann = asAnnotation(false);
    pos.get().annotate(ann);
    // Update data
    for (int i = 0; i < sentences.size(); ++i) {
      sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setPos(pair.second), CoreLabel::tag);
    }
    return this;
  }

  Document runLemma(Properties props) {
    // Cached result
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasLemma()) {
      return this;
    }
    // Prerequisites
    runPOS(props);
    // Run annotator
    Supplier<Annotator> lemma = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultLemma : getOrCreate(STANFORD_LEMMA, props, () -> backend.morpha(props, false));
    Annotation ann = asAnnotation(true);
    lemma.get().annotate(ann);
    // Update data
    for (int i = 0; i < sentences.size(); ++i) {
      sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setLemma(pair.second), CoreLabel::lemma);
    }
    return this;
  }

  Document mockLemma(Properties props) {
    // Cached result
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasLemma()) {
      return this;
    }
    // Prerequisites
    runPOS(props);
    // Mock lemma with word
    Annotation ann = asAnnotation(true);
    for (int i = 0; i < sentences.size(); ++i) {
      sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setLemma(pair.second), CoreLabel::word);
    }
    return this;

  }

  Document runNER(Properties props) {
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasNer()) {
      return this;
    }
    // Run prerequisites
    runPOS(props);
    // Run annotator
    Supplier<Annotator> ner = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultNER : getOrCreate(STANFORD_NER, props, () -> backend.ner(props));
    Annotation ann = asAnnotation(true);
    ner.get().annotate(ann);
    // Update data
    for (int i = 0; i < sentences.size(); ++i) {
      sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setNer(pair.second), CoreLabel::ner);
    }
    return this;
  }

  Document runRegexner(Properties props) {
    // Run prerequisites
    runNER(props);
    // Run annotator
    Supplier<Annotator> ner = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultRegexner : getOrCreate(STANFORD_REGEXNER, props, () -> backend.tokensRegexNER(props, STANFORD_REGEXNER));
    Annotation ann = asAnnotation(true);
    ner.get().annotate(ann);
    // Update data
    for (int i = 0; i < sentences.size(); ++i) {
      sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setNer(pair.second), CoreLabel::ner);
    }
    return this;
  }

  Document runParse(Properties props) {
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawSentence().hasParseTree()) {
      return this;
    }
    // Run annotator
    boolean cacheAnnotation = false;
    Annotator parse = ((props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultParse : getOrCreate(STANFORD_PARSE, props, () -> backend.parse(props))).get();
    if (parse.requires().contains(CoreAnnotations.PartOfSpeechAnnotation.class) || System.getenv("CORENLP_HOST") != null) {
      // Run the POS tagger if we are (or may be) using the shift reduce parser
      runPOS(props);
      cacheAnnotation = true;
    } else {
      sentences();
    }
    Annotation ann = asAnnotation(cacheAnnotation);
    parse.annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
        Tree binaryTree = sentence.get(TreeCoreAnnotations.BinarizedTreeAnnotation.class);
        sentences.get(i).updateParse(serializer.toProto(tree),
                                     binaryTree == null ? null : serializer.toProto(binaryTree));
        sentences.get(i).updateDependencies(
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)),
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)),
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
      }
    }
    return this;
  }

  Document runDepparse(Properties props) {
    if (this.sentences != null && this.sentences.size() > 0 &&
        this.sentences.get(0).rawSentence().hasBasicDependencies()) {
      return this;
    }
    // Run prerequisites
    runPOS(props);
    // Run annotator
    Supplier<Annotator> depparse = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultDepparse : getOrCreate(STANFORD_DEPENDENCIES, props, () -> backend.dependencies(props));
    Annotation ann = asAnnotation(true);
    depparse.get().annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        sentences.get(i).updateDependencies(
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)),
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)),
            ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
      }
    }
    return this;
  }

  Document runNatlog(Properties props) {
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasPolarity()) {
      return this;
    }
    // Run prerequisites
    runLemma(props);
    runDepparse(props);
    // Run annotator
    Supplier<Annotator> natlog = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultNatlog : getOrCreate(STANFORD_NATLOG, props, () -> backend.natlog(props));
    Annotation ann = asAnnotation(true);
    natlog.get().annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (Pair<CoreNLPProtos.Token.Builder, Polarity> pair) -> pair.first().setPolarity(ProtobufAnnotationSerializer.toProto(pair.second())), x -> x.get(NaturalLogicAnnotations.PolarityAnnotation.class));
        sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (Pair<CoreNLPProtos.Token.Builder, OperatorSpec> pair) -> pair.first().setOperator(ProtobufAnnotationSerializer.toProto(pair.second())), x -> x.get(NaturalLogicAnnotations.OperatorAnnotation.class));
      }
    }
    return this;
  }

  Document runOpenie(Properties props) {
    if (haveRunOpenie) {
      return this;
    }
    // Run prerequisites
    runNatlog(props);
    // Run annotator
    Supplier<Annotator> openie = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultOpenie : getOrCreate(STANFORD_OPENIE, props, () -> backend.openie(props));
    Annotation ann = asAnnotation(true);
    openie.get().annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
        sentences.get(i).updateOpenIE(triples.stream().map(ProtobufAnnotationSerializer::toProto));
      }
    }
    // Return
    haveRunOpenie = true;
    return this;
  }


  Document runKBP(Properties props) {
    if (haveRunKBP) {
      return this;
    }
    // Run prerequisites
    coref(props);
    Supplier<Annotator> entityMention = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultEntityMentions : getOrCreate(STANFORD_ENTITY_MENTIONS, props, () -> backend.entityMentions(props, STANFORD_ENTITY_MENTIONS));
    Annotation ann = asAnnotation(true);
    entityMention.get().annotate(ann);
    // Run annotator
    Supplier<Annotator> kbp = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultKBP : getOrCreate(STANFORD_KBP, props, () -> backend.kbp(props));
    kbp.get().annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        Collection<RelationTriple> triples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
        sentences.get(i).updateKBP(triples.stream().map(ProtobufAnnotationSerializer::toProto));
      }
    }
    // Return
    haveRunKBP = true;
    return this;
  }


  Document runSentiment(Properties props) {
    if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawSentence().hasSentiment()) {
        return this;
    }
    // Run prerequisites
    runParse(props);
    if (this.sentences != null && this.sentences.size() > 0 && !this.sentences.get(0).rawSentence().hasBinarizedParseTree()) {
      throw new IllegalStateException("No binarized parse tree (perhaps it's not supported in this language?)");
    }
    // Run annotator
    Annotation ann = asAnnotation(true);
    Supplier<Annotator> sentiment = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultSentiment : getOrCreate(STANFORD_SENTIMENT, props, () -> backend.sentiment(props, STANFORD_SENTIMENT));
    sentiment.get().annotate(ann);
    // Update data
    synchronized (serializer) {
      for (int i = 0; i < sentences.size(); ++i) {
        CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
        String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
        sentences.get(i).updateSentiment(sentimentClass);
      }
    }
    // Return
    return this;
  }

  /**
   * Return this Document as an Annotation object.
   * Note that, importantly, only the fields which have already been called will be populated in
   * the Annotation!
   *
   * <p>Therefore, this method is generally NOT recommended.</p>
   */
  public Annotation asAnnotation() {
    return asAnnotation(false);
  }


  /**
   * A cached version of this document as an Annotation.
   * This will get garbage collected when necessary.
   */
  private SoftReference<Annotation> cachedAnnotation = null;

  /**
   * Return this Document as an Annotation object.
   * Note that, importantly, only the fields which have already been called will be populated in
   * the Annotation!
   *
   * <p>Therefore, this method is generally NOT recommended.</p>
   *
   * @param cache If true, allow retrieving this object from the cache.
   */
  Annotation asAnnotation(boolean cache) {
    Annotation ann = cachedAnnotation == null ? null : cachedAnnotation.get();
    if (!cache || ann == null) {
      ann = serializer.fromProto(serialize());
    }
    cachedAnnotation = new SoftReference<>(ann);
    return ann;
  }


  /**
   * Read a CorefChain from its serialized representation.
   * This is private due to the need for an additional partial document. Also, why on Earth are you trying to use
   * this on its own anyways?
   *
   * @see edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer#fromProto(edu.stanford.nlp.pipeline.CoreNLPProtos.CorefChain, edu.stanford.nlp.pipeline.Annotation)
   *
   * @param proto The serialized representation of the coref chain, missing information on its mention span string.
   *
   * @return A coreference chain.
   */
  private CorefChain fromProto(CoreNLPProtos.CorefChain proto) {
    // Get chain ID
    int cid = proto.getChainID();
    // Get mentions
    Map<IntPair, Set<CorefChain.CorefMention>> mentions = new HashMap<>();
    CorefChain.CorefMention representative = null;
    for (int i = 0; i < proto.getMentionCount(); ++i) {
      CoreNLPProtos.CorefChain.CorefMention mentionProto = proto.getMention(i);
      // Create mention
      StringBuilder mentionSpan = new StringBuilder();
      Sentence sentence = sentence(mentionProto.getSentenceIndex());
      for (int k = mentionProto.getBeginIndex(); k < mentionProto.getEndIndex(); ++k) {
        mentionSpan.append(' ').append(sentence.word(k));
      }
      // Set the coref cluster id for the token
      CorefChain.CorefMention mention = new CorefChain.CorefMention(
          Dictionaries.MentionType.valueOf(mentionProto.getMentionType()),
          Dictionaries.Number.valueOf(mentionProto.getNumber()),
          Dictionaries.Gender.valueOf(mentionProto.getGender()),
          Dictionaries.Animacy.valueOf(mentionProto.getAnimacy()),
          mentionProto.getBeginIndex() + 1,
          mentionProto.getEndIndex() + 1,
          mentionProto.getHeadIndex() + 1,
          cid,
          mentionProto.getMentionID(),
          mentionProto.getSentenceIndex() + 1,
          new IntTuple(new int[]{ mentionProto.getSentenceIndex() + 1, mentionProto.getPosition() }),
          mentionSpan.substring(mentionSpan.length() > 0 ? 1 : 0));
      // Register mention
      IntPair key = new IntPair(mentionProto.getSentenceIndex() - 1, mentionProto.getHeadIndex() - 1);
      if (!mentions.containsKey(key)) { mentions.put(key, new HashSet<>()); }
      mentions.get(key).add(mention);
      // Check for representative
      if (proto.hasRepresentative() && i == proto.getRepresentative()) {
        representative = mention;
      }
    }
    // Return
    return new CorefChain(cid, mentions, representative);
  }


  @SuppressWarnings("SimplifiableIfStatement")
  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (!(o instanceof Document)) return false;
    Document document = (Document) o;
    if (impl.hasText() && !impl.getText().equals(document.impl.getText())) {
      return false;
    }
    return impl.build().equals(document.impl.build()) && sentences.equals(document.sentences);
  }

  @Override
  public int hashCode() {
    if (impl.hasText()) {
      return impl.getText().hashCode();
    } else {
      return impl.build().hashCode();
    }
  }

  @Override
  public String toString() {
    return impl.getText();
  }


}