package edu.stanford.nlp.simple;
import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.naturalli.NaturalLogicAnnotations;
import edu.stanford.nlp.naturalli.OperatorSpec;
import edu.stanford.nlp.naturalli.Polarity;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.util.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.ref.SoftReference;
import java.util.*;
import java.util.function.Function;
import java.util.function.Supplier;
import static edu.stanford.nlp.simple.Sentence.SINGLE_SENTENCE_DOCUMENT;
import static edu.stanford.nlp.pipeline.Annotator.*;
/**
* A representation of a Document. Most blobs of raw text should become documents.
*
* @author Gabor Angeli
*/
@SuppressWarnings("unused")
public class Document {
/**
* The empty {@link java.util.Properties} object, for use with creating default annotators.
*/
static final Properties EMPTY_PROPS = PropertiesUtils.asProperties(
"language", "english",
"annotators", "",
"tokenize.class", "PTBTokenizer",
"tokenize.language", "en",
"parse.binaryTrees", "true",
"mention.type", "dep",
"coref.mode", "statistical", // Use the new coref
"coref.md.type", "dep"
);
/**
* The caseless {@link java.util.Properties} object.
*
* @see Document#caseless()
* @see Sentence#caseless()
*/
static final Properties CASELESS_PROPS = PropertiesUtils.asProperties(
"language", "english",
"annotators", "",
"tokenize.class", "PTBTokenizer",
"tokenize.language", "en",
"parse.binaryTrees", "true",
"pos.model", "edu/stanford/nlp/models/pos-tagger/wsj-0-18-caseless-left3words-distsim.tagger",
"parse.model", "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz",
"ner.model", "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz," +
"edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz," +
"edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz");
/**
* The backend to use for constructing {@link edu.stanford.nlp.pipeline.Annotator}s.
*/
private static AnnotatorImplementations backend = new AnnotatorImplementations();
/**
* The default {@link edu.stanford.nlp.pipeline.TokenizerAnnotator} implementation
*/
private static final Annotator defaultTokenize = backend.tokenizer(EMPTY_PROPS);
/**
* The default {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator} implementation
*/
private static final Annotator defaultSSplit = backend.wordToSentences(EMPTY_PROPS);
/**
* The default {@link edu.stanford.nlp.pipeline.POSTaggerAnnotator} implementation
*/
private static Supplier<Annotator> defaultPOS = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.posTagger(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.MorphaAnnotator} implementation
*/
private static final Supplier<Annotator> defaultLemma = () -> backend.morpha(EMPTY_PROPS, false);
/**
* The default {@link edu.stanford.nlp.pipeline.NERCombinerAnnotator} implementation
*/
private static Supplier<Annotator> defaultNER = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.ner(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.RegexNERAnnotator} implementation
*/
private static Supplier<Annotator> defaultRegexner = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.tokensRegexNER(EMPTY_PROPS, Annotator.STANFORD_REGEXNER);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.ParserAnnotator} implementation
*/
private static Supplier<Annotator> defaultParse = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.parse(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.DependencyParseAnnotator} implementation
*/
private static Supplier<Annotator> defaultDepparse = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.dependencies(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.naturalli.NaturalLogicAnnotator} implementation
*/
private static Supplier<Annotator> defaultNatlog = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.natlog(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link EntityMentionsAnnotator} implementation
*/
private static Supplier<Annotator> defaultEntityMentions = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.entityMentions(EMPTY_PROPS, Annotator.STANFORD_ENTITY_MENTIONS);
}
return impl;
}
};
/**
* The default {@link KBPAnnotator} implementation
*/
private static Supplier<Annotator> defaultKBP = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.kbp(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.naturalli.OpenIE} implementation
*/
private static Supplier<Annotator> defaultOpenie = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.openie(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.MentionAnnotator} implementation
*/
private static Supplier<Annotator> defaultMention = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.mention(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.CorefAnnotator} implementation
*/
private static Supplier<Annotator> defaultCoref = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.coref(EMPTY_PROPS);
}
return impl;
}
};
/**
* The default {@link edu.stanford.nlp.pipeline.SentimentAnnotator} implementation
*/
private static Supplier<Annotator> defaultSentiment = new Supplier<Annotator>() {
Annotator impl = null;
@Override
public synchronized Annotator get() {
if (impl == null) {
impl = backend.sentiment(EMPTY_PROPS, Annotator.STANFORD_SENTIMENT);
}
return impl;
}
};
/**
* Cache the most recently used custom annotators.
*/
private static final AnnotatorPool customAnnotators = AnnotatorPool.SINGLETON;
/**
* Either get a custom annotator which was recently defined, or create it if it has never been defined.
* This method is synchronized to avoid race conditions when loading the annotators.
*
* @param name The name of the annotator.
* @param props The properties used to create the annotator, if we need to create it.
* @param annotator The actual function used to make the annotator, if needed.
*
* @return An annotator as specified by the given name and properties.
*/
private synchronized static Supplier<Annotator> getOrCreate(String name, Properties props, Supplier<Annotator> annotator) {
customAnnotators.register(name, props, Lazy.cache(annotator));
return () -> customAnnotators.get(name);
}
/** The protocol buffer representing this document */
protected final CoreNLPProtos.Document.Builder impl;
/** The list of sentences associated with this document */
protected List<Sentence> sentences = null;
/** A serializer to assist in serializing and deserializing from Protocol buffers */
protected final ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer(false );
/**
* THIS IS NONSTANDARD.
* An indicator of whether we have run the OpenIE annotator.
* Unlike most other annotators, it's quite common for a sentence to not have any extracted triples,
* and therefore it's hard to determine whether we should rerun the annotator based solely on the saved
* annotation.
* At the same time, the proto file should not have this flag in it.
* So, here it is.
*/
private boolean haveRunOpenie = false;
/**
* THIS IS NONSTANDARD.
* An indicator of whether we have run the KBP annotator.
* Unlike most other annotators, it's quite common for a sentence to not have any extracted triples,
* and therefore it's hard to determine whether we should rerun the annotator based solely on the saved
* annotation.
* At the same time, the proto file should not have this flag in it.
* So, here it is.
*/
private boolean haveRunKBP = false;
/** The default properties to use for annotating things (e.g., coref for the document level) */
private Properties defaultProps = EMPTY_PROPS;
/**
* Set the backend implementations for our CoreNLP pipeline.
* For example, to a {@link ServerAnnotatorImplementations}.
*
* @param backend The backend to use from now on for annotating
* documents.
*/
public static void setBackend(AnnotatorImplementations backend) {
Document.backend = backend;
}
/**
* Use the CoreNLP Server ({@link StanfordCoreNLPServer}) for the
* heavyweight backend annotation job.
*
* @param host The hostname of the server.
* @param port The port the server is running on.
*/
public static void useServer(String host, int port) {
backend = new ServerAnnotatorImplementations(host, port);
}
/**
* Use the CoreNLP Server ({@link StanfordCoreNLPServer}) for the
* heavyweight backend annotation job, authenticating with the given
* credentials.
*
* @param host The hostname of the server.
* @param port The port the server is running on.
* @param apiKey The api key to use as the username for authentication
* @param apiSecret The api secrete to use as the password for authentication
* @param lazy Only run the annotations that are required at this time. If this is
* false, we will also run a bunch of standard annotations, to cut down on
* expected number of round-trips.
*/
public static void useServer(String host, int port,
String apiKey, String apiSecret,
boolean lazy) {
backend = new ServerAnnotatorImplementations(host, port, apiKey, apiSecret, lazy);
}
/** @see Document#useServer(String, int, String, String, boolean) */
public static void useServer(String host,
String apiKey, String apiSecret,
boolean lazy) {
useServer(host, host.startsWith("http://") ? 80 : 443, apiKey, apiSecret, lazy);
}
/** @see Document#useServer(String, int, String, String, boolean) */
public static void useServer(String host,
String apiKey, String apiSecret) {
useServer(host, host.startsWith("http://") ? 80 : 443, apiKey, apiSecret, true);
}
/*
* A static block that'll automatically fault in the CoreNLP server, if the appropriate environment
* variables are set.
* These are:
*
* <ul>
* <li>CORENLP_HOST</li> -- this is already sufficient to trigger creating a server
* <li>CORENLP_PORT</li>
* <li>CORENLP_KEY</li>
* <li>CORENLP_SECRET</li>
* <li>CORENLP_LAZY</li> (if true, do as much annotation on a single round-trip as possible)
* </ul>
*/
static {
String host = System.getenv("CORENLP_HOST");
String portStr = System.getenv("CORENLP_PORT");
String key = System.getenv("CORENLP_KEY");
String secret = System.getenv("CORENLP_SECRET");
String lazystr = System.getenv("CORENLP_LAZY");
if (host != null) {
int port = 443;
if (portStr == null) {
if (host.startsWith("http://")) {
port = 80;
}
} else {
port = Integer.parseInt(portStr);
}
boolean lazy = true;
if (lazystr != null) {
lazy = Boolean.parseBoolean(lazystr);
}
if (key != null && secret != null) {
useServer(host, port, key, secret, lazy);
} else {
useServer(host, port);
}
}
}
/**
* Create a new document from the passed in text and the given properties.
* @param text The text of the document.
*/
public Document(Properties props, String text) {
this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
}
/**
* Create a new document from the passed in text.
* @param text The text of the document.
*/
public Document(String text) {
this(EMPTY_PROPS, text);
}
/**
* Convert a CoreNLP Annotation object to a Document.
* @param ann The CoreNLP Annotation object.
*/
@SuppressWarnings("Convert2streamapi")
public Document(Properties props, Annotation ann) {
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann);
List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
this.sentences = new ArrayList<>(sentences.size());
for (CoreMap sentence : sentences) {
this.sentences.add(new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), defaultProps));
}
}
/** @see Document#Document(Properties, Annotation) */
public Document(Annotation ann) {
this(Document.EMPTY_PROPS, ann);
}
/**
* Create a Document object from a read Protocol Buffer.
* @see edu.stanford.nlp.simple.Document#serialize()
* @param proto The protocol buffer representing this document.
*/
@SuppressWarnings("Convert2streamapi")
public Document(Properties props, CoreNLPProtos.Document proto) {
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = proto.toBuilder();
if (proto.getSentenceCount() > 0) {
this.sentences = new ArrayList<>(proto.getSentenceCount());
for (CoreNLPProtos.Sentence sentence : proto.getSentenceList()) {
this.sentences.add(new Sentence(this, sentence.toBuilder(), defaultProps));
}
}
}
/** @see Document#Document(Properties, CoreNLPProtos.Document) */
public Document(CoreNLPProtos.Document proto) {
this(Document.EMPTY_PROPS, proto);
}
/**
* Make this document caseless. That is, from now on, run the caseless models
* on the document by default rather than the standard CoreNLP models.
*
* @return This same document, but with the default properties swapped out.
*/
public Document caseless() {
this.defaultProps = CASELESS_PROPS;
return this;
}
/**
* Make this document case sensitive.
* A document is case sensitive by default; this only has an effect if you have previously
* called {@link Sentence#caseless()}.
*
* @return This same document, but with the default properties swapped out.
*/
public Document cased() {
this.defaultProps = EMPTY_PROPS;
return this;
}
/**
* Serialize this Document as a Protocol Buffer.
* This can be deserialized with the constructor {@link Document#Document(edu.stanford.nlp.pipeline.CoreNLPProtos.Document)}.
*
* @return The document as represented by a Protocol Buffer.
*/
public CoreNLPProtos.Document serialize() {
synchronized (impl) {
// Serialize sentences
this.impl.clearSentence();
for (Sentence sent : sentences()) {
this.impl.addSentence(sent.serialize());
}
// Serialize document
return impl.build();
}
}
/**
* Write this document to an output stream.
* Internally, this stores the document as a protocol buffer, and saves that buffer to the output stream.
* This method does not close the stream after writing.
*
* @param out The output stream to write to. The stream is not closed after the method returns.
* @throws IOException Thrown from the underlying write() implementation.
*
* @see Document#deserialize(InputStream)
*/
public void serialize(OutputStream out) throws IOException {
serialize().writeDelimitedTo(out);
out.flush();
}
/**
* Read a document from an input stream.
* This does not close the input stream.
*
* @param in The input stream to deserialize from.
* @return The next document encoded in the input stream.
* @throws IOException Thrown by the underlying parse() implementation.
*
* @see Document#serialize(java.io.OutputStream)
*/
public static Document deserialize(InputStream in) throws IOException {
return new Document(CoreNLPProtos.Document.parseDelimitedFrom(in));
}
/**
* <p>
* Write this annotation as a JSON string.
* Optionally, you can also specify a number of operations to call on the document before
* dumping it to JSON.
* This allows the user to ensure that certain annotations have been computed before the document
* is dumped.
* For example:
* </p>
*
* <pre>{@code
* String json = new Document("Lucy in the sky with diamonds").json(Sentence::parse, Sentence::ner);
* }</pre>
*
* <p>
* will create a JSON dump of the document, ensuring that at least the parse tree and ner tags are populated.
* </p>
*
* @param functions The (possibly empty) list of annotations to populate on the document before dumping it
* to JSON.
* @return The JSON String for this document.
*/
@SafeVarargs
public final String json(Function<Sentence, Object>... functions) {
for (Function<Sentence, Object> f : functions) {
f.apply(this.sentence(0));
}
try {
return new JSONOutputter().print(this.asAnnotation());
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* Like the {@link Document@json(Function...)} function, but with minified JSON more suitable
* for sending over the wire.
*
* @param functions The (possibly empty) list of annotations to populate on the document before dumping it
* to JSON.
* @return The JSON String for this document, without unnecessary whitespace.
*
*/
@SafeVarargs
public final String jsonMinified(Function<Sentence, Object>... functions) {
for (Function<Sentence, Object> f : functions) {
f.apply(this.sentence(0));
}
try {
AnnotationOutputter.Options options = new AnnotationOutputter.Options();
options.pretty = false;
return new JSONOutputter().print(this.asAnnotation(), options);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* <p>
* Write this annotation as an XML string.
* Optionally, you can also specify a number of operations to call on the document before
* dumping it to XML.
* This allows the user to ensure that certain annotations have been computed before the document
* is dumped.
* For example:
* </p>
*
* <pre>{@code
* String xml = new Document("Lucy in the sky with diamonds").xml(Document::parse, Document::ner);
* }</pre>
*
* <p>
* will create a XML dump of the document, ensuring that at least the parse tree and ner tags are populated.
* </p>
*
* @param functions The (possibly empty) list of annotations to populate on the document before dumping it
* to XML.
* @return The XML String for this document.
*/
@SafeVarargs
public final String xml(Function<Sentence, Object>... functions) {
for (Function<Sentence, Object> f : functions) {
f.apply(this.sentence(0));
}
try {
return new XMLOutputter().print(this.asAnnotation());
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* Like the {@link Document@xml(Function...)} function, but with minified XML more suitable
* for sending over the wire.
*
* @param functions The (possibly empty) list of annotations to populate on the document before dumping it
* to XML.
* @return The XML String for this document, without unecessary whitespace.
*
*/
@SafeVarargs
public final String xmlMinified(Function<Sentence, Object>... functions) {
for (Function<Sentence, Object> f : functions) {
f.apply(this.sentence(0));
}
try {
AnnotationOutputter.Options options = new AnnotationOutputter.Options();
options.pretty = false;
return new XMLOutputter().print(this.asAnnotation(false), options);
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
/**
* Get the sentences in this document, as a list.
* @param props The properties to use in the {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator}.
* @return A list of Sentence objects representing the sentences in the document.
*/
public List<Sentence> sentences(Properties props) {
return this.sentences(props,
props == EMPTY_PROPS ? defaultTokenize : getOrCreate(Annotator.STANFORD_TOKENIZE, props, () -> backend.tokenizer(props)).get());
}
/**
* Get the sentences in this document, as a list.
* @param props The properties to use in the {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator}.
* @return A list of Sentence objects representing the sentences in the document.
*/
protected List<Sentence> sentences(Properties props, Annotator tokenizer) {
if (sentences == null) {
Annotator ssplit = props == EMPTY_PROPS ? defaultSSplit : getOrCreate(STANFORD_SSPLIT, props, () -> backend.wordToSentences(props)).get();
// Annotate
Annotation ann = new Annotation(this.impl.getText());
tokenizer.annotate(ann);
ssplit.annotate(ann);
// Grok results
// (docid)
if (ann.containsKey(CoreAnnotations.DocIDAnnotation.class)) {
impl.setDocID(ann.get(CoreAnnotations.DocIDAnnotation.class));
}
// (sentences)
List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
this.sentences = new ArrayList<>(sentences.size());
for (CoreMap sentence : sentences) {
//Sentence sent = new Sentence(this, sentence);
Sentence sent = new Sentence(this, this.serializer.toProtoBuilder(sentence), sentence.get(CoreAnnotations.TextAnnotation.class), defaultProps);
this.sentences.add(sent);
this.impl.addSentence(sent.serialize());
}
}
return sentences;
}
/** @see Document#sentences(java.util.Properties) */
public List<Sentence> sentences() {
return sentences(EMPTY_PROPS);
}
/** @see Document#sentences(java.util.Properties) */
public Sentence sentence(int sentenceIndex, Properties props) {
return sentences(props).get(sentenceIndex);
}
/** @see Document#sentences(java.util.Properties) */
public Sentence sentence(int sentenceIndex) {
return sentences().get(sentenceIndex);
}
/** Get the raw text of the document, as input by, e.g., {@link Document#Document(String)}. */
public String text() {
synchronized (impl) {
return impl.getText();
}
}
/**
* Returns the coref chains in the document. This is a map from coref cluster IDs, to the coref chain
* with that ID.
* @param props The properties to use in the {@link edu.stanford.nlp.pipeline.DeterministicCorefAnnotator}.
*/
public Map<Integer, CorefChain> coref(Properties props) {
synchronized (this.impl) {
if (impl.getCorefChainCount() == 0) {
// Run prerequisites
this.runLemma(props).runNER(props);
if (CorefProperties.mdType(props) != CorefProperties.MentionDetectionType.DEPENDENCY) {
this.runParse(props);
} else {
this.runDepparse(props);
}
// Run mention
Supplier<Annotator> mention = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultMention : getOrCreate(STANFORD_MENTION, props, () -> backend.mention(props));
// Run coref
Supplier<Annotator> coref = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultCoref : getOrCreate(STANFORD_COREF, props, () -> backend.coref(props));
Annotation ann = asAnnotation(true);
mention.get().annotate(ann);
coref.get().annotate(ann);
// Convert to proto
synchronized (serializer) {
for (CorefChain chain : ann.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
impl.addCorefChain(serializer.toProto(chain));
}
}
}
Map<Integer, CorefChain> corefs = Generics.newHashMap();
for (CoreNLPProtos.CorefChain chain : impl.getCorefChainList()) {
corefs.put(chain.getChainID(), fromProto(chain));
}
return corefs;
}
}
/** @see Document#coref(java.util.Properties) */
public Map<Integer, CorefChain> coref() {
return coref(defaultProps);
}
/** Returns the document id of the document, if one was found */
public Optional<String> docid() {
synchronized (impl) {
if (impl.hasDocID()) {
return Optional.of(impl.getDocID());
} else {
return Optional.empty();
}
}
}
/** Sets the document id of the document, returning this. */
public Document setDocid(String docid) {
synchronized (impl) {
this.impl.setDocID(docid);
}
return this;
}
/**
* <p>
* Bypass the tokenizer and sentence splitter -- axiomatically set the sentences for this document.
* This is a VERY dangerous method to call if you don't know what you're doing.
* The primary use case is for forcing single-sentence documents, where most of the fields in the document
* do not matter.
* </p>
*
* @param sentences The sentences to force for the sentence list of this document.
*/
void forceSentences(List<Sentence> sentences) {
this.sentences = sentences;
synchronized (impl) {
this.impl.clearSentence();
for (Sentence sent : sentences) {
this.impl.addSentence(sent.serialize());
}
}
}
//
// Begin helpers
//
Document runPOS(Properties props) {
// Cached result
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasPos()) {
return this;
}
// Prerequisites
sentences();
// Run annotator
Supplier<Annotator> pos = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultPOS : getOrCreate(STANFORD_POS, props, () -> backend.posTagger(props));
Annotation ann = asAnnotation(false);
pos.get().annotate(ann);
// Update data
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setPos(pair.second), CoreLabel::tag);
}
return this;
}
Document runLemma(Properties props) {
// Cached result
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasLemma()) {
return this;
}
// Prerequisites
runPOS(props);
// Run annotator
Supplier<Annotator> lemma = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultLemma : getOrCreate(STANFORD_LEMMA, props, () -> backend.morpha(props, false));
Annotation ann = asAnnotation(true);
lemma.get().annotate(ann);
// Update data
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setLemma(pair.second), CoreLabel::lemma);
}
return this;
}
Document mockLemma(Properties props) {
// Cached result
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasLemma()) {
return this;
}
// Prerequisites
runPOS(props);
// Mock lemma with word
Annotation ann = asAnnotation(true);
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setLemma(pair.second), CoreLabel::word);
}
return this;
}
Document runNER(Properties props) {
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasNer()) {
return this;
}
// Run prerequisites
runPOS(props);
// Run annotator
Supplier<Annotator> ner = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultNER : getOrCreate(STANFORD_NER, props, () -> backend.ner(props));
Annotation ann = asAnnotation(true);
ner.get().annotate(ann);
// Update data
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setNer(pair.second), CoreLabel::ner);
}
return this;
}
Document runRegexner(Properties props) {
// Run prerequisites
runNER(props);
// Run annotator
Supplier<Annotator> ner = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultRegexner : getOrCreate(STANFORD_REGEXNER, props, () -> backend.tokensRegexNER(props, STANFORD_REGEXNER));
Annotation ann = asAnnotation(true);
ner.get().annotate(ann);
// Update data
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (pair) -> pair.first.setNer(pair.second), CoreLabel::ner);
}
return this;
}
Document runParse(Properties props) {
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawSentence().hasParseTree()) {
return this;
}
// Run annotator
boolean cacheAnnotation = false;
Annotator parse = ((props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultParse : getOrCreate(STANFORD_PARSE, props, () -> backend.parse(props))).get();
if (parse.requires().contains(CoreAnnotations.PartOfSpeechAnnotation.class) || System.getenv("CORENLP_HOST") != null) {
// Run the POS tagger if we are (or may be) using the shift reduce parser
runPOS(props);
cacheAnnotation = true;
} else {
sentences();
}
Annotation ann = asAnnotation(cacheAnnotation);
parse.annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);
Tree binaryTree = sentence.get(TreeCoreAnnotations.BinarizedTreeAnnotation.class);
sentences.get(i).updateParse(serializer.toProto(tree),
binaryTree == null ? null : serializer.toProto(binaryTree));
sentences.get(i).updateDependencies(
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)),
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)),
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
}
}
return this;
}
Document runDepparse(Properties props) {
if (this.sentences != null && this.sentences.size() > 0 &&
this.sentences.get(0).rawSentence().hasBasicDependencies()) {
return this;
}
// Run prerequisites
runPOS(props);
// Run annotator
Supplier<Annotator> depparse = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultDepparse : getOrCreate(STANFORD_DEPENDENCIES, props, () -> backend.dependencies(props));
Annotation ann = asAnnotation(true);
depparse.get().annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
sentences.get(i).updateDependencies(
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class)),
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class)),
ProtobufAnnotationSerializer.toProto(sentence.get(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class)));
}
}
return this;
}
Document runNatlog(Properties props) {
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawToken(0).hasPolarity()) {
return this;
}
// Run prerequisites
runLemma(props);
runDepparse(props);
// Run annotator
Supplier<Annotator> natlog = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultNatlog : getOrCreate(STANFORD_NATLOG, props, () -> backend.natlog(props));
Annotation ann = asAnnotation(true);
natlog.get().annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (Pair<CoreNLPProtos.Token.Builder, Polarity> pair) -> pair.first().setPolarity(ProtobufAnnotationSerializer.toProto(pair.second())), x -> x.get(NaturalLogicAnnotations.PolarityAnnotation.class));
sentences.get(i).updateTokens(ann.get(CoreAnnotations.SentencesAnnotation.class).get(i).get(CoreAnnotations.TokensAnnotation.class), (Pair<CoreNLPProtos.Token.Builder, OperatorSpec> pair) -> pair.first().setOperator(ProtobufAnnotationSerializer.toProto(pair.second())), x -> x.get(NaturalLogicAnnotations.OperatorAnnotation.class));
}
}
return this;
}
Document runOpenie(Properties props) {
if (haveRunOpenie) {
return this;
}
// Run prerequisites
runNatlog(props);
// Run annotator
Supplier<Annotator> openie = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultOpenie : getOrCreate(STANFORD_OPENIE, props, () -> backend.openie(props));
Annotation ann = asAnnotation(true);
openie.get().annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class);
sentences.get(i).updateOpenIE(triples.stream().map(ProtobufAnnotationSerializer::toProto));
}
}
// Return
haveRunOpenie = true;
return this;
}
Document runKBP(Properties props) {
if (haveRunKBP) {
return this;
}
// Run prerequisites
coref(props);
Supplier<Annotator> entityMention = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultEntityMentions : getOrCreate(STANFORD_ENTITY_MENTIONS, props, () -> backend.entityMentions(props, STANFORD_ENTITY_MENTIONS));
Annotation ann = asAnnotation(true);
entityMention.get().annotate(ann);
// Run annotator
Supplier<Annotator> kbp = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultKBP : getOrCreate(STANFORD_KBP, props, () -> backend.kbp(props));
kbp.get().annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
Collection<RelationTriple> triples = sentence.get(CoreAnnotations.KBPTriplesAnnotation.class);
sentences.get(i).updateKBP(triples.stream().map(ProtobufAnnotationSerializer::toProto));
}
}
// Return
haveRunKBP = true;
return this;
}
Document runSentiment(Properties props) {
if (this.sentences != null && this.sentences.size() > 0 && this.sentences.get(0).rawSentence().hasSentiment()) {
return this;
}
// Run prerequisites
runParse(props);
if (this.sentences != null && this.sentences.size() > 0 && !this.sentences.get(0).rawSentence().hasBinarizedParseTree()) {
throw new IllegalStateException("No binarized parse tree (perhaps it's not supported in this language?)");
}
// Run annotator
Annotation ann = asAnnotation(true);
Supplier<Annotator> sentiment = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultSentiment : getOrCreate(STANFORD_SENTIMENT, props, () -> backend.sentiment(props, STANFORD_SENTIMENT));
sentiment.get().annotate(ann);
// Update data
synchronized (serializer) {
for (int i = 0; i < sentences.size(); ++i) {
CoreMap sentence = ann.get(CoreAnnotations.SentencesAnnotation.class).get(i);
String sentimentClass = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
sentences.get(i).updateSentiment(sentimentClass);
}
}
// Return
return this;
}
/**
* Return this Document as an Annotation object.
* Note that, importantly, only the fields which have already been called will be populated in
* the Annotation!
*
* <p>Therefore, this method is generally NOT recommended.</p>
*/
public Annotation asAnnotation() {
return asAnnotation(false);
}
/**
* A cached version of this document as an Annotation.
* This will get garbage collected when necessary.
*/
private SoftReference<Annotation> cachedAnnotation = null;
/**
* Return this Document as an Annotation object.
* Note that, importantly, only the fields which have already been called will be populated in
* the Annotation!
*
* <p>Therefore, this method is generally NOT recommended.</p>
*
* @param cache If true, allow retrieving this object from the cache.
*/
Annotation asAnnotation(boolean cache) {
Annotation ann = cachedAnnotation == null ? null : cachedAnnotation.get();
if (!cache || ann == null) {
ann = serializer.fromProto(serialize());
}
cachedAnnotation = new SoftReference<>(ann);
return ann;
}
/**
* Read a CorefChain from its serialized representation.
* This is private due to the need for an additional partial document. Also, why on Earth are you trying to use
* this on its own anyways?
*
* @see edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer#fromProto(edu.stanford.nlp.pipeline.CoreNLPProtos.CorefChain, edu.stanford.nlp.pipeline.Annotation)
*
* @param proto The serialized representation of the coref chain, missing information on its mention span string.
*
* @return A coreference chain.
*/
private CorefChain fromProto(CoreNLPProtos.CorefChain proto) {
// Get chain ID
int cid = proto.getChainID();
// Get mentions
Map<IntPair, Set<CorefChain.CorefMention>> mentions = new HashMap<>();
CorefChain.CorefMention representative = null;
for (int i = 0; i < proto.getMentionCount(); ++i) {
CoreNLPProtos.CorefChain.CorefMention mentionProto = proto.getMention(i);
// Create mention
StringBuilder mentionSpan = new StringBuilder();
Sentence sentence = sentence(mentionProto.getSentenceIndex());
for (int k = mentionProto.getBeginIndex(); k < mentionProto.getEndIndex(); ++k) {
mentionSpan.append(' ').append(sentence.word(k));
}
// Set the coref cluster id for the token
CorefChain.CorefMention mention = new CorefChain.CorefMention(
Dictionaries.MentionType.valueOf(mentionProto.getMentionType()),
Dictionaries.Number.valueOf(mentionProto.getNumber()),
Dictionaries.Gender.valueOf(mentionProto.getGender()),
Dictionaries.Animacy.valueOf(mentionProto.getAnimacy()),
mentionProto.getBeginIndex() + 1,
mentionProto.getEndIndex() + 1,
mentionProto.getHeadIndex() + 1,
cid,
mentionProto.getMentionID(),
mentionProto.getSentenceIndex() + 1,
new IntTuple(new int[]{ mentionProto.getSentenceIndex() + 1, mentionProto.getPosition() }),
mentionSpan.substring(mentionSpan.length() > 0 ? 1 : 0));
// Register mention
IntPair key = new IntPair(mentionProto.getSentenceIndex() - 1, mentionProto.getHeadIndex() - 1);
if (!mentions.containsKey(key)) { mentions.put(key, new HashSet<>()); }
mentions.get(key).add(mention);
// Check for representative
if (proto.hasRepresentative() && i == proto.getRepresentative()) {
representative = mention;
}
}
// Return
return new CorefChain(cid, mentions, representative);
}
@SuppressWarnings("SimplifiableIfStatement")
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Document)) return false;
Document document = (Document) o;
if (impl.hasText() && !impl.getText().equals(document.impl.getText())) {
return false;
}
return impl.build().equals(document.impl.build()) && sentences.equals(document.sentences);
}
@Override
public int hashCode() {
if (impl.hasText()) {
return impl.getText().hashCode();
} else {
return impl.build().hashCode();
}
}
@Override
public String toString() {
return impl.getText();
}
}