package edu.stanford.nlp.simple;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.naturalli.OperatorSpec;
import edu.stanford.nlp.naturalli.Polarity;
import edu.stanford.nlp.naturalli.SentenceFragment;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.CoreNLPProtos;
import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphFactory;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.*;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.function.BiFunction;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* A representation of a single Sentence.
* Although it is possible to create a sentence directly from text, it is advisable to
* create a document instead and operate on the document directly.
*
* @author Gabor Angeli
*/
@SuppressWarnings({"UnusedDeclaration", "WeakerAccess"})
public class Sentence {
/** A Properties object for creating a document from a single sentence. Used in the constructor {@link Sentence#Sentence(String)} */
static Properties SINGLE_SENTENCE_DOCUMENT = PropertiesUtils.asProperties(
"language", "english",
"ssplit.isOneSentence", "true",
"tokenize.class", "PTBTokenizer",
"tokenize.language", "en",
"mention.type", "dep",
"coref.mode", "statistical", // Use the new coref
"coref.md.type", "dep"
);
/** A Properties object for creating a document from a single tokenized sentence. */
private static Properties SINGLE_SENTENCE_TOKENIZED_DOCUMENT = PropertiesUtils.asProperties(
"language", "english",
"ssplit.isOneSentence", "true",
"tokenize.class", "WhitespaceTokenizer",
"tokenize.language", "en",
"tokenize.whitespace", "true",
"mention.type", "dep",
"coref.mode", "statistical", // Use the new coref
"coref.md.type", "dep"
); // redundant?
/**
* The protobuf representation of a Sentence.
* Note that this does not necessarily have up to date token information.
*/
private final CoreNLPProtos.Sentence.Builder impl;
/** The protobuf representation of the tokens of a sentence. This has up-to-date information on the tokens */
private final List<CoreNLPProtos.Token.Builder> tokensBuilders;
/** The document this sentence is derived from */
public final Document document;
/** The default properties to use for annotators. */
private final Properties defaultProps;
/** The function to use to create a new document. This is used for the cased() and caseless() functions. */
private final BiFunction<Properties, String, Document> docFn;
/**
* Create a new sentence, using the specified properties as the default properties.
* @param doc The document to link this sentence to.
* @param props The properties to use for tokenizing the sentence.
*/
protected Sentence(Document doc, Properties props) {
// Set document
this.document = doc;
// Set sentence
if (props.containsKey("ssplit.isOneSentence")) {
this.impl = this.document.sentence(0, props).impl;
} else {
Properties modProps = new Properties(props);
modProps.setProperty("ssplit.isOneSentence", "true");
this.impl = this.document.sentence(0, modProps).impl;
}
// Set tokens
this.tokensBuilders = document.sentence(0).tokensBuilders;
// Asserts
assert (this.document.sentence(0).impl == this.impl);
assert (this.document.sentence(0).tokensBuilders == this.tokensBuilders);
// Set the default properties
if (props == SINGLE_SENTENCE_TOKENIZED_DOCUMENT) {
this.defaultProps = SINGLE_SENTENCE_DOCUMENT; // no longer care about tokenization
} else {
this.defaultProps = props;
}
this.docFn = Document::new;
}
/**
* Create a new sentence from some text, and some properties.
* @param text The text of the sentence.
* @param props The properties to use for the annotators.
*/
public Sentence(String text, Properties props) {
this(new Document(props, text), props);
}
/**
* Create a new sentence from the given text, assuming the entire text is just one sentence.
* @param text The text of the sentence.
*/
public Sentence(String text) {
this(text, SINGLE_SENTENCE_DOCUMENT);
}
/** The actual implementation of a tokenized sentence constructor */
protected Sentence(Function<String, Document> doc, List<String> tokens, Properties props) {
this(doc.apply(StringUtils.join(tokens.stream().map(x -> x.replace(' ', 'ߝ' /* some random character */)), " ")), props);
// Clean up whitespace
for (int i = 0; i < impl.getTokenCount(); ++i) {
this.impl.getTokenBuilder(i).setWord(this.impl.getTokenBuilder(i).getWord().replace('ߝ', ' '));
this.impl.getTokenBuilder(i).setValue(this.impl.getTokenBuilder(i).getValue().replace('ߝ', ' '));
this.tokensBuilders.get(i).setWord(this.tokensBuilders.get(i).getWord().replace('ߝ', ' '));
this.tokensBuilders.get(i).setValue(this.tokensBuilders.get(i).getValue().replace('ߝ', ' '));
}
}
/**
* Create a new sentence from the given tokenized text, assuming the entire text is just one sentence.
* WARNING: This method may in rare cases (mostly when tokens themselves have whitespace in them)
* produce strange results; it's a bit of a hack around the default tokenizer.
*
* @param tokens The text of the sentence.
*/
public Sentence(List<String> tokens) {
this(Document::new, tokens, SINGLE_SENTENCE_TOKENIZED_DOCUMENT);
}
/**
* Create a sentence from a saved protocol buffer.
*/
protected Sentence(BiFunction<Properties, String, Document> docFn, CoreNLPProtos.Sentence proto, Properties props) {
this.impl = proto.toBuilder();
// Set tokens
tokensBuilders = new ArrayList<>(this.impl.getTokenCount());
for (int i = 0; i < this.impl.getTokenCount(); ++i) {
tokensBuilders.add(this.impl.getToken(i).toBuilder());
}
// Initialize document
this.document = docFn.apply(props, proto.getText());
this.document.forceSentences(Collections.singletonList(this));
// Asserts
assert (this.document.sentence(0).impl == this.impl);
assert (this.document.sentence(0).tokensBuilders == this.tokensBuilders);
// Set default props
this.defaultProps = props;
this.docFn = docFn;
}
/**
* Create a sentence from a saved protocol buffer.
*/
public Sentence(CoreNLPProtos.Sentence proto) {
this(Document::new, proto, SINGLE_SENTENCE_DOCUMENT);
}
/** Helper for creating a sentence from a document at a given index */
protected Sentence(Document doc, int sentenceIndex) {
this.document = doc;
this.impl = doc.sentence(sentenceIndex).impl;
// Set tokens
this.tokensBuilders = doc.sentence(sentenceIndex).tokensBuilders;
// Asserts
assert (this.document.sentence(sentenceIndex).impl == this.impl);
assert (this.document.sentence(sentenceIndex).tokensBuilders == this.tokensBuilders);
// Set default props
this.defaultProps = Document.EMPTY_PROPS;
this.docFn = doc.sentence(sentenceIndex).docFn;
}
/**
* The canonical constructor of a sentence from a {@link edu.stanford.nlp.simple.Document}.
* @param doc The document to link this sentence to.
* @param proto The sentence implementation to use for this sentence.
*/
protected Sentence(Document doc, CoreNLPProtos.Sentence.Builder proto, Properties defaultProps) {
this.document = doc;
this.impl = proto;
this.defaultProps = defaultProps;
// Set tokens
// This is the _only_ place we are allowed to construct tokens builders
tokensBuilders = new ArrayList<>(this.impl.getTokenCount());
for (int i = 0; i < this.impl.getTokenCount(); ++i) {
tokensBuilders.add(this.impl.getToken(i).toBuilder());
}
this.docFn = (props, text) -> MetaClass.create(doc.getClass().getName()).createInstance(props, text);
}
/**
* Also sets the the text of the sentence. Used by {@link Document} internally
*
* @param doc The document to link this sentence to.
* @param proto The sentence implementation to use for this sentence.
* @param text The text for the sentence
* @param defaultProps The default properties to use when annotating this sentence.
*/
Sentence(Document doc, CoreNLPProtos.Sentence.Builder proto, String text, Properties defaultProps) {
this(doc, proto, defaultProps);
this.impl.setText(text);
}
/** Helper for creating a sentence from a document and a CoreMap representation */
protected Sentence(Document doc, CoreMap sentence) {
this.document = doc;
assert ! doc.sentences().isEmpty();
this.impl = doc.sentence(0).impl;
this.tokensBuilders = doc.sentence(0).tokensBuilders;
this.defaultProps = Document.EMPTY_PROPS;
this.docFn = (props, text) -> MetaClass.create(doc.getClass().getName()).createInstance(props, text);
}
/**
* Convert a CoreMap into a simple Sentence object.
* Note that this is a copy operation -- the implementing CoreMap will not be updated, and all of its
* contents are copied over to the protocol buffer format backing the {@link Sentence} object.
*
* @param sentence The CoreMap representation of the sentence.
*/
public Sentence(CoreMap sentence) {
this(new Document(new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)) {{
set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence));
if (sentence.containsKey(CoreAnnotations.DocIDAnnotation.class)) {
set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class));
}
}}), sentence);
}
/**
* <p>
* Convert a sentence fragment (i.e., entailed sentence) into a simple sentence object.
* Like {@link Sentence#Sentence(CoreMap)}, this copies the information in the fragment into the underlying
* protobuf backed format.
* </p>
*
* @param sentence The sentence fragment to convert.
*/
public Sentence(SentenceFragment sentence) {
this(new ArrayCoreMap(32) {{
set(CoreAnnotations.TokensAnnotation.class, sentence.words);
set(CoreAnnotations.TextAnnotation.class, StringUtils.join(sentence.words.stream().map(CoreLabel::originalText), " "));
if (sentence.words.isEmpty()) {
set(CoreAnnotations.TokenBeginAnnotation.class, 0);
set(CoreAnnotations.TokenEndAnnotation.class, 0);
} else {
set(CoreAnnotations.TokenBeginAnnotation.class, sentence.words.get(0).get(CoreAnnotations.IndexAnnotation.class));
set(CoreAnnotations.TokenEndAnnotation.class, sentence.words.get(sentence.words.size() - 1).get(CoreAnnotations.IndexAnnotation.class) + 1);
}
set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.parseTree);
set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, sentence.parseTree);
set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, sentence.parseTree);
}});
}
/**
* Make this sentence caseless. That is, from now on, run the caseless models
* on the sentence by default rather than the standard CoreNLP models.
*
* @return A new sentence with the default properties swapped out.
*/
public Sentence caseless() {
return new Sentence(this.docFn, impl.build(), Document.CASELESS_PROPS);
}
/**
* Make this sentence case sensitive.
* A sentence is case sensitive by default; this only has an effect if you have previously
* called {@link Sentence#caseless()}.
*
* @return A new sentence with the default properties swapped out.
*/
public Sentence cased() {
return new Sentence(this.docFn, impl.build(), Document.EMPTY_PROPS);
}
/**
* Serialize the given sentence (but not the associated document!) into a Protocol Buffer.
* @return The Protocol Buffer representing this sentence.
*/
public CoreNLPProtos.Sentence serialize() {
synchronized (impl) {
this.impl.clearToken();
for (CoreNLPProtos.Token.Builder token : this.tokensBuilders) {
this.impl.addToken(token.build());
}
return impl.build();
}
}
/**
* Write this sentence to an output stream.
* Internally, this stores the sentence as a protocol buffer, and saves that buffer to the output stream.
* This method does not close the stream after writing.
*
* @param out The output stream to write to. The stream is not closed after the method returns.
* @throws IOException Thrown from the underlying write() implementation.
*/
public void serialize(OutputStream out) throws IOException {
serialize().writeDelimitedTo(out);
out.flush();
}
/**
* Read a sentence from an input stream.
* This does not close the input stream.
*
* @param in The input stream to deserialize from.
* @return The next sentence encoded in the input stream.
* @throws IOException Thrown by the underlying parse() implementation.
*
* @see Document#serialize(java.io.OutputStream)
*/
public static Sentence deserialize(InputStream in) throws IOException {
return new Sentence(CoreNLPProtos.Sentence.parseDelimitedFrom(in));
}
/**
* Return a class that can perform common algorithms on this sentence.
*/
public SentenceAlgorithms algorithms() {
return new SentenceAlgorithms(this);
}
/** The raw text of the sentence, as input by, e.g., {@link Sentence#Sentence(String)}. */
public String text() {
synchronized (impl) {
return impl.getText();
}
}
//
// SET AXIOMATICALLY
//
/** The index of the sentence within the document. */
public int sentenceIndex() {
synchronized (impl) {
return impl.getSentenceIndex();
}
}
/** THe token offset of the sentence within the document. */
public int sentenceTokenOffsetBegin() {
synchronized (impl) {
return impl.getTokenOffsetBegin();
}
}
/** The token offset of the end of this sentence within the document. */
public int sentenceTokenOffsetEnd() {
synchronized (impl) {
return impl.getTokenOffsetEnd();
}
}
//
// SET BY TOKENIZER
//
/** The words of the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#word()}. */
public List<String> words() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getWord);
}
}
/** The word at the given index of the sentence. @see Sentence#words() */
public String word(int index) {
return words().get(index);
}
/** The original (unprocessed) words of the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#originalText()}. */
public List<String> originalTexts() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getOriginalText);
}
}
/** The original word at the given index. @see Sentence#originalTexts() */
public String originalText(int index) {
return originalTexts().get(index);
}
/** The character offset of each token in the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#beginPosition()}. */
public List<Integer> characterOffsetBegin() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getBeginChar);
}
}
/** The character offset of the given index in the sentence. @see Sentence#characterOffsetBegin(). */
public int characterOffsetBegin(int index) {
return characterOffsetBegin().get(index);
}
/** The end character offset of each token in the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#endPosition()}. */
public List<Integer> characterOffsetEnd() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getEndChar);
}
}
/** The end character offset of the given index in the sentence. @see Sentence#characterOffsetEnd(). */
public int characterOffsetEnd(int index) {
return characterOffsetEnd().get(index);
}
/** The whitespace before each token in the sentence. This will match {@link #after()} of the previous token. */
public List<String> before() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getBefore);
}
}
/** The whitespace before this token in the sentence. This will match {@link #after()} of the previous token. */
public String before(int index) {
return before().get(index);
}
/** The whitespace after each token in the sentence. This will match {@link #before()} of the next token. */
public List<String> after() {
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getAfter);
}
}
/** The whitespace after this token in the sentence. This will match {@link #before()} of the next token. */
public String after(int index) {
return after().get(index);
}
/** The tokens in this sentence. Each token class is just a helper for the methods in this class. */
public List<Token> tokens() {
ArrayList<Token> tokens = new ArrayList<>(this.length());
for (int i = 0; i < length(); ++i) {
tokens.add(new Token(this, i));
}
return tokens;
}
//
// SET BY ANNOTATORS
//
/**
* The part of speech tags of the sentence.
* @param props The properties to use for the {@link edu.stanford.nlp.pipeline.POSTaggerAnnotator}.
* @return A list of part of speech tags, one for each token in the sentence.
*/
public List<String> posTags(Properties props) {
document.runPOS(props);
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getPos);
}
}
/** @see Sentence#posTags(java.util.Properties) */
public List<String> posTags() {
return posTags(this.defaultProps);
}
/** @see Sentence#posTags(java.util.Properties) */
public String posTag(int index) {
return posTags().get(index);
}
/**
* The lemmas of the sentence.
* @param props The properties to use for the {@link edu.stanford.nlp.pipeline.MorphaAnnotator}.
* @return A list of lemmatized words, one for each token in the sentence.
*/
public List<String> lemmas(Properties props) {
document.runLemma(props);
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getLemma);
}
}
/** @see Sentence#lemmas(java.util.Properties) */
public List<String> lemmas() {
return lemmas(this.defaultProps);
}
/** @see Sentence#lemmas(java.util.Properties) */
public String lemma(int index) {
return lemmas().get(index);
}
/**
* The named entity tags of the sentence.
* @param props The properties to use for the {@link edu.stanford.nlp.pipeline.NERCombinerAnnotator}.
* @return A list of named entity tags, one for each token in the sentence.
*/
public List<String> nerTags(Properties props) {
document.runNER(props);
synchronized (impl) {
return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getNer);
}
}
/** @see Sentence#nerTags(java.util.Properties) */
public List<String> nerTags() {
return nerTags(this.defaultProps);
}
/**
* Run RegexNER over this sentence. Note that this is an in place operation, and simply
* updates the NER tags.
* Therefore, every time this function is called, it re-runs the annotator!
*
* @param mappingFile The regexner mapping file.
* @param ignorecase If true, run a caseless match on the regexner file.
*
*/
public void regexner(String mappingFile, boolean ignorecase) {
Properties props = new Properties();
for (Object prop : this.defaultProps.keySet()) {
props.setProperty(prop.toString(), this.defaultProps.getProperty(prop.toString()));
}
props.setProperty(Annotator.STANFORD_REGEXNER + ".mapping", mappingFile);
props.setProperty(Annotator.STANFORD_REGEXNER + ".ignorecase", Boolean.toString(ignorecase));
this.document.runRegexner(props);
}
/** @see Sentence#nerTags(java.util.Properties) */
public String nerTag(int index) {
return nerTags().get(index);
}
/**
* Get all mentions of the given NER tag, as a list of surface forms.
* @param nerTag The ner tag to search for, case sensitive.
* @return A list of surface forms of the entities of this tag. This is using the {@link Sentence#word(int)} function.
*/
public List<String> mentions(String nerTag) {
List<String> mentionsOfTag = new ArrayList<>();
StringBuilder lastMention = new StringBuilder();
String lastTag = "O";
for (int i = 0; i < length(); ++i) {
String ner = nerTag(i);
if (ner.equals(nerTag) && !lastTag.equals(nerTag)) {
// case: beginning of span
lastMention.append(word(i)).append(' ');
} else if (ner.equals(nerTag) && lastTag.equals(nerTag)) {
// case: in span
lastMention.append(word(i)).append(' ');
} else if (!ner.equals(nerTag) && lastTag.equals(nerTag)) {
// case: end of span
if (lastMention.length() > 0) {
mentionsOfTag.add(lastMention.toString().trim());
}
lastMention.setLength(0);
}
lastTag = ner;
}
if (lastMention.length() > 0) {
mentionsOfTag.add(lastMention.toString().trim());
}
return mentionsOfTag;
}
/**
* Get all mentions of any NER tag, as a list of surface forms.
* @return A list of surface forms of the entities in this sentence. This is using the {@link Sentence#word(int)} function.
*/
public List<String> mentions() {
List<String> mentionsOfTag = new ArrayList<>();
StringBuilder lastMention = new StringBuilder();
String lastTag = "O";
for (int i = 0; i < length(); ++i) {
String ner = nerTag(i);
if (!ner.equals("O") && !lastTag.equals(ner)) {
// case: beginning of span
if (lastMention.length() > 0) {
mentionsOfTag.add(lastMention.toString().trim());
}
lastMention.setLength(0);
lastMention.append(word(i)).append(' ');
} else if (!ner.equals("O") && lastTag.equals(ner)) {
// case: in span
lastMention.append(word(i)).append(' ');
} else if (ner.equals("O") && !lastTag.equals("O")) {
// case: end of span
if (lastMention.length() > 0) {
mentionsOfTag.add(lastMention.toString().trim());
}
lastMention.setLength(0);
}
lastTag = ner;
}
if (lastMention.length() > 0) {
mentionsOfTag.add(lastMention.toString().trim());
}
return mentionsOfTag;
}
/**
* Returns the constituency parse of this sentence.
*
* @param props The properties to use in the parser annotator.
* @return A parse tree object.
*/
public Tree parse(Properties props) {
document.runParse(props);
synchronized (document.serializer) {
return document.serializer.fromProto(impl.getParseTree());
}
}
/** @see Sentence#parse(java.util.Properties) */
public Tree parse() {
return parse(this.defaultProps);
}
/** An internal helper to get the dependency tree of the given type. */
private CoreNLPProtos.DependencyGraph dependencies(SemanticGraphFactory.Mode mode) {
switch (mode) {
case BASIC:
return impl.getBasicDependencies();
case ENHANCED:
return impl.getEnhancedDependencies();
case ENHANCED_PLUS_PLUS:
return impl.getEnhancedPlusPlusDependencies();
default:
throw new IllegalArgumentException("Unsupported dependency type: " + mode);
}
}
/**
* Returns the governor of the given index, according to the passed dependency type.
* The root has index -1.
*
* @param props The properties to use in the parser annotator.
* @param index The index of the dependent word ZERO INDEXED. That is, the first word of the sentence
* is index 0, not 1 as it would be in the {@link edu.stanford.nlp.semgraph.SemanticGraph} framework.
* @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed).
* @return The index of the governor, if one exists. A value of -1 indicates the root node.
*/
public Optional<Integer> governor(Properties props, int index, SemanticGraphFactory.Mode mode) {
document.runDepparse(props);
for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) {
if (edge.getTarget() - 1 == index) {
return Optional.of(edge.getSource() - 1);
}
}
for (int root : impl.getBasicDependencies().getRootList()) {
if (index == root - 1) { return Optional.of(-1); }
}
return Optional.empty();
}
/** @see Sentence#governor(java.util.Properties, int, SemanticGraphFactory.Mode) */
public Optional<Integer> governor(Properties props, int index) {
return governor(props, index, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#governor(java.util.Properties, int, SemanticGraphFactory.Mode) */
public Optional<Integer> governor(int index, SemanticGraphFactory.Mode mode) {
return governor(this.defaultProps, index, mode);
}
/** @see Sentence#governor(java.util.Properties, int) */
public Optional<Integer> governor(int index) {
return governor(this.defaultProps, index);
}
/**
* Returns the governors of a sentence, according to the passed dependency type.
* The resulting list is of the same size as the original sentence, with each element being either
* the governor (index), or empty if the node has no known governor.
* The root has index -1.
*
* @param props The properties to use in the parser annotator.
* @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed).
* @return A list of the (optional) governors of each token in the sentence.
*/
public List<Optional<Integer>> governors(Properties props, SemanticGraphFactory.Mode mode) {
document.runDepparse(props);
List<Optional<Integer>> governors = new ArrayList<>(this.length());
for (int i = 0; i < this.length(); ++i) { governors.add(Optional.empty()); }
for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) {
governors.set(edge.getTarget() - 1, Optional.of(edge.getSource() - 1));
}
for (int root : impl.getBasicDependencies().getRootList()) {
governors.set(root - 1, Optional.of(-1));
}
return governors;
}
/** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<Integer>> governors(Properties props) {
return governors(props, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<Integer>> governors(SemanticGraphFactory.Mode mode) {
return governors(this.defaultProps, mode);
}
/** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<Integer>> governors() {
return governors(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED);
}
/**
* Returns the incoming dependency label to a particular index, according to the Basic Dependencies.
*
* @param props The properties to use in the parser annotator.
* @param index The index of the dependent word ZERO INDEXED. That is, the first word of the sentence
* is index 0, not 1 as it would be in the {@link edu.stanford.nlp.semgraph.SemanticGraph} framework.
* @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed).
* @return The incoming dependency label, if it exists.
*/
public Optional<String> incomingDependencyLabel(Properties props, int index, SemanticGraphFactory.Mode mode) {
document.runDepparse(props);
for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) {
if (edge.getTarget() - 1 == index) {
return Optional.of(edge.getDep());
}
}
for (int root : impl.getBasicDependencies().getRootList()) {
if (index == root - 1) { return Optional.of("root"); }
}
return Optional.empty();
}
/** @see Sentence#incomingDependencyLabel(java.util.Properties, int, SemanticGraphFactory.Mode) */
public Optional<String> incomingDependencyLabel(Properties props, int index) {
return incomingDependencyLabel(props, index, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#incomingDependencyLabel(java.util.Properties, int, SemanticGraphFactory.Mode) */
public Optional<String> incomingDependencyLabel(int index, SemanticGraphFactory.Mode mode) {
return incomingDependencyLabel(this.defaultProps, index, mode);
}
/** @see Sentence#incomingDependencyLabel(java.util.Properties, int) */
public Optional<String> incomingDependencyLabel(int index) {
return incomingDependencyLabel(this.defaultProps, index);
}
/** @see Sentence#incomingDependencyLabel(java.util.Properties, int) */
public List<Optional<String>> incomingDependencyLabels(Properties props, SemanticGraphFactory.Mode mode) {
document.runDepparse(props);
List<Optional<String>> labels = new ArrayList<>(this.length());
for (int i = 0; i < this.length(); ++i) { labels.add(Optional.empty()); }
for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) {
labels.set(edge.getTarget() - 1, Optional.of(edge.getDep()));
}
for (int root : impl.getBasicDependencies().getRootList()) {
labels.set(root - 1, Optional.of("root"));
}
return labels;
}
/** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<String>> incomingDependencyLabels(SemanticGraphFactory.Mode mode) {
return incomingDependencyLabels(this.defaultProps, mode);
}
/** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<String>> incomingDependencyLabels(Properties props) {
return incomingDependencyLabels(props, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */
public List<Optional<String>> incomingDependencyLabels() {
return incomingDependencyLabels(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED);
}
/**
* Returns the dependency graph of the sentence, as a raw {@link SemanticGraph} object.
* Note that this method is slower than you may expect, as it has to convert the underlying protocol
* buffer back into a list of CoreLabels with which to populate the {@link SemanticGraph}.
*
* @param props The properties to use for running the dependency parser annotator.
* @param mode The type of graph to return (e.g., basic, collapsed, etc).
*
* @return The dependency graph of the sentence.
*/
public SemanticGraph dependencyGraph(Properties props, SemanticGraphFactory.Mode mode) {
document.runDepparse(props);
return ProtobufAnnotationSerializer.fromProto(dependencies(mode), asCoreLabels(), document.docid().orElse(null));
}
/** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */
public SemanticGraph dependencyGraph(Properties props) {
return dependencyGraph(props, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */
public SemanticGraph dependencyGraph() {
return dependencyGraph(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED);
}
/** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */
public SemanticGraph dependencyGraph(SemanticGraphFactory.Mode mode) {
return dependencyGraph(this.defaultProps, mode);
}
/** The length of the sentence, in tokens */
public int length() {
return impl.getTokenCount();
}
/**
* Get a list of the (possible) Natural Logic operators on each node of the sentence.
* At each index, the list contains an operator spec if that index is the head word of an operator in the
* sentence.
*
* @param props The properties to pass to the natural logic annotator.
* @return A list of Optionals, where each element corresponds to a token in the sentence, and the optional is nonempty
* if that index is an operator.
*/
public List<Optional<OperatorSpec>> operators(Properties props) {
document.runNatlog(props);
synchronized (impl) {
return lazyList(tokensBuilders, x -> x.hasOperator() ? Optional.of(ProtobufAnnotationSerializer.fromProto(x.getOperator())) : Optional.empty());
}
}
/** @see Sentence#operators(Properties) */
public List<Optional<OperatorSpec>> operators() {
return operators(this.defaultProps);
}
/** @see Sentence#operators(Properties) */
public Optional<OperatorSpec> operatorAt(Properties props, int i) {
return operators(props).get(i);
}
/** @see Sentence#operators(Properties) */
public Optional<OperatorSpec> operatorAt(int i) {
return operators(this.defaultProps).get(i);
}
/**
* Returns the list of non-empty Natural Logic operator specifications.
* This amounts to the actual list of operators in the sentence.
* Note that the spans of the operators can be retrieved with
* {@link OperatorSpec#quantifierBegin} and
* {@link OperatorSpec#quantifierEnd}.
*
* @param props The properties to use for the natlog annotator.
* @return A list of operators in the sentence.
*/
public List<OperatorSpec> operatorsNonempty(Properties props) {
return operators(props).stream().filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList());
}
/** @see Sentence#operatorsNonempty(Properties) */
public List<OperatorSpec> operatorsNonempty() {
return operatorsNonempty(this.defaultProps);
}
/**
* The Natural Logic notion of polarity for each token in a sentence.
* @param props The properties to use for the natural logic annotator.
* @return A list of Polarity objects, one for each token of the sentence.
*/
public List<Polarity> natlogPolarities(Properties props) {
document.runNatlog(props);
synchronized (impl) {
return lazyList(tokensBuilders, x -> ProtobufAnnotationSerializer.fromProto(x.getPolarity()));
}
}
/** @see Sentence#natlogPolarities(Properties) */
public List<Polarity> natlogPolarities() {
return natlogPolarities(this.defaultProps);
}
/**
* Get the polarity (the Natural Logic notion of polarity) for a given token in the sentence.
* @param props The properties to use for the natural logic annotator.
* @param index The index to return the polarity of.
* @return A list of Polarity objects, one for each token of the sentence.
*/
public Polarity natlogPolarity(Properties props, int index) {
document.runNatlog(props);
synchronized (impl) {
return ProtobufAnnotationSerializer.fromProto(tokensBuilders.get(index).getPolarity());
}
}
/** @see Sentence#natlogPolarity(Properties, int) */
public Polarity natlogPolarity(int index) {
return natlogPolarity(this.defaultProps, index);
}
/**
* Get the OpenIE triples associated with this sentence.
* Note that this function may be slower than you would expect, as it has to
* convert the underlying Protobuf representation back into {@link CoreLabel}s.
*
* @param props The properties to use for the OpenIE annotator.
* @return A collection of {@link RelationTriple} objects representing the OpenIE triples in the sentence.
*/
public Collection<RelationTriple> openieTriples(Properties props) {
document.runOpenie(props);
synchronized (impl) {
List<CoreLabel> tokens = asCoreLabels();
Annotation doc = document.asAnnotation();
return impl.getOpenieTripleList().stream().map(x -> ProtobufAnnotationSerializer.fromProto(x, doc, this.sentenceIndex())).collect(Collectors.toList());
}
}
/** @see Sentence@openieTriples(Properties) */
public Collection<RelationTriple> openieTriples() {
return openieTriples(this.defaultProps);
}
/**
* Get a list of Open IE triples as flat (subject, relation, object, confidence) quadruples.
* This is substantially faster than returning {@link RelationTriple} objects, as it doesn't
* require converting the underlying representation into {@link CoreLabel}s; but, it also contains
* significantly less information about the sentence.
*
* @see Sentence@openieTriples(Properties)
*/
public Collection<Quadruple<String, String, String, Double>> openie() {
document.runOpenie(this.defaultProps);
return impl.getOpenieTripleList().stream()
.filter(proto -> proto.hasSubject() && proto.hasRelation() && proto.hasObject())
.map(proto -> Quadruple.makeQuadruple(proto.getSubject(), proto.getRelation(), proto.getObject(),
proto.hasConfidence() ? proto.getConfidence() : 1.0))
.collect(Collectors.toList());
}
/**
* Get the KBP triples associated with this sentence.
* Note that this function may be slower than you would expect, as it has to
* convert the underlying Protobuf representation back into {@link CoreLabel}s.
*
* @param props The properties to use for the KBP annotator.
* @return A collection of {@link RelationTriple} objects representing the KBP triples in the sentence.
*/
public Collection<RelationTriple> kbpTriples(Properties props) {
document.runKBP(props);
synchronized (impl) {
List<CoreLabel> tokens = asCoreLabels();
Annotation doc = document.asAnnotation();
return impl.getKbpTripleList().stream().map(x -> ProtobufAnnotationSerializer.fromProto(x, doc, this.sentenceIndex())).collect(Collectors.toList());
}
}
/** @see Sentence@kbpTriples(Properties) */
public Collection<RelationTriple> kbpTriples() {
return kbpTriples(this.defaultProps);
}
/**
* Get a list of KBP triples as flat (subject, relation, object, confidence) quadruples.
* This is substantially faster than returning {@link RelationTriple} objects, as it doesn't
* require converting the underlying representation into {@link CoreLabel}s; but, it also contains
* significantly less information about the sentence.
*
* @see Sentence@kbpTriples(Properties)
*/
public Collection<Quadruple<String, String, String, Double>> kbp() {
document.runKBP(this.defaultProps);
return impl.getKbpTripleList().stream()
.filter(proto -> proto.hasSubject() && proto.hasRelation() && proto.hasObject())
.map(proto -> Quadruple.makeQuadruple(proto.getSubject(), proto.getRelation(), proto.getObject(),
proto.hasConfidence() ? proto.getConfidence() : 1.0))
.collect(Collectors.toList());
}
/**
* The sentiment of this sentence (e.g., positive / negative).
*
* @return The {@link SentimentClass} of this sentence, as an enum value.
*/
public SentimentClass sentiment() {
return sentiment(this.defaultProps);
}
/**
* The sentiment of this sentence (e.g., positive / negative).
*
* @param props The properties to pass to the sentiment classifier.
*
* @return The {@link SentimentClass} of this sentence, as an enum value.
*/
public SentimentClass sentiment(Properties props) {
document.runSentiment(props);
switch (impl.getSentiment().toLowerCase()) {
case "very positive":
return SentimentClass.VERY_POSITIVE;
case "positive":
return SentimentClass.POSITIVE;
case "negative":
return SentimentClass.NEGATIVE;
case "very negative":
return SentimentClass.VERY_NEGATIVE;
case "neutral":
return SentimentClass.NEUTRAL;
default:
throw new IllegalStateException("Unknown sentiment class: " + impl.getSentiment());
}
}
/**
* Get the coreference chain for just this sentence.
* Note that this method is actually fairly computationally expensive to call, as it constructs and prunes
* the coreference data structure for the entire document.
*
* @return A coreference chain, but only for this sentence
*/
public Map<Integer, CorefChain> coref() {
// Get the raw coref structure
Map<Integer, CorefChain> allCorefs = document.coref();
// Delete coreference chains not in this sentence
Set<Integer> toDeleteEntirely = new HashSet<>();
for (Map.Entry<Integer, CorefChain> integerCorefChainEntry : allCorefs.entrySet()) {
CorefChain chain = integerCorefChainEntry.getValue();
List<CorefChain.CorefMention> mentions = new ArrayList<>(chain.getMentionsInTextualOrder());
mentions.stream().filter(m -> m.sentNum != this.sentenceIndex() + 1).forEach(chain::deleteMention);
if (chain.getMentionsInTextualOrder().isEmpty()) {
toDeleteEntirely.add(integerCorefChainEntry.getKey());
}
}
// Clean up dangling empty chains
toDeleteEntirely.forEach(allCorefs::remove);
// Return
return allCorefs;
}
//
// Helpers for CoreNLP interoperability
//
/**
* Returns this sentence as a CoreNLP CoreMap object.
* Note that, importantly, only the fields which have already been called will be populated in
* the CoreMap!
*
* Therefore, this method is generally NOT recommended.
*
* @param functions A list of functions to call before populating the CoreMap.
* For example, you can specify mySentence::posTags, and then posTags will
* be populated.
*/
@SuppressWarnings("TypeParameterExplicitlyExtendsObject")
@SafeVarargs
public final CoreMap asCoreMap(Function<Sentence,Object>... functions) {
for (Function<Sentence, Object> function : functions) {
function.apply(this);
}
return this.document.asAnnotation(true).get(CoreAnnotations.SentencesAnnotation.class).get(this.sentenceIndex());
}
/**
* Returns this sentence as a list of CoreLabels representing the sentence.
* Note that, importantly, only the fields which have already been called will be populated in
* the CoreMap!
*
* Therefore, this method is generally NOT recommended.
*
* @param functions A list of functions to call before populating the CoreMap.
* For example, you can specify mySentence::posTags, and then posTags will
* be populated.
*/
@SuppressWarnings("TypeParameterExplicitlyExtendsObject")
@SafeVarargs
public final List<CoreLabel> asCoreLabels(Function<Sentence,Object>... functions) {
for (Function<Sentence, Object> function : functions) {
function.apply(this);
}
return asCoreMap().get(CoreAnnotations.TokensAnnotation.class);
}
//
// HELPERS FROM DOCUMENT
//
/**
* A helper to get the raw Protobuf builder for a given token.
* Primarily useful for cache checks.
* @param i The index of the token to retrieve.
* @return A Protobuf builder for that token.
*/
public CoreNLPProtos.Token.Builder rawToken(int i) {
return tokensBuilders.get(i);
}
/**
* Get the backing protocol buffer for this sentence.
* @return The raw backing protocol buffer builder for this sentence.
*/
public CoreNLPProtos.Sentence.Builder rawSentence() {
return this.impl;
}
/**
* Update each token in the sentence with the given information.
* @param tokens The CoreNLP tokens returned by the {@link edu.stanford.nlp.pipeline.Annotator}.
* @param setter The function to set a Protobuf object with the given field.
* @param getter The function to get the given field from the {@link CoreLabel}.
* @param <E> The type of the given field we are setting in the protocol buffer and reading from the {@link CoreLabel}.
*/
protected <E> void updateTokens(List<CoreLabel> tokens,
Consumer<Pair<CoreNLPProtos.Token.Builder, E>> setter,
Function<CoreLabel, E> getter) {
synchronized (this.impl) {
for (int i = 0; i < tokens.size(); ++i) {
E value = getter.apply(tokens.get(i));
if (value != null) {
setter.accept(Pair.makePair(tokensBuilders.get(i), value));
}
}
}
}
/**
* Update the parse tree for this sentence.
* @param parse The parse tree to update.
* @param binary The binary parse tree to update.
*/
protected void updateParse(
CoreNLPProtos.ParseTree parse,
CoreNLPProtos.ParseTree binary) {
synchronized (this.impl) {
this.impl.setParseTree(parse);
if (binary != null) {
this.impl.setBinarizedParseTree(binary);
}
}
}
/**
* Update the dependencies of the sentence.
*
* @param basic The basic dependencies to update.
* @param enhanced The enhanced dependencies to update.
* @param enhancedPlusPlus The enhanced plus plus dependencies to update.
*/
protected void updateDependencies(CoreNLPProtos.DependencyGraph basic,
CoreNLPProtos.DependencyGraph enhanced,
CoreNLPProtos.DependencyGraph enhancedPlusPlus) {
synchronized (this.impl) {
this.impl.setBasicDependencies(basic);
this.impl.setEnhancedDependencies(enhanced);
this.impl.setEnhancedPlusPlusDependencies(enhancedPlusPlus);
}
}
/**
* Update the Open IE relation triples for this sentence.
*
* @param triples The stream of relation triples to add to the sentence.
*/
protected void updateOpenIE(Stream<CoreNLPProtos.RelationTriple> triples) {
synchronized (this.impl) {
triples.forEach(this.impl::addOpenieTriple);
}
}
/**
* Update the Open IE relation triples for this sentence.
*
* @param triples The stream of relation triples to add to the sentence.
*/
protected void updateKBP(Stream<CoreNLPProtos.RelationTriple> triples) {
synchronized (this.impl) {
triples.forEach(this.impl::addKbpTriple);
}
}
/**
* Update the Sentiment class for this sentence.
*
* @param sentiment The sentiment of the sentence.
*/
protected void updateSentiment(String sentiment) {
synchronized (this.impl) {
this.impl.setSentiment(sentiment);
}
}
/** {@inheritDoc} */
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (!(o instanceof Sentence)) return false;
Sentence sentence = (Sentence) o;
// Short circuit for fast equals check
if (impl.hasText() && !impl.getText().equals(sentence.impl.getText())) {
return false;
}
if (this.tokensBuilders.size() != sentence.tokensBuilders.size()) {
return false;
}
// Check the implementation of the sentence
if (!impl.build().equals(sentence.impl.build())) {
return false;
}
// Check each token
for (int i = 0, sz = tokensBuilders.size(); i < sz; ++i) {
if (!tokensBuilders.get(i).build().equals(sentence.tokensBuilders.get(i).build())) {
return false;
}
}
return true;
}
/** {@inheritDoc} */
@Override
public int hashCode() {
if (this.impl.hasText()) {
return this.impl.getText().hashCode() * 31 + this.tokensBuilders.size();
} else {
return impl.build().hashCode();
}
}
/** {@inheritDoc} */
@Override
public String toString() {
return impl.getText();
}
/**
* @param start - inclusive
* @param end - exclusive
* @return - the text for the provided token span.
*/
public String substring(int start, int end) {
StringBuilder sb = new StringBuilder();
for(CoreLabel word : asCoreLabels().subList(start, end)) {
sb.append(word.word());
sb.append(word.after());
}
return sb.toString();
}
private static <E> List<E> lazyList(final List<CoreNLPProtos.Token.Builder> tokens, final Function<CoreNLPProtos.Token.Builder,E> fn) {
return new AbstractList<E>() {
@Override
public E get(int index) {
return fn.apply(tokens.get(index));
}
@Override
public int size() {
return tokens.size();
}
};
}
/** Returns the sentence id of the sentence, if one was found */
public Optional<String> sentenceid() {
synchronized (impl) {
if (impl.hasSentenceID()) {
return Optional.of(impl.getSentenceID());
} else {
return Optional.empty();
}
}
}
/**
* Apply a TokensRegex pattern to the sentence.
*
* @param pattern The TokensRegex pattern to match against.
* @return the matcher.
*/
public boolean matches(TokenSequencePattern pattern) {
return pattern.getMatcher(asCoreLabels()).matches();
}
/**
* Apply a TokensRegex pattern to the sentence.
*
* @param pattern The TokensRegex pattern to match against.
* @return True if the tokensregex pattern matches.
*/
public boolean matches(String pattern) {
return matches(TokenSequencePattern.compile(pattern));
}
/**
* Apply a TokensRegex pattern to the sentence.
*
* @param pattern The TokensRegex pattern to match against.
* @param fn The action to do on each match.
* @return the list of matches, after run through the function.
*/
public <T> List<T> find(TokenSequencePattern pattern, Function<TokenSequenceMatcher, T> fn) {
TokenSequenceMatcher matcher = pattern.matcher(asCoreLabels());
List<T> lst = new ArrayList<>();
while(matcher.find()) {
lst.add(fn.apply(matcher));
}
return lst;
}
public <T> List<T> find(String pattern, Function<TokenSequenceMatcher, T> fn) {
return find(TokenSequencePattern.compile(pattern), fn);
}
/**
* Apply a semgrex pattern to the sentence
* @param pattern The Semgrex pattern to match against.
* @param fn The action to do on each match.
* @return the list of matches, after run through the function.
*/
public <T> List<T> semgrex(SemgrexPattern pattern, Function<SemgrexMatcher, T> fn) {
SemgrexMatcher matcher = pattern.matcher(dependencyGraph());
List<T> lst = new ArrayList<>();
while(matcher.findNextMatchingNode()) {
lst.add(fn.apply(matcher));
}
return lst;
}
/**
* Apply a semgrex pattern to the sentence
* @param pattern The Semgrex pattern to match against.
* @param fn The action to do on each match.
* @return the list of matches, after run through the function.
*/
public <T> List<T> semgrex(String pattern, Function<SemgrexMatcher, T> fn) {
return semgrex(SemgrexPattern.compile(pattern), fn);
}
}