package edu.stanford.nlp.simple; import edu.stanford.nlp.coref.data.CorefChain; import edu.stanford.nlp.ie.util.RelationTriple; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import edu.stanford.nlp.naturalli.OperatorSpec; import edu.stanford.nlp.naturalli.Polarity; import edu.stanford.nlp.naturalli.SentenceFragment; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotator; import edu.stanford.nlp.pipeline.CoreNLPProtos; import edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphFactory; import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.util.*; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.*; import java.util.function.BiFunction; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; /** * A representation of a single Sentence. * Although it is possible to create a sentence directly from text, it is advisable to * create a document instead and operate on the document directly. * * @author Gabor Angeli */ @SuppressWarnings({"UnusedDeclaration", "WeakerAccess"}) public class Sentence { /** A Properties object for creating a document from a single sentence. Used in the constructor {@link Sentence#Sentence(String)} */ static Properties SINGLE_SENTENCE_DOCUMENT = PropertiesUtils.asProperties( "language", "english", "ssplit.isOneSentence", "true", "tokenize.class", "PTBTokenizer", "tokenize.language", "en", "mention.type", "dep", "coref.mode", "statistical", // Use the new coref "coref.md.type", "dep" ); /** A Properties object for creating a document from a single tokenized sentence. */ private static Properties SINGLE_SENTENCE_TOKENIZED_DOCUMENT = PropertiesUtils.asProperties( "language", "english", "ssplit.isOneSentence", "true", "tokenize.class", "WhitespaceTokenizer", "tokenize.language", "en", "tokenize.whitespace", "true", "mention.type", "dep", "coref.mode", "statistical", // Use the new coref "coref.md.type", "dep" ); // redundant? /** * The protobuf representation of a Sentence. * Note that this does not necessarily have up to date token information. */ private final CoreNLPProtos.Sentence.Builder impl; /** The protobuf representation of the tokens of a sentence. This has up-to-date information on the tokens */ private final List<CoreNLPProtos.Token.Builder> tokensBuilders; /** The document this sentence is derived from */ public final Document document; /** The default properties to use for annotators. */ private final Properties defaultProps; /** The function to use to create a new document. This is used for the cased() and caseless() functions. */ private final BiFunction<Properties, String, Document> docFn; /** * Create a new sentence, using the specified properties as the default properties. * @param doc The document to link this sentence to. * @param props The properties to use for tokenizing the sentence. */ protected Sentence(Document doc, Properties props) { // Set document this.document = doc; // Set sentence if (props.containsKey("ssplit.isOneSentence")) { this.impl = this.document.sentence(0, props).impl; } else { Properties modProps = new Properties(props); modProps.setProperty("ssplit.isOneSentence", "true"); this.impl = this.document.sentence(0, modProps).impl; } // Set tokens this.tokensBuilders = document.sentence(0).tokensBuilders; // Asserts assert (this.document.sentence(0).impl == this.impl); assert (this.document.sentence(0).tokensBuilders == this.tokensBuilders); // Set the default properties if (props == SINGLE_SENTENCE_TOKENIZED_DOCUMENT) { this.defaultProps = SINGLE_SENTENCE_DOCUMENT; // no longer care about tokenization } else { this.defaultProps = props; } this.docFn = Document::new; } /** * Create a new sentence from some text, and some properties. * @param text The text of the sentence. * @param props The properties to use for the annotators. */ public Sentence(String text, Properties props) { this(new Document(props, text), props); } /** * Create a new sentence from the given text, assuming the entire text is just one sentence. * @param text The text of the sentence. */ public Sentence(String text) { this(text, SINGLE_SENTENCE_DOCUMENT); } /** The actual implementation of a tokenized sentence constructor */ protected Sentence(Function<String, Document> doc, List<String> tokens, Properties props) { this(doc.apply(StringUtils.join(tokens.stream().map(x -> x.replace(' ', 'ߝ' /* some random character */)), " ")), props); // Clean up whitespace for (int i = 0; i < impl.getTokenCount(); ++i) { this.impl.getTokenBuilder(i).setWord(this.impl.getTokenBuilder(i).getWord().replace('ߝ', ' ')); this.impl.getTokenBuilder(i).setValue(this.impl.getTokenBuilder(i).getValue().replace('ߝ', ' ')); this.tokensBuilders.get(i).setWord(this.tokensBuilders.get(i).getWord().replace('ߝ', ' ')); this.tokensBuilders.get(i).setValue(this.tokensBuilders.get(i).getValue().replace('ߝ', ' ')); } } /** * Create a new sentence from the given tokenized text, assuming the entire text is just one sentence. * WARNING: This method may in rare cases (mostly when tokens themselves have whitespace in them) * produce strange results; it's a bit of a hack around the default tokenizer. * * @param tokens The text of the sentence. */ public Sentence(List<String> tokens) { this(Document::new, tokens, SINGLE_SENTENCE_TOKENIZED_DOCUMENT); } /** * Create a sentence from a saved protocol buffer. */ protected Sentence(BiFunction<Properties, String, Document> docFn, CoreNLPProtos.Sentence proto, Properties props) { this.impl = proto.toBuilder(); // Set tokens tokensBuilders = new ArrayList<>(this.impl.getTokenCount()); for (int i = 0; i < this.impl.getTokenCount(); ++i) { tokensBuilders.add(this.impl.getToken(i).toBuilder()); } // Initialize document this.document = docFn.apply(props, proto.getText()); this.document.forceSentences(Collections.singletonList(this)); // Asserts assert (this.document.sentence(0).impl == this.impl); assert (this.document.sentence(0).tokensBuilders == this.tokensBuilders); // Set default props this.defaultProps = props; this.docFn = docFn; } /** * Create a sentence from a saved protocol buffer. */ public Sentence(CoreNLPProtos.Sentence proto) { this(Document::new, proto, SINGLE_SENTENCE_DOCUMENT); } /** Helper for creating a sentence from a document at a given index */ protected Sentence(Document doc, int sentenceIndex) { this.document = doc; this.impl = doc.sentence(sentenceIndex).impl; // Set tokens this.tokensBuilders = doc.sentence(sentenceIndex).tokensBuilders; // Asserts assert (this.document.sentence(sentenceIndex).impl == this.impl); assert (this.document.sentence(sentenceIndex).tokensBuilders == this.tokensBuilders); // Set default props this.defaultProps = Document.EMPTY_PROPS; this.docFn = doc.sentence(sentenceIndex).docFn; } /** * The canonical constructor of a sentence from a {@link edu.stanford.nlp.simple.Document}. * @param doc The document to link this sentence to. * @param proto The sentence implementation to use for this sentence. */ protected Sentence(Document doc, CoreNLPProtos.Sentence.Builder proto, Properties defaultProps) { this.document = doc; this.impl = proto; this.defaultProps = defaultProps; // Set tokens // This is the _only_ place we are allowed to construct tokens builders tokensBuilders = new ArrayList<>(this.impl.getTokenCount()); for (int i = 0; i < this.impl.getTokenCount(); ++i) { tokensBuilders.add(this.impl.getToken(i).toBuilder()); } this.docFn = (props, text) -> MetaClass.create(doc.getClass().getName()).createInstance(props, text); } /** * Also sets the the text of the sentence. Used by {@link Document} internally * * @param doc The document to link this sentence to. * @param proto The sentence implementation to use for this sentence. * @param text The text for the sentence * @param defaultProps The default properties to use when annotating this sentence. */ Sentence(Document doc, CoreNLPProtos.Sentence.Builder proto, String text, Properties defaultProps) { this(doc, proto, defaultProps); this.impl.setText(text); } /** Helper for creating a sentence from a document and a CoreMap representation */ protected Sentence(Document doc, CoreMap sentence) { this.document = doc; assert ! doc.sentences().isEmpty(); this.impl = doc.sentence(0).impl; this.tokensBuilders = doc.sentence(0).tokensBuilders; this.defaultProps = Document.EMPTY_PROPS; this.docFn = (props, text) -> MetaClass.create(doc.getClass().getName()).createInstance(props, text); } /** * Convert a CoreMap into a simple Sentence object. * Note that this is a copy operation -- the implementing CoreMap will not be updated, and all of its * contents are copied over to the protocol buffer format backing the {@link Sentence} object. * * @param sentence The CoreMap representation of the sentence. */ public Sentence(CoreMap sentence) { this(new Document(new Annotation(sentence.get(CoreAnnotations.TextAnnotation.class)) {{ set(CoreAnnotations.SentencesAnnotation.class, Collections.singletonList(sentence)); if (sentence.containsKey(CoreAnnotations.DocIDAnnotation.class)) { set(CoreAnnotations.DocIDAnnotation.class, sentence.get(CoreAnnotations.DocIDAnnotation.class)); } }}), sentence); } /** * <p> * Convert a sentence fragment (i.e., entailed sentence) into a simple sentence object. * Like {@link Sentence#Sentence(CoreMap)}, this copies the information in the fragment into the underlying * protobuf backed format. * </p> * * @param sentence The sentence fragment to convert. */ public Sentence(SentenceFragment sentence) { this(new ArrayCoreMap(32) {{ set(CoreAnnotations.TokensAnnotation.class, sentence.words); set(CoreAnnotations.TextAnnotation.class, StringUtils.join(sentence.words.stream().map(CoreLabel::originalText), " ")); if (sentence.words.isEmpty()) { set(CoreAnnotations.TokenBeginAnnotation.class, 0); set(CoreAnnotations.TokenEndAnnotation.class, 0); } else { set(CoreAnnotations.TokenBeginAnnotation.class, sentence.words.get(0).get(CoreAnnotations.IndexAnnotation.class)); set(CoreAnnotations.TokenEndAnnotation.class, sentence.words.get(sentence.words.size() - 1).get(CoreAnnotations.IndexAnnotation.class) + 1); } set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, sentence.parseTree); set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, sentence.parseTree); set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, sentence.parseTree); }}); } /** * Make this sentence caseless. That is, from now on, run the caseless models * on the sentence by default rather than the standard CoreNLP models. * * @return A new sentence with the default properties swapped out. */ public Sentence caseless() { return new Sentence(this.docFn, impl.build(), Document.CASELESS_PROPS); } /** * Make this sentence case sensitive. * A sentence is case sensitive by default; this only has an effect if you have previously * called {@link Sentence#caseless()}. * * @return A new sentence with the default properties swapped out. */ public Sentence cased() { return new Sentence(this.docFn, impl.build(), Document.EMPTY_PROPS); } /** * Serialize the given sentence (but not the associated document!) into a Protocol Buffer. * @return The Protocol Buffer representing this sentence. */ public CoreNLPProtos.Sentence serialize() { synchronized (impl) { this.impl.clearToken(); for (CoreNLPProtos.Token.Builder token : this.tokensBuilders) { this.impl.addToken(token.build()); } return impl.build(); } } /** * Write this sentence to an output stream. * Internally, this stores the sentence as a protocol buffer, and saves that buffer to the output stream. * This method does not close the stream after writing. * * @param out The output stream to write to. The stream is not closed after the method returns. * @throws IOException Thrown from the underlying write() implementation. */ public void serialize(OutputStream out) throws IOException { serialize().writeDelimitedTo(out); out.flush(); } /** * Read a sentence from an input stream. * This does not close the input stream. * * @param in The input stream to deserialize from. * @return The next sentence encoded in the input stream. * @throws IOException Thrown by the underlying parse() implementation. * * @see Document#serialize(java.io.OutputStream) */ public static Sentence deserialize(InputStream in) throws IOException { return new Sentence(CoreNLPProtos.Sentence.parseDelimitedFrom(in)); } /** * Return a class that can perform common algorithms on this sentence. */ public SentenceAlgorithms algorithms() { return new SentenceAlgorithms(this); } /** The raw text of the sentence, as input by, e.g., {@link Sentence#Sentence(String)}. */ public String text() { synchronized (impl) { return impl.getText(); } } // // SET AXIOMATICALLY // /** The index of the sentence within the document. */ public int sentenceIndex() { synchronized (impl) { return impl.getSentenceIndex(); } } /** THe token offset of the sentence within the document. */ public int sentenceTokenOffsetBegin() { synchronized (impl) { return impl.getTokenOffsetBegin(); } } /** The token offset of the end of this sentence within the document. */ public int sentenceTokenOffsetEnd() { synchronized (impl) { return impl.getTokenOffsetEnd(); } } // // SET BY TOKENIZER // /** The words of the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#word()}. */ public List<String> words() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getWord); } } /** The word at the given index of the sentence. @see Sentence#words() */ public String word(int index) { return words().get(index); } /** The original (unprocessed) words of the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#originalText()}. */ public List<String> originalTexts() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getOriginalText); } } /** The original word at the given index. @see Sentence#originalTexts() */ public String originalText(int index) { return originalTexts().get(index); } /** The character offset of each token in the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#beginPosition()}. */ public List<Integer> characterOffsetBegin() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getBeginChar); } } /** The character offset of the given index in the sentence. @see Sentence#characterOffsetBegin(). */ public int characterOffsetBegin(int index) { return characterOffsetBegin().get(index); } /** The end character offset of each token in the sentence, as per {@link edu.stanford.nlp.ling.CoreLabel#endPosition()}. */ public List<Integer> characterOffsetEnd() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getEndChar); } } /** The end character offset of the given index in the sentence. @see Sentence#characterOffsetEnd(). */ public int characterOffsetEnd(int index) { return characterOffsetEnd().get(index); } /** The whitespace before each token in the sentence. This will match {@link #after()} of the previous token. */ public List<String> before() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getBefore); } } /** The whitespace before this token in the sentence. This will match {@link #after()} of the previous token. */ public String before(int index) { return before().get(index); } /** The whitespace after each token in the sentence. This will match {@link #before()} of the next token. */ public List<String> after() { synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getAfter); } } /** The whitespace after this token in the sentence. This will match {@link #before()} of the next token. */ public String after(int index) { return after().get(index); } /** The tokens in this sentence. Each token class is just a helper for the methods in this class. */ public List<Token> tokens() { ArrayList<Token> tokens = new ArrayList<>(this.length()); for (int i = 0; i < length(); ++i) { tokens.add(new Token(this, i)); } return tokens; } // // SET BY ANNOTATORS // /** * The part of speech tags of the sentence. * @param props The properties to use for the {@link edu.stanford.nlp.pipeline.POSTaggerAnnotator}. * @return A list of part of speech tags, one for each token in the sentence. */ public List<String> posTags(Properties props) { document.runPOS(props); synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getPos); } } /** @see Sentence#posTags(java.util.Properties) */ public List<String> posTags() { return posTags(this.defaultProps); } /** @see Sentence#posTags(java.util.Properties) */ public String posTag(int index) { return posTags().get(index); } /** * The lemmas of the sentence. * @param props The properties to use for the {@link edu.stanford.nlp.pipeline.MorphaAnnotator}. * @return A list of lemmatized words, one for each token in the sentence. */ public List<String> lemmas(Properties props) { document.runLemma(props); synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getLemma); } } /** @see Sentence#lemmas(java.util.Properties) */ public List<String> lemmas() { return lemmas(this.defaultProps); } /** @see Sentence#lemmas(java.util.Properties) */ public String lemma(int index) { return lemmas().get(index); } /** * The named entity tags of the sentence. * @param props The properties to use for the {@link edu.stanford.nlp.pipeline.NERCombinerAnnotator}. * @return A list of named entity tags, one for each token in the sentence. */ public List<String> nerTags(Properties props) { document.runNER(props); synchronized (impl) { return lazyList(tokensBuilders, CoreNLPProtos.Token.Builder::getNer); } } /** @see Sentence#nerTags(java.util.Properties) */ public List<String> nerTags() { return nerTags(this.defaultProps); } /** * Run RegexNER over this sentence. Note that this is an in place operation, and simply * updates the NER tags. * Therefore, every time this function is called, it re-runs the annotator! * * @param mappingFile The regexner mapping file. * @param ignorecase If true, run a caseless match on the regexner file. * */ public void regexner(String mappingFile, boolean ignorecase) { Properties props = new Properties(); for (Object prop : this.defaultProps.keySet()) { props.setProperty(prop.toString(), this.defaultProps.getProperty(prop.toString())); } props.setProperty(Annotator.STANFORD_REGEXNER + ".mapping", mappingFile); props.setProperty(Annotator.STANFORD_REGEXNER + ".ignorecase", Boolean.toString(ignorecase)); this.document.runRegexner(props); } /** @see Sentence#nerTags(java.util.Properties) */ public String nerTag(int index) { return nerTags().get(index); } /** * Get all mentions of the given NER tag, as a list of surface forms. * @param nerTag The ner tag to search for, case sensitive. * @return A list of surface forms of the entities of this tag. This is using the {@link Sentence#word(int)} function. */ public List<String> mentions(String nerTag) { List<String> mentionsOfTag = new ArrayList<>(); StringBuilder lastMention = new StringBuilder(); String lastTag = "O"; for (int i = 0; i < length(); ++i) { String ner = nerTag(i); if (ner.equals(nerTag) && !lastTag.equals(nerTag)) { // case: beginning of span lastMention.append(word(i)).append(' '); } else if (ner.equals(nerTag) && lastTag.equals(nerTag)) { // case: in span lastMention.append(word(i)).append(' '); } else if (!ner.equals(nerTag) && lastTag.equals(nerTag)) { // case: end of span if (lastMention.length() > 0) { mentionsOfTag.add(lastMention.toString().trim()); } lastMention.setLength(0); } lastTag = ner; } if (lastMention.length() > 0) { mentionsOfTag.add(lastMention.toString().trim()); } return mentionsOfTag; } /** * Get all mentions of any NER tag, as a list of surface forms. * @return A list of surface forms of the entities in this sentence. This is using the {@link Sentence#word(int)} function. */ public List<String> mentions() { List<String> mentionsOfTag = new ArrayList<>(); StringBuilder lastMention = new StringBuilder(); String lastTag = "O"; for (int i = 0; i < length(); ++i) { String ner = nerTag(i); if (!ner.equals("O") && !lastTag.equals(ner)) { // case: beginning of span if (lastMention.length() > 0) { mentionsOfTag.add(lastMention.toString().trim()); } lastMention.setLength(0); lastMention.append(word(i)).append(' '); } else if (!ner.equals("O") && lastTag.equals(ner)) { // case: in span lastMention.append(word(i)).append(' '); } else if (ner.equals("O") && !lastTag.equals("O")) { // case: end of span if (lastMention.length() > 0) { mentionsOfTag.add(lastMention.toString().trim()); } lastMention.setLength(0); } lastTag = ner; } if (lastMention.length() > 0) { mentionsOfTag.add(lastMention.toString().trim()); } return mentionsOfTag; } /** * Returns the constituency parse of this sentence. * * @param props The properties to use in the parser annotator. * @return A parse tree object. */ public Tree parse(Properties props) { document.runParse(props); synchronized (document.serializer) { return document.serializer.fromProto(impl.getParseTree()); } } /** @see Sentence#parse(java.util.Properties) */ public Tree parse() { return parse(this.defaultProps); } /** An internal helper to get the dependency tree of the given type. */ private CoreNLPProtos.DependencyGraph dependencies(SemanticGraphFactory.Mode mode) { switch (mode) { case BASIC: return impl.getBasicDependencies(); case ENHANCED: return impl.getEnhancedDependencies(); case ENHANCED_PLUS_PLUS: return impl.getEnhancedPlusPlusDependencies(); default: throw new IllegalArgumentException("Unsupported dependency type: " + mode); } } /** * Returns the governor of the given index, according to the passed dependency type. * The root has index -1. * * @param props The properties to use in the parser annotator. * @param index The index of the dependent word ZERO INDEXED. That is, the first word of the sentence * is index 0, not 1 as it would be in the {@link edu.stanford.nlp.semgraph.SemanticGraph} framework. * @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed). * @return The index of the governor, if one exists. A value of -1 indicates the root node. */ public Optional<Integer> governor(Properties props, int index, SemanticGraphFactory.Mode mode) { document.runDepparse(props); for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) { if (edge.getTarget() - 1 == index) { return Optional.of(edge.getSource() - 1); } } for (int root : impl.getBasicDependencies().getRootList()) { if (index == root - 1) { return Optional.of(-1); } } return Optional.empty(); } /** @see Sentence#governor(java.util.Properties, int, SemanticGraphFactory.Mode) */ public Optional<Integer> governor(Properties props, int index) { return governor(props, index, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#governor(java.util.Properties, int, SemanticGraphFactory.Mode) */ public Optional<Integer> governor(int index, SemanticGraphFactory.Mode mode) { return governor(this.defaultProps, index, mode); } /** @see Sentence#governor(java.util.Properties, int) */ public Optional<Integer> governor(int index) { return governor(this.defaultProps, index); } /** * Returns the governors of a sentence, according to the passed dependency type. * The resulting list is of the same size as the original sentence, with each element being either * the governor (index), or empty if the node has no known governor. * The root has index -1. * * @param props The properties to use in the parser annotator. * @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed). * @return A list of the (optional) governors of each token in the sentence. */ public List<Optional<Integer>> governors(Properties props, SemanticGraphFactory.Mode mode) { document.runDepparse(props); List<Optional<Integer>> governors = new ArrayList<>(this.length()); for (int i = 0; i < this.length(); ++i) { governors.add(Optional.empty()); } for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) { governors.set(edge.getTarget() - 1, Optional.of(edge.getSource() - 1)); } for (int root : impl.getBasicDependencies().getRootList()) { governors.set(root - 1, Optional.of(-1)); } return governors; } /** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<Integer>> governors(Properties props) { return governors(props, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<Integer>> governors(SemanticGraphFactory.Mode mode) { return governors(this.defaultProps, mode); } /** @see Sentence#governors(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<Integer>> governors() { return governors(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED); } /** * Returns the incoming dependency label to a particular index, according to the Basic Dependencies. * * @param props The properties to use in the parser annotator. * @param index The index of the dependent word ZERO INDEXED. That is, the first word of the sentence * is index 0, not 1 as it would be in the {@link edu.stanford.nlp.semgraph.SemanticGraph} framework. * @param mode The type of dependency to use (e.g., basic, collapsed, collapsed cc processed). * @return The incoming dependency label, if it exists. */ public Optional<String> incomingDependencyLabel(Properties props, int index, SemanticGraphFactory.Mode mode) { document.runDepparse(props); for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) { if (edge.getTarget() - 1 == index) { return Optional.of(edge.getDep()); } } for (int root : impl.getBasicDependencies().getRootList()) { if (index == root - 1) { return Optional.of("root"); } } return Optional.empty(); } /** @see Sentence#incomingDependencyLabel(java.util.Properties, int, SemanticGraphFactory.Mode) */ public Optional<String> incomingDependencyLabel(Properties props, int index) { return incomingDependencyLabel(props, index, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#incomingDependencyLabel(java.util.Properties, int, SemanticGraphFactory.Mode) */ public Optional<String> incomingDependencyLabel(int index, SemanticGraphFactory.Mode mode) { return incomingDependencyLabel(this.defaultProps, index, mode); } /** @see Sentence#incomingDependencyLabel(java.util.Properties, int) */ public Optional<String> incomingDependencyLabel(int index) { return incomingDependencyLabel(this.defaultProps, index); } /** @see Sentence#incomingDependencyLabel(java.util.Properties, int) */ public List<Optional<String>> incomingDependencyLabels(Properties props, SemanticGraphFactory.Mode mode) { document.runDepparse(props); List<Optional<String>> labels = new ArrayList<>(this.length()); for (int i = 0; i < this.length(); ++i) { labels.add(Optional.empty()); } for (CoreNLPProtos.DependencyGraph.Edge edge : dependencies(mode).getEdgeList()) { labels.set(edge.getTarget() - 1, Optional.of(edge.getDep())); } for (int root : impl.getBasicDependencies().getRootList()) { labels.set(root - 1, Optional.of("root")); } return labels; } /** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<String>> incomingDependencyLabels(SemanticGraphFactory.Mode mode) { return incomingDependencyLabels(this.defaultProps, mode); } /** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<String>> incomingDependencyLabels(Properties props) { return incomingDependencyLabels(props, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#incomingDependencyLabels(java.util.Properties, SemanticGraphFactory.Mode) */ public List<Optional<String>> incomingDependencyLabels() { return incomingDependencyLabels(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED); } /** * Returns the dependency graph of the sentence, as a raw {@link SemanticGraph} object. * Note that this method is slower than you may expect, as it has to convert the underlying protocol * buffer back into a list of CoreLabels with which to populate the {@link SemanticGraph}. * * @param props The properties to use for running the dependency parser annotator. * @param mode The type of graph to return (e.g., basic, collapsed, etc). * * @return The dependency graph of the sentence. */ public SemanticGraph dependencyGraph(Properties props, SemanticGraphFactory.Mode mode) { document.runDepparse(props); return ProtobufAnnotationSerializer.fromProto(dependencies(mode), asCoreLabels(), document.docid().orElse(null)); } /** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */ public SemanticGraph dependencyGraph(Properties props) { return dependencyGraph(props, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */ public SemanticGraph dependencyGraph() { return dependencyGraph(this.defaultProps, SemanticGraphFactory.Mode.ENHANCED); } /** @see Sentence#dependencyGraph(Properties, SemanticGraphFactory.Mode) */ public SemanticGraph dependencyGraph(SemanticGraphFactory.Mode mode) { return dependencyGraph(this.defaultProps, mode); } /** The length of the sentence, in tokens */ public int length() { return impl.getTokenCount(); } /** * Get a list of the (possible) Natural Logic operators on each node of the sentence. * At each index, the list contains an operator spec if that index is the head word of an operator in the * sentence. * * @param props The properties to pass to the natural logic annotator. * @return A list of Optionals, where each element corresponds to a token in the sentence, and the optional is nonempty * if that index is an operator. */ public List<Optional<OperatorSpec>> operators(Properties props) { document.runNatlog(props); synchronized (impl) { return lazyList(tokensBuilders, x -> x.hasOperator() ? Optional.of(ProtobufAnnotationSerializer.fromProto(x.getOperator())) : Optional.empty()); } } /** @see Sentence#operators(Properties) */ public List<Optional<OperatorSpec>> operators() { return operators(this.defaultProps); } /** @see Sentence#operators(Properties) */ public Optional<OperatorSpec> operatorAt(Properties props, int i) { return operators(props).get(i); } /** @see Sentence#operators(Properties) */ public Optional<OperatorSpec> operatorAt(int i) { return operators(this.defaultProps).get(i); } /** * Returns the list of non-empty Natural Logic operator specifications. * This amounts to the actual list of operators in the sentence. * Note that the spans of the operators can be retrieved with * {@link OperatorSpec#quantifierBegin} and * {@link OperatorSpec#quantifierEnd}. * * @param props The properties to use for the natlog annotator. * @return A list of operators in the sentence. */ public List<OperatorSpec> operatorsNonempty(Properties props) { return operators(props).stream().filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()); } /** @see Sentence#operatorsNonempty(Properties) */ public List<OperatorSpec> operatorsNonempty() { return operatorsNonempty(this.defaultProps); } /** * The Natural Logic notion of polarity for each token in a sentence. * @param props The properties to use for the natural logic annotator. * @return A list of Polarity objects, one for each token of the sentence. */ public List<Polarity> natlogPolarities(Properties props) { document.runNatlog(props); synchronized (impl) { return lazyList(tokensBuilders, x -> ProtobufAnnotationSerializer.fromProto(x.getPolarity())); } } /** @see Sentence#natlogPolarities(Properties) */ public List<Polarity> natlogPolarities() { return natlogPolarities(this.defaultProps); } /** * Get the polarity (the Natural Logic notion of polarity) for a given token in the sentence. * @param props The properties to use for the natural logic annotator. * @param index The index to return the polarity of. * @return A list of Polarity objects, one for each token of the sentence. */ public Polarity natlogPolarity(Properties props, int index) { document.runNatlog(props); synchronized (impl) { return ProtobufAnnotationSerializer.fromProto(tokensBuilders.get(index).getPolarity()); } } /** @see Sentence#natlogPolarity(Properties, int) */ public Polarity natlogPolarity(int index) { return natlogPolarity(this.defaultProps, index); } /** * Get the OpenIE triples associated with this sentence. * Note that this function may be slower than you would expect, as it has to * convert the underlying Protobuf representation back into {@link CoreLabel}s. * * @param props The properties to use for the OpenIE annotator. * @return A collection of {@link RelationTriple} objects representing the OpenIE triples in the sentence. */ public Collection<RelationTriple> openieTriples(Properties props) { document.runOpenie(props); synchronized (impl) { List<CoreLabel> tokens = asCoreLabels(); Annotation doc = document.asAnnotation(); return impl.getOpenieTripleList().stream().map(x -> ProtobufAnnotationSerializer.fromProto(x, doc, this.sentenceIndex())).collect(Collectors.toList()); } } /** @see Sentence@openieTriples(Properties) */ public Collection<RelationTriple> openieTriples() { return openieTriples(this.defaultProps); } /** * Get a list of Open IE triples as flat (subject, relation, object, confidence) quadruples. * This is substantially faster than returning {@link RelationTriple} objects, as it doesn't * require converting the underlying representation into {@link CoreLabel}s; but, it also contains * significantly less information about the sentence. * * @see Sentence@openieTriples(Properties) */ public Collection<Quadruple<String, String, String, Double>> openie() { document.runOpenie(this.defaultProps); return impl.getOpenieTripleList().stream() .filter(proto -> proto.hasSubject() && proto.hasRelation() && proto.hasObject()) .map(proto -> Quadruple.makeQuadruple(proto.getSubject(), proto.getRelation(), proto.getObject(), proto.hasConfidence() ? proto.getConfidence() : 1.0)) .collect(Collectors.toList()); } /** * Get the KBP triples associated with this sentence. * Note that this function may be slower than you would expect, as it has to * convert the underlying Protobuf representation back into {@link CoreLabel}s. * * @param props The properties to use for the KBP annotator. * @return A collection of {@link RelationTriple} objects representing the KBP triples in the sentence. */ public Collection<RelationTriple> kbpTriples(Properties props) { document.runKBP(props); synchronized (impl) { List<CoreLabel> tokens = asCoreLabels(); Annotation doc = document.asAnnotation(); return impl.getKbpTripleList().stream().map(x -> ProtobufAnnotationSerializer.fromProto(x, doc, this.sentenceIndex())).collect(Collectors.toList()); } } /** @see Sentence@kbpTriples(Properties) */ public Collection<RelationTriple> kbpTriples() { return kbpTriples(this.defaultProps); } /** * Get a list of KBP triples as flat (subject, relation, object, confidence) quadruples. * This is substantially faster than returning {@link RelationTriple} objects, as it doesn't * require converting the underlying representation into {@link CoreLabel}s; but, it also contains * significantly less information about the sentence. * * @see Sentence@kbpTriples(Properties) */ public Collection<Quadruple<String, String, String, Double>> kbp() { document.runKBP(this.defaultProps); return impl.getKbpTripleList().stream() .filter(proto -> proto.hasSubject() && proto.hasRelation() && proto.hasObject()) .map(proto -> Quadruple.makeQuadruple(proto.getSubject(), proto.getRelation(), proto.getObject(), proto.hasConfidence() ? proto.getConfidence() : 1.0)) .collect(Collectors.toList()); } /** * The sentiment of this sentence (e.g., positive / negative). * * @return The {@link SentimentClass} of this sentence, as an enum value. */ public SentimentClass sentiment() { return sentiment(this.defaultProps); } /** * The sentiment of this sentence (e.g., positive / negative). * * @param props The properties to pass to the sentiment classifier. * * @return The {@link SentimentClass} of this sentence, as an enum value. */ public SentimentClass sentiment(Properties props) { document.runSentiment(props); switch (impl.getSentiment().toLowerCase()) { case "very positive": return SentimentClass.VERY_POSITIVE; case "positive": return SentimentClass.POSITIVE; case "negative": return SentimentClass.NEGATIVE; case "very negative": return SentimentClass.VERY_NEGATIVE; case "neutral": return SentimentClass.NEUTRAL; default: throw new IllegalStateException("Unknown sentiment class: " + impl.getSentiment()); } } /** * Get the coreference chain for just this sentence. * Note that this method is actually fairly computationally expensive to call, as it constructs and prunes * the coreference data structure for the entire document. * * @return A coreference chain, but only for this sentence */ public Map<Integer, CorefChain> coref() { // Get the raw coref structure Map<Integer, CorefChain> allCorefs = document.coref(); // Delete coreference chains not in this sentence Set<Integer> toDeleteEntirely = new HashSet<>(); for (Map.Entry<Integer, CorefChain> integerCorefChainEntry : allCorefs.entrySet()) { CorefChain chain = integerCorefChainEntry.getValue(); List<CorefChain.CorefMention> mentions = new ArrayList<>(chain.getMentionsInTextualOrder()); mentions.stream().filter(m -> m.sentNum != this.sentenceIndex() + 1).forEach(chain::deleteMention); if (chain.getMentionsInTextualOrder().isEmpty()) { toDeleteEntirely.add(integerCorefChainEntry.getKey()); } } // Clean up dangling empty chains toDeleteEntirely.forEach(allCorefs::remove); // Return return allCorefs; } // // Helpers for CoreNLP interoperability // /** * Returns this sentence as a CoreNLP CoreMap object. * Note that, importantly, only the fields which have already been called will be populated in * the CoreMap! * * Therefore, this method is generally NOT recommended. * * @param functions A list of functions to call before populating the CoreMap. * For example, you can specify mySentence::posTags, and then posTags will * be populated. */ @SuppressWarnings("TypeParameterExplicitlyExtendsObject") @SafeVarargs public final CoreMap asCoreMap(Function<Sentence,Object>... functions) { for (Function<Sentence, Object> function : functions) { function.apply(this); } return this.document.asAnnotation(true).get(CoreAnnotations.SentencesAnnotation.class).get(this.sentenceIndex()); } /** * Returns this sentence as a list of CoreLabels representing the sentence. * Note that, importantly, only the fields which have already been called will be populated in * the CoreMap! * * Therefore, this method is generally NOT recommended. * * @param functions A list of functions to call before populating the CoreMap. * For example, you can specify mySentence::posTags, and then posTags will * be populated. */ @SuppressWarnings("TypeParameterExplicitlyExtendsObject") @SafeVarargs public final List<CoreLabel> asCoreLabels(Function<Sentence,Object>... functions) { for (Function<Sentence, Object> function : functions) { function.apply(this); } return asCoreMap().get(CoreAnnotations.TokensAnnotation.class); } // // HELPERS FROM DOCUMENT // /** * A helper to get the raw Protobuf builder for a given token. * Primarily useful for cache checks. * @param i The index of the token to retrieve. * @return A Protobuf builder for that token. */ public CoreNLPProtos.Token.Builder rawToken(int i) { return tokensBuilders.get(i); } /** * Get the backing protocol buffer for this sentence. * @return The raw backing protocol buffer builder for this sentence. */ public CoreNLPProtos.Sentence.Builder rawSentence() { return this.impl; } /** * Update each token in the sentence with the given information. * @param tokens The CoreNLP tokens returned by the {@link edu.stanford.nlp.pipeline.Annotator}. * @param setter The function to set a Protobuf object with the given field. * @param getter The function to get the given field from the {@link CoreLabel}. * @param <E> The type of the given field we are setting in the protocol buffer and reading from the {@link CoreLabel}. */ protected <E> void updateTokens(List<CoreLabel> tokens, Consumer<Pair<CoreNLPProtos.Token.Builder, E>> setter, Function<CoreLabel, E> getter) { synchronized (this.impl) { for (int i = 0; i < tokens.size(); ++i) { E value = getter.apply(tokens.get(i)); if (value != null) { setter.accept(Pair.makePair(tokensBuilders.get(i), value)); } } } } /** * Update the parse tree for this sentence. * @param parse The parse tree to update. * @param binary The binary parse tree to update. */ protected void updateParse( CoreNLPProtos.ParseTree parse, CoreNLPProtos.ParseTree binary) { synchronized (this.impl) { this.impl.setParseTree(parse); if (binary != null) { this.impl.setBinarizedParseTree(binary); } } } /** * Update the dependencies of the sentence. * * @param basic The basic dependencies to update. * @param enhanced The enhanced dependencies to update. * @param enhancedPlusPlus The enhanced plus plus dependencies to update. */ protected void updateDependencies(CoreNLPProtos.DependencyGraph basic, CoreNLPProtos.DependencyGraph enhanced, CoreNLPProtos.DependencyGraph enhancedPlusPlus) { synchronized (this.impl) { this.impl.setBasicDependencies(basic); this.impl.setEnhancedDependencies(enhanced); this.impl.setEnhancedPlusPlusDependencies(enhancedPlusPlus); } } /** * Update the Open IE relation triples for this sentence. * * @param triples The stream of relation triples to add to the sentence. */ protected void updateOpenIE(Stream<CoreNLPProtos.RelationTriple> triples) { synchronized (this.impl) { triples.forEach(this.impl::addOpenieTriple); } } /** * Update the Open IE relation triples for this sentence. * * @param triples The stream of relation triples to add to the sentence. */ protected void updateKBP(Stream<CoreNLPProtos.RelationTriple> triples) { synchronized (this.impl) { triples.forEach(this.impl::addKbpTriple); } } /** * Update the Sentiment class for this sentence. * * @param sentiment The sentiment of the sentence. */ protected void updateSentiment(String sentiment) { synchronized (this.impl) { this.impl.setSentiment(sentiment); } } /** {@inheritDoc} */ @Override public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof Sentence)) return false; Sentence sentence = (Sentence) o; // Short circuit for fast equals check if (impl.hasText() && !impl.getText().equals(sentence.impl.getText())) { return false; } if (this.tokensBuilders.size() != sentence.tokensBuilders.size()) { return false; } // Check the implementation of the sentence if (!impl.build().equals(sentence.impl.build())) { return false; } // Check each token for (int i = 0, sz = tokensBuilders.size(); i < sz; ++i) { if (!tokensBuilders.get(i).build().equals(sentence.tokensBuilders.get(i).build())) { return false; } } return true; } /** {@inheritDoc} */ @Override public int hashCode() { if (this.impl.hasText()) { return this.impl.getText().hashCode() * 31 + this.tokensBuilders.size(); } else { return impl.build().hashCode(); } } /** {@inheritDoc} */ @Override public String toString() { return impl.getText(); } /** * @param start - inclusive * @param end - exclusive * @return - the text for the provided token span. */ public String substring(int start, int end) { StringBuilder sb = new StringBuilder(); for(CoreLabel word : asCoreLabels().subList(start, end)) { sb.append(word.word()); sb.append(word.after()); } return sb.toString(); } private static <E> List<E> lazyList(final List<CoreNLPProtos.Token.Builder> tokens, final Function<CoreNLPProtos.Token.Builder,E> fn) { return new AbstractList<E>() { @Override public E get(int index) { return fn.apply(tokens.get(index)); } @Override public int size() { return tokens.size(); } }; } /** Returns the sentence id of the sentence, if one was found */ public Optional<String> sentenceid() { synchronized (impl) { if (impl.hasSentenceID()) { return Optional.of(impl.getSentenceID()); } else { return Optional.empty(); } } } /** * Apply a TokensRegex pattern to the sentence. * * @param pattern The TokensRegex pattern to match against. * @return the matcher. */ public boolean matches(TokenSequencePattern pattern) { return pattern.getMatcher(asCoreLabels()).matches(); } /** * Apply a TokensRegex pattern to the sentence. * * @param pattern The TokensRegex pattern to match against. * @return True if the tokensregex pattern matches. */ public boolean matches(String pattern) { return matches(TokenSequencePattern.compile(pattern)); } /** * Apply a TokensRegex pattern to the sentence. * * @param pattern The TokensRegex pattern to match against. * @param fn The action to do on each match. * @return the list of matches, after run through the function. */ public <T> List<T> find(TokenSequencePattern pattern, Function<TokenSequenceMatcher, T> fn) { TokenSequenceMatcher matcher = pattern.matcher(asCoreLabels()); List<T> lst = new ArrayList<>(); while(matcher.find()) { lst.add(fn.apply(matcher)); } return lst; } public <T> List<T> find(String pattern, Function<TokenSequenceMatcher, T> fn) { return find(TokenSequencePattern.compile(pattern), fn); } /** * Apply a semgrex pattern to the sentence * @param pattern The Semgrex pattern to match against. * @param fn The action to do on each match. * @return the list of matches, after run through the function. */ public <T> List<T> semgrex(SemgrexPattern pattern, Function<SemgrexMatcher, T> fn) { SemgrexMatcher matcher = pattern.matcher(dependencyGraph()); List<T> lst = new ArrayList<>(); while(matcher.findNextMatchingNode()) { lst.add(fn.apply(matcher)); } return lst; } /** * Apply a semgrex pattern to the sentence * @param pattern The Semgrex pattern to match against. * @param fn The action to do on each match. * @return the list of matches, after run through the function. */ public <T> List<T> semgrex(String pattern, Function<SemgrexMatcher, T> fn) { return semgrex(SemgrexPattern.compile(pattern), fn); } }