package edu.stanford.nlp.simple;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.CoreNLPProtos;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import java.util.*;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* <p>
* A set of common utility algorithms for working with sentences (e.g., finding the head of a span).
* These are not intended to be perfect, or even the canonical version of these algorithms.
* They should only be trusted for prototyping, and more careful attention should be paid in cases
* where the performance of the task is important or the domain is unusual.
* </p>
*
* <p>
* For developers: this class is intended to be where <i>domain independent</i> and
* <i>broadly useful</i> functions on a sentence would go, rather than polluting the {@link Sentence}
* class itself.
* </p>
*
* @author Gabor Angeli
*/
public class SentenceAlgorithms {
/** The underlying {@link Sentence}. */
public final Sentence sentence;
/**
* Create a new algorithms object, based off of a sentence.
*
* @see Sentence#algorithms()
*/
public SentenceAlgorithms(Sentence impl) {
this.sentence = impl;
}
/**
* Returns a collection of keyphrases, defined as relevant noun phrases and verbs in the sentence.
* Each token of the sentence is consumed at most once.
*
* What counts as a keyphrase is in general quite subjective -- this method is just one possible interpretation
* (in particular, Gabor's interpretation).
* Please don't rely on this method to produce exactly your interpretation of what a keyphrase is.
*
* @return A list of spans in the sentence, where each one corresponds to a keyphrase.
*
* @author Gabor Angeli
*/
public List<Span> keyphraseSpans() {
//
// Implementation note:
// This is implemented roughly as a finite state automata, looking for sequences of nouns, adjective+nouns, verbs,
// and a few special cases of prepositions.
// The code defines a transition matrix, based on POS tags and lemmas, where at each word we update the valid next
// tags/words based on the current tag/word we see.
// Note: The tag 'B' is used for the verb "to be", rather than the usual 'V' tag.
// Note: The tag 'X' is used for proper nouns, rather than the usual 'N' tag.
// Note: The tag 'Z' is used for possessives, rather than the usual 'P' tag.
//
// The output
List<Span> spans = new ArrayList<>();
// The marker for where the last span began
int spanBegin = -1;
// The expected next states
final Set<Character> expectNextTag = new HashSet<>();
final Set<String> expectNextLemma = new HashSet<>();
// A special marker for when we look-ahead and only accept the last word if
// the word after it is ok (e.g., PP attachments).
boolean inLookahead = false;
// The transition matrix, over POS tags.
Consumer<Character> updateExpectation = coarseTag -> {
if (coarseTag == 'N') {
expectNextTag.clear();
expectNextTag.add('N');
expectNextTag.add('X');
expectNextLemma.clear();
expectNextLemma.add("of");
expectNextLemma.add("'s");
} else if (coarseTag == 'G') {
expectNextTag.clear();
expectNextTag.add('N'); // 'water freezing' is fishy, but 'freezing water' is ok.
expectNextLemma.clear();
} else if (coarseTag == 'X') {
expectNextTag.clear();
expectNextTag.add('X');
expectNextLemma.clear();
} else if (coarseTag == 'J') {
expectNextTag.clear();
expectNextTag.add('N');
expectNextTag.add('X');
expectNextTag.add('J');
expectNextLemma.clear();
} else if (coarseTag == 'V') {
expectNextTag.clear();
expectNextTag.add('V');
expectNextLemma.clear();
} else if (coarseTag == 'Z') {
expectNextTag.clear();
expectNextTag.add('J');
expectNextTag.add('N');
expectNextLemma.clear();
} else if (coarseTag == 'I') {
expectNextTag.clear();
expectNextTag.add('N');
expectNextTag.add('X');
expectNextTag.add('J');
expectNextLemma.clear();
} else {
throw new IllegalStateException("Cannot update expected next token for POS tag: " + coarseTag);
}
};
// Run the FSA:
for (int i = 0; i < sentence.length(); ++i) {
// Get some variables
String tag = sentence.posTag(i);
char coarseTag = Character.toUpperCase(tag.charAt(0));
String lemma = sentence.lemma(i).toLowerCase();
// Tweak the tag
if (coarseTag == 'V' && lemma.equals("be")) {
coarseTag = 'B';
} else if (tag.startsWith("NNP")) {
coarseTag = 'X';
} else if (tag.startsWith("POS")) {
coarseTag = 'Z';
}
// (don't collapse 'ing' nouns)
if (coarseTag == 'N' && sentence.word(i).endsWith("ing")) {
coarseTag = 'G';
}
// Transition
if (spanBegin < 0 && !sentence.word(i).equals("%") &&
(coarseTag == 'N' || coarseTag == 'V' || coarseTag == 'J' || coarseTag == 'X' || coarseTag == 'G')) {
// Case: we were not in a span, but we hit a valid start tag.
spanBegin = i;
updateExpectation.accept(coarseTag);
inLookahead = false;
} else if (spanBegin >= 0) {
// Case: we're in a span
if (expectNextTag.contains(coarseTag)) {
// Case: we hit a valid expected POS tag.
// update the transition matrix.
updateExpectation.accept(coarseTag);
inLookahead = false;
} else if (expectNextLemma.contains(lemma)) {
// Case: we hit a valid word. Do something special.
switch (lemma) {
case "of":
// These prepositions are valid to subsume into a noun phrase.
// Update the transition matrix, and mark this as conditionally ok.
updateExpectation.accept('I');
inLookahead = true;
break;
case "'s":
// Possessives often denote a longer compound phrase
updateExpectation.accept('Z');
inLookahead = true;
break;
default:
throw new IllegalStateException("Unknown special lemma: " + lemma);
}
} else {
// Case: We have transitioned to an 'invalid' state, and therefore the span should end.
if (inLookahead) {
// If we were in a lookahead token, ignore the last token (as per the lookahead definition)
spans.add(Span.fromValues(spanBegin, i - 1));
} else {
// Otherwise, add the span
spans.add(Span.fromValues(spanBegin, i));
}
// We may also have started a new span.
// Check to see if we have started a new span.
if (coarseTag == 'N' || coarseTag == 'V' || coarseTag == 'J' || coarseTag == 'X' || coarseTag == 'G') {
spanBegin = i;
updateExpectation.accept(coarseTag);
} else {
spanBegin = -1;
}
inLookahead = false;
}
}
}
// Add a potential last span
if (spanBegin >= 0) {
spans.add(Span.fromValues(spanBegin, inLookahead ? sentence.length() - 1 : sentence.length()));
}
// Return
return spans;
}
/**
* Get the keyphrases of the sentence as a list of Strings.
*
* @param toString The function to use to convert a span to a string. The canonical case is Sentence::words
* @return A list of keyphrases, as Strings.
*
* @see edu.stanford.nlp.simple.SentenceAlgorithms#keyphraseSpans()
*/
public List<String> keyphrases(Function<Sentence, List<String>> toString) {
return keyphraseSpans().stream().map(x -> StringUtils.join(toString.apply(sentence).subList(x.start(), x.end()), " ")).collect(Collectors.toList());
}
/**
* The keyphrases of the sentence, using the words of the sentence to convert a span into a keyphrase.
* @return A list of String keyphrases in the sentence.
*
* @see edu.stanford.nlp.simple.SentenceAlgorithms#keyphraseSpans()
*/
public List<String> keyphrases() {
return keyphrases(Sentence::words);
}
/**
* Get the index of the head word for a given span, based off of the dependency parse.
*
* @param tokenSpan The span of tokens we are finding the head of.
* @return The head index of the given span of tokens.
*/
public int headOfSpan(Span tokenSpan) {
// Error checks
if (tokenSpan.size() == 0) {
throw new IllegalArgumentException("Cannot find head word of empty span!");
}
List<Optional<Integer>> governors = sentence.governors();
if (tokenSpan.start() >= governors.size()) {
throw new IllegalArgumentException("Span is out of range: " + tokenSpan + "; sentence: " + sentence);
}
if (tokenSpan.end() > governors.size()) {
throw new IllegalArgumentException("Span is out of range: " + tokenSpan + "; sentence: " + sentence);
}
// Find where to start searching up the dependency tree
int candidateStart = tokenSpan.end() - 1;
Optional<Integer> parent;
while ( !(parent = governors.get(candidateStart)).isPresent() ) {
candidateStart -= 1;
if (candidateStart < tokenSpan.start()) {
// Case: nothing in this span has a head. Default to right-most element.
return tokenSpan.end() - 1;
}
}
int candidate = candidateStart;
// Search up the dependency tree
Set<Integer> seen = new HashSet<>();
while (parent.isPresent() && parent.get() >= tokenSpan.start() && parent.get() < tokenSpan.end()) {
candidate = parent.get();
if (seen.contains(candidate)) {
return candidate;
}
seen.add(candidate);
parent = governors.get(candidate);
}
// Return
return candidate;
}
/**
* Return all the spans of a sentence. So, for example, a sentence "a b c" would return:
* [a], [b], [c], [a b], [b c], [a b c].
*
* @param selector The function to apply to each token. For example, {@link Sentence#words}.
* For that example, you can use <code>allSpans(Sentence::words)</code>.
* @param maxLength The maximum length of the spans to extract. The default to extract all spans
* is to set this to <code>sentence.length()</code>.
* @param <E> The type of the element we are getting.
*
* @return A streaming iterable of spans for this sentence.
*/
public <E> Iterable<List<E>> allSpans(Function<Sentence, List<E>> selector, int maxLength) {
return () -> new Iterator<List<E>>() {
private int length = maxLength > sentence.length() ? sentence.length() : maxLength;
private int start = 0;
@Override
public boolean hasNext() {
return length > 0;
}
@Override
public List<E> next() {
// Get the term
List<E> rtn = selector.apply(sentence).subList(start, start + length);
// Update the state
start += 1;
if (start + length > sentence.length()) {
length -= 1;
start = 0;
}
// Return
return rtn;
}
};
}
/** @see SentenceAlgorithms#allSpans(Function, int) */
public <E> Iterable<List<E>> allSpans(Function<Sentence, List<E>> selector) {
return allSpans(selector, sentence.length());
}
/** @see SentenceAlgorithms#allSpans(Function, int) */
public Iterable<List<String>> allSpans() {
return allSpans(Sentence::words, sentence.length());
}
/**
* Select the most common element of the given type in the given span.
* This is useful for, e.g., finding the most likely NER span of a given span, or the most
* likely POS tag of a given span.
* Null entries are removed.
*
* @param span The span of the sentence to find the mode element in. This must be entirely contained in the sentence.
* @param selector The property of the sentence we are getting the mode of. For example, <code>Sentence::posTags</code>
* @param <E> The type of the element we are getting.
* @return The most common element of the given property in the sentence.
*/
public <E> E modeInSpan(Span span, Function<Sentence, List<E>> selector) {
if (!Span.fromValues(0, sentence.length()).contains(span)) {
throw new IllegalArgumentException("Span must be entirely contained in the sentence: " + span + " (sentence length=" + sentence.length() + ")");
}
Counter<E> candidates = new ClassicCounter<>();
for (int i : span) {
candidates.incrementCount(selector.apply(sentence).get(i));
}
candidates.remove(null);
return Counters.argmax(candidates);
}
/**
* Run a proper BFS over a dependency graph, finding the shortest path between two vertices.
*
* @param start The start index.
* @param end The end index.
* @param selector The selector to use for the word nodes.
*
* @return A path string, analogous to {@link #dependencyPathBetween(int, int)}
*/
protected List<String> loopyDependencyPathBetween(int start, int end, Optional<Function<Sentence, List<String>>> selector) {
// Find the start and end
SemanticGraph graph = this.sentence.dependencyGraph();
IndexedWord[] indexedWords = new IndexedWord[this.sentence.length()];
for (IndexedWord vertex : graph.vertexSet()) {
indexedWords[vertex.index() - 1] = vertex;
}
// Set up the search
BitSet seen = new BitSet();
int[] backpointers = new int[sentence.length()];
Arrays.fill(backpointers, -1);
Queue<IndexedWord> fringe = new LinkedList<>();
fringe.add(indexedWords[start]);
// Run the search
while (!fringe.isEmpty()) {
IndexedWord vertex = fringe.poll();
int vertexIndex = vertex.index() - 1;
if (seen.get(vertexIndex)) {
continue; // should not reach here
}
seen.set(vertexIndex);
for (SemanticGraphEdge inEdge : graph.incomingEdgeIterable(vertex)) {
IndexedWord governor = inEdge.getGovernor();
int govIndex = governor.index() - 1;
if (!seen.get(govIndex)) {
backpointers[govIndex] = vertexIndex;
if (govIndex == end) {
break;
} else {
fringe.add(governor);
}
}
}
for (SemanticGraphEdge outEdge : graph.outgoingEdgeIterable(vertex)) {
IndexedWord dependent = outEdge.getDependent();
int depIndex = dependent.index() - 1;
if (!seen.get(depIndex)) {
backpointers[depIndex] = vertexIndex;
if (depIndex == end) {
break;
} else {
fringe.add(dependent);
}
}
}
}
// Infer the path
ArrayList<String> path = new ArrayList<>();
Optional<List<String>> words = selector.map(x -> x.apply(sentence));
int vertex = end;
while (vertex != start) {
// 1. Add the word
if (words.isPresent()) {
path.add(words.get().get(vertex));
}
// 2. Find the parent
for (SemanticGraphEdge inEdge : graph.incomingEdgeIterable(indexedWords[vertex])) {
int governor = inEdge.getGovernor().index() - 1;
if (backpointers[vertex] == governor) {
path.add("-" + inEdge.getRelation().toString() + "->");
break;
}
}
for (SemanticGraphEdge outEdge : graph.outgoingEdgeIterable(indexedWords[vertex])) {
int dependent = outEdge.getDependent().index() - 1;
if (backpointers[vertex] == dependent) {
path.add("<-" + outEdge.getRelation().toString() + "-");
break;
}
}
// 3. Update the node
vertex = backpointers[vertex];
}
words.ifPresent(strings -> path.add(strings.get(start)));
Collections.reverse(path);
return path;
}
/**
* Find the dependency path between two words in a sentence.
*
* @param start The start word, 0-indexed.
* @param end The end word, 0-indexed.
* @param selector The selector for the strings between the path, if any. If left empty, these will be omitted from the list.
*
* @return A list encoding the dependency path between the vertices, suitable for inclusion as features.
*/
@SuppressWarnings({"unchecked", "Duplicates"})
public List<String> dependencyPathBetween(int start, int end, Optional<Function<Sentence, List<String>>> selector) {
// Get paths from a node to the root of the sentence
LinkedList<Integer> rootToStart = new LinkedList<>();
LinkedList<Integer> rootToEnd = new LinkedList<>();
int startAncestor = start;
List<Optional<Integer>> governors = sentence.governors();
Set<Integer> seenVertices = new HashSet<>();
while (startAncestor >= 0 && governors.get(startAncestor).isPresent()) {
if (seenVertices.contains(startAncestor)) {
// Found loopiness -- revert to BFS
return loopyDependencyPathBetween(start, end, selector);
}
seenVertices.add(startAncestor);
rootToStart.addFirst(startAncestor);
startAncestor = governors.get(startAncestor).get();
}
if (startAncestor == -1) {
rootToStart.addFirst(-1);
}
int endAncestor = end;
seenVertices.clear();
while (endAncestor >= 0 && governors.get(endAncestor).isPresent()) {
if (seenVertices.contains(endAncestor)) {
// Found loopiness -- revert to BFS
return loopyDependencyPathBetween(start, end, selector);
}
seenVertices.add(endAncestor);
rootToEnd.addFirst(endAncestor);
endAncestor = governors.get(endAncestor).get();
}
if (endAncestor == -1) {
rootToEnd.addFirst(-1);
}
// Get least common node
int leastCommonNodeIndex = (rootToStart.size() == 0 || rootToEnd.size() == 0 || !rootToStart.get(0).equals(rootToEnd.get(0))) ? -1 : 0;
for (int i = 1; i < Math.min(rootToStart.size(), rootToEnd.size()); ++i) {
if (rootToStart.get(i).equals(rootToEnd.get(i))) {
leastCommonNodeIndex = i;
}
}
// Construct the path
if (leastCommonNodeIndex < 0) {
return Collections.emptyList();
}
List<String> path = new ArrayList<>();
Optional<List<String>> words = selector.map(x -> x.apply(sentence));
for (int i = rootToStart.size() - 1; i > leastCommonNodeIndex; --i) {
final int index = i;
words.ifPresent(x -> path.add(x.get(rootToStart.get(index))));
path.add("<-" + sentence.incomingDependencyLabel(rootToStart.get(i)).orElse("dep") + "-");
}
if (words.isPresent()) {
path.add(words.get().get(rootToStart.get(leastCommonNodeIndex)));
}
for (int i = leastCommonNodeIndex + 1; i < rootToEnd.size(); ++i) {
final int index = i;
path.add("-" + sentence.incomingDependencyLabel(rootToEnd.get(i)).orElse("dep") + "->");
words.ifPresent(x -> path.add(x.get(rootToEnd.get(index))));
}
return path;
}
public List<String> dependencyPathBetween(int start, int end) {
return dependencyPathBetween(start, end, Optional.of(Sentence::words));
}
/**
* A funky little helper method to interpret each token of the sentence as an HTML string, and translate it back to text.
* Note that this is <b>in place</b>.
*/
public void unescapeHTML() {
// Change in the protobuf
for (int i = 0; i < sentence.length(); ++i) {
CoreNLPProtos.Token.Builder token = sentence.rawToken(i);
token.setWord(StringUtils.unescapeHtml3(token.getWord()));
token.setLemma(StringUtils.unescapeHtml3(token.getLemma()));
}
// Change in the annotation
CoreMap cm = sentence.document.asAnnotation().get(CoreAnnotations.SentencesAnnotation.class).get(sentence.sentenceIndex());
for (CoreLabel token : cm.get(CoreAnnotations.TokensAnnotation.class)) {
token.setWord(StringUtils.unescapeHtml3(token.word()));
token.setLemma(StringUtils.unescapeHtml3(token.lemma()));
}
}
}