package edu.stanford.nlp.naturalli;
import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.PriorityQueue;
import java.util.*;
import java.util.stream.Collectors;
/**
* This class takes a {@link edu.stanford.nlp.naturalli.SentenceFragment} and converts it to a conventional
* OpenIE triple, as materialized in the {@link RelationTriple} class.
*
* @author Gabor Angeli
*/
@SuppressWarnings("WeakerAccess")
public class RelationTripleSegmenter {
private final boolean allowNominalsWithoutNER;
/** A list of patterns to match relation extractions against */
public final List<SemgrexPattern> VERB_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
// { blue cats play [quietly] with yarn,
// Jill blew kisses at Jack,
// cats are standing next to dogs }
add(SemgrexPattern.compile("{$}=verb ?>/cop|aux(pass)?/ {}=be >/.subj(pass)?/ {}=subject >/(nmod|acl|advcl):.*/=prepEdge ( {}=object ?>appos {} = appos ?>case {}=prep) ?>dobj {pos:/N.*/}=relObj"));
// { cats are cute,
// horses are grazing peacefully }
add(SemgrexPattern.compile("{$}=object >/.subj(pass)?/ {}=subject >/cop|aux(pass)?/ {}=verb ?>case {}=prep"));
// { fish like to swim }
add(SemgrexPattern.compile("{$}=verb >/.subj(pass)?/ {}=subject >xcomp ( {}=object ?>appos {}=appos )"));
// { cats have tails }
add(SemgrexPattern.compile("{$}=verb ?>/aux(pass)?/ {}=be >/.subj(pass)?/ {}=subject >/[di]obj|xcomp/ ( {}=object ?>appos {}=appos )"));
// { Tom and Jerry were fighting }
add(SemgrexPattern.compile("{$}=verb >/nsubj(pass)?/ ( {}=subject >/conj:and/=subjIgnored {}=object )"));
// { mass of iron is 55amu }
add(SemgrexPattern.compile("{pos:/NNS?/}=object >cop {}=relappend1 >/nsubj(pass)?/ ( {}=verb >/nmod:of/ ( {pos:/NNS?/}=subject >case {}=relappend0 ) )"));
}});
/**
* <p>
* A set of derivative patterns from {@link RelationTripleSegmenter#VERB_PATTERNS} that ignore the subject
* arc. This is useful primarily for creating a training set for the clause splitter which emulates the
* behavior of the relation triple segmenter component.
* </p>
*/
public final List<SemgrexPattern> VP_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
for (SemgrexPattern pattern : VERB_PATTERNS) {
String fullPattern = pattern.pattern();
String vpPattern = fullPattern
.replace(">/.subj(pass)?/ {}=subject", "") // drop the subject
.replace("$", "pos:/V.*/"); // but, force the root to be on a verb
add(SemgrexPattern.compile(vpPattern));
}
}});
/**
* A set of nominal patterns, that don't require being in a coherent clause, but do require NER information.
*/
public final List<TokenSequencePattern> NOUN_TOKEN_PATTERNS = Collections.unmodifiableList(new ArrayList<TokenSequencePattern>() {{
// { NER nominal_verb NER,
// United States president Obama }
add(TokenSequencePattern.compile("(?$object [ner:/PERSON|ORGANIZATION|LOCATION+/]+ ) (?$beof_comp [ {tag:/NN.*/} & !{ner:/PERSON|ORGANIZATION|LOCATION/} ]+ ) (?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ )"));
// { NER 's nominal_verb NER,
// America 's president , Obama }
add(TokenSequencePattern.compile("(?$object [ner:/PERSON|ORGANIZATION|LOCATION+/]+ ) /'s/ (?$beof_comp [ {tag:/NN.*/} & !{ner:/PERSON|ORGANIZATION|LOCATION/} ]+ ) /,/? (?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ )"));
// { NER , NER ,,
// Obama, 28, ...,
// Obama (28) ...}
add(TokenSequencePattern.compile("(?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ ) /,/ (?$object [ner:/NUMBER|DURATION|PERSON|ORGANIZATION/]+ ) /,/"));
add(TokenSequencePattern.compile("(?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ ) /\\(/ (?$object [ner:/NUMBER|DURATION|PERSON|ORGANIZATION/]+ ) /\\)/"));
}});
/**
* A set of nominal patterns using dependencies, that don't require being in a coherent clause, but do require NER information.
*/
private final List<SemgrexPattern> NOUN_DEPENDENCY_PATTERNS;
/**
* Create a new relation triple segmenter.
*
* @param allowNominalsWithoutNER If true, extract all nominal relations and not just those which are warranted based on
* named entity tags. For most practical applications, this greatly over-produces trivial triples.
*/
public RelationTripleSegmenter(boolean allowNominalsWithoutNER) {
this.allowNominalsWithoutNER = allowNominalsWithoutNER;
NOUN_DEPENDENCY_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
// { Durin, son of Thorin }
add(SemgrexPattern.compile("{tag:/N.*/}=subject >appos ( {}=relation >/nmod:.*/=relaux {}=object)"));
// { Thorin's son, Durin }
add(SemgrexPattern.compile("{}=relation >/nmod:.*/=relaux {}=subject >appos {}=object"));
// { Stanford's Chris Manning }
add(SemgrexPattern.compile("{tag:/N.*/}=object >/nmod:poss/=relaux ( {}=subject >case {} )"));
// { Chris Manning of Stanford,
// [There are] cats with tails,
if (allowNominalsWithoutNER) {
add(SemgrexPattern.compile("{tag:/N.*/}=subject >/nmod:(?!poss).*/=relaux {}=object"));
} else {
add(SemgrexPattern.compile("{ner:/PERSON|ORGANIZATION|LOCATION/}=subject >/nmod:(?!poss).*/=relaux {ner:/..+/}=object"));
add(SemgrexPattern.compile("{tag:/N.*/}=subject >/nmod:(in|with)/=relaux {}=object"));
}
// { President Obama }
if (allowNominalsWithoutNER) {
add(SemgrexPattern.compile("{tag:/N.*/}=subject >/amod/=arc {}=object"));
} else {
add(SemgrexPattern.compile("{ner:/PERSON|ORGANIZATION|LOCATION/}=subject >/amod|compound/=arc {ner:/..+/}=object"));
}
}});
}
/**
* @see RelationTripleSegmenter#RelationTripleSegmenter(boolean)
*/
@SuppressWarnings("UnusedDeclaration")
public RelationTripleSegmenter() {
this(false);
}
/**
* Extract the nominal patterns from this sentence.
*
* @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
* @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
*
* @param parse The parse tree of the sentence to annotate.
* @param tokens The tokens of the sentence to annotate.
* @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
*/
@SuppressWarnings("unchecked")
public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
List<RelationTriple> extractions = new ArrayList<>();
Set<Triple<Span,String,Span>> alreadyExtracted = new HashSet<>();
//
// Run Token Patterns
//
for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
while (tokenMatcher.find()) {
boolean missingPrefixBe;
boolean missingSuffixOf = false;
// Create subject
List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
List<CoreLabel> subjectTokens = new ArrayList<>();
for (int i : subjectSpan) {
subjectTokens.add(tokens.get(i));
}
// Create object
List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
if (Span.overlaps(subjectSpan, objectSpan)) {
continue;
}
List<CoreLabel> objectTokens = new ArrayList<>();
for (int i : objectSpan) {
objectTokens.add(tokens.get(i));
}
// Create relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
List<CoreLabel> relationTokens = new ArrayList<>();
// (add the 'be')
missingPrefixBe = true;
// (add a complement to the 'be')
List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
if (beofComp != null) {
// (add the complement
for (CoreMap token : beofComp) {
if (token instanceof CoreLabel) {
relationTokens.add((CoreLabel) token);
} else {
relationTokens.add(new CoreLabel(token));
}
}
// (add the 'of')
missingSuffixOf = true;
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
//noinspection ConstantConditions
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixOf(missingSuffixOf);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
//
// Run Semgrex Matches
//
for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
SemgrexMatcher matcher = semgrex.matcher(parse);
while (matcher.find()) {
boolean missingPrefixBe = false;
boolean missingSuffixBe = false;
boolean istmod = false;
// Get relaux if applicable
String relaux = matcher.getRelnString("relaux");
String ignoredArc = relaux;
if (ignoredArc == null) {
ignoredArc = matcher.getRelnString("arc");
}
// Create subject
IndexedWord subject = matcher.getNode("subject");
List<IndexedWord> subjectTokens = new ArrayList<>();
Span subjectSpan;
if (subject.ner() != null && !"O".equals(subject.ner())) {
subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
for (int i : subjectSpan) {
subjectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
subjectSpan = Util.tokensToSpan(subjectTokens);
}
// Create object
IndexedWord object = matcher.getNode("object");
List<IndexedWord> objectTokens = new ArrayList<>();
Span objectSpan;
if (object.ner() != null && !"O".equals(object.ner())) {
objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
for (int i : objectSpan) {
objectTokens.add(new IndexedWord(tokens.get(i)));
}
} else {
objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
objectSpan = Util.tokensToSpan(objectTokens);
}
// Check that the pair is valid
if (Span.overlaps(subjectSpan, objectSpan)) {
continue; // We extracted an identity
}
if (subjectSpan.end() == objectSpan.start() - 1 &&
(tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") ||
"CC".equals(tokens.get(subjectSpan.end()).tag()))) {
continue; // We're straddling a clause
}
if (objectSpan.end() == subjectSpan.start() - 1 &&
(tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") ||
"CC".equals(tokens.get(objectSpan.end()).tag()))) {
continue; // We're straddling a clause
}
// Get any prepositional edges
String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
IndexedWord prepWord = null;
// (these usually come from the object)
boolean prepositionIsPrefix = false;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
if (edge.getRelation().toString().equals("case")) {
prepWord = edge.getDependent();
}
}
// (...but sometimes from the subject)
if (prepWord == null) {
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
if (edge.getRelation().toString().equals("case")) {
prepositionIsPrefix = true;
prepWord = edge.getDependent();
}
}
}
List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
if (prepWord != null && !expected.equals("tmod")) {
Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
if (!optionalPrepChunk.isPresent()) { continue; }
prepChunk = optionalPrepChunk.get();
Collections.sort(prepChunk, (a, b) -> {
double val = a.pseudoPosition() - b.pseudoPosition();
if (val < 0) { return -1; }
if (val > 0) { return 1; }
else { return 0; }
}); // ascending sort
}
// Get the relation
if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
LinkedList<IndexedWord> relationTokens = new LinkedList<>();
IndexedWord relNode = matcher.getNode("relation");
if (relNode != null) {
// Case: we have a grounded relation span
// (add the relation)
relationTokens.add(relNode);
// (add any prepositional case markings)
if (prepositionIsPrefix) {
missingSuffixBe = true; // We're almost certainly missing a suffix 'be'
for (int i = prepChunk.size() - 1; i >=0; --i) { relationTokens.addFirst(prepChunk.get(i)); }
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
} else {
// Case: we have a hallucinated relation span
// (mark it as missing a preceding 'be'
if (!expected.equals("poss")) {
missingPrefixBe = true;
}
// (add any prepositional case markings)
if (prepositionIsPrefix) {
for (int i = prepChunk.size() - 1; i >=0; --i) { relationTokens.addFirst(prepChunk.get(i)); }
} else {
relationTokens.addAll(prepChunk);
}
if (expected.equalsIgnoreCase("tmod")) {
istmod = true;
}
// (some fine-tuning)
if (allowNominalsWithoutNER && "of".equals(expected)) {
continue; // prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
}
}
// Add extraction
String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
RelationTriple extraction = new RelationTriple(
subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
extraction.istmod(istmod);
extraction.isPrefixBe(missingPrefixBe);
extraction.isSuffixBe(missingSuffixBe);
extractions.add(extraction);
alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
}
}
}
}
}
//
// Filter downward polarity extractions
//
Iterator<RelationTriple> iter = extractions.iterator();
while (iter.hasNext()) {
RelationTriple term = iter.next();
boolean shouldRemove = true;
for (CoreLabel token : term) {
if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null ||
!token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards() ) {
shouldRemove = false;
}
}
if (shouldRemove) {
iter.remove(); // Don't extract things in downward polarity contexts.
}
}
// Return
return extractions;
}
// /**
// * A counter keeping track of how many times a given pattern has matched. This allows us to learn to iterate
// * over patterns in the optimal order; this is just an efficiency tweak (but an effective one!).
// */
// private final Counter<SemgrexPattern> VERB_PATTERN_HITS = new ClassicCounter<>();
/** A set of valid arcs denoting a subject entity we are interested in */
public final Set<String> VALID_SUBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod:poss"); add("nmod:tmod"); add("expl");
add("nsubj"); add("case");
}});
/** A set of valid arcs denoting an object entity we are interested in */
public final Set<String> VALID_OBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod"); add("nsubj"); add("nmod:*"); add("nmod:poss");
add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case");
// add("advcl"); // Born in Hawaii, Obama is a US citizen; citizen -advcl-> Born.
}});
/** A set of valid arcs denoting an adverbial modifier we are interested in */
public final Set<String> VALID_ADVERB_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
add("amod"); add("advmod"); add("conj"); add("cc"); add("conj:and"); add("conj:or");
add("auxpass"); add("compound:*");
}});
/**
* @see RelationTripleSegmenter#getValidSubjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidObjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
* @see RelationTripleSegmenter#getValidAdverbChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
*/
@SuppressWarnings("StatementWithEmptyBody")
protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot,
Set<String> validArcs, Optional<String> ignoredArc,
boolean allowExtraArcs) {
PriorityQueue<IndexedWord> chunk = new FixedPrioritiesPriorityQueue<>();
Set<Double> seenIndices = new HashSet<>();
Queue<IndexedWord> fringe = new LinkedList<>();
IndexedWord root = originalRoot;
fringe.add(root);
boolean isCopula = false;
IndexedWord primaryCase = null;
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(originalRoot)) {
String shortName = edge.getRelation().getShortName();
if (shortName.equals("cop") || shortName.equals("auxpass")) {
isCopula = true;
}
if (shortName.equals("case")) {
primaryCase = edge.getDependent();
}
}
while (!fringe.isEmpty()) {
root = fringe.poll();
chunk.add(root, -root.pseudoPosition());
// Sanity check to prevent infinite loops
if (seenIndices.contains(root.pseudoPosition())) {
// TODO(gabor) Indicates a cycle in the tree!
return Optional.empty();
}
seenIndices.add(root.pseudoPosition());
// Check outgoing edges
boolean hasConj = false;
boolean hasCC = false;
for (SemanticGraphEdge edge : parse.getOutEdgesSorted(root)) {
String shortName = edge.getRelation().getShortName();
String name = edge.getRelation().toString();
if (shortName.startsWith("conj")) { hasConj = true; }
if (shortName.equals("cc")) { hasCC = true; }
//noinspection StatementWithEmptyBody
if (isCopula && (shortName.equals("cop") || shortName.contains("subj") || shortName.equals("auxpass") )) {
// noop; ignore nsubj, cop for extractions with copula
} else if (edge.getDependent() == primaryCase) {
// noop: ignore case edge
} else if (ignoredArc.isPresent() &&
(ignoredArc.get().equals(name) || (ignoredArc.get().startsWith("conj") && name.equals("cc")))) {
// noop; ignore explicitly requested noop arc, or "CC" if the noop arc is a conj:*
} else if (!validArcs.contains(edge.getRelation().getShortName()) && !validArcs.contains(edge.getRelation().getShortName().replaceAll(":.*",":*"))) {
if (!allowExtraArcs) {
return Optional.empty();
} else {
// noop: just some dangling arc
}
} else {
fringe.add(edge.getDependent());
}
}
// Ensure that we don't have a conj without a cc, or vice versa
if (Boolean.logicalXor(hasConj, hasCC)) {
return Optional.empty();
}
}
return Optional.of(chunk.toSortedList());
}
/**
* @see RelationTripleSegmenter#getValidChunk(SemanticGraph, IndexedWord, Set, Optional, boolean)
*/
protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot,
Set<String> validArcs, Optional<String> ignoredArc) {
return getValidChunk(parse, originalRoot, validArcs, ignoredArc, false);
}
/**
* Get the yield of a given subtree, if it is a valid subject.
* Otherwise, return {@link java.util.Optional#empty()}}.
* @param parse The parse tree we are extracting a subtree from.
* @param root The root of the subtree.
* @param noopArc An optional edge type to ignore in gathering the chunk.
* @return If this subtree is a valid entity, we return its yield. Otherwise, we return empty.
*/
protected Optional<List<IndexedWord>> getValidSubjectChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
return getValidChunk(parse, root, VALID_SUBJECT_ARCS, noopArc);
}
/**
* Get the yield of a given subtree, if it is a valid object.
* Otherwise, return {@link java.util.Optional#empty()}}.
* @param parse The parse tree we are extracting a subtree from.
* @param root The root of the subtree.
* @param noopArc An optional edge type to ignore in gathering the chunk.
* @return If this subtree is a valid entity, we return its yield. Otherwise, we return empty.
*/
protected Optional<List<IndexedWord>> getValidObjectChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
return getValidChunk(parse, root, VALID_OBJECT_ARCS, noopArc);
}
/**
* Get the yield of a given subtree, if it is a adverb chunk.
* Otherwise, return {@link java.util.Optional#empty()}}.
* @param parse The parse tree we are extracting a subtree from.
* @param root The root of the subtree.
* @param noopArc An optional edge type to ignore in gathering the chunk.
* @return If this subtree is a valid adverb, we return its yield. Otherwise, we return empty.
*/
protected Optional<List<IndexedWord>> getValidAdverbChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
return getValidChunk(parse, root, VALID_ADVERB_ARCS, noopArc);
}
/**
* <p>
* Try to segment this sentence as a relation triple.
* This sentence must already match one of a few strict patterns for a valid OpenIE extraction.
* If it does not, then no relation triple is created.
* That is, this is <b>not</b> a relation extractor; it is just a utility to segment what is already a
* (subject, relation, object) triple into these three parts.
* </p>
*
* <p>
* This method will only run the verb-centric patterns
* </p>
*
* @param parse The sentence to process, as a dependency tree.
* @param confidence An optional confidence to pass on to the relation triple.
* @param consumeAll if true, force the entire parse to be consumed by the pattern.
* @return A relation triple, if this sentence matches one of the patterns of a valid relation triple.
*/
@SuppressWarnings("UnnecessaryLabelOnContinueStatement")
private Optional<RelationTriple> segmentVerb(SemanticGraph parse,
Optional<Double> confidence,
boolean consumeAll) {
// Run pattern loop
PATTERN_LOOP: for (SemgrexPattern pattern : VERB_PATTERNS) { // For every candidate pattern...
SemgrexMatcher m = pattern.matcher(parse);
if (m.matches()) { // ... see if it matches the sentence
if ("nmod:poss".equals(m.getRelnString("prepEdge"))) {
continue PATTERN_LOOP; // nmod:poss is not a preposition!
}
int numKnownDependents = 2; // subject and object, at minimum
boolean istmod = false; // this is a tmod relation
// Object
IndexedWord object = m.getNode("appos");
if (object == null) {
object = m.getNode("object");
}
if (object != null && object.tag() != null && object.tag().startsWith("W")) {
continue; // don't extract WH arguments
}
assert object != null;
// Verb
PriorityQueue<IndexedWord> verbChunk = new FixedPrioritiesPriorityQueue<>();
IndexedWord verb = m.getNode("verb");
List<IndexedWord> adverbs = new ArrayList<>();
Optional<String> subjNoopArc = Optional.empty();
Optional<String> objNoopArc = Optional.empty();
assert verb != null;
// Case: a standard extraction with a main verb
IndexedWord relObj = m.getNode("relObj");
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(verb)) {
if ("advmod".equals(edge.getRelation().toString()) ||
"amod".equals(edge.getRelation().toString()) ||
"compound:*".equals(edge.getRelation().toString().replaceAll(":.*", ":*"))) {
// Add adverb modifiers
String tag = edge.getDependent().backingLabel().tag();
if (tag == null ||
(!tag.startsWith("W") && !edge.getDependent().backingLabel().word().equalsIgnoreCase("then"))) { // prohibit advmods like "where"
adverbs.add(edge.getDependent());
}
} else if (edge.getDependent().equals(relObj)) {
// Add additional object to the relation
Optional<List<IndexedWord>> relObjSpan = getValidChunk(parse, relObj, Collections.singleton("compound"), Optional.empty());
if (!relObjSpan.isPresent()) {
continue PATTERN_LOOP;
} else {
for (IndexedWord token : relObjSpan.get()) {
verbChunk.add(token, -token.pseudoPosition());
}
numKnownDependents += 1;
}
}
}
verbChunk.add(verb, -verb.pseudoPosition());
// Prepositions
IndexedWord prep = m.getNode("prep");
String prepEdge = m.getRelnString("prepEdge");
if (prep != null) {
// (get the preposition chunk)
Optional<List<IndexedWord>> chunk = getValidChunk(parse, prep, Collections.singleton("mwe"), Optional.empty(), true);
// (continue if no chunk found)
if (!chunk.isPresent()) {
continue PATTERN_LOOP; // Probably something like a conj w/o a cc
}
// (add the preposition)
for (IndexedWord word : chunk.get()) {
verbChunk.add(word, Integer.MIN_VALUE / 2 - word.pseudoPosition());
}
}
// (handle special prepositions)
if (prepEdge != null) {
String prepStringFromEdge = prepEdge.substring(prepEdge.indexOf(":") + 1).replace("_", " ");
if ("tmod".equals(prepStringFromEdge)) {
istmod = true;
}
}
// Auxilliary "be"
IndexedWord be = m.getNode("be");
if (be != null) { verbChunk.add(be, -be.pseudoPosition()); numKnownDependents += 1; }
// (adverbs have to be well-formed)
if (!adverbs.isEmpty()) {
Set<IndexedWord> adverbialModifiers = new HashSet<>();
for (IndexedWord adv : adverbs) {
Optional<List<IndexedWord>> adverbChunk = getValidAdverbChunk(parse, adv, Optional.empty());
if (adverbChunk.isPresent()) {
adverbialModifiers.addAll(adverbChunk.get().stream().collect(Collectors.toList()));
} else {
continue PATTERN_LOOP; // Invalid adverbial phrase
}
numKnownDependents += 1;
}
for (IndexedWord adverbToken : adverbialModifiers) {
verbChunk.add(adverbToken, -adverbToken.pseudoPosition());
}
}
// (check for additional edges)
if (consumeAll && parse.outDegree(verb) > numKnownDependents) {
//noinspection UnnecessaryLabelOnContinueStatement
continue PATTERN_LOOP; // Too many outgoing edges; we didn't consume them all.
}
List<IndexedWord> relation = verbChunk.toSortedList();
int appendI = 0;
IndexedWord relAppend = m.getNode("relappend" + appendI);
while (relAppend != null) {
relation.add(relAppend);
appendI += 1;
relAppend = m.getNode("relappend" + appendI);
}
// Last chance to register ignored edges
if (!subjNoopArc.isPresent()) {
subjNoopArc = Optional.ofNullable(m.getRelnString("subjIgnored"));
if (!subjNoopArc.isPresent()) {
subjNoopArc = Optional.ofNullable(m.getRelnString("prepEdge")); // For some strange "there are" cases
}
}
if (!objNoopArc.isPresent()) {
objNoopArc = Optional.ofNullable(m.getRelnString("objIgnored"));
}
// Find the subject
// By default, this is just the subject node; but, occasionally we want to follow a
// csubj clause to find the real subject.
IndexedWord subject = m.getNode("subject");
if (subject != null && subject.tag() != null && subject.tag().startsWith("W")) {
continue; // don't extract WH subjects
}
// Subject+Object
Optional<List<IndexedWord>> subjectSpan = getValidSubjectChunk(parse, subject, subjNoopArc);
Optional<List<IndexedWord>> objectSpan = getValidObjectChunk(parse, object, objNoopArc);
// Create relation
if (subjectSpan.isPresent() && objectSpan.isPresent() &&
CollectionUtils.intersection(new HashSet<>(subjectSpan.get()), new HashSet<>(objectSpan.get())).isEmpty()
) { // ... and has a valid subject+object
// Success! Found a valid extraction.
RelationTriple.WithTree extraction = new RelationTriple.WithTree(
subjectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
relation.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
objectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
parse, confidence.orElse(1.0));
extraction.istmod(istmod);
return Optional.of(extraction);
}
}
}
// Failed to match any pattern; return failure
return Optional.empty();
}
/**
* Same as {@link RelationTripleSegmenter#segmentVerb}, but with ACL clauses.
* This is a bit out of the ordinary, logic-wise, so it sits in its own function.
*/
private Optional<RelationTriple> segmentACL(SemanticGraph parse, Optional<Double> confidence, boolean consumeAll) {
IndexedWord subject = parse.getFirstRoot();
Optional<List<IndexedWord>> subjectSpan = getValidSubjectChunk(parse, subject, Optional.of("acl"));
if (subjectSpan.isPresent()) {
// found a valid subject
for (SemanticGraphEdge edgeFromSubj : parse.outgoingEdgeIterable(subject)) {
if ("acl".equals(edgeFromSubj.getRelation().toString())) {
// found a valid relation
IndexedWord relation = edgeFromSubj.getDependent();
List<IndexedWord> relationSpan = new ArrayList<>();
relationSpan.add(relation);
List<IndexedWord> objectSpan = new ArrayList<>();
List<IndexedWord> ppSpan = new ArrayList<>();
Optional<IndexedWord> pp = Optional.empty();
// Get other arguments
for (SemanticGraphEdge edgeFromRel : parse.outgoingEdgeIterable(relation)) {
String rel = edgeFromRel.getRelation().toString();
// Collect adverbs
if ("advmod".equals(rel)) {
Optional<List<IndexedWord>> advSpan = getValidAdverbChunk(parse, edgeFromRel.getDependent(), Optional.empty());
if (!advSpan.isPresent()) {
return Optional.empty(); // bad adverb span!
}
relationSpan.addAll(advSpan.get());
}
// Collect object
else if (rel.endsWith("obj")) {
if (!objectSpan.isEmpty()) {
return Optional.empty(); // duplicate objects!
}
Optional<List<IndexedWord>> maybeObjSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.empty());
if (!maybeObjSpan.isPresent()) {
return Optional.empty(); // bad object span!
}
objectSpan.addAll(maybeObjSpan.get());
}
// Collect pp
else if (rel.startsWith("nmod:")) {
if (!ppSpan.isEmpty()) {
return Optional.empty(); // duplicate objects!
}
Optional<List<IndexedWord>> maybePPSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.of("case"));
if (!maybePPSpan.isPresent()) {
return Optional.empty(); // bad object span!
}
ppSpan.addAll(maybePPSpan.get());
// Add the actual preposition, if we can find it
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(edgeFromRel.getDependent())) {
if ("case".equals(edge.getRelation().toString())) {
pp = Optional.of(edge.getDependent());
}
}
}
else if (consumeAll) {
return Optional.empty(); // bad edge out of the relation
}
}
// Construct a triple
// (canonicalize the triple to be subject; relation; object, folding in the PP)
if (!ppSpan.isEmpty() && !objectSpan.isEmpty()) {
relationSpan.addAll(objectSpan);
objectSpan = ppSpan;
} else if (!ppSpan.isEmpty()) {
objectSpan = ppSpan;
}
// (last error checks -- shouldn't ever fire)
if (!subjectSpan.isPresent() || subjectSpan.get().isEmpty() || relationSpan.isEmpty() || objectSpan.isEmpty()) {
return Optional.empty();
}
// (sort the relation span)
Collections.sort(relationSpan, (a, b) -> {
double val = a.pseudoPosition() - b.pseudoPosition();
if (val < 0) {
return -1;
}
if (val > 0) {
return 1;
} else {
return 0;
}
});
// (add in the PP node, if it exists)
if (pp.isPresent()) {
relationSpan.add(pp.get());
}
// (success!)
RelationTriple.WithTree extraction = new RelationTriple.WithTree(
subjectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
relationSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
objectSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
parse, confidence.orElse(1.0));
return Optional.of(extraction);
}
}
}
// Nothing found; return
return Optional.empty();
}
/**
* <p>
* Try to segment this sentence as a relation triple.
* This sentence must already match one of a few strict patterns for a valid OpenIE extraction.
* If it does not, then no relation triple is created.
* That is, this is <b>not</b> a relation extractor; it is just a utility to segment what is already a
* (subject, relation, object) triple into these three parts.
* </p>
*
* <p>
* This method will attempt to use both the verb-centric patterns and the ACL-centric patterns.
* </p>
*
* @param parse The sentence to process, as a dependency tree.
* @param confidence An optional confidence to pass on to the relation triple.
* @param consumeAll if true, force the entire parse to be consumed by the pattern.
* @return A relation triple, if this sentence matches one of the patterns of a valid relation triple.
*/
public Optional<RelationTriple> segment(SemanticGraph parse, Optional<Double> confidence, boolean consumeAll) {
// Copy and clean the tree
parse = new SemanticGraph(parse);
// Special case "there is <something>". Arguably this is a job for the clause splitter, but the <something> is
// sometimes not _really_ its own clause
IndexedWord root = parse.getFirstRoot();
if ( (root.lemma() != null && root.lemma().equalsIgnoreCase("be")) ||
(root.lemma() == null && ("is".equalsIgnoreCase(root.word()) ||
"are".equalsIgnoreCase(root.word()) ||
"were".equalsIgnoreCase(root.word()) ||
"be".equalsIgnoreCase(root.word())))) {
// Check for the "there is" construction
boolean foundThere = false;
boolean tooMayArcs = false; // an indicator for there being too much nonsense hanging off of the root
Optional<SemanticGraphEdge> newRoot = Optional.empty();
for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(root)) {
if (edge.getRelation().toString().equals("expl") && edge.getDependent().word().equalsIgnoreCase("there")) {
foundThere = true;
} else if (edge.getRelation().toString().equals("nsubj")) {
newRoot = Optional.of(edge);
} else {
tooMayArcs = true;
}
}
// Split off "there is")
if (foundThere && newRoot.isPresent() && !tooMayArcs) {
ClauseSplitterSearchProblem.splitToChildOfEdge(parse, newRoot.get());
}
}
// Run the patterns
Optional<RelationTriple> extraction = segmentVerb(parse, confidence, consumeAll);
if (!extraction.isPresent()) {
extraction = segmentACL(parse, confidence, consumeAll);
}
//
// Remove downward polarity extractions
//
if (extraction.isPresent()) {
boolean shouldRemove = true;
for (CoreLabel token : extraction.get()) {
if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null ||
!token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
shouldRemove = false;
}
}
if (shouldRemove) {
return Optional.empty();
}
}
// Return
return extraction;
}
/**
* Segment the given parse tree, forcing all nodes to be consumed.
* @see RelationTripleSegmenter#segment(edu.stanford.nlp.semgraph.SemanticGraph, Optional)
*/
public Optional<RelationTriple> segment(SemanticGraph parse, Optional<Double> confidence) {
return segment(parse, confidence, true);
}
}