RelationTripleSegmenter.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.naturalli;

import edu.stanford.nlp.ie.machinereading.structure.Span;
import edu.stanford.nlp.ie.util.RelationTriple;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.PriorityQueue;

import java.util.*;
import java.util.stream.Collectors;

/**
 * This class takes a {@link edu.stanford.nlp.naturalli.SentenceFragment} and converts it to a conventional
 * OpenIE triple, as materialized in the {@link RelationTriple} class.
 *
 * @author Gabor Angeli
 */
@SuppressWarnings("WeakerAccess")
public class RelationTripleSegmenter {

  private final boolean allowNominalsWithoutNER;

  /** A list of patterns to match relation extractions against */
  public final List<SemgrexPattern> VERB_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
    // { blue cats play [quietly] with yarn,
    //   Jill blew kisses at Jack,
    //   cats are standing next to dogs }
    add(SemgrexPattern.compile("{$}=verb ?>/cop|aux(pass)?/ {}=be >/.subj(pass)?/ {}=subject >/(nmod|acl|advcl):.*/=prepEdge ( {}=object ?>appos {} = appos ?>case {}=prep) ?>dobj {pos:/N.*/}=relObj"));
    // { cats are cute,
    //   horses are grazing peacefully }
    add(SemgrexPattern.compile("{$}=object >/.subj(pass)?/ {}=subject >/cop|aux(pass)?/ {}=verb ?>case {}=prep"));
    // { fish like to swim }
    add(SemgrexPattern.compile("{$}=verb >/.subj(pass)?/ {}=subject >xcomp ( {}=object ?>appos {}=appos )"));
    // { cats have tails }
    add(SemgrexPattern.compile("{$}=verb ?>/aux(pass)?/ {}=be >/.subj(pass)?/ {}=subject >/[di]obj|xcomp/ ( {}=object ?>appos {}=appos )"));
    // { Tom and Jerry were fighting }
    add(SemgrexPattern.compile("{$}=verb >/nsubj(pass)?/ ( {}=subject >/conj:and/=subjIgnored {}=object )"));
    // { mass of iron is 55amu }
    add(SemgrexPattern.compile("{pos:/NNS?/}=object >cop {}=relappend1 >/nsubj(pass)?/ ( {}=verb >/nmod:of/ ( {pos:/NNS?/}=subject >case {}=relappend0 ) )"));
  }});

  /**
   * <p>
   *   A set of derivative patterns from {@link RelationTripleSegmenter#VERB_PATTERNS} that ignore the subject
   *   arc. This is useful primarily for creating a training set for the clause splitter which emulates the
   *   behavior of the relation triple segmenter component.
   * </p>
   */
  public final List<SemgrexPattern> VP_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
    for (SemgrexPattern pattern : VERB_PATTERNS) {
      String fullPattern = pattern.pattern();
      String vpPattern = fullPattern
          .replace(">/.subj(pass)?/ {}=subject", "")  // drop the subject
          .replace("$", "pos:/V.*/");                 // but, force the root to be on a verb
      add(SemgrexPattern.compile(vpPattern));
    }
  }});

  /**
   * A set of nominal patterns, that don't require being in a coherent clause, but do require NER information.
   */
  public final List<TokenSequencePattern> NOUN_TOKEN_PATTERNS = Collections.unmodifiableList(new ArrayList<TokenSequencePattern>() {{
    // { NER nominal_verb NER,
    //   United States president Obama }
    add(TokenSequencePattern.compile("(?$object [ner:/PERSON|ORGANIZATION|LOCATION+/]+ ) (?$beof_comp [ {tag:/NN.*/} & !{ner:/PERSON|ORGANIZATION|LOCATION/} ]+ ) (?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ )"));
    // { NER 's nominal_verb NER,
    //   America 's president , Obama }
    add(TokenSequencePattern.compile("(?$object [ner:/PERSON|ORGANIZATION|LOCATION+/]+ ) /'s/ (?$beof_comp [ {tag:/NN.*/} & !{ner:/PERSON|ORGANIZATION|LOCATION/} ]+ ) /,/? (?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ )"));
    // { NER , NER ,,
    //   Obama, 28, ...,
    //   Obama (28) ...}
    add(TokenSequencePattern.compile("(?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ ) /,/ (?$object [ner:/NUMBER|DURATION|PERSON|ORGANIZATION/]+ ) /,/"));
    add(TokenSequencePattern.compile("(?$subject [ner:/PERSON|ORGANIZATION|LOCATION/]+ ) /\\(/ (?$object [ner:/NUMBER|DURATION|PERSON|ORGANIZATION/]+ ) /\\)/"));
  }});

  /**
   * A set of nominal patterns using dependencies, that don't require being in a coherent clause, but do require NER information.
   */
  private final List<SemgrexPattern> NOUN_DEPENDENCY_PATTERNS;


  /**
   * Create a new relation triple segmenter.
   *
   * @param allowNominalsWithoutNER If true, extract all nominal relations and not just those which are warranted based on
   *                                named entity tags. For most practical applications, this greatly over-produces trivial triples.
   */
  public RelationTripleSegmenter(boolean allowNominalsWithoutNER) {
    this.allowNominalsWithoutNER = allowNominalsWithoutNER;
    NOUN_DEPENDENCY_PATTERNS = Collections.unmodifiableList(new ArrayList<SemgrexPattern>() {{
      // { Durin, son of Thorin }
      add(SemgrexPattern.compile("{tag:/N.*/}=subject >appos ( {}=relation >/nmod:.*/=relaux {}=object)"));
      // { Thorin's son, Durin }
      add(SemgrexPattern.compile("{}=relation >/nmod:.*/=relaux {}=subject >appos {}=object"));
      // { Stanford's Chris Manning  }
      add(SemgrexPattern.compile("{tag:/N.*/}=object >/nmod:poss/=relaux ( {}=subject >case {} )"));
      // { Chris Manning of Stanford,
      //   [There are] cats with tails,
      if (allowNominalsWithoutNER) {
        add(SemgrexPattern.compile("{tag:/N.*/}=subject >/nmod:(?!poss).*/=relaux {}=object"));
      } else {
        add(SemgrexPattern.compile("{ner:/PERSON|ORGANIZATION|LOCATION/}=subject >/nmod:(?!poss).*/=relaux {ner:/..+/}=object"));
        add(SemgrexPattern.compile("{tag:/N.*/}=subject >/nmod:(in|with)/=relaux {}=object"));
      }
      //  { President Obama }
      if (allowNominalsWithoutNER) {
        add(SemgrexPattern.compile("{tag:/N.*/}=subject >/amod/=arc {}=object"));
      } else {
        add(SemgrexPattern.compile("{ner:/PERSON|ORGANIZATION|LOCATION/}=subject >/amod|compound/=arc {ner:/..+/}=object"));
      }
    }});
  }

  /**
   * @see RelationTripleSegmenter#RelationTripleSegmenter(boolean)
   */
  @SuppressWarnings("UnusedDeclaration")
  public RelationTripleSegmenter() {
    this(false);
  }

  /**
   * Extract the nominal patterns from this sentence.
   *
   * @see RelationTripleSegmenter#NOUN_TOKEN_PATTERNS
   * @see RelationTripleSegmenter#NOUN_DEPENDENCY_PATTERNS
   *
   * @param parse The parse tree of the sentence to annotate.
   * @param tokens The tokens of the sentence to annotate.
   * @return A list of {@link RelationTriple}s. Note that these do not have an associated tree with them.
   */
  @SuppressWarnings("unchecked")
  public List<RelationTriple> extract(SemanticGraph parse, List<CoreLabel> tokens) {
    List<RelationTriple> extractions = new ArrayList<>();
    Set<Triple<Span,String,Span>> alreadyExtracted = new HashSet<>();

    //
    // Run Token Patterns
    //
    for (TokenSequencePattern tokenPattern : NOUN_TOKEN_PATTERNS) {
      TokenSequenceMatcher tokenMatcher = tokenPattern.matcher(tokens);
      while (tokenMatcher.find()) {
        boolean missingPrefixBe;
        boolean missingSuffixOf = false;

        // Create subject
        List<? extends CoreMap> subject = tokenMatcher.groupNodes("$subject");
        Span subjectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) subject.get(0)).index() - 1, ((CoreLabel) subject.get(subject.size() - 1)).index()));
        List<CoreLabel> subjectTokens = new ArrayList<>();
        for (int i : subjectSpan) {
          subjectTokens.add(tokens.get(i));
        }

        // Create object
        List<? extends CoreMap> object = tokenMatcher.groupNodes("$object");
        Span objectSpan = Util.extractNER(tokens, Span.fromValues(((CoreLabel) object.get(0)).index() - 1, ((CoreLabel) object.get(object.size() - 1)).index()));
        if (Span.overlaps(subjectSpan, objectSpan)) {
          continue;
        }
        List<CoreLabel> objectTokens = new ArrayList<>();
        for (int i : objectSpan) {
          objectTokens.add(tokens.get(i));
        }

        // Create relation
        if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
          List<CoreLabel> relationTokens = new ArrayList<>();
          // (add the 'be')
          missingPrefixBe = true;
          // (add a complement to the 'be')
          List<? extends CoreMap> beofComp = tokenMatcher.groupNodes("$beof_comp");
          if (beofComp != null) {
            // (add the complement
            for (CoreMap token : beofComp) {
              if (token instanceof CoreLabel) {
                relationTokens.add((CoreLabel) token);
              } else {
                relationTokens.add(new CoreLabel(token));
              }
            }
            // (add the 'of')
            missingSuffixOf = true;
          }
          // Add extraction
          String relationGloss = StringUtils.join(relationTokens.stream().map(CoreLabel::word), " ");
          if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
            RelationTriple extraction = new RelationTriple(subjectTokens, relationTokens, objectTokens);
            //noinspection ConstantConditions
            extraction.isPrefixBe(missingPrefixBe);
            extraction.isSuffixOf(missingSuffixOf);
            extractions.add(extraction);
            alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
          }
        }
      }

      //
      // Run Semgrex Matches
      //
      for (SemgrexPattern semgrex : NOUN_DEPENDENCY_PATTERNS) {
        SemgrexMatcher matcher = semgrex.matcher(parse);
        while (matcher.find()) {
          boolean missingPrefixBe = false;
          boolean missingSuffixBe = false;
          boolean istmod = false;

          // Get relaux if applicable
          String relaux = matcher.getRelnString("relaux");
          String ignoredArc = relaux;
          if (ignoredArc == null) {
            ignoredArc = matcher.getRelnString("arc");
          }

          // Create subject
          IndexedWord subject = matcher.getNode("subject");
          List<IndexedWord> subjectTokens = new ArrayList<>();
          Span subjectSpan;
          if (subject.ner() != null && !"O".equals(subject.ner())) {
            subjectSpan = Util.extractNER(tokens, Span.fromValues(subject.index() - 1, subject.index()));
            for (int i : subjectSpan) {
              subjectTokens.add(new IndexedWord(tokens.get(i)));
            }
          } else {
            subjectTokens = getValidChunk(parse, subject, VALID_SUBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(subject));
            subjectSpan = Util.tokensToSpan(subjectTokens);
          }

          // Create object
          IndexedWord object = matcher.getNode("object");
          List<IndexedWord> objectTokens = new ArrayList<>();
          Span objectSpan;
          if (object.ner() != null && !"O".equals(object.ner())) {
            objectSpan = Util.extractNER(tokens, Span.fromValues(object.index() - 1, object.index()));
            for (int i : objectSpan) {
              objectTokens.add(new IndexedWord(tokens.get(i)));
            }
          } else {
            objectTokens = getValidChunk(parse, object, VALID_OBJECT_ARCS, Optional.ofNullable(ignoredArc), true).orElse(Collections.singletonList(object));
            objectSpan = Util.tokensToSpan(objectTokens);
          }

          // Check that the pair is valid
          if (Span.overlaps(subjectSpan, objectSpan)) {
            continue;  // We extracted an identity
          }
          if (subjectSpan.end() == objectSpan.start() - 1 &&
              (tokens.get(subjectSpan.end()).word().matches("[\\.,:;\\('\"]") ||
                  "CC".equals(tokens.get(subjectSpan.end()).tag()))) {
            continue; // We're straddling a clause
          }
          if (objectSpan.end() == subjectSpan.start() - 1 &&
              (tokens.get(objectSpan.end()).word().matches("[\\.,:;\\('\"]") ||
                  "CC".equals(tokens.get(objectSpan.end()).tag()))) {
            continue; // We're straddling a clause
          }

          // Get any prepositional edges
          String expected = relaux == null ? "" : relaux.substring(relaux.indexOf(":") + 1).replace("_", " ");
          IndexedWord prepWord = null;
          // (these usually come from the object)
          boolean prepositionIsPrefix = false;
          for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(object)) {
            if (edge.getRelation().toString().equals("case")) {
              prepWord = edge.getDependent();
            }
          }
          // (...but sometimes from the subject)
          if (prepWord == null) {
            for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(subject)) {
              if (edge.getRelation().toString().equals("case")) {
                prepositionIsPrefix = true;
                prepWord = edge.getDependent();
              }
            }
          }
          List<IndexedWord> prepChunk = Collections.EMPTY_LIST;
          if (prepWord != null && !expected.equals("tmod")) {
            Optional<List<IndexedWord>> optionalPrepChunk = getValidChunk(parse, prepWord, Collections.singleton("mwe"), Optional.empty(), true);
            if (!optionalPrepChunk.isPresent()) { continue; }
            prepChunk = optionalPrepChunk.get();
            Collections.sort(prepChunk, (a, b) -> {
              double val = a.pseudoPosition() - b.pseudoPosition();
              if (val < 0) { return -1; }
              if (val > 0) { return 1; }
              else { return 0; }
            });  // ascending sort
          }

          // Get the relation
          if (subjectTokens.size() > 0 && objectTokens.size() > 0) {
            LinkedList<IndexedWord> relationTokens = new LinkedList<>();
            IndexedWord relNode = matcher.getNode("relation");
            if (relNode != null) {

              // Case: we have a grounded relation span
              // (add the relation)
              relationTokens.add(relNode);
              // (add any prepositional case markings)
              if (prepositionIsPrefix) {
                missingSuffixBe = true;  // We're almost certainly missing a suffix 'be'
                for (int i = prepChunk.size() - 1; i >=0; --i) { relationTokens.addFirst(prepChunk.get(i)); }
              } else {
                relationTokens.addAll(prepChunk);
              }
              if (expected.equalsIgnoreCase("tmod")) {
                istmod = true;
              }

            } else {

              // Case: we have a hallucinated relation span
              // (mark it as missing a preceding 'be'
              if (!expected.equals("poss")) {
                missingPrefixBe = true;
              }
              // (add any prepositional case markings)
              if (prepositionIsPrefix) {
                for (int i = prepChunk.size() - 1; i >=0; --i) { relationTokens.addFirst(prepChunk.get(i)); }
              } else {
                relationTokens.addAll(prepChunk);
              }
              if (expected.equalsIgnoreCase("tmod")) {
                istmod = true;
              }
              // (some fine-tuning)
              if (allowNominalsWithoutNER && "of".equals(expected)) {
                continue;  // prohibit things like "conductor of electricity" -> "conductor; be of; electricity"
              }
            }


            // Add extraction
            String relationGloss = StringUtils.join(relationTokens.stream().map(IndexedWord::word), " ");
            if (!alreadyExtracted.contains(Triple.makeTriple(subjectSpan, relationGloss, objectSpan))) {
              RelationTriple extraction = new RelationTriple(
                  subjectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
                  relationTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
                  objectTokens.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()));
              extraction.istmod(istmod);
              extraction.isPrefixBe(missingPrefixBe);
              extraction.isSuffixBe(missingSuffixBe);
              extractions.add(extraction);
              alreadyExtracted.add(Triple.makeTriple(subjectSpan, relationGloss, objectSpan));
            }
          }
        }
      }
    }

    //
    // Filter downward polarity extractions
    //
    Iterator<RelationTriple> iter = extractions.iterator();
    while (iter.hasNext()) {
      RelationTriple term = iter.next();
      boolean shouldRemove = true;
      for (CoreLabel token : term) {
        if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null ||
            !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards() ) {
          shouldRemove = false;
        }
      }
      if (shouldRemove) {
        iter.remove();   // Don't extract things in downward polarity contexts.
      }
    }

    // Return
    return extractions;
  }

//  /**
//   * A counter keeping track of how many times a given pattern has matched. This allows us to learn to iterate
//   * over patterns in the optimal order; this is just an efficiency tweak (but an effective one!).
//   */
//  private final Counter<SemgrexPattern> VERB_PATTERN_HITS = new ClassicCounter<>();

  /** A set of valid arcs denoting a subject entity we are interested in */
  public final Set<String> VALID_SUBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
    add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod:poss"); add("nmod:tmod"); add("expl");
    add("nsubj"); add("case");
  }});

  /** A set of valid arcs denoting an object entity we are interested in */
  public final Set<String> VALID_OBJECT_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
    add("amod"); add("compound"); add("aux"); add("nummod"); add("nmod"); add("nsubj"); add("nmod:*"); add("nmod:poss");
    add("nmod:tmod"); add("conj:and"); add("advmod"); add("acl"); add("case");
    // add("advcl"); // Born in Hawaii, Obama is a US citizen; citizen -advcl-> Born.
  }});

  /** A set of valid arcs denoting an adverbial modifier we are interested in */
  public final Set<String> VALID_ADVERB_ARCS = Collections.unmodifiableSet(new HashSet<String>(){{
    add("amod"); add("advmod"); add("conj"); add("cc"); add("conj:and"); add("conj:or");
    add("auxpass"); add("compound:*");
  }});

  /**
   * @see RelationTripleSegmenter#getValidSubjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
   * @see RelationTripleSegmenter#getValidObjectChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
   * @see RelationTripleSegmenter#getValidAdverbChunk(edu.stanford.nlp.semgraph.SemanticGraph, edu.stanford.nlp.ling.IndexedWord, Optional)
   */
  @SuppressWarnings("StatementWithEmptyBody")
  protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot,
                                                    Set<String> validArcs, Optional<String> ignoredArc,
                                                    boolean allowExtraArcs) {
    PriorityQueue<IndexedWord> chunk = new FixedPrioritiesPriorityQueue<>();
    Set<Double> seenIndices = new HashSet<>();
    Queue<IndexedWord> fringe = new LinkedList<>();
    IndexedWord root = originalRoot;
    fringe.add(root);

    boolean isCopula = false;
    IndexedWord primaryCase = null;
    for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(originalRoot)) {
      String shortName = edge.getRelation().getShortName();
      if (shortName.equals("cop") || shortName.equals("auxpass")) {
        isCopula = true;
      }
      if (shortName.equals("case")) {
        primaryCase = edge.getDependent();
      }
    }

    while (!fringe.isEmpty()) {
      root = fringe.poll();
      chunk.add(root, -root.pseudoPosition());

      // Sanity check to prevent infinite loops
      if (seenIndices.contains(root.pseudoPosition())) {
        // TODO(gabor) Indicates a cycle in the tree!
        return Optional.empty();
      }
      seenIndices.add(root.pseudoPosition());

      // Check outgoing edges
      boolean hasConj = false;
      boolean hasCC = false;
      for (SemanticGraphEdge edge : parse.getOutEdgesSorted(root)) {
        String shortName = edge.getRelation().getShortName();
        String name = edge.getRelation().toString();
        if (shortName.startsWith("conj")) { hasConj = true; }
        if (shortName.equals("cc")) { hasCC = true; }
        //noinspection StatementWithEmptyBody
        if (isCopula && (shortName.equals("cop") || shortName.contains("subj") || shortName.equals("auxpass") )) {
          // noop; ignore nsubj, cop for extractions with copula
        } else if (edge.getDependent() == primaryCase) {
          // noop: ignore case edge
        } else if (ignoredArc.isPresent() &&
                   (ignoredArc.get().equals(name) || (ignoredArc.get().startsWith("conj") && name.equals("cc")))) {
          // noop; ignore explicitly requested noop arc, or "CC" if the noop arc is a conj:*
        } else if (!validArcs.contains(edge.getRelation().getShortName()) && !validArcs.contains(edge.getRelation().getShortName().replaceAll(":.*",":*"))) {
          if (!allowExtraArcs) {
            return Optional.empty();
          } else {
            // noop: just some dangling arc
          }
        } else {
          fringe.add(edge.getDependent());
        }
      }

      // Ensure that we don't have a conj without a cc, or vice versa
      if (Boolean.logicalXor(hasConj, hasCC)) {
        return Optional.empty();
      }
    }

    return Optional.of(chunk.toSortedList());
  }

  /**
   * @see RelationTripleSegmenter#getValidChunk(SemanticGraph, IndexedWord, Set, Optional, boolean)
   */
  protected Optional<List<IndexedWord>> getValidChunk(SemanticGraph parse, IndexedWord originalRoot,
                                                    Set<String> validArcs, Optional<String> ignoredArc) {
    return getValidChunk(parse, originalRoot, validArcs, ignoredArc, false);
  }

  /**
   * Get the yield of a given subtree, if it is a valid subject.
   * Otherwise, return {@link java.util.Optional#empty()}}.
   * @param parse The parse tree we are extracting a subtree from.
   * @param root The root of the subtree.
   * @param noopArc An optional edge type to ignore in gathering the chunk.
   * @return If this subtree is a valid entity, we return its yield. Otherwise, we return empty.
   */
  protected Optional<List<IndexedWord>> getValidSubjectChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
    return getValidChunk(parse, root, VALID_SUBJECT_ARCS, noopArc);
  }

  /**
   * Get the yield of a given subtree, if it is a valid object.
   * Otherwise, return {@link java.util.Optional#empty()}}.
   * @param parse The parse tree we are extracting a subtree from.
   * @param root The root of the subtree.
   * @param noopArc An optional edge type to ignore in gathering the chunk.
   * @return If this subtree is a valid entity, we return its yield. Otherwise, we return empty.
   */
  protected Optional<List<IndexedWord>> getValidObjectChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
    return getValidChunk(parse, root, VALID_OBJECT_ARCS, noopArc);
  }

  /**
   * Get the yield of a given subtree, if it is a adverb chunk.
   * Otherwise, return {@link java.util.Optional#empty()}}.
   * @param parse The parse tree we are extracting a subtree from.
   * @param root The root of the subtree.
   * @param noopArc An optional edge type to ignore in gathering the chunk.
   * @return If this subtree is a valid adverb, we return its yield. Otherwise, we return empty.
   */
  protected Optional<List<IndexedWord>> getValidAdverbChunk(SemanticGraph parse, IndexedWord root, Optional<String> noopArc) {
    return getValidChunk(parse, root, VALID_ADVERB_ARCS, noopArc);
  }

  /**
   * <p>
   * Try to segment this sentence as a relation triple.
   * This sentence must already match one of a few strict patterns for a valid OpenIE extraction.
   * If it does not, then no relation triple is created.
   * That is, this is <b>not</b> a relation extractor; it is just a utility to segment what is already a
   * (subject, relation, object) triple into these three parts.
   * </p>
   *
   * <p>
   *   This method will only run the verb-centric patterns
   * </p>
   *
   * @param parse The sentence to process, as a dependency tree.
   * @param confidence An optional confidence to pass on to the relation triple.
   * @param consumeAll if true, force the entire parse to be consumed by the pattern.
   * @return A relation triple, if this sentence matches one of the patterns of a valid relation triple.
   */
  @SuppressWarnings("UnnecessaryLabelOnContinueStatement")
  private Optional<RelationTriple> segmentVerb(SemanticGraph parse,
                                               Optional<Double> confidence,
                                               boolean consumeAll) {
    // Run pattern loop
    PATTERN_LOOP: for (SemgrexPattern pattern : VERB_PATTERNS) {  // For every candidate pattern...
      SemgrexMatcher m = pattern.matcher(parse);
      if (m.matches()) {  // ... see if it matches the sentence
        if ("nmod:poss".equals(m.getRelnString("prepEdge"))) {
          continue PATTERN_LOOP;   // nmod:poss is not a preposition!
        }
        int numKnownDependents = 2;  // subject and object, at minimum
        boolean istmod = false;      // this is a tmod relation

        // Object
        IndexedWord object = m.getNode("appos");
        if (object == null) {
          object = m.getNode("object");
        }
        if (object != null && object.tag() != null && object.tag().startsWith("W")) {
          continue;  // don't extract WH arguments
        }
        assert object != null;

        // Verb
        PriorityQueue<IndexedWord> verbChunk = new FixedPrioritiesPriorityQueue<>();
        IndexedWord verb = m.getNode("verb");
        List<IndexedWord> adverbs = new ArrayList<>();
        Optional<String> subjNoopArc = Optional.empty();
        Optional<String> objNoopArc = Optional.empty();
        assert verb != null;
        // Case: a standard extraction with a main verb
        IndexedWord relObj = m.getNode("relObj");
        for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(verb)) {
          if ("advmod".equals(edge.getRelation().toString()) ||
              "amod".equals(edge.getRelation().toString()) ||
              "compound:*".equals(edge.getRelation().toString().replaceAll(":.*", ":*"))) {
            // Add adverb modifiers
            String tag = edge.getDependent().backingLabel().tag();
            if (tag == null ||
                (!tag.startsWith("W") && !edge.getDependent().backingLabel().word().equalsIgnoreCase("then"))) {  // prohibit advmods like "where"
              adverbs.add(edge.getDependent());
            }
          } else if (edge.getDependent().equals(relObj)) {
            // Add additional object to the relation
            Optional<List<IndexedWord>> relObjSpan = getValidChunk(parse, relObj, Collections.singleton("compound"), Optional.empty());
            if (!relObjSpan.isPresent()) {
              continue PATTERN_LOOP;
            } else {
              for (IndexedWord token : relObjSpan.get()) {
                verbChunk.add(token, -token.pseudoPosition());
              }
              numKnownDependents += 1;
            }
          }
        }
        verbChunk.add(verb, -verb.pseudoPosition());

        // Prepositions
        IndexedWord prep = m.getNode("prep");
        String prepEdge = m.getRelnString("prepEdge");
        if (prep != null) {
          // (get the preposition chunk)
          Optional<List<IndexedWord>> chunk = getValidChunk(parse, prep, Collections.singleton("mwe"), Optional.empty(), true);
          // (continue if no chunk found)
          if (!chunk.isPresent()) {
            continue PATTERN_LOOP;  // Probably something like a conj w/o a cc
          }
          // (add the preposition)
          for (IndexedWord word : chunk.get()) {
            verbChunk.add(word, Integer.MIN_VALUE / 2 - word.pseudoPosition());
          }
        }
        // (handle special prepositions)
        if (prepEdge != null) {
          String prepStringFromEdge = prepEdge.substring(prepEdge.indexOf(":") + 1).replace("_", " ");
          if ("tmod".equals(prepStringFromEdge)) {
            istmod = true;
          }
        }

        // Auxilliary "be"
        IndexedWord be = m.getNode("be");
        if (be != null) { verbChunk.add(be, -be.pseudoPosition()); numKnownDependents += 1; }
        // (adverbs have to be well-formed)
        if (!adverbs.isEmpty()) {
          Set<IndexedWord> adverbialModifiers = new HashSet<>();
          for (IndexedWord adv : adverbs) {
            Optional<List<IndexedWord>> adverbChunk = getValidAdverbChunk(parse, adv, Optional.empty());
            if (adverbChunk.isPresent()) {
              adverbialModifiers.addAll(adverbChunk.get().stream().collect(Collectors.toList()));
            } else {
              continue PATTERN_LOOP;  // Invalid adverbial phrase
            }
            numKnownDependents += 1;
          }
          for (IndexedWord adverbToken : adverbialModifiers) {
            verbChunk.add(adverbToken, -adverbToken.pseudoPosition());
          }
        }

        // (check for additional edges)
        if (consumeAll && parse.outDegree(verb) > numKnownDependents) {
          //noinspection UnnecessaryLabelOnContinueStatement
          continue PATTERN_LOOP;  // Too many outgoing edges; we didn't consume them all.
        }
        List<IndexedWord> relation = verbChunk.toSortedList();
        int appendI = 0;
        IndexedWord relAppend = m.getNode("relappend" + appendI);
        while (relAppend != null) {
          relation.add(relAppend);
          appendI += 1;
          relAppend = m.getNode("relappend" + appendI);
        }

        // Last chance to register ignored edges
        if (!subjNoopArc.isPresent()) {
          subjNoopArc = Optional.ofNullable(m.getRelnString("subjIgnored"));
          if (!subjNoopArc.isPresent()) {
            subjNoopArc = Optional.ofNullable(m.getRelnString("prepEdge"));  // For some strange "there are" cases
          }
        }
        if (!objNoopArc.isPresent()) {
          objNoopArc = Optional.ofNullable(m.getRelnString("objIgnored"));
        }

        // Find the subject
        // By default, this is just the subject node; but, occasionally we want to follow a
        // csubj clause to find the real subject.
        IndexedWord subject = m.getNode("subject");
        if (subject != null && subject.tag() != null && subject.tag().startsWith("W")) {
          continue;  // don't extract WH subjects
        }

        // Subject+Object
        Optional<List<IndexedWord>> subjectSpan = getValidSubjectChunk(parse, subject, subjNoopArc);
        Optional<List<IndexedWord>> objectSpan = getValidObjectChunk(parse, object, objNoopArc);
        // Create relation
        if (subjectSpan.isPresent() && objectSpan.isPresent() &&
            CollectionUtils.intersection(new HashSet<>(subjectSpan.get()), new HashSet<>(objectSpan.get())).isEmpty()
            ) {  // ... and has a valid subject+object
          // Success! Found a valid extraction.
          RelationTriple.WithTree extraction = new RelationTriple.WithTree(
              subjectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              relation.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              objectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              parse, confidence.orElse(1.0));
          extraction.istmod(istmod);
          return Optional.of(extraction);
        }
      }
    }
    // Failed to match any pattern; return failure
    return Optional.empty();
  }

  /**
   * Same as {@link RelationTripleSegmenter#segmentVerb}, but with ACL clauses.
   * This is a bit out of the ordinary, logic-wise, so it sits in its own function.
   */
  private Optional<RelationTriple> segmentACL(SemanticGraph parse, Optional<Double> confidence, boolean consumeAll) {
    IndexedWord subject = parse.getFirstRoot();
    Optional<List<IndexedWord>> subjectSpan = getValidSubjectChunk(parse, subject, Optional.of("acl"));
    if (subjectSpan.isPresent()) {
      // found a valid subject
      for (SemanticGraphEdge edgeFromSubj : parse.outgoingEdgeIterable(subject)) {
        if ("acl".equals(edgeFromSubj.getRelation().toString())) {
          // found a valid relation
          IndexedWord relation = edgeFromSubj.getDependent();
          List<IndexedWord> relationSpan = new ArrayList<>();
          relationSpan.add(relation);
          List<IndexedWord> objectSpan = new ArrayList<>();
          List<IndexedWord> ppSpan = new ArrayList<>();
          Optional<IndexedWord> pp = Optional.empty();

          // Get other arguments
          for (SemanticGraphEdge edgeFromRel : parse.outgoingEdgeIterable(relation)) {
            String rel = edgeFromRel.getRelation().toString();
            // Collect adverbs
            if ("advmod".equals(rel)) {
              Optional<List<IndexedWord>> advSpan = getValidAdverbChunk(parse, edgeFromRel.getDependent(), Optional.empty());
              if (!advSpan.isPresent()) {
                return Optional.empty();  // bad adverb span!
              }
              relationSpan.addAll(advSpan.get());
            }
            // Collect object
            else if (rel.endsWith("obj")) {
              if (!objectSpan.isEmpty()) {
                return Optional.empty();  // duplicate objects!
              }
              Optional<List<IndexedWord>> maybeObjSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.empty());
              if (!maybeObjSpan.isPresent()) {
                return Optional.empty();  // bad object span!
              }
              objectSpan.addAll(maybeObjSpan.get());
            }
            // Collect pp
            else if (rel.startsWith("nmod:")) {
              if (!ppSpan.isEmpty()) {
                return Optional.empty();  // duplicate objects!
              }
              Optional<List<IndexedWord>> maybePPSpan = getValidObjectChunk(parse, edgeFromRel.getDependent(), Optional.of("case"));
              if (!maybePPSpan.isPresent()) {
                return Optional.empty();  // bad object span!
              }
              ppSpan.addAll(maybePPSpan.get());
              // Add the actual preposition, if we can find it
              for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(edgeFromRel.getDependent())) {
                if ("case".equals(edge.getRelation().toString())) {
                  pp = Optional.of(edge.getDependent());
                }
              }
            }
            else if (consumeAll) {
              return Optional.empty();  // bad edge out of the relation
            }
          }

          // Construct a triple
          // (canonicalize the triple to be subject; relation; object, folding in the PP)
          if (!ppSpan.isEmpty() && !objectSpan.isEmpty()) {
            relationSpan.addAll(objectSpan);
            objectSpan = ppSpan;
          } else if (!ppSpan.isEmpty()) {
            objectSpan = ppSpan;
          }
          // (last error checks -- shouldn't ever fire)
          if (!subjectSpan.isPresent() || subjectSpan.get().isEmpty() || relationSpan.isEmpty() || objectSpan.isEmpty()) {
            return Optional.empty();
          }
          // (sort the relation span)
          Collections.sort(relationSpan, (a, b) -> {
                double val = a.pseudoPosition() - b.pseudoPosition();
                if (val < 0) {
                  return -1;
                }
                if (val > 0) {
                  return 1;
                } else {
                  return 0;
                }
              });
          // (add in the PP node, if it exists)
          if (pp.isPresent()) {
            relationSpan.add(pp.get());
          }
          // (success!)
          RelationTriple.WithTree extraction = new RelationTriple.WithTree(
              subjectSpan.get().stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              relationSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              objectSpan.stream().map(IndexedWord::backingLabel).collect(Collectors.toList()),
              parse, confidence.orElse(1.0));
          return Optional.of(extraction);
        }
      }
    }

    // Nothing found; return
    return Optional.empty();
  }

  /**
   * <p>
   * Try to segment this sentence as a relation triple.
   * This sentence must already match one of a few strict patterns for a valid OpenIE extraction.
   * If it does not, then no relation triple is created.
   * That is, this is <b>not</b> a relation extractor; it is just a utility to segment what is already a
   * (subject, relation, object) triple into these three parts.
   * </p>
   *
   * <p>
   *   This method will attempt to use both the verb-centric patterns and the ACL-centric patterns.
   * </p>
   *
   * @param parse The sentence to process, as a dependency tree.
   * @param confidence An optional confidence to pass on to the relation triple.
   * @param consumeAll if true, force the entire parse to be consumed by the pattern.
   * @return A relation triple, if this sentence matches one of the patterns of a valid relation triple.
   */
  public Optional<RelationTriple> segment(SemanticGraph parse, Optional<Double> confidence, boolean consumeAll) {
    // Copy and clean the tree
    parse = new SemanticGraph(parse);

    // Special case "there is <something>". Arguably this is a job for the clause splitter, but the <something> is
    // sometimes not _really_ its own clause
    IndexedWord root = parse.getFirstRoot();
    if ( (root.lemma() != null && root.lemma().equalsIgnoreCase("be")) ||
         (root.lemma() == null && ("is".equalsIgnoreCase(root.word()) ||
                                   "are".equalsIgnoreCase(root.word()) ||
                                   "were".equalsIgnoreCase(root.word()) ||
                                   "be".equalsIgnoreCase(root.word())))) {
      // Check for the "there is" construction
      boolean foundThere = false;
      boolean tooMayArcs = false;  // an indicator for there being too much nonsense hanging off of the root
      Optional<SemanticGraphEdge> newRoot = Optional.empty();
      for (SemanticGraphEdge edge : parse.outgoingEdgeIterable(root)) {
        if (edge.getRelation().toString().equals("expl") && edge.getDependent().word().equalsIgnoreCase("there")) {
          foundThere = true;
        } else if (edge.getRelation().toString().equals("nsubj")) {
          newRoot = Optional.of(edge);
        } else {
          tooMayArcs = true;
        }
      }
      // Split off "there is")
      if (foundThere && newRoot.isPresent() && !tooMayArcs) {
        ClauseSplitterSearchProblem.splitToChildOfEdge(parse, newRoot.get());
      }
    }

    // Run the patterns
    Optional<RelationTriple> extraction = segmentVerb(parse, confidence, consumeAll);
    if (!extraction.isPresent()) {
      extraction = segmentACL(parse, confidence, consumeAll);
    }

    //
    // Remove downward polarity extractions
    //
    if (extraction.isPresent()) {
      boolean shouldRemove = true;
      for (CoreLabel token : extraction.get()) {
        if (token.get(NaturalLogicAnnotations.PolarityAnnotation.class) == null ||
            !token.get(NaturalLogicAnnotations.PolarityAnnotation.class).isDownwards()) {
          shouldRemove = false;
        }
      }
      if (shouldRemove) {
        return Optional.empty();
      }
    }

    // Return
    return extraction;
  }

  /**
   * Segment the given parse tree, forcing all nodes to be consumed.
   * @see RelationTripleSegmenter#segment(edu.stanford.nlp.semgraph.SemanticGraph, Optional)
   */
  public Optional<RelationTriple> segment(SemanticGraph parse, Optional<Double> confidence) {
    return segment(parse, confidence, true);
  }
}