MentionExtractor.java example

Explorer
CoreNLP-master
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2011 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.dcoref;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
 * Generic mention extractor from a corpus.
 *
 * @author Jenny Finkel
 * @author Mihai Surdeanu
 * @author Karthik Raghunathan
 * @author Heeyoung Lee
 * @author Sudarshan Rangarajan
 */
public class MentionExtractor {

  private final HeadFinder headFinder;

  protected String currentDocumentID;

  protected final Dictionaries dictionaries;
  protected final Semantics semantics;

  public CorefMentionFinder mentionFinder;
  protected StanfordCoreNLP stanfordProcessor;
  protected LogisticClassifier<String, String> singletonPredictor;

  /** The maximum mention ID: for preventing duplicated mention ID assignment */
  protected int maxID = -1;

  public static final boolean VERBOSE = false;

  public MentionExtractor(Dictionaries dict, Semantics semantics) {
    this.headFinder = new SemanticHeadFinder();
    this.dictionaries = dict;
    this.semantics = semantics;
    this.mentionFinder = new RuleBasedCorefMentionFinder();  // Default
  }

  public void setMentionFinder(CorefMentionFinder mentionFinder)
  {
    this.mentionFinder = mentionFinder;
  }

  /**
   * Extracts the info relevant for coref from the next document in the corpus
   * @return List of mentions found in each sentence ordered according to the tree traversal.
   * @throws Exception
   */
  public Document nextDoc() throws Exception { return null; }

  /**
   * Reset so that we start at the beginning of the document collection
   */
  public void resetDocs() {
    maxID = -1;
    currentDocumentID = null;
  }

  public Document arrange(
      Annotation anno,
      List<List<CoreLabel>> words,
      List<Tree> trees,
      List<List<Mention>> unorderedMentions) throws Exception {
    return arrange(anno, words, trees, unorderedMentions, null, false);
  }

  protected int getHeadIndex(Tree t) {
    // The trees passed in do not have the CoordinationTransformer
    // applied, but that just means the SemanticHeadFinder results are
    // slightly worse.
    Tree ht = t.headTerminal(headFinder);
    if(ht==null) return -1;  // temporary: a key which is matched to nothing
    CoreLabel l = (CoreLabel) ht.label();
    return l.get(CoreAnnotations.IndexAnnotation.class);
  }
  private String treeToKey(Tree t) {
    int idx = getHeadIndex(t);
    String key = Integer.toString(idx) + ':' + t.toString();
    return key;
  }

  public Document arrange(
      Annotation anno,
      List<List<CoreLabel>> words,
      List<Tree> trees,
      List<List<Mention>> unorderedMentions,
      List<List<Mention>> unorderedGoldMentions,
      boolean doMergeLabels) throws Exception {
    List<List<Mention>> predictedOrderedMentionsBySentence = arrange(anno, words, trees, unorderedMentions, doMergeLabels);
    List<List<Mention>> goldOrderedMentionsBySentence = null;
//    SieveCoreferenceSystem.debugPrintMentions(System.err, "UNORDERED GOLD MENTIONS:", unorderedGoldMentions);

    if(unorderedGoldMentions != null) {
      goldOrderedMentionsBySentence = arrange(anno, words, trees, unorderedGoldMentions, doMergeLabels);
    }
//    SieveCoreferenceSystem.debugPrintMentions(System.err, "ORDERED GOLD MENTIONS:", goldOrderedMentionsBySentence);
    return new Document(anno, predictedOrderedMentionsBySentence, goldOrderedMentionsBySentence, dictionaries);
  }

  /**
   * Post-processes the extracted mentions. Here we set the Mention fields required for coref and order mentions by tree-traversal order.
   * @param words List of words in each sentence, in textual order
   * @param trees List of trees, one per sentence
   * @param unorderedMentions List of unordered, unprocessed mentions
   *                 Each mention MUST have startIndex and endIndex set!
   *                 Optionally, if scoring is desired, mentions must have mentionID and originalRef set.
   *                 All the other Mention fields are set here.
   * @return List of mentions ordered according to the tree traversal
   * @throws Exception
   */
  public List<List<Mention>> arrange(
      Annotation anno,
      List<List<CoreLabel>> words,
      List<Tree> trees,
      List<List<Mention>> unorderedMentions,
      boolean doMergeLabels) throws Exception {

    List<List<Mention>> orderedMentionsBySentence = new ArrayList<>();

    //
    // traverse all sentences and process each individual one
    //
    for (int sent = 0, sz = words.size(); sent < sz; sent ++) {
      List<CoreLabel> sentence = words.get(sent);
      Tree tree = trees.get(sent);
      List<Mention> mentions = unorderedMentions.get(sent);
      Map<String, List<Mention>> mentionsToTrees = Generics.newHashMap();

      // merge the parse tree of the entire sentence with the sentence words
      if(doMergeLabels) mergeLabels(tree, sentence);

      //
      // set the surface information and the syntactic info in each mention
      // startIndex and endIndex MUST be set before!
      //
      for (Mention mention: mentions) {
        mention.contextParseTree = tree;
        mention.sentenceWords = sentence;
        mention.originalSpan = new ArrayList<>(mention.sentenceWords.subList(mention.startIndex, mention.endIndex));
        if(!((CoreLabel) tree.label()).containsKey(CoreAnnotations.BeginIndexAnnotation.class)) tree.indexSpans(0);
        if(mention.headWord==null) {
          Tree headTree = ((RuleBasedCorefMentionFinder) mentionFinder).findSyntacticHead(mention, tree, sentence);
          mention.headWord = (CoreLabel)headTree.label();
          mention.headIndex = mention.headWord.get(CoreAnnotations.IndexAnnotation.class) - 1;
        }
        if(mention.mentionSubTree==null) {
          // mentionSubTree = highest NP that has the same head
          Tree headTree = tree.getLeaves().get(mention.headIndex);
          if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); }
          Tree t = headTree;
          while ((t = t.parent(tree)) != null) {
            if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) {
              mention.mentionSubTree = t;
            } else if(mention.mentionSubTree != null){
              break;
            }
          }
          if (mention.mentionSubTree == null) {
            mention.mentionSubTree = headTree;
          }
        }

        List<Mention> mentionsForTree = mentionsToTrees.get(treeToKey(mention.mentionSubTree));
        if(mentionsForTree == null){
          mentionsForTree = new ArrayList<>();
          mentionsToTrees.put(treeToKey(mention.mentionSubTree), mentionsForTree);
        }
        mentionsForTree.add(mention);

        // generates all fields required for coref, such as gender, number, etc.
        mention.process(dictionaries, semantics, this, singletonPredictor);
      }

      //
      // Order all mentions in tree-traversal order
      //
      List<Mention> orderedMentions = new ArrayList<>();
      orderedMentionsBySentence.add(orderedMentions);

      // extract all mentions in tree traversal order (alternative: tree.postOrderNodeList())
      for (Tree t : tree.preOrderNodeList()) {
        List<Mention> lm = mentionsToTrees.get(treeToKey(t));
        if(lm != null){
          for(Mention m: lm){
            orderedMentions.add(m);
          }
        }
      }

      //
      // find appositions, predicate nominatives, relative pronouns in this sentence
      //
      findSyntacticRelations(tree, orderedMentions);
      assert(mentions.size() == orderedMentions.size());
    }
    return orderedMentionsBySentence;
  }

  /**
   * Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
   * The original value() of the Tree nodes is preserved, and otherwise the label of tree
   * leaves becomes the label from the List.
   */
  // todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
  public static void mergeLabels(Tree tree, List<CoreLabel> sentence) {
    int idx = 0;
    for (Tree t : tree.getLeaves()) {
      CoreLabel cl = sentence.get(idx ++);
      String value = t.value();
      cl.set(CoreAnnotations.ValueAnnotation.class, value);
      t.setLabel(cl);
    }
    tree.indexLeaves();
  }

  private static boolean inside(int i, Mention m) {
    return i >= m.startIndex && i < m.endIndex;
  }

  /** Find syntactic relations (e.g., appositives) in a sentence */
  private void findSyntacticRelations(Tree tree, List<Mention> orderedMentions) {
    markListMemberRelation(orderedMentions);

    Set<Pair<Integer, Integer>> appos = Generics.newHashSet();
    // TODO: This apposition finding doesn't seem to be very good - what about using "appos" from dependencies?
    findAppositions(tree, appos);
    markMentionRelation(orderedMentions, appos, "APPOSITION");

    Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet();
    findPredicateNominatives(tree, preNomi);
    markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE");

    Set<Pair<Integer, Integer>> relativePronounPairs = Generics.newHashSet();
    findRelativePronouns(tree, relativePronounPairs);
    markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN");
  }

  /** Find syntactic pattern in a sentence by tregex */
  private void findTreePattern(Tree tree, String tregex, Set<Pair<Integer, Integer>> foundPairs) {
    try {
      TregexPattern tgrepPattern = TregexPattern.compile(tregex);
      findTreePattern(tree, tgrepPattern, foundPairs);
    } catch (Exception e) {
      // shouldn't happen....
      throw new RuntimeException(e);
    }
  }

  private void findTreePattern(Tree tree, TregexPattern tgrepPattern, Set<Pair<Integer, Integer>> foundPairs) {
    try {
      TregexMatcher m = tgrepPattern.matcher(tree);
      while (m.find()) {
        Tree t = m.getMatch();
        Tree np1 = m.getNode("m1");
        Tree np2 = m.getNode("m2");
        Tree np3 = null;
        if(tgrepPattern.pattern().contains("m3")) np3 = m.getNode("m3");
        addFoundPair(np1, np2, t, foundPairs);
        if(np3!=null) addFoundPair(np2, np3, t, foundPairs);
      }
    } catch (Exception e) {
      // shouldn't happen....
      throw new RuntimeException(e);
    }
  }

  private void addFoundPair(Tree np1, Tree np2, Tree t,
      Set<Pair<Integer, Integer>> foundPairs) {
    Tree head1 = np1.headTerminal(headFinder);
    Tree head2 = np2.headTerminal(headFinder);
    int h1 = ((CoreMap) head1.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
    int h2 = ((CoreMap) head2.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
    Pair<Integer, Integer> p = new Pair<>(h1, h2);
    foundPairs.add(p);
  }

  private static final TregexPattern appositionPattern = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))");
  private static final TregexPattern appositionPattern2 = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))");
  private static final TregexPattern appositionPattern3 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)");
  private static final TregexPattern appositionPattern4 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))");
  private void findAppositions(Tree tree, Set<Pair<Integer, Integer>> appos) {
    findTreePattern(tree, appositionPattern, appos);
    findTreePattern(tree, appositionPattern2, appos);
    findTreePattern(tree, appositionPattern3, appos);
    findTreePattern(tree, appositionPattern4, appos);
  }

  private static final TregexPattern predicateNominativePattern = TregexPattern.compile("S < (NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2)))");
  private static final TregexPattern predicateNominativePattern2 = TregexPattern.compile("S < (NP=m1 $.. (VP < (VP < ((/VB/ < /^(be|been|being)$/) $.. NP=m2))))");
  private void findPredicateNominatives(Tree tree, Set<Pair<Integer, Integer>> preNomi) {
    //    String predicateNominativePattern2 = "NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2))";
    findTreePattern(tree, predicateNominativePattern, preNomi);
    findTreePattern(tree, predicateNominativePattern2, preNomi);
  }

  private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))");
  private void findRelativePronouns(Tree tree, Set<Pair<Integer, Integer>> relativePronounPairs) {
    findTreePattern(tree, relativePronounPattern, relativePronounPairs);
  }

  private static void markListMemberRelation(List<Mention> orderedMentions) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        // Mark if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2)) {
          m2.addListMember(m1);
          m1.addBelongsToList(m2);
        } else if (m2.isListMemberOf(m1)) {
          m1.addListMember(m2);
          m2.addBelongsToList(m1);
        }
      }
    }
  }

  private static void markMentionRelation(List<Mention> orderedMentions, Set<Pair<Integer, Integer>> foundPairs, String flag) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        // Ignore if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) {
          SieveCoreferenceSystem.logger.finest("Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship");
          continue;
        }
        for(Pair<Integer, Integer> foundPair: foundPairs){
          if((foundPair.first == m1.headIndex && foundPair.second == m2.headIndex)){
            switch (flag) {
              case "APPOSITION":
                m2.addApposition(m1);
                break;
              case "PREDICATE_NOMINATIVE":
                m2.addPredicateNominatives(m1);
                break;
              case "RELATIVE_PRONOUN":
                m2.addRelativePronoun(m1);
                break;
              default:
                throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)");
            }
          }
        }
      }
    }
  }

  /**
   * Finds the tree the matches this span exactly
   * @param tree Leaves must be indexed!
   * @param first First element in the span (first position has offset 1)
   * @param last Last element included in the span (first position has offset 1)
   */
  public static Tree findExactMatch(Tree tree, int first, int last) {
    List<Tree> leaves = tree.getLeaves();
    int thisFirst = ((CoreMap) leaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class);
    int thisLast = ((CoreMap) leaves.get(leaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
    if(thisFirst == first && thisLast == last) {
      return tree;
    } else {
      Tree [] kids = tree.children();
      for(Tree k: kids){
        Tree t = findExactMatch(k, first, last);
        if(t != null) return t;
      }
    }
    return null;
  }

  /** Load Stanford Processor: skip unnecessary annotator */
  protected static StanfordCoreNLP loadStanfordProcessor(Properties props) {
    boolean replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false"));

    Properties pipelineProps = new Properties(props);
    StringBuilder annoSb = new StringBuilder("");
    if (!Constants.USE_GOLD_POS && !replicateCoNLL)  {
      annoSb.append("pos, lemma");
    } else {
      annoSb.append("lemma");
    }
    if(Constants.USE_TRUECASE) {
      annoSb.append(", truecase");
    }
    if (!Constants.USE_GOLD_NE && !replicateCoNLL)  {
      annoSb.append(", ner");
    }
    if (!Constants.USE_GOLD_PARSES && !replicateCoNLL)  {
      annoSb.append(", parse");
    }
    String annoStr = annoSb.toString();
    SieveCoreferenceSystem.logger.info("MentionExtractor ignores specified annotators, using annotators=" + annoStr);
    pipelineProps.setProperty("annotators", annoStr);
    return new StanfordCoreNLP(pipelineProps, false);
  }

  public static void initializeUtterance(List<CoreLabel> tokens) {
    for(CoreLabel l : tokens){
      if (l.get(CoreAnnotations.UtteranceAnnotation.class) == null) {
        l.set(CoreAnnotations.UtteranceAnnotation.class, 0);
      }
    }
  }

}