Sieve.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.quoteattribution.Sieves;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.QuoteAttributionAnnotator;
import edu.stanford.nlp.quoteattribution.*;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;

import java.util.*;

/**
 * Created by mjfang on 7/8/16.
 */
public class Sieve {

  protected Annotation doc;
  protected Map<String, List<Person>> characterMap;
  protected Map<Integer, String> pronounCorefMap;
  protected Set<String> animacySet;


  //mention types
  public static final String PRONOUN = "pronoun";
  public static final String NAME = "name";
  public static final String ANIMATE_NOUN = "animate noun";


  protected TokenNode rootNameNode;

  public Sieve(Annotation doc,
               Map<String, List<Person>> characterMap,
               Map<Integer, String> pronounCorefMap,
               Set<String> animacySet) {
    this.doc = doc;
    this.characterMap = characterMap;
    this.pronounCorefMap = pronounCorefMap;
    this.animacySet = animacySet;
    this.rootNameNode = createNameMatcher();
  }


  //resolves ambiguities if necessary (note: currently not actually being done)
  protected Person resolveAmbiguities(String name) {
    if(characterMap.get(name)==null)
      return null;
    if(characterMap.get(name).size() == 1)
      return characterMap.get(name).get(0);
    else
    {
      return null;
    }
  }

  protected Set<Person> getNamesInParagraph(CoreMap quote) {
    //iterate forwards and backwards to look for quotes in the same paragraph, and add all the names present in them to the list.
    List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class);
    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    List<String> quoteNames = new ArrayList<>();
    int quoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, quote);
    int quoteIndex = quote.get(CoreAnnotations.QuotationIndexAnnotation.class);
    for(int i = quoteIndex; i >= 0; i--) {
      CoreMap currQuote = quotes.get(i);
      int currQuoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, currQuote);
      if(currQuoteParagraph == quoteParagraph) {
        quoteNames.addAll(scanForNames(new Pair<>(currQuote.get(CoreAnnotations.TokenBeginAnnotation.class), currQuote.get(CoreAnnotations.TokenEndAnnotation.class))).first);
      }
      else {
        break;
      }
    }
    for(int i = quoteIndex + 1; i < quotes.size(); i++) {
      CoreMap currQuote = quotes.get(i);
      int currQuoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, currQuote);
      if(currQuoteParagraph == quoteParagraph) {
        quoteNames.addAll(scanForNames(new Pair<>(currQuote.get(CoreAnnotations.TokenBeginAnnotation.class), currQuote.get(CoreAnnotations.TokenEndAnnotation.class))).first);
      }
      else {
        break;
      }
    }

    Set<Person> namesInParagraph = new HashSet<>();
    for(String name : quoteNames) {
      for(Person p : characterMap.get(name)) {
        namesInParagraph.add(p);
      }
    }

    return namesInParagraph;
  }

  public Person doCoreference(int corefMapKey, CoreMap quote) {
    if(pronounCorefMap == null) {
      return null;
    }
    Set<Person> quoteNames = new HashSet<>();
    if(quote != null) {
      quoteNames = getNamesInParagraph(quote);
    }
    String referent = pronounCorefMap.get(corefMapKey);
    Person candidate = resolveAmbiguities(referent);
    if (candidate != null && !quoteNames.contains(candidate)) {
      return candidate;
    }
    return null;
  }

  private class TokenNode {
    public List<Person> personList;
    public HashMap<String, TokenNode> childNodes;
    public String token;
    public String fullName;
    int level;

    public TokenNode(String token, int level) {
      this.token = token;
      this.level = level;
      childNodes = new HashMap<>();
    }

  }

  protected TokenNode createNameMatcher() {
    TokenNode rootNode = new TokenNode("$ROOT", -1);

    for(String key : characterMap.keySet()) {
      String[] tokens = key.split(" ");
      TokenNode currNode = rootNode;
      for(int i = 0; i < tokens.length; i++) {
        String tok = tokens[i];
        if(currNode.childNodes.keySet().contains(tok)) {
          currNode = currNode.childNodes.get(tok);
        }
        else {
          TokenNode newNode = new TokenNode(tok, i);
          currNode.childNodes.put(tok, newNode);
          currNode = newNode;
        }

        if(i == tokens.length - 1) {
          currNode.personList = characterMap.get(key);
          currNode.fullName = key;
        }
      }
    }
    return rootNode;
  }

  //Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version.
  // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well.
  public Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> scanForNamesNew(Pair<Integer, Integer> textRun) {
    ArrayList<String> potentialNames = new ArrayList<>();
    ArrayList<Pair<Integer, Integer>> nameIndices = new ArrayList<>();
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);

    TokenNode pointer = rootNameNode;
    for(int index = textRun.first; index <= textRun.second; index++) {
      CoreLabel token = tokens.get(index);
      String tokenText = token.word();
//      System.out.println(token);

      if(pointer.childNodes.keySet().contains(tokenText)) {
        pointer = pointer.childNodes.get(tokenText);
      }
      else {
        if(!pointer.token.equals("$ROOT")) {
          if(pointer.fullName != null) {
            potentialNames.add(pointer.fullName);
            nameIndices.add(new Pair<>(index - 1 - pointer.level, index - 1));
          }
          pointer = rootNameNode;
        }
      }
    }
    int index = textRun.second + 1;
    if(!pointer.token.equals("$ROOT")) { //catch the end case
      if(pointer.fullName != null) {
        potentialNames.add(pointer.fullName);
        nameIndices.add(new Pair<>(index - 1 - pointer.level, index - 1));
      }
      pointer = rootNameNode;
    }
    return new Pair<>(potentialNames, nameIndices);
  }


  //scan for all potential names based on names list, based on CoreMaps and returns their indices in doc.tokens as well.
  public Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> scanForNames(Pair<Integer, Integer> textRun){
    ArrayList<String> potentialNames = new ArrayList<>();
    ArrayList<Pair<Integer, Integer>> nameIndices = new ArrayList<>();
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); //split on non-alphanumeric
    Set<String> aliases = characterMap.keySet();
    String potentialName = "";
    Pair<Integer, Integer> potentialIndex = null;

    for(int index = textRun.first; index <= textRun.second; index++)
    {
      CoreLabel token = tokens.get(index);
      String tokenText = token.word();

      if(Character.isUpperCase(tokenText.charAt(0)) || tokenText.equals("de")) //TODO: make this better (String matching)
      {
        potentialName += " " + tokenText;
        if(potentialIndex == null)
          potentialIndex = new Pair<>(index, index);
        else
          potentialIndex.second = index;
      }
      else
      {
        if(potentialName.length() != 0) {
          String actual = potentialName.substring(1);
          if(aliases.contains(actual)) {
            potentialNames.add(actual);
            nameIndices.add(potentialIndex);
          }
          else // in the event that the first word in a sentence is a non-name..
          {
            String removeFirstWord = actual.substring(actual.indexOf(" ") + 1);
            if(aliases.contains(removeFirstWord))
            {
              potentialNames.add(removeFirstWord);
              nameIndices.add(new Pair<>(potentialIndex.first + 1, potentialIndex.second));
            }

          }

          potentialName = "";
          potentialIndex = null;
        }
      }
    }

    if(potentialName.length() != 0) {
      if(aliases.contains(potentialName.substring(1))) {
        potentialNames.add(potentialName.substring(1));
        nameIndices.add(potentialIndex);
      }
    }
    return new Pair<>(potentialNames,nameIndices);
  }

  protected ArrayList<Integer> scanForPronouns(Pair<Integer, Integer> nonQuoteRun) {
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);
    ArrayList<Integer> pronounList = new ArrayList<>();


    for(int i = nonQuoteRun.first; i <= nonQuoteRun.second; i++)
    {
      if(tokens.get(i).word().equalsIgnoreCase("he") || tokens.get(i).word().equalsIgnoreCase("she"))
        pronounList.add(i);
    }

    return pronounList;
  }

  protected ArrayList<Integer> scanForPronouns(ArrayList<Pair<Integer, Integer>> nonQuoteRuns) {
    ArrayList<Integer> pronounList = new ArrayList<>();
    for(int run_index = 0; run_index < nonQuoteRuns.size(); run_index++)
      pronounList.addAll(scanForPronouns(nonQuoteRuns.get(run_index)));
    return pronounList;
  }

  //for filling in the text of a mention
  public String tokenRangeToString(Pair<Integer, Integer> tokenRange) {
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);
    return doc.get(CoreAnnotations.TextAnnotation.class).substring(tokens.get(tokenRange.first).beginPosition(), tokens.get(tokenRange.second).endPosition());
  }

  public String tokenRangeToString(int token_idx) {
    return doc.get(CoreAnnotations.TokensAnnotation.class).get(token_idx).word();
  }

  public MentionData findClosestMentionInSpanForward(Pair<Integer, Integer> span) {
    List<Integer> pronounIndices = scanForPronouns(span);
    List<Pair<Integer, Integer>> nameIndices = scanForNamesNew(span).second;
    List<Integer> animacyIndices = scanForAnimates(span);

    int closestPronounIndex = Integer.MAX_VALUE, closestAnimate = Integer.MAX_VALUE;
    Pair<Integer, Integer> closestNameIndex = new Pair<>(Integer.MAX_VALUE, 0);

    if(pronounIndices.size() > 0)
      closestPronounIndex = pronounIndices.get(0);
    if(nameIndices.size() > 0)
      closestNameIndex = nameIndices.get(0);
    if(animacyIndices.size() > 0)
      closestAnimate = animacyIndices.get(0);
    MentionData md = null;
    if(closestPronounIndex < closestNameIndex.first) {
      md = (closestAnimate < closestPronounIndex) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN)
              : new MentionData(closestPronounIndex, closestPronounIndex, tokenRangeToString(closestPronounIndex), PRONOUN);
    } else if(closestPronounIndex > closestNameIndex.first) {
      md = (closestAnimate < closestNameIndex.first) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN)
              : new MentionData(closestNameIndex.first, closestNameIndex.second, tokenRangeToString(closestNameIndex), NAME);
    }
    return md;
  }



  public List<MentionData> findClosestMentionsInSpanForward(Pair<Integer, Integer> span) {
    List<MentionData> mentions = new ArrayList<>();
    Pair<Integer, Integer> currSpan = span;
    while(true) {
      MentionData mention = findClosestMentionInSpanForward(currSpan);
      if(mention != null) {
        mentions.add(mention);
        currSpan.first = mention.end + 1;
      }
      else {
        return mentions;
      }
    }
  }

  public List<MentionData> findClosestMentionsInSpanBackward(Pair<Integer, Integer> span) {
    List<MentionData> mentions = new ArrayList<>();
    Pair<Integer, Integer> currSpan = span;
    while(true) {
      MentionData mentionData = findClosestMentionInSpanBackward(currSpan);
      if(mentionData != null) {
        mentions.add(mentionData);
        currSpan.second = mentionData.begin -1;
      }
      else {
        return mentions;
      }
    }
  }


  public List<Integer> scanForAnimates(Pair<Integer, Integer> span) {
    List<Integer> animateIndices = new ArrayList<>();
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);
    for(int i = span.first; i <= span.second; i++)
    {
      CoreLabel token = tokens.get(i);
      if(animacySet.contains(token.word()))
        animateIndices.add(i);
    }
    return animateIndices;
  }

  public class MentionData {
    public int begin;
    public int end;
    public String text;
    public String type;

    public MentionData(int begin, int end, String text, String type) {
      this.begin = begin;
      this.end = end;
      this.text = text;
      this.type = type;
    }
  }

  public MentionData findClosestMentionInSpanBackward(Pair<Integer, Integer> span) {
    List<Integer> pronounIndices = scanForPronouns(span);
    List<Pair<Integer, Integer>> nameIndices = scanForNamesNew(span).second;
    List<Integer> animateIndices = scanForAnimates(span);

    int closestPronounIndex = Integer.MIN_VALUE, closestAnimate = Integer.MIN_VALUE;
    Pair<Integer, Integer> closestNameIndex = new Pair<>(0, Integer.MIN_VALUE);

    if(pronounIndices.size() > 0) {
      closestPronounIndex = pronounIndices.get(pronounIndices.size() - 1);
    }
    if(nameIndices.size() > 0) {
      closestNameIndex = nameIndices.get(nameIndices.size() - 1);
    }
    if(animateIndices.size() > 0) {
      closestAnimate = animateIndices.get(animateIndices.size() - 1);
    }

    MentionData md = null;
    if(closestPronounIndex > closestNameIndex.second) {
      md = (closestAnimate > closestPronounIndex) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN)
        : new MentionData(closestPronounIndex, closestPronounIndex, tokenRangeToString(closestPronounIndex), PRONOUN);
    }
    else if(closestPronounIndex < closestNameIndex.second) {
      md = (closestAnimate > closestNameIndex.second) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN)
              : new MentionData(closestNameIndex.first, closestNameIndex.second, tokenRangeToString(closestNameIndex), NAME);
    }
    return md;
  }


  private class Mention {
    public int begin, end;
    public String text, type;

    public Mention(int begin, int end, String text, String type) {
      this.begin = begin;
      this.end = end;
      this.text = text;
      this.type = type;
    }
  }

  public void oneSpeakerSentence(Annotation doc) {
    List<CoreLabel> toks = doc.get(CoreAnnotations.TokensAnnotation.class);
    List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class);
    Map<Integer, List<CoreMap>> quotesBySentence = new HashMap<>();
    for (int quoteIndex = 0; quoteIndex < quotes.size(); quoteIndex++) {
      CoreMap quote = quotes.get(quoteIndex);

      // iterate through each quote in the chapter
      // group quotes by sentence

      int quoteBeginTok = quote.get(CoreAnnotations.TokenBeginAnnotation.class);
      int sentenceBeginId = toks.get(quoteBeginTok).sentIndex();
      int quoteEndTok = quote.get(CoreAnnotations.TokenEndAnnotation.class);
      int sentenceEndId = toks.get(quoteEndTok).sentIndex();
      quotesBySentence.putIfAbsent(sentenceBeginId, new ArrayList<>());
      quotesBySentence.putIfAbsent(sentenceEndId, new ArrayList<>());
      quotesBySentence.get(sentenceBeginId).add(quote);
      quotesBySentence.get(sentenceEndId).add(quote);
    }

    //
    for (int k : quotesBySentence.keySet()) {
      List<CoreMap> quotesInSent = quotesBySentence.get(k);
      List<Mention> existantMentions = new ArrayList<>();
      for (CoreMap quote : quotesInSent) {
        if (quote.get(QuoteAttributionAnnotator.MentionAnnotation.class) != null) {
          Mention m = new Mention(quote.get(QuoteAttributionAnnotator.MentionBeginAnnotation.class),
                                  quote.get(QuoteAttributionAnnotator.MentionEndAnnotation.class),
                                  quote.get(QuoteAttributionAnnotator.MentionAnnotation.class),
                                  quote.get(QuoteAttributionAnnotator.MentionTypeAnnotation.class));
          existantMentions.add(m);
        }
      }

      //remove cases in which there is more than one mention in a sentence.
      boolean same = true;
      String text = null;
      for (Mention m : existantMentions) {
        if (text == null) {
          text = m.text;
        }
        if (!m.text.equalsIgnoreCase(text)) {
          same = false;
        }
      }

      if (same && text != null && existantMentions.size() > 0) {
        for (CoreMap quote : quotesInSent) {
          if (quote.get(QuoteAttributionAnnotator.MentionAnnotation.class) == null) {
            Mention firstM = existantMentions.get(0);
            quote.set(QuoteAttributionAnnotator.MentionAnnotation.class, firstM.text);
            quote.set(QuoteAttributionAnnotator.MentionBeginAnnotation.class, firstM.begin);
            quote.set(QuoteAttributionAnnotator.MentionEndAnnotation.class, firstM.end);
            quote.set(QuoteAttributionAnnotator.MentionSieveAnnotation.class, "Deterministic one speaker sentence");
            quote.set(QuoteAttributionAnnotator.MentionTypeAnnotation.class, firstM.type);
          }
        }
      }
    }
  }

  //convert token range to char range, check if charIndex is in it.
  public  boolean rangeContainsCharIndex(Pair<Integer, Integer> tokenRange, int charIndex) {
    List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class);
    CoreLabel startToken = tokens.get(tokenRange.first());
    CoreLabel endToken = tokens.get(tokenRange.second());
    int startTokenCharBegin  = startToken.beginPosition();
    int endTokenCharEnd = endToken.endPosition();
    return (startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd);
  }

  public int tokenToLocation(CoreLabel token) {
    CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get(
        token.get(CoreAnnotations.SentenceIndexAnnotation.class));
    return sentence.get(CoreAnnotations.TokenBeginAnnotation.class) +
        token.get(CoreAnnotations.IndexAnnotation.class) - 1;
  }

  protected int getQuoteParagraph(CoreMap quote) {
    List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
    return sentences.get(quote.get(CoreAnnotations.SentenceBeginAnnotation.class)).get(CoreAnnotations.ParagraphIndexAnnotation.class);
  }
}