package edu.stanford.nlp.quoteattribution.Sieves; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.QuoteAttributionAnnotator; import edu.stanford.nlp.quoteattribution.*; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Pair; import java.util.*; /** * Created by mjfang on 7/8/16. */ public class Sieve { protected Annotation doc; protected Map<String, List<Person>> characterMap; protected Map<Integer, String> pronounCorefMap; protected Set<String> animacySet; //mention types public static final String PRONOUN = "pronoun"; public static final String NAME = "name"; public static final String ANIMATE_NOUN = "animate noun"; protected TokenNode rootNameNode; public Sieve(Annotation doc, Map<String, List<Person>> characterMap, Map<Integer, String> pronounCorefMap, Set<String> animacySet) { this.doc = doc; this.characterMap = characterMap; this.pronounCorefMap = pronounCorefMap; this.animacySet = animacySet; this.rootNameNode = createNameMatcher(); } //resolves ambiguities if necessary (note: currently not actually being done) protected Person resolveAmbiguities(String name) { if(characterMap.get(name)==null) return null; if(characterMap.get(name).size() == 1) return characterMap.get(name).get(0); else { return null; } } protected Set<Person> getNamesInParagraph(CoreMap quote) { //iterate forwards and backwards to look for quotes in the same paragraph, and add all the names present in them to the list. List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class); List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); List<String> quoteNames = new ArrayList<>(); int quoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, quote); int quoteIndex = quote.get(CoreAnnotations.QuotationIndexAnnotation.class); for(int i = quoteIndex; i >= 0; i--) { CoreMap currQuote = quotes.get(i); int currQuoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, currQuote); if(currQuoteParagraph == quoteParagraph) { quoteNames.addAll(scanForNames(new Pair<>(currQuote.get(CoreAnnotations.TokenBeginAnnotation.class), currQuote.get(CoreAnnotations.TokenEndAnnotation.class))).first); } else { break; } } for(int i = quoteIndex + 1; i < quotes.size(); i++) { CoreMap currQuote = quotes.get(i); int currQuoteParagraph = QuoteAttributionUtils.getQuoteParagraphIndex(doc, currQuote); if(currQuoteParagraph == quoteParagraph) { quoteNames.addAll(scanForNames(new Pair<>(currQuote.get(CoreAnnotations.TokenBeginAnnotation.class), currQuote.get(CoreAnnotations.TokenEndAnnotation.class))).first); } else { break; } } Set<Person> namesInParagraph = new HashSet<>(); for(String name : quoteNames) { for(Person p : characterMap.get(name)) { namesInParagraph.add(p); } } return namesInParagraph; } public Person doCoreference(int corefMapKey, CoreMap quote) { if(pronounCorefMap == null) { return null; } Set<Person> quoteNames = new HashSet<>(); if(quote != null) { quoteNames = getNamesInParagraph(quote); } String referent = pronounCorefMap.get(corefMapKey); Person candidate = resolveAmbiguities(referent); if (candidate != null && !quoteNames.contains(candidate)) { return candidate; } return null; } private class TokenNode { public List<Person> personList; public HashMap<String, TokenNode> childNodes; public String token; public String fullName; int level; public TokenNode(String token, int level) { this.token = token; this.level = level; childNodes = new HashMap<>(); } } protected TokenNode createNameMatcher() { TokenNode rootNode = new TokenNode("$ROOT", -1); for(String key : characterMap.keySet()) { String[] tokens = key.split(" "); TokenNode currNode = rootNode; for(int i = 0; i < tokens.length; i++) { String tok = tokens[i]; if(currNode.childNodes.keySet().contains(tok)) { currNode = currNode.childNodes.get(tok); } else { TokenNode newNode = new TokenNode(tok, i); currNode.childNodes.put(tok, newNode); currNode = newNode; } if(i == tokens.length - 1) { currNode.personList = characterMap.get(key); currNode.fullName = key; } } } return rootNode; } //Note: this doesn't necessarily find all possible candidates, but is kind of a greedy version. // E.g. "Elizabeth and Jane" will return only "Elizabeth and Jane", but not "Elizabeth", and "Jane" as well. public Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> scanForNamesNew(Pair<Integer, Integer> textRun) { ArrayList<String> potentialNames = new ArrayList<>(); ArrayList<Pair<Integer, Integer>> nameIndices = new ArrayList<>(); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); TokenNode pointer = rootNameNode; for(int index = textRun.first; index <= textRun.second; index++) { CoreLabel token = tokens.get(index); String tokenText = token.word(); // System.out.println(token); if(pointer.childNodes.keySet().contains(tokenText)) { pointer = pointer.childNodes.get(tokenText); } else { if(!pointer.token.equals("$ROOT")) { if(pointer.fullName != null) { potentialNames.add(pointer.fullName); nameIndices.add(new Pair<>(index - 1 - pointer.level, index - 1)); } pointer = rootNameNode; } } } int index = textRun.second + 1; if(!pointer.token.equals("$ROOT")) { //catch the end case if(pointer.fullName != null) { potentialNames.add(pointer.fullName); nameIndices.add(new Pair<>(index - 1 - pointer.level, index - 1)); } pointer = rootNameNode; } return new Pair<>(potentialNames, nameIndices); } //scan for all potential names based on names list, based on CoreMaps and returns their indices in doc.tokens as well. public Pair<ArrayList<String>, ArrayList<Pair<Integer, Integer>>> scanForNames(Pair<Integer, Integer> textRun){ ArrayList<String> potentialNames = new ArrayList<>(); ArrayList<Pair<Integer, Integer>> nameIndices = new ArrayList<>(); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); //split on non-alphanumeric Set<String> aliases = characterMap.keySet(); String potentialName = ""; Pair<Integer, Integer> potentialIndex = null; for(int index = textRun.first; index <= textRun.second; index++) { CoreLabel token = tokens.get(index); String tokenText = token.word(); if(Character.isUpperCase(tokenText.charAt(0)) || tokenText.equals("de")) //TODO: make this better (String matching) { potentialName += " " + tokenText; if(potentialIndex == null) potentialIndex = new Pair<>(index, index); else potentialIndex.second = index; } else { if(potentialName.length() != 0) { String actual = potentialName.substring(1); if(aliases.contains(actual)) { potentialNames.add(actual); nameIndices.add(potentialIndex); } else // in the event that the first word in a sentence is a non-name.. { String removeFirstWord = actual.substring(actual.indexOf(" ") + 1); if(aliases.contains(removeFirstWord)) { potentialNames.add(removeFirstWord); nameIndices.add(new Pair<>(potentialIndex.first + 1, potentialIndex.second)); } } potentialName = ""; potentialIndex = null; } } } if(potentialName.length() != 0) { if(aliases.contains(potentialName.substring(1))) { potentialNames.add(potentialName.substring(1)); nameIndices.add(potentialIndex); } } return new Pair<>(potentialNames,nameIndices); } protected ArrayList<Integer> scanForPronouns(Pair<Integer, Integer> nonQuoteRun) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); ArrayList<Integer> pronounList = new ArrayList<>(); for(int i = nonQuoteRun.first; i <= nonQuoteRun.second; i++) { if(tokens.get(i).word().equalsIgnoreCase("he") || tokens.get(i).word().equalsIgnoreCase("she")) pronounList.add(i); } return pronounList; } protected ArrayList<Integer> scanForPronouns(ArrayList<Pair<Integer, Integer>> nonQuoteRuns) { ArrayList<Integer> pronounList = new ArrayList<>(); for(int run_index = 0; run_index < nonQuoteRuns.size(); run_index++) pronounList.addAll(scanForPronouns(nonQuoteRuns.get(run_index))); return pronounList; } //for filling in the text of a mention public String tokenRangeToString(Pair<Integer, Integer> tokenRange) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); return doc.get(CoreAnnotations.TextAnnotation.class).substring(tokens.get(tokenRange.first).beginPosition(), tokens.get(tokenRange.second).endPosition()); } public String tokenRangeToString(int token_idx) { return doc.get(CoreAnnotations.TokensAnnotation.class).get(token_idx).word(); } public MentionData findClosestMentionInSpanForward(Pair<Integer, Integer> span) { List<Integer> pronounIndices = scanForPronouns(span); List<Pair<Integer, Integer>> nameIndices = scanForNamesNew(span).second; List<Integer> animacyIndices = scanForAnimates(span); int closestPronounIndex = Integer.MAX_VALUE, closestAnimate = Integer.MAX_VALUE; Pair<Integer, Integer> closestNameIndex = new Pair<>(Integer.MAX_VALUE, 0); if(pronounIndices.size() > 0) closestPronounIndex = pronounIndices.get(0); if(nameIndices.size() > 0) closestNameIndex = nameIndices.get(0); if(animacyIndices.size() > 0) closestAnimate = animacyIndices.get(0); MentionData md = null; if(closestPronounIndex < closestNameIndex.first) { md = (closestAnimate < closestPronounIndex) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN) : new MentionData(closestPronounIndex, closestPronounIndex, tokenRangeToString(closestPronounIndex), PRONOUN); } else if(closestPronounIndex > closestNameIndex.first) { md = (closestAnimate < closestNameIndex.first) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN) : new MentionData(closestNameIndex.first, closestNameIndex.second, tokenRangeToString(closestNameIndex), NAME); } return md; } public List<MentionData> findClosestMentionsInSpanForward(Pair<Integer, Integer> span) { List<MentionData> mentions = new ArrayList<>(); Pair<Integer, Integer> currSpan = span; while(true) { MentionData mention = findClosestMentionInSpanForward(currSpan); if(mention != null) { mentions.add(mention); currSpan.first = mention.end + 1; } else { return mentions; } } } public List<MentionData> findClosestMentionsInSpanBackward(Pair<Integer, Integer> span) { List<MentionData> mentions = new ArrayList<>(); Pair<Integer, Integer> currSpan = span; while(true) { MentionData mentionData = findClosestMentionInSpanBackward(currSpan); if(mentionData != null) { mentions.add(mentionData); currSpan.second = mentionData.begin -1; } else { return mentions; } } } public List<Integer> scanForAnimates(Pair<Integer, Integer> span) { List<Integer> animateIndices = new ArrayList<>(); List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); for(int i = span.first; i <= span.second; i++) { CoreLabel token = tokens.get(i); if(animacySet.contains(token.word())) animateIndices.add(i); } return animateIndices; } public class MentionData { public int begin; public int end; public String text; public String type; public MentionData(int begin, int end, String text, String type) { this.begin = begin; this.end = end; this.text = text; this.type = type; } } public MentionData findClosestMentionInSpanBackward(Pair<Integer, Integer> span) { List<Integer> pronounIndices = scanForPronouns(span); List<Pair<Integer, Integer>> nameIndices = scanForNamesNew(span).second; List<Integer> animateIndices = scanForAnimates(span); int closestPronounIndex = Integer.MIN_VALUE, closestAnimate = Integer.MIN_VALUE; Pair<Integer, Integer> closestNameIndex = new Pair<>(0, Integer.MIN_VALUE); if(pronounIndices.size() > 0) { closestPronounIndex = pronounIndices.get(pronounIndices.size() - 1); } if(nameIndices.size() > 0) { closestNameIndex = nameIndices.get(nameIndices.size() - 1); } if(animateIndices.size() > 0) { closestAnimate = animateIndices.get(animateIndices.size() - 1); } MentionData md = null; if(closestPronounIndex > closestNameIndex.second) { md = (closestAnimate > closestPronounIndex) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN) : new MentionData(closestPronounIndex, closestPronounIndex, tokenRangeToString(closestPronounIndex), PRONOUN); } else if(closestPronounIndex < closestNameIndex.second) { md = (closestAnimate > closestNameIndex.second) ? new MentionData(closestAnimate, closestAnimate, tokenRangeToString(closestAnimate), ANIMATE_NOUN) : new MentionData(closestNameIndex.first, closestNameIndex.second, tokenRangeToString(closestNameIndex), NAME); } return md; } private class Mention { public int begin, end; public String text, type; public Mention(int begin, int end, String text, String type) { this.begin = begin; this.end = end; this.text = text; this.type = type; } } public void oneSpeakerSentence(Annotation doc) { List<CoreLabel> toks = doc.get(CoreAnnotations.TokensAnnotation.class); List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class); Map<Integer, List<CoreMap>> quotesBySentence = new HashMap<>(); for (int quoteIndex = 0; quoteIndex < quotes.size(); quoteIndex++) { CoreMap quote = quotes.get(quoteIndex); // iterate through each quote in the chapter // group quotes by sentence int quoteBeginTok = quote.get(CoreAnnotations.TokenBeginAnnotation.class); int sentenceBeginId = toks.get(quoteBeginTok).sentIndex(); int quoteEndTok = quote.get(CoreAnnotations.TokenEndAnnotation.class); int sentenceEndId = toks.get(quoteEndTok).sentIndex(); quotesBySentence.putIfAbsent(sentenceBeginId, new ArrayList<>()); quotesBySentence.putIfAbsent(sentenceEndId, new ArrayList<>()); quotesBySentence.get(sentenceBeginId).add(quote); quotesBySentence.get(sentenceEndId).add(quote); } // for (int k : quotesBySentence.keySet()) { List<CoreMap> quotesInSent = quotesBySentence.get(k); List<Mention> existantMentions = new ArrayList<>(); for (CoreMap quote : quotesInSent) { if (quote.get(QuoteAttributionAnnotator.MentionAnnotation.class) != null) { Mention m = new Mention(quote.get(QuoteAttributionAnnotator.MentionBeginAnnotation.class), quote.get(QuoteAttributionAnnotator.MentionEndAnnotation.class), quote.get(QuoteAttributionAnnotator.MentionAnnotation.class), quote.get(QuoteAttributionAnnotator.MentionTypeAnnotation.class)); existantMentions.add(m); } } //remove cases in which there is more than one mention in a sentence. boolean same = true; String text = null; for (Mention m : existantMentions) { if (text == null) { text = m.text; } if (!m.text.equalsIgnoreCase(text)) { same = false; } } if (same && text != null && existantMentions.size() > 0) { for (CoreMap quote : quotesInSent) { if (quote.get(QuoteAttributionAnnotator.MentionAnnotation.class) == null) { Mention firstM = existantMentions.get(0); quote.set(QuoteAttributionAnnotator.MentionAnnotation.class, firstM.text); quote.set(QuoteAttributionAnnotator.MentionBeginAnnotation.class, firstM.begin); quote.set(QuoteAttributionAnnotator.MentionEndAnnotation.class, firstM.end); quote.set(QuoteAttributionAnnotator.MentionSieveAnnotation.class, "Deterministic one speaker sentence"); quote.set(QuoteAttributionAnnotator.MentionTypeAnnotation.class, firstM.type); } } } } } //convert token range to char range, check if charIndex is in it. public boolean rangeContainsCharIndex(Pair<Integer, Integer> tokenRange, int charIndex) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); CoreLabel startToken = tokens.get(tokenRange.first()); CoreLabel endToken = tokens.get(tokenRange.second()); int startTokenCharBegin = startToken.beginPosition(); int endTokenCharEnd = endToken.endPosition(); return (startTokenCharBegin <= charIndex && charIndex <= endTokenCharEnd); } public int tokenToLocation(CoreLabel token) { CoreMap sentence = doc.get(CoreAnnotations.SentencesAnnotation.class).get( token.get(CoreAnnotations.SentenceIndexAnnotation.class)); return sentence.get(CoreAnnotations.TokenBeginAnnotation.class) + token.get(CoreAnnotations.IndexAnnotation.class) - 1; } protected int getQuoteParagraph(CoreMap quote) { List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); return sentences.get(quote.get(CoreAnnotations.SentenceBeginAnnotation.class)).get(CoreAnnotations.ParagraphIndexAnnotation.class); } }