package edu.stanford.nlp.quoteattribution; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; //import edu.stanford.nlp.parser.ensemble.maltparser.core.options.option.IntegerOption; import edu.stanford.nlp.parser.nndep.DependencyParser; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.WordsToSentencesAnnotator; import edu.stanford.nlp.process.WordToSentenceProcessor; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphFactory; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.stats.Counters; import edu.stanford.nlp.trees.Dependency; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.*; /** * @author Grace Muzny, Michael Fang */ public class QuoteAttributionUtils { //TODO: change this to take the nearest (non-quote) sentence (even if not part of it) public static Pair<Integer, Integer> getRemainderInSentence(Annotation doc, CoreMap quote) { Pair<Integer, Integer> range = getTokenRangePrecedingQuote(doc, quote); if(range == null) { range = getTokenRangeFollowingQuote(doc, quote); } return range; } public static int getQuoteParagraphIndex(Annotation doc, CoreMap quote) { List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); return sentences.get(quote.get(CoreAnnotations.SentenceBeginAnnotation.class)).get(CoreAnnotations.ParagraphIndexAnnotation.class); } //taken from WordToSentencesAnnotator private static CoreMap constructSentence(List<CoreLabel> sentenceTokens, CoreMap prevSentence, CoreMap sentence) { // get the sentence text from the first and last character offsets int begin = sentenceTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int last = sentenceTokens.size() - 1; int end = sentenceTokens.get(last).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = prevSentence.get(CoreAnnotations.TextAnnotation.class) + sentence.get(CoreAnnotations.TextAnnotation.class); // create a sentence annotation with text and token offsets Annotation newSentence = new Annotation(sentenceText); newSentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); newSentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); newSentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); newSentence.set(CoreAnnotations.TokenBeginAnnotation.class, prevSentence.get(CoreAnnotations.TokenBeginAnnotation.class)); newSentence.set(CoreAnnotations.TokenEndAnnotation.class, sentence.get(CoreAnnotations.TokenEndAnnotation.class)); newSentence.set(CoreAnnotations.ParagraphIndexAnnotation.class, sentence.get(CoreAnnotations.ParagraphIndexAnnotation.class)); newSentence.set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, getParse(newSentence)); return newSentence; // newSentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size()); } public static class EnhancedSentenceAnnotation implements CoreAnnotation<CoreMap> { @Override public Class<CoreMap> getType() { return CoreMap.class; } } public static void addEnhancedSentences(Annotation doc) { //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence. //for each sieve that potentially uses augmentedSentences in original: List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); WordToSentenceProcessor wsp = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER); //create SentenceSplitter that never splits on newline int prevParagraph = 0; for(int i = 1; i < sentences.size(); i++) { CoreMap sentence = sentences.get(i); CoreMap prevSentence = sentences.get(i-1); List<CoreLabel> tokensConcat = new ArrayList<>(); tokensConcat.addAll(prevSentence.get(CoreAnnotations.TokensAnnotation.class)); tokensConcat.addAll(sentence.get(CoreAnnotations.TokensAnnotation.class)); List<List<CoreLabel>> sentenceTokens = wsp.process(tokensConcat); if(sentenceTokens.size() == 1) { //wsp would have put them into a single sentence --> add enhanced sentence. sentence.set(EnhancedSentenceAnnotation.class, constructSentence(sentenceTokens.get(0), prevSentence, sentence)); } } } //gets range of tokens that are in the same sentence as the beginning of the quote that precede it, if they exist, //or the previous sentence, if it is in the same paragraph. //also, ensure that the difference is at least two tokens public static Pair<Integer, Integer> getTokenRangePrecedingQuote(Annotation doc, CoreMap quote) { List<CoreMap> docSentences = doc.get(CoreAnnotations.SentencesAnnotation.class); int quoteBeginTokenIndex = quote.get(CoreAnnotations.TokenBeginAnnotation.class); if(quoteBeginTokenIndex <= 2) { return null; } int quoteBeginSentenceIndex = quote.get(CoreAnnotations.SentenceBeginAnnotation.class); CoreMap beginSentence = docSentences.get(quoteBeginSentenceIndex); if(beginSentence.get(EnhancedSentenceAnnotation.class) != null) { beginSentence = beginSentence.get(EnhancedSentenceAnnotation.class); } int quoteIndex = quote.get(CoreAnnotations.QuotationIndexAnnotation.class); if(beginSentence.get(CoreAnnotations.TokenBeginAnnotation.class) < quoteBeginTokenIndex - 1) { //check previous quote to make sure boundary is okay- modify if necessary. if(quoteIndex > 0) { CoreMap prevQuote = doc.get(CoreAnnotations.QuotationsAnnotation.class).get(quoteIndex - 1); int prevQuoteTokenEnd = prevQuote.get(CoreAnnotations.TokenEndAnnotation.class); if (prevQuoteTokenEnd > beginSentence.get(CoreAnnotations.TokenBeginAnnotation.class)) { if (prevQuoteTokenEnd + 1 == quoteBeginTokenIndex) { return null; } return new Pair<>(prevQuoteTokenEnd + 1, quoteBeginTokenIndex - 1); } } return new Pair<>(beginSentence.get(CoreAnnotations.TokenBeginAnnotation.class), quoteBeginTokenIndex - 1); } else if (quoteBeginSentenceIndex > 0) { //try previous sentence- if it is in the same paragraph. int currParagraph = beginSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); CoreMap prevSentence = docSentences.get(quoteBeginSentenceIndex - 1); //check if prevSentence is in same paragraph if(prevSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == currParagraph) { //check previous quote boundary if(quoteIndex > 0) { CoreMap prevQuote = doc.get(CoreAnnotations.QuotationsAnnotation.class).get(quoteIndex - 1); int prevQuoteTokenEnd = prevQuote.get(CoreAnnotations.TokenEndAnnotation.class); if (prevQuoteTokenEnd > prevSentence.get(CoreAnnotations.TokenBeginAnnotation.class)) { if (prevQuoteTokenEnd + 1 == quoteBeginTokenIndex) { return null; } return new Pair<>(prevQuoteTokenEnd + 1, quoteBeginTokenIndex - 1); } return new Pair<>(prevSentence.get(CoreAnnotations.TokenBeginAnnotation.class), quoteBeginTokenIndex - 1); } } } return null; } public static Pair<Integer, Integer> getTokenRangeFollowingQuote(Annotation doc, CoreMap quote) { List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class); List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class); int quoteEndTokenIndex = quote.get(CoreAnnotations.TokenEndAnnotation.class); if(quoteEndTokenIndex >= doc.get(CoreAnnotations.TokensAnnotation.class).size() - 2) { return null; } int quoteEndSentenceIndex = quote.get(CoreAnnotations.SentenceEndAnnotation.class); CoreMap endSentence = sentences.get(quoteEndSentenceIndex); int quoteIndex = quote.get(CoreAnnotations.QuotationIndexAnnotation.class); if(quoteEndTokenIndex < endSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 2) { //quote TokenEndAnnotation is inclusive; sentence TokenEndAnnotation is exclusive //check next quote to ensure boundary if (quoteIndex < quotes.size() - 1) { CoreMap nextQuote = quotes.get(quoteIndex + 1); int nextQuoteTokenBegin = nextQuote.get(CoreAnnotations.TokenBeginAnnotation.class); if (nextQuoteTokenBegin < endSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1) { if (quoteEndTokenIndex + 1 == nextQuoteTokenBegin) { return null; } return new Pair<>(quoteEndTokenIndex + 1, nextQuoteTokenBegin - 1); } } return new Pair<>(quoteEndTokenIndex + 1, endSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1); } else if (quoteEndSentenceIndex < sentences.size() - 1) { //check next sentence CoreMap nextSentence = sentences.get(quoteEndSentenceIndex + 1); int currParagraph = endSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class); if(nextSentence.get(CoreAnnotations.ParagraphIndexAnnotation.class) == currParagraph) { //check next quote boundary if (quoteIndex < quotes.size() - 1) { CoreMap nextQuote = quotes.get(quoteIndex + 1); int nextQuoteTokenBegin = nextQuote.get(CoreAnnotations.TokenBeginAnnotation.class); if (nextQuoteTokenBegin < nextSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1) { if (quoteEndTokenIndex + 1 == nextQuoteTokenBegin) { return null; } return new Pair<>(quoteEndTokenIndex + 1, nextQuoteTokenBegin - 1); } return new Pair<>(quoteEndTokenIndex + 1, nextSentence.get(CoreAnnotations.TokenEndAnnotation.class) - 1); } } } return null; } private static CoreMap constructCoreMap(Annotation doc, Pair<Integer, Integer> run) { List<CoreLabel> tokens = doc.get(CoreAnnotations.TokensAnnotation.class); // get the sentence text from the first and last character offsets int begin = tokens.get(run.first).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); int end = tokens.get(run.second).get(CoreAnnotations.CharacterOffsetEndAnnotation.class); String sentenceText = doc.get(CoreAnnotations.TextAnnotation.class).substring(begin, end); List<CoreLabel> sentenceTokens = tokens.subList(run.first, run.second+1); // create a sentence annotation with text and token offsets CoreMap sentence = new Annotation(sentenceText); sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin); sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end); sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); return sentence; } static DependencyParser parser = DependencyParser.loadFromModelFile(DependencyParser.DEFAULT_MODEL, new Properties()); private static SemanticGraph getParse(CoreMap sentence) { GrammaticalStructure gs = parser.predict(sentence); GrammaticalStructure.Extras maximal = GrammaticalStructure.Extras.MAXIMAL; // SemanticGraph deps = SemanticGraphFactory.makeFromTree(gs, SemanticGraphFactory.Mode.ENHANCED, maximal, true, null), // uncollapsedDeps = SemanticGraphFactory.makeFromTree(gs, SemanticGraphFactory.Mode.BASIC, maximal, true, null), // SemanticGraph ccDeps = SemanticGraphFactory.makeFromTree(gs, SemanticGraphFactory.Mode.ENHANCED_PLUS_PLUS, maximal, true, null); SemanticGraph ccDeps = SemanticGraphFactory.generateEnhancedPlusPlusDependencies(gs); return ccDeps; } public static void annotateForDependencyParse(Annotation doc) { // for each quote, dependency parse sentences with quote-removed (if it exists). List<CoreMap> quotes = doc.get(CoreAnnotations.QuotationsAnnotation.class); for(CoreMap quote : quotes) { Pair<Integer, Integer> range = getRemainderInSentence(doc, quote); if (range != null) { CoreMap sentenceQuoteRemoved = constructCoreMap(doc, range); quote.set(SemanticGraphCoreAnnotations.EnhancedPlusPlusDependenciesAnnotation.class, getParse(sentenceQuoteRemoved)); } } } public static int getParagraphRank(Annotation doc, CoreMap quote) { int quoteParaBegin = getParagraphBeginNumber(quote); List<CoreMap> sents = getSentsInParagraph(doc, quoteParaBegin); List<CoreMap> quotesInParagraph = Generics.newArrayList(); for (CoreMap q : doc.get(CoreAnnotations.QuotationsAnnotation.class)) { if (getParagraphBeginNumber(q) == quoteParaBegin) { quotesInParagraph.add(q); } } return quotesInParagraph.indexOf(quote); } public static int getParagraphBeginNumber(CoreMap quote) { List<CoreMap> sents = quote.get(CoreAnnotations.SentencesAnnotation.class); return sents.get(0).get(CoreAnnotations.ParagraphIndexAnnotation.class); } public static int getParagraphEndNumber(CoreMap quote) { List<CoreMap> sents = quote.get(CoreAnnotations.SentencesAnnotation.class); return sents.get(sents.size() - 1).get(CoreAnnotations.ParagraphIndexAnnotation.class); } public static List<CoreMap> getSentsInParagraph(Annotation doc, int paragraph) { List<CoreMap> sents = doc.get(CoreAnnotations.SentencesAnnotation.class); List<CoreMap> targets = Generics.newArrayList(); for (CoreMap sent : sents) { if (sent.get(CoreAnnotations.ParagraphIndexAnnotation.class) == paragraph) { targets.add(sent); } } return sents; } public static List<CoreMap> getSentsForQuoteParagraphs(Annotation doc, CoreMap quote) { List<CoreMap> sents = doc.get(CoreAnnotations.SentencesAnnotation.class); int paragraphBegin = getParagraphBeginNumber(quote); int paragraphEnd = getParagraphEndNumber(quote); List<CoreMap> targets = Generics.newArrayList(); for (CoreMap sent : sents) { if (sent.get(CoreAnnotations.ParagraphIndexAnnotation.class) >= paragraphBegin && sent.get(CoreAnnotations.ParagraphIndexAnnotation.class) <= paragraphEnd) { targets.add(sent); } } return sents; } public static Map<String, Person.Gender> readGenderedNounList(String filename) { Map<String, Person.Gender> genderMap = Generics.newHashMap(); List<String> lines = IOUtils.linesFromFile(filename); for(String line : lines) { String[] nounAndStats = line.split("\\t"); String[] stats = nounAndStats[1].split(" "); Person.Gender gender = (Integer.parseInt(stats[0]) >= Integer.parseInt(stats[1])) ? Person.Gender.MALE : Person.Gender.FEMALE; genderMap.put(nounAndStats[0], gender); } return genderMap; } public static Set<String> readFamilyRelations(String filename) { Set<String> familyRelations = Generics.newHashSet(); List<String> lines = IOUtils.linesFromFile(filename); for (String line : lines) { if (line.trim().length() > 0) { familyRelations.add(line.toLowerCase().trim()); } } return familyRelations; } public static Set<String> readAnimacyList(String filename) { Set<String> animacyList = Generics.newHashSet(); List<String> lines = IOUtils.linesFromFile(filename); for (String line : lines) { if (!Character.isUpperCase(line.charAt(0))) //ignore names animacyList.add(line); } return animacyList; } //map each alias(i.e. the name of a character) to a character, potentially multiple if ambiguous. public static Map<String, List<Person>> readPersonMap(List<Person> personList) { Map<String, List<Person>> personMap = new HashMap<>(); for(Person person : personList) { for(String alias : person.aliases) { if(personMap.get(alias)== null) personMap.put(alias, new ArrayList<>()); personMap.get(alias).add(person); } } return personMap; } public static Map<String, List<Person>> readPersonMap(String fileName) { return readPersonMap(readCharacterList(fileName)); } public static ArrayList<Person> readCharacterList(String filename) { ArrayList<Person> characterList = new ArrayList<>(); //format: name;Gender(M or F); aliases (everything semi-colon delimited) for(String line : IOUtils.readLines(new File(filename))) { String[] terms = line.split(";"); if(terms.length == 2) { characterList.add(new Person(terms[0], terms[1], null)); } else { ArrayList<String> aliases = new ArrayList<>(); for(int l = 2; l < terms.length; l++) { aliases.add(terms[l]); } aliases.add(terms[0]); characterList.add(new Person(terms[0], terms[1], aliases)); } } return characterList; } public static Map<Integer,String> setupCoref(String bammanFile, Map<String, List<Person>> characterMap, Annotation doc ) { if(bammanFile != null) { //TODO: integrate coref Map<Integer, List<CoreLabel>> bammanTokens = BammanCorefReader.readTokenFile(bammanFile, doc); Map<Integer,String> pronounCorefMap = mapBammanToCharacterMap(bammanTokens, characterMap); return pronounCorefMap; } return null; } //return map of index of CharacterOffsetBeginAnnotation to name of character. protected static Map<Integer,String> mapBammanToCharacterMap(Map<Integer, List<CoreLabel>> BammanTokens, Map<String, List<Person>> characterMap) { Map<Integer, String> indexToCharacterName = new HashMap<>(); //first, link the for(Integer characterID : BammanTokens.keySet()) { List<CoreLabel> tokens = BammanTokens.get(characterID); Counter<String> names = new ClassicCounter<>(); int prevEnd = -2; String prevName = ""; for(CoreLabel token : tokens) { if(token.tag().equals("NNP")) { int beginIndex = token.beginPosition(); if(prevEnd +1 == beginIndex) //adjacent to last token { prevName += " " + token.word(); } else //not adjacent candidate: clear and then { if(!prevName.equals("")) names.incrementCount(prevName, 1); prevName = token.word(); prevEnd = token.endPosition(); } } else { if(!prevName.equals("")) { names.incrementCount(prevName, 1); } prevName = ""; prevEnd = -2; } } //System.out.println(); boolean flag = false; //exact match for(String name : Counters.toSortedList(names)) { if(characterMap.keySet().contains(name)) { indexToCharacterName.put(characterID, name); flag = true; break; } } //not exact match: try partial match if(!flag) { for(String charName : characterMap.keySet()) { for(String name : Counters.toSortedList(names)) { if(charName.contains(name)) { indexToCharacterName.put(characterID, charName); flag=true; System.out.println("contingency name found" + characterID); for(String n : Counters.toSortedList(names)) System.out.print(n + "|"); System.out.println(); break; } } if(flag) { break; } } System.out.println(); } if(!flag) { System.err.println("no name found :( " + characterID); for(String name : Counters.toSortedList(names)) System.err.print(name + "| "); System.err.println(); } } Map<Integer, String> beginIndexToName = new HashMap<>(); for(Integer charId: BammanTokens.keySet()) { if(indexToCharacterName.get(charId) == null) continue; List<CoreLabel> tokens = BammanTokens.get(charId); for(CoreLabel btoken : tokens) { if(btoken.tag().equals("PRP")) beginIndexToName.put(btoken.beginPosition(),indexToCharacterName.get(charId)); } } return beginIndexToName; } //return true if one is contained in the other. public static boolean rangeContains(Pair<Integer, Integer> r1, Pair<Integer, Integer> r2) { return ((r1.first <= r2.first && r1.second >= r2.first) || (r1.first <= r2.second && r1.second >= r2.second)); } }