package edu.stanford.nlp.coref.data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.coref.CorefRules;
import edu.stanford.nlp.coref.CorefUtils;
import edu.stanford.nlp.coref.data.Document.DocType;
import edu.stanford.nlp.coref.data.Dictionaries.Number;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.UtteranceAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Coref document preprocessor.
* @author Heeyoung Lee
* @author Kevin Clark
*/
public class DocumentPreprocessor {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(DocumentPreprocessor.class);
private DocumentPreprocessor() {}
/**
* Fill missing information in document including mention ID, mention attributes, syntactic relation, etc.
*
* @throws Exception
*/
public static void preprocess(Document doc, Dictionaries dict, LogisticClassifier<String, String> singletonPredictor, HeadFinder headFinder) throws Exception {
// assign mention IDs, find twin mentions, fill mention positions, sentNum, headpositions
initializeMentions(doc, dict, singletonPredictor, headFinder);
// mention reordering
mentionReordering(doc, headFinder);
// find syntactic information
fillSyntacticInfo(doc);
// process discourse (speaker info etc)
setParagraphAnnotation(doc);
processDiscourse(doc, dict);
// initialize cluster info
initializeClusters(doc);
// extract gold clusters if we have
if(doc.goldMentions!=null) {
extractGoldClusters(doc);
int foundGoldCount = 0;
for(Mention g : doc.goldMentionsByID.values()) {
if(g.hasTwin) foundGoldCount++;
}
Redwood.log("debug-md", "# of found gold mentions: "+ foundGoldCount +
" / # of gold mentions: "+ doc.goldMentionsByID.size());
}
// assign mention numbers
assignMentionNumbers(doc);
}
/** Extract gold coref cluster information. */
public static void extractGoldClusters(Document doc){
doc.goldCorefClusters = Generics.newHashMap();
for (List<Mention> mentions : doc.goldMentions) {
for (Mention m : mentions) {
int id = m.goldCorefClusterID;
if (id == -1) {
throw new RuntimeException("No gold info");
}
CorefCluster c = doc.goldCorefClusters.get(id);
if (c == null) {
c = new CorefCluster(id);
doc.goldCorefClusters.put(id, c);
}
c.corefMentions.add(m);
}
}
}
private static void assignMentionNumbers(Document document) {
List<Mention> mentionsList = CorefUtils.getSortedMentions(document);
for (int i = 0; i < mentionsList.size(); i++) {
mentionsList.get(i).mentionNum = i;
}
}
private static void mentionReordering(Document doc, HeadFinder headFinder) throws Exception {
List<List<Mention>> mentions = doc.predictedMentions;
List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
for (int i=0 ; i<sentences.size() ; i++) {
List<Mention> mentionsInSent = mentions.get(i);
mentions.set(i, mentionReorderingBySpan(mentionsInSent));
}
}
protected static int getHeadIndex(Tree t, HeadFinder headFinder) {
// The trees passed in do not have the CoordinationTransformer
// applied, but that just means the SemanticHeadFinder results are
// slightly worse.
Tree ht = t.headTerminal(headFinder);
if(ht==null) return -1; // temporary: a key which is matched to nothing
CoreLabel l = (CoreLabel) ht.label();
return l.get(CoreAnnotations.IndexAnnotation.class);
}
private static List<Mention> mentionReorderingBySpan(List<Mention> mentionsInSent) {
TreeSet<Mention> ordering = new TreeSet<>(new Comparator<Mention>() {
@Override
public int compare(Mention m1, Mention m2) {
return (m1.appearEarlierThan(m2)) ? -1 : (m2.appearEarlierThan(m1)) ? 1 : 0;
}
});
ordering.addAll(mentionsInSent);
List<Mention> orderedMentions = Generics.newArrayList(ordering);
return orderedMentions;
}
private static void fillSyntacticInfo(Document doc) {
List<List<Mention>> mentions = doc.predictedMentions;
List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
for (int i=0 ; i<sentences.size() ; i++) {
List<Mention> mentionsInSent = mentions.get(i);
findSyntacticRelationsFromDependency(mentionsInSent);
}
}
/** assign mention IDs, find twin mentions, fill mention positions, initialize coref clusters, etc
* @throws Exception */
private static void initializeMentions(Document doc, Dictionaries dict, LogisticClassifier<String, String> singletonPredictor, HeadFinder headFinder) throws Exception {
boolean hasGold = (doc.goldMentions != null);
assignMentionIDs(doc);
if(hasGold) findTwinMentions(doc, true);
fillMentionInfo(doc, dict, singletonPredictor, headFinder);
doc.allPositions = Generics.newHashMap(doc.positions); // allPositions retain all mentions even after postprocessing
}
private static void assignMentionIDs(Document doc) {
boolean hasGold = (doc.goldMentions != null);
int maxID = 0;
if(hasGold) {
for (List<Mention> golds : doc.goldMentions) {
for (Mention g : golds) {
g.mentionID = maxID++;
}
}
}
for (List<Mention> predicted : doc.predictedMentions) {
for (Mention p : predicted) {
p.mentionID = maxID++;
}
}
}
/** Mark twin mentions in gold and predicted mentions */
protected static void findTwinMentions(Document doc, boolean strict){
if(strict) findTwinMentionsStrict(doc);
else findTwinMentionsRelaxed(doc);
}
/** Mark twin mentions: All mention boundaries should be matched */
private static void findTwinMentionsStrict(Document doc){
for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
List<Mention> golds = doc.goldMentions.get(sentNum);
List<Mention> predicts = doc.predictedMentions.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for(Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg: goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
Redwood.log("debug-preprocessor", "WARNING: gold mentions with the same offsets: " + ip
+ " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for(Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if(goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
int minId = Integer.MAX_VALUE;
Mention g = null;
for (Mention m : cm) {
if (m.mentionID < minId) {
g = m;
minId = m.mentionID;
}
}
cm.remove(g);
p.mentionID = g.mentionID;
p.hasTwin = true;
g.hasTwin = true;
}
}
}
}
/** Mark twin mentions: heads of the mentions are matched */
private static void findTwinMentionsRelaxed(Document doc) {
for(int sentNum = 0; sentNum < doc.goldMentions.size(); sentNum++) {
List<Mention> golds = doc.goldMentions.get(sentNum);
List<Mention> predicts = doc.predictedMentions.get(sentNum);
Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
for(Mention g : golds) {
goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
if(!goldMentionHeadPositions.containsKey(g.headIndex)) {
goldMentionHeadPositions.put(g.headIndex, new LinkedList<>());
}
goldMentionHeadPositions.get(g.headIndex).add(g);
}
List<Mention> remains = new ArrayList<>();
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if(goldMentionPositions.containsKey(pos)) {
Mention g = goldMentionPositions.get(pos);
p.mentionID = g.mentionID;
p.hasTwin = true;
g.hasTwin = true;
goldMentionHeadPositions.get(g.headIndex).remove(g);
if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
goldMentionHeadPositions.remove(g.headIndex);
}
}
else remains.add(p);
}
for (Mention r : remains){
if(goldMentionHeadPositions.containsKey(r.headIndex)) {
Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
r.mentionID = g.mentionID;
r.hasTwin = true;
g.hasTwin = true;
if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
goldMentionHeadPositions.remove(g.headIndex);
}
}
}
}
}
/** initialize several variables for mentions
* @throws Exception
*/
private static void fillMentionInfo(Document doc, Dictionaries dict,
LogisticClassifier<String, String> singletonPredictor, HeadFinder headFinder) throws Exception {
List<CoreMap> sentences = doc.annotation.get(SentencesAnnotation.class);
for(int i = 0; i < doc.predictedMentions.size(); i ++){
CoreMap sentence = sentences.get(i);
for(int j = 0; j < doc.predictedMentions.get(i).size(); j ++){
Mention m = doc.predictedMentions.get(i).get(j);
doc.predictedMentionsByID.put(m.mentionID, m); // mentionsByID
IntTuple pos = new IntTuple(2);
pos.set(0, i);
pos.set(1, j);
doc.positions.put(m, pos); // positions
m.sentNum = i; // sentNum
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, i);
headPosition.set(1, m.headIndex);
doc.mentionheadPositions.put(headPosition, m); // headPositions
m.contextParseTree = sentence.get(TreeAnnotation.class);
// m.sentenceWords = sentence.get(TokensAnnotation.class);
m.basicDependency = sentence.get(BasicDependenciesAnnotation.class);
m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (m.enhancedDependency == null) {
m.enhancedDependency = sentence.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
// mentionSubTree (highest NP that has the same head) if constituency tree available
if (m.contextParseTree != null) {
Tree headTree = m.contextParseTree.getLeaves().get(m.headIndex);
if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); }
Tree t = headTree;
while ((t = t.parent(m.contextParseTree)) != null) {
if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) {
m.mentionSubTree = t;
} else if(m.mentionSubTree != null){
break;
}
}
if (m.mentionSubTree == null) {
m.mentionSubTree = headTree;
}
}
m.process(dict, null, singletonPredictor);
}
}
boolean hasGold = (doc.goldMentions != null);
if(hasGold) {
doc.goldMentionsByID = Generics.newHashMap();
int sentNum = 0;
for(List<Mention> golds : doc.goldMentions) {
for(Mention g : golds) {
doc.goldMentionsByID.put(g.mentionID, g);
g.sentNum = sentNum;
}
sentNum++;
}
}
}
private static void findSyntacticRelationsFromDependency(List<Mention> orderedMentions) {
if(orderedMentions.size()==0) return;
markListMemberRelation(orderedMentions);
SemanticGraph dependency = orderedMentions.get(0).enhancedDependency;
// apposition
Set<Pair<Integer, Integer>> appos = Generics.newHashSet();
List<SemanticGraphEdge> appositions = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.APPOSITIONAL_MODIFIER);
for(SemanticGraphEdge edge : appositions) {
int sIdx = edge.getSource().index()-1;
int tIdx = edge.getTarget().index()-1;
appos.add(Pair.makePair(sIdx, tIdx));
}
markMentionRelation(orderedMentions, appos, "APPOSITION");
// predicate nominatives
Set<Pair<Integer, Integer>> preNomi = Generics.newHashSet();
List<SemanticGraphEdge> copula = dependency.findAllRelns(UniversalEnglishGrammaticalRelations.COPULA);
for(SemanticGraphEdge edge : copula) {
IndexedWord source = edge.getSource();
IndexedWord target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.NOMINAL_SUBJECT);
if(target==null) target = dependency.getChildWithReln(source, UniversalEnglishGrammaticalRelations.CLAUSAL_SUBJECT);
// TODO
if(target == null) continue;
// to handle relative clause: e.g., Tim who is a student,
if(target.tag().startsWith("W")) {
IndexedWord parent = dependency.getParent(source);
if(parent!=null && dependency.reln(parent, source).equals(UniversalEnglishGrammaticalRelations.RELATIVE_CLAUSE_MODIFIER)) {
target = parent;
}
}
int sIdx = source.index()-1;
int tIdx = target.index()-1;
preNomi.add(Pair.makePair(tIdx, sIdx));
}
markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE");
// relative pronouns TODO
Set<Pair<Integer, Integer>> relativePronounPairs = Generics.newHashSet();
markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN");
}
private static void initializeClusters(Document doc) {
for (List<Mention> predicted : doc.predictedMentions) {
for (Mention p : predicted) {
doc.corefClusters.put(p.mentionID, new CorefCluster(p.mentionID, Generics.newHashSet(Arrays.asList(p))));
p.corefClusterID = p.mentionID;
}
}
boolean hasGold = (doc.goldMentions != null);
if(hasGold) {
for(List<Mention> golds : doc.goldMentions) {
for(Mention g : golds) {
doc.goldMentionsByID.put(g.mentionID, g);
}
}
}
}
/** Find document type: Conversation or article */
private static DocType findDocType(Document doc) {
boolean speakerChange = false;
for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
if(utterIndex!=0) speakerChange = true;
if(speakerChange && utterIndex==0) return DocType.ARTICLE;
if(doc.maxUtter < utterIndex) doc.maxUtter = utterIndex;
}
}
if(!speakerChange) return DocType.ARTICLE;
return DocType.CONVERSATION; // in conversation, utter index keep increasing.
}
/** Set paragraph index */
private static void setParagraphAnnotation(Document doc) {
int paragraphIndex = 0;
int previousOffset = -10;
for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++;
w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
} else {
w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
}
}
}
for(List<Mention> l : doc.predictedMentions) {
for(Mention m : l){
m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
}
}
doc.numParagraph = paragraphIndex;
}
/** Process discourse information */
protected static void processDiscourse(Document doc, Dictionaries dict) {
Boolean useMarkedDiscourse =
doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
if (useMarkedDiscourse == null || !useMarkedDiscourse) {
for (CoreLabel l : doc.annotation.get(CoreAnnotations.TokensAnnotation.class)) {
l.remove(CoreAnnotations.SpeakerAnnotation.class);
l.remove(CoreAnnotations.UtteranceAnnotation.class);
}
}
setUtteranceAndSpeakerAnnotation(doc);
// markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false);
// mention utter setting
for(Mention m : doc.predictedMentionsByID.values()) {
m.utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
}
doc.docType = findDocType(doc);
findSpeakers(doc, dict);
boolean debug = false;
if(debug) {
for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel cl : sent.get(TokensAnnotation.class)) {
log.info(" "+cl.word()+"-"+cl.get(UtteranceAnnotation.class)+"-"+cl.get(SpeakerAnnotation.class));
}
}
for(Integer utter : doc.speakers.keySet()) {
String speakerID = doc.speakers.get(utter);
log.info("utterance: "+utter);
log.info("speakers value: " + speakerID);
log.info("mention for it: "+
( (NumberMatchingRegex.isDecimalInteger(speakerID))?
doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter)))
: "no mention for this speaker yet") );
}
log.info("AA SPEAKERS: "+ doc.speakers);
}
// build 'speakerInfo' from 'speakers'
for(Integer utter : doc.speakers.keySet()) {
String speaker = doc.speakers.get(utter);
SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker);
if (speakerInfo == null) {
doc.speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker));
}
}
if(debug){
log.info("BB SPEAKER INFO MAP: "+doc.speakerInfoMap);
}
// mention -> to its speakerID: m.headWord.get(SpeakerAnnotation.class)
// speakerID -> more info: speakerInfoMap.get(speakerID)
// if exists, set(mentionID, its speakerID pair): speakerPairs
// for speakerInfo with real speaker name, find corresponding mention by strict/loose matching
Map<String, Integer> speakerConversion = Generics.newHashMap();
for(String speaker : doc.speakerInfoMap.keySet()) {
SpeakerInfo speakerInfo = doc.speakerInfoMap.get(speaker);
if (speakerInfo.hasRealSpeakerName()) { // do only for real name speaker, not mention ID
boolean found = false;
for(Mention m : doc.predictedMentionsByID.values()) {
if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, true)) {
speakerConversion.put(speaker, m.mentionID);
found = true;
break;
}
}
if(!found) {
for(Mention m : doc.predictedMentionsByID.values()) {
if (CorefRules.mentionMatchesSpeaker(m, speakerInfo, false)) {
speakerConversion.put(speaker, m.mentionID);
break;
}
}
}
}
}
if(debug) log.info("CC speaker conversion: " + speakerConversion);
// convert real name speaker to speaker mention id
for(Integer utter : doc.speakers.keySet()) {
String speaker = doc.speakers.get(utter);
if(speakerConversion.containsKey(speaker)) {
int speakerID = speakerConversion.get(speaker);
doc.speakers.put(utter, Integer.toString(speakerID));
}
}
for(String speaker : speakerConversion.keySet()) {
doc.speakerInfoMap.put( Integer.toString(speakerConversion.get(speaker)), doc.speakerInfoMap.get(speaker));
doc.speakerInfoMap.remove(speaker);
}
// fix SpeakerAnnotation
for(CoreLabel cl : doc.annotation.get(TokensAnnotation.class)) {
int utter = cl.get(UtteranceAnnotation.class);
if(doc.speakers.containsKey(utter)) {
cl.set(CoreAnnotations.SpeakerAnnotation.class, doc.speakers.get(utter));
}
}
// find speakerPairs
for(Mention m : doc.predictedMentionsByID.values()) {
String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
if(debug) log.info("DD: "+speaker);
if (NumberMatchingRegex.isDecimalInteger(speaker)) {
int speakerMentionID = Integer.parseInt(speaker);
doc.speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
}
}
if(debug) {
log.info("==========================================================================");
for(Integer utter : doc.speakers.keySet()) {
String speakerID = doc.speakers.get(utter);
log.info("utterance: "+utter);
log.info("speakers value: " + speakerID);
log.info("mention for it: "+
( (NumberMatchingRegex.isDecimalInteger(speakerID))?
doc.predictedMentionsByID.get(Integer.parseInt(doc.speakers.get(utter)))
: "no mention for this speaker yet") );
}
log.info(doc.speakers);
}
}
private static void setUtteranceAndSpeakerAnnotation(Document doc) {
doc.speakerInfoGiven = false;
int utterance = 0;
int outsideQuoteUtterance = 0; // the utterance of outside of quotation
boolean insideQuotation = false;
List<CoreLabel> tokens = doc.annotation.get(CoreAnnotations.TokensAnnotation.class);
String preSpeaker = (tokens.size() > 0)? tokens.get(0).get(CoreAnnotations.SpeakerAnnotation.class) : null;
for (CoreLabel l : tokens) {
String curSpeaker = l.get(CoreAnnotations.SpeakerAnnotation.class);
String w = l.get(CoreAnnotations.TextAnnotation.class);
if (curSpeaker!=null && !curSpeaker.equals("-")) doc.speakerInfoGiven = true;
boolean speakerChange = doc.speakerInfoGiven && curSpeaker!=null && !curSpeaker.equals(preSpeaker);
boolean quoteStart = w.equals("``") || (!insideQuotation && w.equals("\""));
boolean quoteEnd = w.equals("''") || (insideQuotation && w.equals("\""));
if(speakerChange) {
if(quoteStart) {
utterance = doc.maxUtter + 1;
outsideQuoteUtterance = utterance+1;
} else {
utterance = doc.maxUtter + 1;
outsideQuoteUtterance = utterance;
}
preSpeaker = curSpeaker;
} else {
if(quoteStart) {
utterance = doc.maxUtter + 1;
}
}
if(quoteEnd) {
utterance = outsideQuoteUtterance;
insideQuotation = false;
}
if(doc.maxUtter < utterance) doc.maxUtter = utterance;
l.set(CoreAnnotations.UtteranceAnnotation.class, utterance);
if(quoteStart) l.set(CoreAnnotations.UtteranceAnnotation.class, outsideQuoteUtterance); // quote start got outside utterance idx
boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
|| l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
|| l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");
if(noSpeakerInfo || insideQuotation){
l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+utterance);
}
if(quoteStart) insideQuotation = true;
}
}
/** Speaker extraction */
private static void findSpeakers(Document doc, Dictionaries dict) {
Boolean useMarkedDiscourseBoolean = doc.annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null)? useMarkedDiscourseBoolean: false;
if(!useMarkedDiscourse) {
if(doc.docType==DocType.CONVERSATION) findSpeakersInConversation(doc, dict);
else if (doc.docType==DocType.ARTICLE) findSpeakersInArticle(doc, dict);
}
for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
if(!doc.speakers.containsKey(utterIndex)) {
doc.speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
}
}
}
}
private static void findSpeakersInArticle(Document doc, Dictionaries dict) {
List<CoreMap> sentences = doc.annotation.get(CoreAnnotations.SentencesAnnotation.class);
IntPair beginQuotation = null;
IntPair endQuotation = null;
boolean insideQuotation = false;
int utterNum = -1;
for (int i = 0 ; i < sentences.size(); i++) {
List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
for(int j = 0 ; j < sent.size() ; j++) {
int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);
if(utterIndex != 0 && !insideQuotation) {
utterNum = utterIndex;
insideQuotation = true;
beginQuotation = new IntPair(i,j);
} else if (utterIndex == 0 && insideQuotation) {
insideQuotation = false;
endQuotation = new IntPair(i,j);
findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
}
}
}
if(insideQuotation) {
endQuotation = new IntPair(sentences.size()-1, sentences.get(sentences.size()-1).get(CoreAnnotations.TokensAnnotation.class).size()-1);
findQuotationSpeaker(doc, utterNum, sentences, beginQuotation, endQuotation, dict);
}
}
private static void findQuotationSpeaker(Document doc, int utterNum, List<CoreMap> sentences,
IntPair beginQuotation, IntPair endQuotation, Dictionaries dict) {
if(findSpeaker(doc, utterNum, beginQuotation.get(0), sentences, 0, beginQuotation.get(1), dict))
return ;
if(findSpeaker(doc, utterNum, endQuotation.get(0), sentences, endQuotation.get(1),
sentences.get(endQuotation.get(0)).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
if(beginQuotation.get(1) <= 1 && beginQuotation.get(0) > 0) {
if(findSpeaker(doc, utterNum, beginQuotation.get(0)-1, sentences, 0,
sentences.get(beginQuotation.get(0)-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
}
if(endQuotation.get(1) >= sentences.get(endQuotation.get(0)).size()-2
&& sentences.size() > endQuotation.get(0)+1) {
if(findSpeaker(doc, utterNum, endQuotation.get(0)+1, sentences, 0,
sentences.get(endQuotation.get(0)+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
}
}
private static boolean findSpeaker(Document doc, int utterNum, int sentNum, List<CoreMap> sentences,
int startIndex, int endIndex, Dictionaries dict) {
List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
for(int i = startIndex ; i < endIndex ; i++) {
CoreLabel cl = sent.get(i);
if(cl.get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue;
String lemma = cl.lemma();
String word = cl.word();
if(dict.reportVerb.contains(lemma) && cl.tag().startsWith("V")) {
// find subject
SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (dependency == null) {
dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
IndexedWord w = dependency.getNodeByWordPattern(word);
if (w != null) {
if(findSubject(doc, dependency, w, sentNum, utterNum)) return true;
for(IndexedWord p : dependency.getPathToRoot(w)) {
if(!p.tag().startsWith("V") && !p.tag().startsWith("MD")) break;
if(findSubject(doc, dependency, p, sentNum, utterNum)) return true; // handling something like "was talking", "can tell"
}
} else {
Redwood.log("debug-preprocessor", "Cannot find node in dependency for word " + word);
}
}
}
return false;
}
private static boolean findSubject(Document doc, SemanticGraph dependency, IndexedWord w, int sentNum, int utterNum) {
for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(w)){
if(child.first().getShortName().equals("nsubj")) {
String subjectString = child.second().word();
int subjectIndex = child.second().index(); // start from 1
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, sentNum);
headPosition.set(1, subjectIndex-1);
String speaker;
if(doc.mentionheadPositions.containsKey(headPosition)) {
speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
} else {
speaker = subjectString;
}
doc.speakers.put(utterNum, speaker);
return true;
}
}
return false;
}
private static void findSpeakersInConversation(Document doc, Dictionaries dict) {
for(List<Mention> l : doc.predictedMentions) {
for(Mention m : l){
if(m.predicateNominatives == null) continue;
for (Mention a : m.predicateNominatives){
if(a.spanToString().toLowerCase().equals("i")) {
doc.speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID));
}
}
}
}
List<CoreMap> paragraph = new ArrayList<>();
int paragraphUtterIndex = 0;
String nextParagraphSpeaker = "";
int paragraphOffset = 0;
for(CoreMap sent : doc.annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
paragraph.add(sent);
int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class);
if(paragraphUtterIndex!=currentUtter) {
nextParagraphSpeaker = findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
paragraphUtterIndex = currentUtter;
paragraphOffset += paragraph.size();
paragraph = new ArrayList<>();
}
}
findParagraphSpeaker(doc, paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
}
private static String findParagraphSpeaker(Document doc, List<CoreMap> paragraph,
int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) {
if ( ! doc.speakers.containsKey(paragraphUtterIndex)) {
if ( ! nextParagraphSpeaker.isEmpty()) {
doc.speakers.put(paragraphUtterIndex, nextParagraphSpeaker);
} else { // find the speaker of this paragraph (John, nbc news)
// cdm [Sept 2015] added this check to try to avoid crash
if (paragraph.isEmpty()) {
Redwood.log("debug-preprocessor", "Empty paragraph; skipping findParagraphSpeaker");
return "";
}
CoreMap lastSent = paragraph.get(paragraph.size()-1);
String speaker = "";
boolean hasVerb = false;
for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){
CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i);
String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if(pos.startsWith("V")) {
hasVerb = true;
break;
}
if(ner.startsWith("PER")) {
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, paragraph.size()-1 + paragraphOffset);
headPosition.set(1, i);
if(doc.mentionheadPositions.containsKey(headPosition)) {
speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
}
}
}
if(!hasVerb && !speaker.equals("")) {
doc.speakers.put(paragraphUtterIndex, speaker);
}
}
}
return findNextParagraphSpeaker(doc, paragraph, paragraphOffset, dict);
}
private static String findNextParagraphSpeaker(Document doc, List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
if (paragraph.isEmpty()) {
return "";
}
CoreMap lastSent = paragraph.get(paragraph.size()-1);
String speaker = "";
for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
String word = w.get(CoreAnnotations.TextAnnotation.class);
SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (dependency == null) {
dependency = lastSent.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
IndexedWord t = dependency.getNodeByWordPattern(word);
for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){
if(child.first().getShortName().equals("nsubj")) {
int subjectIndex = child.second().index(); // start from 1
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, paragraph.size()-1 + paragraphOffset);
headPosition.set(1, subjectIndex-1);
if(doc.mentionheadPositions.containsKey(headPosition)
&& doc.mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
speaker = Integer.toString(doc.mentionheadPositions.get(headPosition).mentionID);
}
}
}
}
}
return speaker;
}
/** Check one mention is the speaker of the other mention */
public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {
if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
|| ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false;
int countQuotationMark = 0;
for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) {
String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
if(word.equals("``") || word.equals("''")) countQuotationMark++;
}
if(countQuotationMark!=1) return false;
IndexedWord w = m.enhancedDependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
if(w== null) return false;
for(Pair<GrammaticalRelation,IndexedWord> parent : m.enhancedDependency.parentPairs(w)){
if(parent.first().getShortName().equals("nsubj")
&& dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
return true;
}
}
return false;
}
private static void markListMemberRelation(List<Mention> orderedMentions) {
for(Mention m1 : orderedMentions){
for(Mention m2 : orderedMentions){
// Mark if m2 and m1 are in list relationship
if (m1.isListMemberOf(m2)) {
m2.addListMember(m1);
m1.addBelongsToList(m2);
} else if (m2.isListMemberOf(m1)) {
m1.addListMember(m2);
m2.addBelongsToList(m1);
}
}
}
}
private static void markMentionRelation(List<Mention> orderedMentions, Set<Pair<Integer, Integer>> foundPairs, String flag) {
for(Mention m1 : orderedMentions){
for(Mention m2 : orderedMentions){
if(m1==m2) continue;
// Ignore if m2 and m1 are in list relationship
if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) {
//Redwood.log("debug-preprocessor", "Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship");
continue;
}
for(Pair<Integer, Integer> foundPair: foundPairs){
if (foundPair.first() == m1.headIndex && foundPair.second() == m2.headIndex) {
if(flag.equals("APPOSITION")) {
if ( ! foundPair.first().equals(foundPair.second()) || m2.insideIn(m1)) {
m2.addApposition(m1);
}
}
else if(flag.equals("PREDICATE_NOMINATIVE")) {
m2.addPredicateNominatives(m1);
}
else if(flag.equals("RELATIVE_PRONOUN")) m2.addRelativePronoun(m1);
else throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)");
}
}
}
}
}
// private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))");
// private static void findRelativePronouns(Tree tree, Set<Pair<Integer, Integer>> relativePronounPairs) {
// findTreePattern(tree, relativePronounPattern, relativePronounPairs);
// }
}