//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2010 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
// Christopher Manning
// Dept of Computer Science, Gates 1A
// Stanford CA 94305-9010
// USA
//
package edu.stanford.nlp.dcoref;
import java.io.Serializable;
import java.util.*;
import edu.stanford.nlp.dcoref.Dictionaries.Number;
import edu.stanford.nlp.dcoref.Dictionaries.Person;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.IntTuple;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.TwoDimensionalMap;
import edu.stanford.nlp.util.TwoDimensionalSet;
public class Document implements Serializable {
private static final long serialVersionUID = -4139866807494603953L;
public enum DocType { CONVERSATION, ARTICLE }
/** The type of document: conversational or article */
public DocType docType;
/** Document annotation */
public Annotation annotation;
/** for conll shared task 2011 */
public CoNLL2011DocumentReader.Document conllDoc;
/** The list of gold mentions */
public List<List<Mention>> goldOrderedMentionsBySentence;
/** The list of predicted mentions */
public List<List<Mention>> predictedOrderedMentionsBySentence;
/** return the list of predicted mentions */
public List<List<Mention>> getOrderedMentions() {
return predictedOrderedMentionsBySentence;
}
/** Clusters for coreferent mentions */
public Map<Integer, CorefCluster> corefClusters;
/** Gold Clusters for coreferent mentions */
public Map<Integer, CorefCluster> goldCorefClusters;
/** For all mentions in a document, map mentionID to mention. */
public Map<Integer, Mention> allPredictedMentions;
public Map<Integer, Mention> allGoldMentions;
/** Set of roles (in role apposition) in a document */
public Set<Mention> roleSet;
/**
* Position of each mention in the input matrix
* Each mention occurrence with sentence # and position within sentence
* (Nth mention, not Nth token)
*/
public Map<Mention, IntTuple> positions; // mentions may be removed from this due to post processing
public Map<Mention, IntTuple> allPositions; // all mentions (mentions will not be removed from this)
public final Map<IntTuple, Mention> mentionheadPositions;
/** List of gold links in a document by positions */
private List<Pair<IntTuple,IntTuple>> goldLinks;
/** Map UtteranceAnnotation to String (speaker): mention ID or speaker string */
public Map<Integer, String> speakers;
/** Pair of mention id, and the mention's speaker id */
public Set<Pair<Integer, Integer>> speakerPairs;
public int maxUtter;
public int numParagraph;
public int numSentences;
/** Set of incompatible clusters pairs */
private TwoDimensionalSet<Integer, Integer> incompatibles;
private TwoDimensionalSet<Integer, Integer> incompatibleClusters;
protected TwoDimensionalMap<Integer, Integer, Boolean> acronymCache;
/** Map of speaker name/id to speaker info */
transient private Map<String, SpeakerInfo> speakerInfoMap = Generics.newHashMap();
public Document() {
positions = Generics.newHashMap();
mentionheadPositions = Generics.newHashMap();
roleSet = Generics.newHashSet();
corefClusters = Generics.newHashMap();
goldCorefClusters = null;
allPredictedMentions = Generics.newHashMap();
allGoldMentions = Generics.newHashMap();
speakers = Generics.newHashMap();
speakerPairs = Generics.newHashSet();
incompatibles = TwoDimensionalSet.hashSet();
incompatibleClusters = TwoDimensionalSet.hashSet();
acronymCache = TwoDimensionalMap.hashMap();
}
public Document(Annotation anno, List<List<Mention>> predictedMentions,
List<List<Mention>> goldMentions, Dictionaries dict) {
this();
annotation = anno;
numSentences = anno.get(CoreAnnotations.SentencesAnnotation.class).size();
predictedOrderedMentionsBySentence = predictedMentions;
goldOrderedMentionsBySentence = goldMentions;
if(goldMentions!=null) {
findTwinMentions(true);
// fill allGoldMentions
for(List<Mention> l : goldOrderedMentionsBySentence) {
for(Mention g : l) {
allGoldMentions.put(g.mentionID, g);
}
}
}
// set original ID, initial coref clusters, paragraph annotation, mention positions
initialize();
processDiscourse(dict);
printMentionDetection();
}
/** Process discourse information */
protected void processDiscourse(Dictionaries dict) {
docType = findDocType(dict);
markQuotations(this.annotation.get(CoreAnnotations.SentencesAnnotation.class), false);
findSpeakers(dict);
// find 'speaker mention' for each mention
for(Mention m : allPredictedMentions.values()) {
int utter = m.headWord.get(CoreAnnotations.UtteranceAnnotation.class);
String speaker = m.headWord.get(CoreAnnotations.SpeakerAnnotation.class);
if (speaker != null) {
// Populate speaker info
SpeakerInfo speakerInfo = speakerInfoMap.get(speaker);
if (speakerInfo == null) {
speakerInfoMap.put(speaker, speakerInfo = new SpeakerInfo(speaker));
// span indicates this is the speaker
if (Rules.mentionMatchesSpeaker(m, speakerInfo, true)) {
m.speakerInfo = speakerInfo;
}
}
if (NumberMatchingRegex.isDecimalInteger(speaker)) {
try{
int speakerMentionID = Integer.parseInt(speaker);
if (utter != 0) {
// Add pairs of mention id and the mention id of the speaker
speakerPairs.add(new Pair<>(m.mentionID, speakerMentionID));
// speakerPairs.add(new Pair<Integer, Integer>(speakerMentionID, m.mentionID));
}
} catch (Exception e){
// no mention found for the speaker
// nothing to do
}
}
}
// set generic 'you' : e.g., you know in conversation
if(docType!=DocType.ARTICLE && m.person==Person.YOU && m.endIndex < m.sentenceWords.size()-1
&& m.sentenceWords.get(m.endIndex).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("know")) {
m.generic = true;
}
}
// now that we have identified the speakers, first pass to check if mentions should cluster with the speakers
for(Mention m : allPredictedMentions.values()) {
if (m.speakerInfo == null) {
for (SpeakerInfo speakerInfo: speakerInfoMap.values()) {
if (speakerInfo.hasRealSpeakerName()) {
// do loose match - assumes that there isn't that many speakers....
if (Rules.mentionMatchesSpeaker(m, speakerInfo, false)) {
m.speakerInfo = speakerInfo;
break;
}
}
}
}
}
}
/** Document initialize */
protected void initialize() {
if(goldOrderedMentionsBySentence==null) assignOriginalID();
setParagraphAnnotation();
initializeCorefCluster();
this.allPositions = Generics.newHashMap(this.positions);
}
/** initialize positions and corefClusters (put each mention in each CorefCluster) */
private void initializeCorefCluster() {
for(int i = 0; i < predictedOrderedMentionsBySentence.size(); i ++){
for(int j = 0; j < predictedOrderedMentionsBySentence.get(i).size(); j ++){
Mention m = predictedOrderedMentionsBySentence.get(i).get(j);
if (allPredictedMentions.containsKey(m.mentionID)) {
SieveCoreferenceSystem.logger.warning("WARNING: Already contain mention " + m.mentionID);
Mention m1 = allPredictedMentions.get(m.mentionID);
SieveCoreferenceSystem.logger.warning("OLD mention: " + m1.spanToString() + "[" + m1.startIndex + "," + m1.endIndex + "]");
SieveCoreferenceSystem.logger.warning("NEW mention: " + m.spanToString() + "[" + m.startIndex + "," + m.endIndex + "]");
// SieveCoreferenceSystem.debugPrintMentions(System.err, "PREDICTED ORDERED", predictedOrderedMentionsBySentence);
// SieveCoreferenceSystem.debugPrintMentions(System.err, "GOLD ORDERED", goldOrderedMentionsBySentence);
}
assert(!allPredictedMentions.containsKey(m.mentionID));
allPredictedMentions.put(m.mentionID, m);
IntTuple pos = new IntTuple(2);
pos.set(0, i);
pos.set(1, j);
positions.put(m, pos);
m.sentNum = i;
assert(!corefClusters.containsKey(m.mentionID));
corefClusters.put(m.mentionID, new CorefCluster(m.mentionID, Generics.newHashSet(Collections.singletonList(m))));
m.corefClusterID = m.mentionID;
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, i);
headPosition.set(1, m.headIndex);
mentionheadPositions.put(headPosition, m);
}
}
}
public boolean isIncompatible(CorefCluster c1, CorefCluster c2) {
// Was any of the pairs of mentions marked as incompatible
int cid1 = Math.min(c1.clusterID, c2.clusterID);
int cid2 = Math.max(c1.clusterID, c2.clusterID);
return incompatibleClusters.contains(cid1,cid2);
}
// Update incompatibles for two clusters that are about to be merged
public void mergeIncompatibles(CorefCluster to, CorefCluster from) {
List<Pair<Pair<Integer,Integer>, Pair<Integer,Integer>>> replacements =
new ArrayList<>();
for (Pair<Integer, Integer> p : incompatibleClusters) {
Integer other = null;
if (p.first == from.clusterID) {
other = p.second;
} else if (p.second == from.clusterID) {
other = p.first;
}
if (other != null && other != to.clusterID) {
int cid1 = Math.min(other, to.clusterID);
int cid2 = Math.max(other, to.clusterID);
replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2)));
}
}
for (Pair<Pair<Integer,Integer>, Pair<Integer,Integer>> r:replacements) {
incompatibleClusters.remove(r.first.first(), r.first.second());
incompatibleClusters.add(r.second.first(), r.second.second());
}
}
public void mergeAcronymCache(CorefCluster to, CorefCluster from) {
TwoDimensionalSet<Integer, Integer> replacements = TwoDimensionalSet.hashSet();
for (Integer first : acronymCache.firstKeySet()) {
for (Integer second : acronymCache.get(first).keySet()) {
if (acronymCache.get(first, second)) {
Integer other = null;
if (first == from.clusterID) {
other = second;
} else if (second == from.clusterID) {
other = first;
}
if (other != null && other != to.clusterID) {
int cid1 = Math.min(other, to.clusterID);
int cid2 = Math.max(other, to.clusterID);
replacements.add(cid1, cid2);
}
}
}
}
for (Integer first : replacements.firstKeySet()) {
for (Integer second : replacements.secondKeySet(first)) {
acronymCache.put(first, second, true);
}
}
}
public boolean isIncompatible(Mention m1, Mention m2) {
int mid1 = Math.min(m1.mentionID, m2.mentionID);
int mid2 = Math.max(m1.mentionID, m2.mentionID);
return incompatibles.contains(mid1,mid2);
}
public void addIncompatible(Mention m1, Mention m2) {
int mid1 = Math.min(m1.mentionID, m2.mentionID);
int mid2 = Math.max(m1.mentionID, m2.mentionID);
incompatibles.add(mid1,mid2);
int cid1 = Math.min(m1.corefClusterID, m2.corefClusterID);
int cid2 = Math.max(m1.corefClusterID, m2.corefClusterID);
incompatibleClusters.add(cid1,cid2);
}
/** Mark twin mentions in gold and predicted mentions */
protected void findTwinMentions(boolean strict){
if(strict) findTwinMentionsStrict();
else findTwinMentionsRelaxed();
}
/** Mark twin mentions: All mention boundaries should be matched */
private void findTwinMentionsStrict(){
for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
// For CoNLL training there are some documents with gold mentions with the same position offsets
// See /scr/nlp/data/conll-2011/v2/data/train/data/english/annotations/nw/wsj/09/wsj_0990.v2_auto_conll
// (Packwood - Roth)
CollectionValuedMap<IntPair, Mention> goldMentionPositions = new CollectionValuedMap<>();
for(Mention g : golds) {
IntPair ip = new IntPair(g.startIndex, g.endIndex);
if (goldMentionPositions.containsKey(ip)) {
StringBuilder existingMentions = new StringBuilder();
for (Mention eg: goldMentionPositions.get(ip)) {
if (existingMentions.length() > 0) {
existingMentions.append(",");
}
existingMentions.append(eg.mentionID);
}
SieveCoreferenceSystem.logger.warning("WARNING: gold mentions with the same offsets: " + ip
+ " mentions=" + g.mentionID + "," + existingMentions + ", " + g.spanToString());
}
//assert(!goldMentionPositions.containsKey(ip));
goldMentionPositions.add(new IntPair(g.startIndex, g.endIndex), g);
}
for(Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if(goldMentionPositions.containsKey(pos)) {
Collection<Mention> cm = goldMentionPositions.get(pos);
Mention g = cm.iterator().next();
cm.remove(g);
p.mentionID = g.mentionID;
p.twinless = false;
g.twinless = false;
}
}
// temp: for making easy to recognize twinless mention
for(Mention p : predicts){
if(p.twinless) p.mentionID += 10000;
}
}
}
/** Mark twin mentions: heads of the mentions are matched */
private void findTwinMentionsRelaxed() {
for(int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);
Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
for(Mention g : golds) {
goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
if(!goldMentionHeadPositions.containsKey(g.headIndex)) {
goldMentionHeadPositions.put(g.headIndex, new LinkedList<>());
}
goldMentionHeadPositions.get(g.headIndex).add(g);
}
List<Mention> remains = new ArrayList<>();
for (Mention p : predicts) {
IntPair pos = new IntPair(p.startIndex, p.endIndex);
if(goldMentionPositions.containsKey(pos)) {
Mention g = goldMentionPositions.get(pos);
p.mentionID = g.mentionID;
p.twinless = false;
g.twinless = false;
goldMentionHeadPositions.get(g.headIndex).remove(g);
if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
goldMentionHeadPositions.remove(g.headIndex);
}
}
else remains.add(p);
}
for (Mention r : remains){
if(goldMentionHeadPositions.containsKey(r.headIndex)) {
Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
r.mentionID = g.mentionID;
r.twinless = false;
g.twinless = false;
if(goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
goldMentionHeadPositions.remove(g.headIndex);
}
}
}
}
}
/** Set paragraph index */
private void setParagraphAnnotation() {
int paragraphIndex = 0;
int previousOffset = -10;
for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
if(w.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
if(w.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) > previousOffset+2) paragraphIndex++;
w.set(CoreAnnotations.ParagraphAnnotation.class, paragraphIndex);
previousOffset = w.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
} else {
w.set(CoreAnnotations.ParagraphAnnotation.class, -1);
}
}
}
for(List<Mention> l : predictedOrderedMentionsBySentence) {
for(Mention m : l){
m.paragraph = m.headWord.get(CoreAnnotations.ParagraphAnnotation.class);
}
}
numParagraph = paragraphIndex;
}
/** Find document type: Conversation or article */
private DocType findDocType(Dictionaries dict) {
boolean speakerChange = false;
Set<Integer> discourseWithIorYou = Generics.newHashSet();
for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
if(utterIndex!=0) speakerChange = true;
if(speakerChange && utterIndex==0) return DocType.ARTICLE;
if(dict.firstPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())
|| dict.secondPersonPronouns.contains(w.get(CoreAnnotations.TextAnnotation.class).toLowerCase())) {
discourseWithIorYou.add(utterIndex);
}
if(maxUtter < utterIndex) maxUtter = utterIndex;
}
}
if(!speakerChange) return DocType.ARTICLE;
return DocType.CONVERSATION; // in conversation, utter index keep increasing.
}
/** When there is no mentionID information (without gold annotation), assign mention IDs */
protected void assignOriginalID(){
List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
boolean hasOriginalID = true;
for(List<Mention> l : orderedMentionsBySentence){
if (l.size()==0) continue;
for(Mention m : l){
if(m.mentionID == -1){
hasOriginalID = false;
}
}
}
if(!hasOriginalID){
int id = 0;
for(List<Mention> l : orderedMentionsBySentence){
for(Mention m : l){
m.mentionID = id++;
}
}
}
}
/** Extract gold coref cluster information. */
public void extractGoldCorefClusters(){
goldCorefClusters = Generics.newHashMap();
for (List<Mention> mentions : goldOrderedMentionsBySentence) {
for (Mention m : mentions) {
int id = m.goldCorefClusterID;
if (id == -1) {
throw new RuntimeException("No gold info");
}
CorefCluster c = goldCorefClusters.get(id);
if (c == null) {
c = new CorefCluster(id);
goldCorefClusters.put(id, c);
}
c.corefMentions.add(m);
}
}
}
protected List<Pair<IntTuple, IntTuple>> getGoldLinks() {
if(goldLinks==null) this.extractGoldLinks();
return goldLinks;
}
/** Extract gold coref link information */
protected void extractGoldLinks() {
// List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
List<Pair<IntTuple, IntTuple>> links = new ArrayList<>();
// position of each mention in the input matrix, by id
Map<Integer, IntTuple> positions = Generics.newHashMap();
// positions of antecedents
Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
for(int i = 0; i < goldOrderedMentionsBySentence.size(); i ++){
for(int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j ++){
Mention m = goldOrderedMentionsBySentence.get(i).get(j);
int id = m.mentionID;
IntTuple pos = new IntTuple(2);
pos.set(0, i);
pos.set(1, j);
positions.put(id, pos);
antecedents.put(id, new ArrayList<>());
}
}
// SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
for (List<Mention> mentions : goldOrderedMentionsBySentence) {
for (Mention m : mentions) {
int id = m.mentionID;
IntTuple src = positions.get(id);
assert (src != null);
if (m.originalRef >= 0) {
IntTuple dst = positions.get(m.originalRef);
if (dst == null) {
throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
}
// to deal with cataphoric annotation
while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
m.originalRef = dstMention.originalRef;
dstMention.originalRef = id;
if (m.originalRef < 0) break;
dst = positions.get(m.originalRef);
}
if (m.originalRef < 0) continue;
// A B C: if A<-B, A<-C => make a link B<-C
for (int k = dst.get(0); k <= src.get(0); k++) {
for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
if (k == dst.get(0) && l < dst.get(1)) continue;
if (k == src.get(0) && l > src.get(1)) break;
IntTuple missed = new IntTuple(2);
missed.set(0, k);
missed.set(1, l);
if (links.contains(new Pair<>(missed, dst))) {
antecedents.get(id).add(missed);
links.add(new Pair<>(src, missed));
}
}
}
links.add(new Pair<>(src, dst));
assert (antecedents.get(id) != null);
antecedents.get(id).add(dst);
List<IntTuple> ants = antecedents.get(m.originalRef);
assert (ants != null);
for (IntTuple ant : ants) {
antecedents.get(id).add(ant);
links.add(new Pair<>(src, ant));
}
}
}
}
goldLinks = links;
}
/** set UtteranceAnnotation for quotations: default UtteranceAnnotation = 0 is given */
private void markQuotations(List<CoreMap> results, boolean normalQuotationType) {
boolean insideQuotation = false;
for(CoreMap m : results) {
for(CoreLabel l : m.get(CoreAnnotations.TokensAnnotation.class)) {
String w = l.get(CoreAnnotations.TextAnnotation.class);
boolean noSpeakerInfo = !l.containsKey(CoreAnnotations.SpeakerAnnotation.class)
|| l.get(CoreAnnotations.SpeakerAnnotation.class).equals("")
|| l.get(CoreAnnotations.SpeakerAnnotation.class).startsWith("PER");
if(w.equals("``")
|| (!insideQuotation && normalQuotationType && w.equals("\""))) {
insideQuotation = true;
maxUtter++;
continue;
} else if(w.equals("''")
|| (insideQuotation && normalQuotationType && w.equals("\""))) {
insideQuotation = false;
}
if(insideQuotation) {
l.set(CoreAnnotations.UtteranceAnnotation.class, maxUtter);
}
if(noSpeakerInfo){
l.set(CoreAnnotations.SpeakerAnnotation.class, "PER"+l.get(CoreAnnotations.UtteranceAnnotation.class));
}
}
}
if(maxUtter==0 && !normalQuotationType) markQuotations(results, true);
}
/** Speaker extraction */
private void findSpeakers(Dictionaries dict) {
Boolean useMarkedDiscourseBoolean = annotation.get(CoreAnnotations.UseMarkedDiscourseAnnotation.class);
boolean useMarkedDiscourse = (useMarkedDiscourseBoolean != null)? useMarkedDiscourseBoolean: false;
if (Constants.USE_GOLD_SPEAKER_TAGS || useMarkedDiscourse) {
for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
speakers.put(utterIndex, w.get(CoreAnnotations.SpeakerAnnotation.class));
}
}
} else {
if(docType==DocType.CONVERSATION) findSpeakersInConversation(dict);
else if (docType==DocType.ARTICLE) findSpeakersInArticle(dict);
// set speaker info to annotation
for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for(CoreLabel w : sent.get(CoreAnnotations.TokensAnnotation.class)) {
int utterIndex = w.get(CoreAnnotations.UtteranceAnnotation.class);
if(speakers.containsKey(utterIndex)) {
w.set(CoreAnnotations.SpeakerAnnotation.class, speakers.get(utterIndex));
}
}
}
}
}
private void findSpeakersInArticle(Dictionaries dict) {
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
Pair<Integer, Integer> beginQuotation = new Pair<>();
Pair<Integer, Integer> endQuotation = new Pair<>();
boolean insideQuotation = false;
int utterNum = -1;
for (int i = 0 ; i < sentences.size(); i++) {
List<CoreLabel> sent = sentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
for(int j = 0 ; j < sent.size() ; j++) {
int utterIndex = sent.get(j).get(CoreAnnotations.UtteranceAnnotation.class);
if(utterIndex != 0 && !insideQuotation) {
utterNum = utterIndex;
insideQuotation = true;
beginQuotation.setFirst(i);
beginQuotation.setSecond(j);
} else if (utterIndex == 0 && insideQuotation) {
insideQuotation = false;
endQuotation.setFirst(i);
endQuotation.setSecond(j);
findQuotationSpeaker(utterNum, sentences, beginQuotation, endQuotation, dict);
}
}
}
}
private void findQuotationSpeaker(int utterNum, List<CoreMap> sentences,
Pair<Integer, Integer> beginQuotation, Pair<Integer, Integer> endQuotation, Dictionaries dict) {
if(findSpeaker(utterNum, beginQuotation.first(), sentences, 0, beginQuotation.second(), dict))
return ;
if(findSpeaker(utterNum, endQuotation.first(), sentences, endQuotation.second(),
sentences.get(endQuotation.first()).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
if(beginQuotation.second() <= 1 && beginQuotation.first() > 0) {
if(findSpeaker(utterNum, beginQuotation.first()-1, sentences, 0,
sentences.get(beginQuotation.first()-1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
}
if(endQuotation.second() == sentences.get(endQuotation.first()).size()-1
&& sentences.size() > endQuotation.first()+1) {
if(findSpeaker(utterNum, endQuotation.first()+1, sentences, 0,
sentences.get(endQuotation.first()+1).get(CoreAnnotations.TokensAnnotation.class).size(), dict))
return;
}
}
private boolean findSpeaker(int utterNum, int sentNum, List<CoreMap> sentences,
int startIndex, int endIndex, Dictionaries dict) {
List<CoreLabel> sent = sentences.get(sentNum).get(CoreAnnotations.TokensAnnotation.class);
for(int i = startIndex ; i < endIndex ; i++) {
if(sent.get(i).get(CoreAnnotations.UtteranceAnnotation.class)!=0) continue;
String lemma = sent.get(i).get(CoreAnnotations.LemmaAnnotation.class);
String word = sent.get(i).get(CoreAnnotations.TextAnnotation.class);
if(dict.reportVerb.contains(lemma)) {
// find subject
SemanticGraph dependency = sentences.get(sentNum).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
IndexedWord w = dependency.getNodeByWordPattern(word);
if (w != null) {
for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(w)){
if(child.first().getShortName().equals("nsubj")) {
String subjectString = child.second().word();
int subjectIndex = child.second().index(); // start from 1
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, sentNum);
headPosition.set(1, subjectIndex-1);
String speaker;
if(mentionheadPositions.containsKey(headPosition)) {
speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
} else {
speaker = subjectString;
}
speakers.put(utterNum, speaker);
return true;
}
}
} else {
SieveCoreferenceSystem.logger.warning("Cannot find node in dependency for word " + word);
}
}
}
return false;
}
private void findSpeakersInConversation(Dictionaries dict) {
for(List<Mention> l : predictedOrderedMentionsBySentence) {
for(Mention m : l){
if(m.predicateNominatives == null) continue;
for (Mention a : m.predicateNominatives){
if(a.spanToString().toLowerCase().equals("i")) {
speakers.put(m.headWord.get(CoreAnnotations.UtteranceAnnotation.class), Integer.toString(m.mentionID));
}
}
}
}
List<CoreMap> paragraph = new ArrayList<>();
int paragraphUtterIndex = 0;
String nextParagraphSpeaker = "";
int paragraphOffset = 0;
for(CoreMap sent : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
int currentUtter = sent.get(CoreAnnotations.TokensAnnotation.class).get(0).get(CoreAnnotations.UtteranceAnnotation.class);
if(paragraphUtterIndex!=currentUtter) {
nextParagraphSpeaker = findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
paragraphUtterIndex = currentUtter;
paragraphOffset += paragraph.size();
paragraph = new ArrayList<>();
}
paragraph.add(sent);
}
findParagraphSpeaker(paragraph, paragraphUtterIndex, nextParagraphSpeaker, paragraphOffset, dict);
}
private String findParagraphSpeaker(List<CoreMap> paragraph,
int paragraphUtterIndex, String nextParagraphSpeaker, int paragraphOffset, Dictionaries dict) {
if(!speakers.containsKey(paragraphUtterIndex)) {
if(!nextParagraphSpeaker.equals("")) {
speakers.put(paragraphUtterIndex, nextParagraphSpeaker);
} else { // find the speaker of this paragraph (John, nbc news)
CoreMap lastSent = paragraph.get(paragraph.size()-1);
String speaker = "";
boolean hasVerb = false;
for(int i = 0 ; i < lastSent.get(CoreAnnotations.TokensAnnotation.class).size() ; i++){
CoreLabel w = lastSent.get(CoreAnnotations.TokensAnnotation.class).get(i);
String pos = w.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ner = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if(pos.startsWith("V")) {
hasVerb = true;
break;
}
if(ner.startsWith("PER")) {
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, paragraph.size()-1 + paragraphOffset);
headPosition.set(1, i);
if(mentionheadPositions.containsKey(headPosition)) {
speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
}
}
}
if(!hasVerb && !speaker.equals("")) {
speakers.put(paragraphUtterIndex, speaker);
}
}
}
return findNextParagraphSpeaker(paragraph, paragraphOffset, dict);
}
private String findNextParagraphSpeaker(List<CoreMap> paragraph, int paragraphOffset, Dictionaries dict) {
CoreMap lastSent = paragraph.get(paragraph.size()-1);
String speaker = "";
for(CoreLabel w : lastSent.get(CoreAnnotations.TokensAnnotation.class)) {
if(w.get(CoreAnnotations.LemmaAnnotation.class).equals("report") || w.get(CoreAnnotations.LemmaAnnotation.class).equals("say")) {
String word = w.get(CoreAnnotations.TextAnnotation.class);
SemanticGraph dependency = lastSent.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
IndexedWord t = dependency.getNodeByWordPattern(word);
for(Pair<GrammaticalRelation,IndexedWord> child : dependency.childPairs(t)){
if(child.first().getShortName().equals("nsubj")) {
int subjectIndex = child.second().index(); // start from 1
IntTuple headPosition = new IntTuple(2);
headPosition.set(0, paragraph.size()-1 + paragraphOffset);
headPosition.set(1, subjectIndex-1);
if(mentionheadPositions.containsKey(headPosition)
&& mentionheadPositions.get(headPosition).nerString.startsWith("PER")) {
speaker = Integer.toString(mentionheadPositions.get(headPosition).mentionID);
}
}
}
}
}
return speaker;
}
public SpeakerInfo getSpeakerInfo(String speaker) {
return speakerInfoMap.get(speaker);
}
public int numberOfSpeakers() {
return speakerInfoMap.size();
}
/** Check one mention is the speaker of the other mention */
public static boolean isSpeaker(Mention m, Mention ant, Dictionaries dict) {
if(!dict.firstPersonPronouns.contains(ant.spanToString().toLowerCase())
|| ant.number==Number.PLURAL || ant.sentNum!=m.sentNum) return false;
int countQuotationMark = 0;
for(int i = Math.min(m.headIndex, ant.headIndex)+1 ; i < Math.max(m.headIndex, ant.headIndex) ; i++) {
String word = m.sentenceWords.get(i).get(CoreAnnotations.TextAnnotation.class);
if(word.equals("``") || word.equals("''")) countQuotationMark++;
}
if(countQuotationMark!=1) return false;
IndexedWord w = m.dependency.getNodeByWordPattern(m.sentenceWords.get(m.headIndex).get(CoreAnnotations.TextAnnotation.class));
if(w== null) return false;
for(Pair<GrammaticalRelation,IndexedWord> parent : m.dependency.parentPairs(w)){
if(parent.first().getShortName().equals("nsubj")
&& dict.reportVerb.contains(parent.second().get(CoreAnnotations.LemmaAnnotation.class))) {
return true;
}
}
return false;
}
protected void printMentionDetection() {
int foundGoldCount = 0;
for(Mention g : allGoldMentions.values()) {
if(!g.twinless) foundGoldCount++;
}
SieveCoreferenceSystem.logger.fine("# of found gold mentions: "+foundGoldCount + " / # of gold mentions: "+allGoldMentions.size());
SieveCoreferenceSystem.logger.fine("gold mentions == ");
}
}