package edu.stanford.nlp.coref.hybrid;
import java.io.FileNotFoundException;
import java.text.DecimalFormat;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import edu.stanford.nlp.coref.data.CorefCluster;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Dictionaries.MentionType;
import edu.stanford.nlp.coref.data.Document;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.hybrid.sieve.DiscourseMatch;
import edu.stanford.nlp.coref.hybrid.sieve.ExactStringMatch;
import edu.stanford.nlp.coref.hybrid.sieve.PreciseConstructs;
import edu.stanford.nlp.coref.hybrid.sieve.PronounMatch;
import edu.stanford.nlp.coref.hybrid.sieve.RFSieve;
import edu.stanford.nlp.coref.hybrid.sieve.RelaxedExactStringMatch;
import edu.stanford.nlp.coref.hybrid.sieve.RelaxedHeadMatch;
import edu.stanford.nlp.coref.hybrid.sieve.Sieve;
import edu.stanford.nlp.coref.hybrid.sieve.SpeakerMatch;
import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch1;
import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch2;
import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch3;
import edu.stanford.nlp.coref.hybrid.sieve.StrictHeadMatch4;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SpeakerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.UtteranceAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.math.NumberMatchingRegex;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
/**
* Prints CoNLL-style output from a {@link Document}
* @author heeyoung
*/
public class HybridCorefPrinter {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(HybridCorefPrinter.class);
public static final DecimalFormat df = new DecimalFormat("#.####");
// for debug
// public static final ChineseHeadMatch dcorefChineseHeadMatch = new ChineseHeadMatch(StringUtils.argsToProperties(new String[]{"-coref.language", "zh"}));
public static final SpeakerMatch dcorefSpeaker = new SpeakerMatch();
public static final DiscourseMatch dcorefDiscourse = new DiscourseMatch();
public static final ExactStringMatch dcorefExactString = new ExactStringMatch();
public static final RelaxedExactStringMatch dcorefRelaxedExactString = new RelaxedExactStringMatch();
public static final PreciseConstructs dcorefPreciseConstructs = new PreciseConstructs();
public static final StrictHeadMatch1 dcorefHead1 = new StrictHeadMatch1();
public static final StrictHeadMatch2 dcorefHead2 = new StrictHeadMatch2();
public static final StrictHeadMatch3 dcorefHead3 = new StrictHeadMatch3();
public static final StrictHeadMatch4 dcorefHead4 = new StrictHeadMatch4();
public static final RelaxedHeadMatch dcorefRelaxedHead = new RelaxedHeadMatch();
public static final PronounMatch dcorefPronounSieve = new PronounMatch();
/** Print raw document for analysis */
public static String printRawDoc(Document document, boolean gold, boolean printClusterID) throws FileNotFoundException {
StringBuilder sb = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class);
StringBuilder doc = new StringBuilder();
for(int i = 0 ; i<sentences.size(); i++) {
doc.append(sentenceStringWithMention(i, document, gold, printClusterID));
doc.append("\n");
}
sb.append("PRINT RAW DOC START\n");
sb.append(document.annotation.get(CoreAnnotations.DocIDAnnotation.class)).append("\n");
if (gold) {
sb.append("New DOC: (GOLD MENTIONS) ==================================================\n");
} else {
sb.append("New DOC: (Predicted Mentions) ==================================================\n");
}
sb.append(doc.toString()).append("\n");
sb.append("PRINT RAW DOC END").append("\n");
return sb.toString();
}
public static String printErrorLog(Mention m, Document document, Counter<Integer> probs, int mIdx, Dictionaries dict, RFSieve sieve) throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("\nERROR START-----------------------------------------------------------------------\n");
sb.append("RESOLVER TYPE: mType: "+sieve.mType +", aType: "+sieve.aType).append("\n");
sb.append("DOCUMENT: "+document.docInfo.get("DOC_ID")+", "+document.docInfo.get("DOC_PART")).append("\n");
List<Mention> orderedAnts = new ArrayList<>();
sb.append("\nGOLD CLUSTER ID\n");
for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) {
if(sentDist == sieve.maxSentDist) sb.append("\tstart compare from here-------------\n");
int sentIdx = m.sentNum-sentDist;
sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, true, true)).append("\n");
}
sb.append("\nMENTION ID\n");
for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) {
if(sentDist == sieve.maxSentDist) sb.append("\tstart compare from here-------------\n");
int sentIdx = m.sentNum-sentDist;
sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, false, false)).append("\n");
}
// get dcoref antecedents ordering
for(int sentDist=0 ; sentDist <= Math.min(sieve.maxSentDist, m.sentNum) ; sentDist++) {
int sentIdx = m.sentNum-sentDist;
orderedAnts.addAll(Sieve.getOrderedAntecedents(m, sentIdx, mIdx, document.predictedMentions, dict));
}
Map<Integer, Integer> orders = Generics.newHashMap();
for(int i=0 ; i<orderedAnts.size() ; i++) {
Mention ant = orderedAnts.get(i);
orders.put(ant.mentionID, i);
}
CorefCluster mC = document.corefClusters.get(m.corefClusterID);
boolean isFirstMention = isFirstMention(m, document);
boolean foundCorefAnt = (probs.size() > 0 && Counters.max(probs) > sieve.thresMerge);
boolean correctDecision = ( (isFirstMention && !foundCorefAnt)
|| (foundCorefAnt && Sieve.isReallyCoref(document, m.mentionID, Counters.argmax(probs))) );
boolean barePlural = (m.originalSpan.size()==1 && m.headWord.tag().equals("NNS"));
if(correctDecision) return "";
sb.append("\nMENTION: "+m.spanToString()+" ("+m.mentionID
+")\tperson: "+m.person+"\tsingleton? "+ (!m.hasTwin) +"\t\tisFirstMention? "+isFirstMention
+"\t\tfoundAnt? "+foundCorefAnt+"\t\tcorrectDecision? "+correctDecision+"\tbarePlural? "+barePlural);
sb.append("\n\ttype: "+m.mentionType+"\tHeadword: "+m.headWord.word()+"\tNEtype: "+m.nerString+"\tnumber: "+m.number+"\tgender: "+m.gender+"\tanimacy: "+m.animacy).append("\n");
if(m.contextParseTree!=null) sb.append(m.contextParseTree.pennString());
sb.append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n");
for(int antID : Counters.toSortedList(probs)) {
Mention ant = document.predictedMentionsByID.get(antID);
CorefCluster aC = document.corefClusters.get(ant.corefClusterID);
boolean oracle = Sieve.isReallyCoref(document, m.mentionID, antID);
double prob = probs.getCount(antID);
int order = orders.get(antID);
String oracleStr = (oracle)? "coref " : "notcoref";
// String dcorefStr = (dcoref)? "coref " : "notcoref";
String dcorefStr = "notcoref";
if(dcorefDiscourse.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-discourse";
// else if(dcorefChineseHeadMatch.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-chineseHeadMatch";
else if(dcorefExactString.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-exactString";
else if(dcorefRelaxedExactString.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-relaxedExact";
else if(dcorefPreciseConstructs.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-preciseConstruct";
else if(dcorefHead1.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head1";
else if(dcorefHead2.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head2";
else if(dcorefHead3.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head3";
else if(dcorefHead4.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head4";
else if(dcorefRelaxedHead.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-relaxedHead";
else if(dcorefPronounSieve.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-pronounSieve";
else if(dcorefSpeaker.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-speaker";
dcorefStr += "\t"+String.valueOf(order);
String probStr = df.format(prob);
sb.append("\t\t"+oracleStr+"\t"+dcorefStr+"\t"+probStr+"\t\t"+ant.spanToString()+" ("+ant.mentionID+")\n");
}
sb.append("ERROR END -----------------------------------------------------------------------\n");
return sb.toString();
}
static boolean isFirstMention(Mention m, Document document) {
if(!m.hasTwin) return true;
Mention twinGold = document.goldMentionsByID.get(m.mentionID);
for(Mention coref : document.goldCorefClusters.get(twinGold.goldCorefClusterID).getCorefMentions()) {
if(coref==twinGold) continue;
if(coref.appearEarlierThan(twinGold)) return false;
}
return true;
}
public static String sentenceStringWithMention(int i, Document document, boolean gold, boolean printClusterID) {
StringBuilder sentStr = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(CoreAnnotations.SentencesAnnotation.class);
List<List<Mention>> allMentions;
if (gold) {
allMentions = document.goldMentions;
} else {
allMentions = document.predictedMentions;
}
// String filename = document.annotation.get()
int previousOffset = 0;
CoreMap sentence = sentences.get(i);
List<Mention> mentions = allMentions.get(i);
List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
String speaker = t.get(0).get(SpeakerAnnotation.class);
if(NumberMatchingRegex.isDecimalInteger(speaker)) speaker = speaker + ": "+document.predictedMentionsByID.get(Integer.parseInt(speaker)).spanToString();
sentStr.append("\tspeaker: "+speaker+" ("+t.get(0).get(UtteranceAnnotation.class)+") ");
String[] tokens = new String[t.size()];
for(CoreLabel c : t) {
tokens[c.index()-1] = c.word();
}
// if(previousOffset+2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) && printClusterID) {
// sentStr.append("\n");
// }
previousOffset = t.get(t.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
Counter<Integer> startCounts = new ClassicCounter<>();
Counter<Integer> endCounts = new ClassicCounter<>();
Map<Integer, Deque<Mention>> endMentions = Generics.newHashMap();
for (Mention m : mentions) {
// if(!gold && (document.corefClusters.get(m.corefClusterID)==null || document.corefClusters.get(m.corefClusterID).getCorefMentions().size()<=1)) {
// continue;
// }
startCounts.incrementCount(m.startIndex);
endCounts.incrementCount(m.endIndex);
if(!endMentions.containsKey(m.endIndex)) endMentions.put(m.endIndex, new ArrayDeque<>());
endMentions.get(m.endIndex).push(m);
}
for (int j = 0 ; j < tokens.length; j++){
if(endMentions.containsKey(j)) {
for(Mention m : endMentions.get(j)){
int id = (gold)? m.goldCorefClusterID: m.corefClusterID;
id = (printClusterID)? id : m.mentionID;
sentStr.append("]_").append(id);
}
}
for (int k = 0 ; k < startCounts.getCount(j) ; k++) {
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length()-1) != '[') sentStr.append(" ");
sentStr.append("[");
}
if (sentStr.length() > 0 && sentStr.charAt(sentStr.length()-1)!='[') sentStr.append(" ");
sentStr.append(tokens[j]);
}
if(endMentions.containsKey(tokens.length)) {
for(Mention m : endMentions.get(tokens.length)){
int id = (gold)? m.goldCorefClusterID: m.corefClusterID;
id = (printClusterID)? id : m.mentionID;
sentStr.append("]_").append(id); //append("_").append(m.mentionID);
}
}
// sentStr.append("\n");
return sentStr.toString();
}
public static String printMentionDetectionLog(Document document) {
StringBuilder sbLog = new StringBuilder();
List<CoreMap> sentences = document.annotation.get(SentencesAnnotation.class);
sbLog.append("\nERROR START-----------------------------------------------------------------------\n");
for(int i=0 ; i < sentences.size() ; i++) {
sbLog.append("\nSENT ").append(i).append(" GOLD : ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, true, false)).append("\n");
sbLog.append("SENT ").append(i).append(" PREDICT: ").append(HybridCorefPrinter.sentenceStringWithMention(i, document, false, false)).append("\n");
// for(CoreLabel cl : sentences.get(i).get(TokensAnnotation.class)) {
// sbLog.append(cl.word()).append("-").append(cl.get(UtteranceAnnotation.class)).append("-").append(cl.get(SpeakerAnnotation.class)).append(" ");
// }
for(Mention p : document.predictedMentions.get(i)) {
sbLog.append("\n");
if(!p.hasTwin) sbLog.append("\tSPURIOUS");
sbLog.append("\tmention: ").append(p.spanToString()).append("\t\t\theadword: ").append(p.headString).append("\tPOS: ").append(p.headWord.tag()).append("\tmentiontype: ").append(p.mentionType).append("\tnumber: ").append(p.number).append("\tgender: ").append(p.gender).append("\tanimacy: ").append(p.animacy).append("\tperson: ").append(p.person).append("\tNE: ").append(p.nerString);
}
sbLog.append("\n");
for(Mention g : document.goldMentions.get(i)){
if(!g.hasTwin) {
sbLog.append("\tmissed gold: ").append(g.spanToString()).append("\tPOS: ").append(g.headWord.tag()).append("\tmentiontype: ").append(g.mentionType).append("\theadword: ").append(g.headString).append("\tnumber: ").append(g.number).append("\tgender: ").append(g.gender).append("\tanimacy: ").append(g.animacy).append("\tperson: ").append(g.person).append("\tNE: ").append(g.nerString).append("\n");
if(g.sentenceWords!=null)
if(g.sentenceWords.size() > g.endIndex) sbLog.append("\tnextword: ").append(g.sentenceWords.get(g.endIndex)).append("\t").append(g.sentenceWords.get(g.endIndex).tag()).append("\n");
if(g.contextParseTree!=null) sbLog.append(g.contextParseTree.pennString()).append("\n\n");
else sbLog.append("\n\n");
}
}
if(sentences.get(i).get(TreeAnnotation.class)!=null) sbLog.append("\n\tparse: \n").append(sentences.get(i).get(TreeAnnotation.class).pennString());
sbLog.append("\n\tcollapsedDependency: \n").append(sentences.get(i).get(BasicDependenciesAnnotation.class));
}
sbLog.append("ERROR END -----------------------------------------------------------------------\n");
return sbLog.toString();
}
public static String printErrorLogDcoref(Mention m, Mention found, Document document, Dictionaries dict, int mIdx, String whichResolver) throws Exception {
StringBuilder sb = new StringBuilder();
sb.append("\nERROR START-----------------------------------------------------------------------\n");
sb.append("RESOLVER TYPE: ").append(whichResolver).append("\n");
sb.append("DOCUMENT: "+document.docInfo.get("DOC_ID")+", "+document.docInfo.get("DOC_PART")).append("\n");
List<Mention> orderedAnts = new ArrayList<>();
sb.append("\nGOLD CLUSTER ID\n");
for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) {
int sentIdx = m.sentNum-sentDist;
sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, true, true)).append("\n");
}
sb.append("\nMENTION ID\n");
for(int sentDist=m.sentNum ; sentDist >= 0 ; sentDist--) {
int sentIdx = m.sentNum-sentDist;
sb.append("\tSENT "+sentIdx+"\t"+sentenceStringWithMention(sentIdx, document, false, false)).append("\n");
}
// get dcoref antecedents ordering
for(int sentDist=0 ; sentDist <= m.sentNum ; sentDist++) {
int sentIdx = m.sentNum-sentDist;
orderedAnts.addAll(Sieve.getOrderedAntecedents(m, sentIdx, mIdx, document.predictedMentions, dict));
}
Map<Integer, Integer> orders = Generics.newHashMap();
for(int i=0 ; i<orderedAnts.size() ; i++) {
Mention ant = orderedAnts.get(i);
orders.put(ant.mentionID, i);
}
CorefCluster mC = document.corefClusters.get(m.corefClusterID);
boolean isFirstMention = isFirstMention(m, document);
boolean foundCorefAnt = true; // we're printing only mentions that found coref antecedent
boolean correctDecision = document.isCoref(m, found);
if(correctDecision) return "";
sb.append("\nMENTION: "+m.spanToString()+" ("+m.mentionID
+")\tperson: "+m.person+"\tsingleton? "+ (!m.hasTwin) +"\t\tisFirstMention? "+isFirstMention
+"\t\tfoundAnt? "+foundCorefAnt+"\t\tcorrectDecision? "+correctDecision);
sb.append("\n\ttype: "+m.mentionType+"\tHeadword: "+m.headWord.word()+"\tNEtype: "+m.nerString+"\tnumber: "+m.number+"\tgender: "+m.gender+"\tanimacy: "+m.animacy).append("\n");
if(m.contextParseTree!=null) sb.append(m.contextParseTree.pennString());
sb.append("\n\n\t\tOracle\t\tDcoref\t\t\tRF\t\tAntecedent\n");
for(Mention ant : orderedAnts) {
int antID = ant.mentionID;
CorefCluster aC = document.corefClusters.get(ant.corefClusterID);
boolean oracle = Sieve.isReallyCoref(document, m.mentionID, antID);
int order = orders.get(antID);
String oracleStr = (oracle)? "coref " : "notcoref";
// String dcorefStr = (dcoref)? "coref " : "notcoref";
String dcorefStr = "notcoref";
if(dcorefSpeaker.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-speaker";
// else if(dcorefChineseHeadMatch.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-chineseHeadMatch";
else if(dcorefDiscourse.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-discourse";
else if(dcorefExactString.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-exactString";
else if(dcorefRelaxedExactString.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-relaxedExact";
else if(dcorefPreciseConstructs.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-preciseConstruct";
else if(dcorefHead1.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head1";
else if(dcorefHead2.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head2";
else if(dcorefHead3.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head3";
else if(dcorefHead4.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-head4";
else if(dcorefRelaxedHead.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-relaxedHead";
else if(dcorefPronounSieve.coreferent(document, mC, aC, m, ant, dict, null)) dcorefStr = "coref-pronounSieve";
dcorefStr += "\t"+String.valueOf(order);
sb.append("\t\t"+oracleStr+"\t"+dcorefStr+"\t\t"+ant.spanToString()+" ("+ant.mentionID+")\n");
}
sb.append("ERROR END -----------------------------------------------------------------------\n");
return sb.toString();
}
public static void linkDistanceAnalysis(String[] args) throws Exception {
Properties props = StringUtils.argsToProperties(args);
HybridCorefSystem cs = new HybridCorefSystem(props);
cs.docMaker.resetDocs();
Counter<Integer> proper = new ClassicCounter<>();
Counter<Integer> common = new ClassicCounter<>();
Counter<Integer> pronoun = new ClassicCounter<>();
Counter<Integer> list = new ClassicCounter<>();
while(true) {
Document document = cs.docMaker.nextDoc();
if(document==null) break;
for(int sentIdx=0 ; sentIdx < document.predictedMentions.size() ; sentIdx++) {
List<Mention> predictedInSent = document.predictedMentions.get(sentIdx);
for(int mIdx = 0 ; mIdx < predictedInSent.size() ; mIdx++) {
Mention m = predictedInSent.get(mIdx);
loop:
for(int distance=0 ; distance <= sentIdx ; distance++) {
List<Mention> candidates = Sieve.getOrderedAntecedents(m, sentIdx-distance, mIdx, document.predictedMentions, cs.dictionaries);
for(Mention candidate : candidates) {
if(candidate == m) continue;
if(distance==0 && m.appearEarlierThan(candidate)) continue; // ignore cataphora
if(candidate.goldCorefClusterID == m.goldCorefClusterID) {
switch(m.mentionType) {
case NOMINAL:
if(candidate.mentionType==MentionType.NOMINAL || candidate.mentionType==MentionType.PROPER) {
common.incrementCount(distance);
break loop;
}
break;
case PROPER:
if(candidate.mentionType==MentionType.PROPER) {
proper.incrementCount(distance);
break loop;
}
break;
case PRONOMINAL:
pronoun.incrementCount(distance);
break loop;
case LIST:
if(candidate.mentionType==MentionType.LIST) {
list.incrementCount(distance);
break loop;
}
break;
default:
break;
}
}
}
}
}
}
}
System.out.println("PROPER -------------------------------------------");
Counters.printCounterSortedByKeys(proper);
System.out.println("COMMON -------------------------------------------");
Counters.printCounterSortedByKeys(common);
System.out.println("PRONOUN -------------------------------------------");
Counters.printCounterSortedByKeys(pronoun);
System.out.println("LIST -------------------------------------------");
Counters.printCounterSortedByKeys(list);
log.info();
}
public static void main(String[] args) throws Exception {
linkDistanceAnalysis(args);
}
}