// // StanfordCoreNLP -- a suite of NLP tools // Copyright (c) 2009-2010 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // package edu.stanford.nlp.coref.data; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; import edu.stanford.nlp.coref.docreader.CoNLLDocumentReader; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.IntTuple; import edu.stanford.nlp.util.Pair; public class Document implements Serializable { private static final long serialVersionUID = -4139866807494603953L; public enum DocType { CONVERSATION, ARTICLE } /** The type of document: conversational or article */ public DocType docType; /** Document annotation */ public Annotation annotation; /** for conll shared task 2011 */ public CoNLLDocumentReader.CoNLLDocument conllDoc; /** The list of gold mentions */ public List<List<Mention>> goldMentions; /** The list of predicted mentions */ public List<List<Mention>> predictedMentions; /** return the list of predicted mentions */ public List<List<Mention>> getOrderedMentions() { return predictedMentions; } /** Clusters for coreferent mentions */ public Map<Integer, CorefCluster> corefClusters; /** Gold Clusters for coreferent mentions */ public Map<Integer, CorefCluster> goldCorefClusters; /** All mentions in a document {@literal mentionID -> mention} */ public Map<Integer, Mention> predictedMentionsByID; public Map<Integer, Mention> goldMentionsByID; /** Set of roles (in role apposition) in a document */ public Set<Mention> roleSet; /** * Position of each mention in the input matrix * Each mention occurrence with sentence # and position within sentence * (Nth mention, not Nth token) */ public Map<Mention, IntTuple> positions; // mentions may be removed from this due to post processing public Map<Mention, IntTuple> allPositions; // all mentions (mentions will not be removed from this) public final Map<IntTuple, Mention> mentionheadPositions; /** List of gold links in a document by positions */ private List<Pair<IntTuple,IntTuple>> goldLinks; /** UtteranceAnnotation {@literal ->} String (speaker): mention ID or speaker string * e.g., the value can be "34" (mentionID), "Larry" (speaker string), or "PER3" (autoassigned speaker string) */ public Map<Integer, String> speakers; /** Pair of mention id, and the mention's speaker id * the second value is the "speaker mention"'s id. * e.g., Larry said, "San Francisco is a city.": (id(Larry), id(San Francisco)) */ public Set<Pair<Integer, Integer>> speakerPairs; public boolean speakerInfoGiven; public int maxUtter; public int numParagraph; public int numSentences; /** Set of incompatible clusters pairs */ private final Set<Pair<Integer, Integer>> incompatibles; private final Set<Pair<Integer, Integer>> incompatibleClusters; public Map<Pair<Integer, Integer>, Boolean> acronymCache; /** Map of speaker name/id to speaker info * the key is the value of the variable 'speakers' */ public Map<String, SpeakerInfo> speakerInfoMap = Generics.newHashMap(); // public Counter<String> properNouns = new ClassicCounter<>(); // public Counter<String> phraseCounter = new ClassicCounter<>(); // public Counter<String> headwordCounter = new ClassicCounter<>(); /** Additional information about the document. Can be used as features */ public Map<String, String> docInfo; public Document() { positions = Generics.newHashMap(); mentionheadPositions = Generics.newHashMap(); roleSet = Generics.newHashSet(); corefClusters = Generics.newHashMap(); goldCorefClusters = null; predictedMentionsByID = Generics.newHashMap(); // goldMentionsByID = Generics.newHashMap(); speakers = Generics.newHashMap(); speakerPairs = Generics.newHashSet(); incompatibles = Generics.newHashSet(); incompatibleClusters = Generics.newHashSet(); acronymCache = Generics.newHashMap(); } public Document(Annotation anno, List<List<Mention>> predictedMentions, List<List<Mention>> goldMentions) { this(); annotation = anno; this.predictedMentions = predictedMentions; this.goldMentions = goldMentions; } public Document(InputDoc input, List<List<Mention>> mentions) { this(); this.annotation = input.annotation; this.predictedMentions = mentions; this.goldMentions = input.goldMentions; this.docInfo = input.docInfo; this.numSentences = input.annotation.get(SentencesAnnotation.class).size(); this.conllDoc = input.conllDoc; // null if it's not conll input } public boolean isIncompatible(CorefCluster c1, CorefCluster c2) { // Was any of the pairs of mentions marked as incompatible int cid1 = Math.min(c1.clusterID, c2.clusterID); int cid2 = Math.max(c1.clusterID, c2.clusterID); return incompatibleClusters.contains(Pair.makePair(cid1,cid2)); } // Update incompatibles for two clusters that are about to be merged public void mergeIncompatibles(CorefCluster to, CorefCluster from) { List<Pair<Pair<Integer,Integer>, Pair<Integer,Integer>>> replacements = new ArrayList<>(); for (Pair<Integer, Integer> p:incompatibleClusters) { Integer other = null; if (p.first == from.clusterID) { other = p.second; } else if (p.second == from.clusterID) { other = p.first; } if (other != null && other != to.clusterID) { int cid1 = Math.min(other, to.clusterID); int cid2 = Math.max(other, to.clusterID); replacements.add(Pair.makePair(p, Pair.makePair(cid1, cid2))); } } for (Pair<Pair<Integer,Integer>, Pair<Integer,Integer>> r:replacements) { incompatibleClusters.remove(r.first); incompatibleClusters.add(r.second); } } public void mergeAcronymCache(CorefCluster to, CorefCluster from) { Map<Pair<Integer, Integer>, Boolean> replacements = Generics.newHashMap(); for(Pair<Integer, Integer> p : acronymCache.keySet()) { if(acronymCache.get(p)) { Integer other = null; if(p.first==from.clusterID){ other = p.second; } else if(p.second==from.clusterID) { other = p.first; } if(other != null && other != to.clusterID) { int cid1 = Math.min(other, to.clusterID); int cid2 = Math.max(other, to.clusterID); replacements.put(Pair.makePair(cid1, cid2), true); } } } for(Pair<Integer, Integer> p : replacements.keySet()) { acronymCache.put(p, replacements.get(p)); } } public boolean isIncompatible(Mention m1, Mention m2) { int mid1 = Math.min(m1.mentionID, m2.mentionID); int mid2 = Math.max(m1.mentionID, m2.mentionID); return incompatibles.contains(Pair.makePair(mid1,mid2)); } public void addIncompatible(Mention m1, Mention m2) { int mid1 = Math.min(m1.mentionID, m2.mentionID); int mid2 = Math.max(m1.mentionID, m2.mentionID); incompatibles.add(Pair.makePair(mid1,mid2)); int cid1 = Math.min(m1.corefClusterID, m2.corefClusterID); int cid2 = Math.max(m1.corefClusterID, m2.corefClusterID); incompatibleClusters.add(Pair.makePair(cid1,cid2)); } public List<Pair<IntTuple, IntTuple>> getGoldLinks() { if(goldLinks==null) this.extractGoldLinks(); return goldLinks; } /** Extract gold coref link information */ protected void extractGoldLinks() { // List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions(); List<Pair<IntTuple, IntTuple>> links = new ArrayList<>(); // position of each mention in the input matrix, by id Map<Integer, IntTuple> positions = Generics.newHashMap(); // positions of antecedents Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap(); for(int i = 0; i < goldMentions.size(); i ++){ for(int j = 0; j < goldMentions.get(i).size(); j ++){ Mention m = goldMentions.get(i).get(j); int id = m.mentionID; IntTuple pos = new IntTuple(2); pos.set(0, i); pos.set(1, j); positions.put(id, pos); antecedents.put(id, new ArrayList<>()); } } // SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence); for (List<Mention> mentions : goldMentions) { for (Mention m : mentions) { int id = m.mentionID; IntTuple src = positions.get(id); assert (src != null); if (m.originalRef >= 0) { IntTuple dst = positions.get(m.originalRef); if (dst == null) { throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef); } // to deal with cataphoric annotation while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) { Mention dstMention = goldMentions.get(dst.get(0)).get(dst.get(1)); m.originalRef = dstMention.originalRef; dstMention.originalRef = id; if (m.originalRef < 0) break; dst = positions.get(m.originalRef); } if (m.originalRef < 0) continue; // A B C: if A<-B, A<-C => make a link B<-C for (int k = dst.get(0); k <= src.get(0); k++) { for (int l = 0; l < goldMentions.get(k).size(); l++) { if (k == dst.get(0) && l < dst.get(1)) continue; if (k == src.get(0) && l > src.get(1)) break; IntTuple missed = new IntTuple(2); missed.set(0, k); missed.set(1, l); if (links.contains(new Pair<>(missed, dst))) { antecedents.get(id).add(missed); links.add(new Pair<>(src, missed)); } } } links.add(new Pair<>(src, dst)); assert (antecedents.get(id) != null); antecedents.get(id).add(dst); List<IntTuple> ants = antecedents.get(m.originalRef); assert (ants != null); for (IntTuple ant : ants) { antecedents.get(id).add(ant); links.add(new Pair<>(src, ant)); } } } } goldLinks = links; } public SpeakerInfo getSpeakerInfo(String speaker) { return speakerInfoMap.get(speaker); } public int numberOfSpeakers() { return speakerInfoMap.size(); } public boolean isCoref(Mention m1, Mention m2) { return this.goldMentionsByID.containsKey(m1.mentionID) && this.goldMentionsByID.containsKey(m2.mentionID) && this.goldMentionsByID.get(m1.mentionID).goldCorefClusterID == this.goldMentionsByID.get(m2.mentionID).goldCorefClusterID; } }