// // StanfordCoreNLP -- a suite of NLP tools // Copyright (c) 2009-2010 The Board of Trustees of // The Leland Stanford Junior University. All Rights Reserved. // // This program is free software; you can redistribute it and/or // modify it under the terms of the GNU General Public License // as published by the Free Software Foundation; either version 2 // of the License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. // // For more information, bug reports, fixes, contact: // Christopher Manning // Dept of Computer Science, Gates 1A // Stanford CA 94305-9010 // USA // package edu.stanford.nlp.dcoref; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import edu.stanford.nlp.classify.LogisticClassifier; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.trees.TreeLemmatizer; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; import edu.stanford.nlp.semgraph.SemanticGraphFactory; import edu.stanford.nlp.util.CollectionValuedMap; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Pair; /** * Extracts coref mentions from CoNLL2011 data files. * * @author Angel Chang */ public class CoNLLMentionExtractor extends MentionExtractor { private final CoNLL2011DocumentReader reader; private final String corpusPath; private final boolean replicateCoNLL; private static final Logger logger = SieveCoreferenceSystem.logger; public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics) throws Exception { super(dict, semantics); // Initialize reader for reading from CONLL2011 corpus corpusPath = props.getProperty(Constants.CONLL2011_PROP); replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false")); CoNLL2011DocumentReader.Options options = new CoNLL2011DocumentReader.Options(); options.annotateTokenCoref = false; options.annotateTokenSpeaker = Constants.USE_GOLD_SPEAKER_TAGS || replicateCoNLL; options.annotateTokenNer = Constants.USE_GOLD_NE || replicateCoNLL; options.annotateTokenPos = Constants.USE_GOLD_POS || replicateCoNLL; if (Constants.USE_CONLL_AUTO) options.setFilter(".*_auto_conll$"); reader = new CoNLL2011DocumentReader(corpusPath, options); stanfordProcessor = loadStanfordProcessor(props); } public CoNLLMentionExtractor(Dictionaries dict, Properties props, Semantics semantics, LogisticClassifier<String, String> singletonModel) throws Exception { this(dict, props, semantics); singletonPredictor = singletonModel; } private static final boolean LEMMATIZE = true; private static final boolean threadSafe = true; private static final TreeLemmatizer treeLemmatizer = new TreeLemmatizer(); public void resetDocs() { super.resetDocs(); reader.reset(); } @Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<>(); List<Tree> allTrees = new ArrayList<>(); CoNLL2011DocumentReader.Document conllDoc = reader.getNextDocument(); if (conllDoc == null) { return null; } Annotation anno = conllDoc.getAnnotation(); List<CoreMap> sentences = anno.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence:sentences) { if (!Constants.USE_GOLD_PARSES && !replicateCoNLL) { // Remove tree from annotation and replace with parse using stanford parser sentence.remove(TreeCoreAnnotations.TreeAnnotation.class); } else { Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class); if (LEMMATIZE) { treeLemmatizer.transformTree(tree); } // generate the dependency graph try { SemanticGraph deps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.ENHANCED, GrammaticalStructure.Extras.NONE); SemanticGraph basicDeps = SemanticGraphFactory.makeFromTree(tree, SemanticGraphFactory.Mode.BASIC, GrammaticalStructure.Extras.NONE); sentence.set(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class, basicDeps); sentence.set(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class, deps); } catch(Exception e) { logger.log(Level.WARNING, "Exception caught during extraction of Stanford dependencies. Will ignore and continue...", e); } } } String preSpeaker = null; int utterance = -1; for (CoreLabel token:anno.get(CoreAnnotations.TokensAnnotation.class)) { if (!token.containsKey(CoreAnnotations.SpeakerAnnotation.class)) { token.set(CoreAnnotations.SpeakerAnnotation.class, ""); } String curSpeaker = token.get(CoreAnnotations.SpeakerAnnotation.class); if (!curSpeaker.equals(preSpeaker)) { utterance++; preSpeaker = curSpeaker; } token.set(CoreAnnotations.UtteranceAnnotation.class, utterance); } // Run pipeline stanfordProcessor.annotate(anno); for (CoreMap sentence:anno.get(CoreAnnotations.SentencesAnnotation.class)) { allWords.add(sentence.get(CoreAnnotations.TokensAnnotation.class)); allTrees.add(sentence.get(TreeCoreAnnotations.TreeAnnotation.class)); } // Initialize gold mentions List<List<Mention>> allGoldMentions = extractGoldMentions(conllDoc); List<List<Mention>> allPredictedMentions; if (Constants.USE_GOLD_MENTIONS) { //allPredictedMentions = allGoldMentions; // Make copy of gold mentions since mentions may be later merged, mentionID's changed and stuff allPredictedMentions = makeCopy(allGoldMentions); } else if (Constants.USE_GOLD_MENTION_BOUNDARIES) { allPredictedMentions = ((RuleBasedCorefMentionFinder) mentionFinder).filterPredictedMentions(allGoldMentions, anno, dictionaries); } else { allPredictedMentions = mentionFinder.extractPredictedMentions(anno, maxID, dictionaries); } try { recallErrors(allGoldMentions,allPredictedMentions,anno); } catch (IOException e) { throw new RuntimeException(e); } Document doc = arrange(anno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); doc.conllDoc = conllDoc; return doc; } private static List<List<Mention>> makeCopy(List<List<Mention>> mentions) { List<List<Mention>> copy = new ArrayList<>(mentions.size()); for (List<Mention> sm:mentions) { List<Mention> sm2 = new ArrayList<>(sm.size()); for (Mention m:sm) { Mention m2 = new Mention(); m2.goldCorefClusterID = m.goldCorefClusterID; m2.mentionID = m.mentionID; m2.startIndex = m.startIndex; m2.endIndex = m.endIndex; m2.originalSpan = m.originalSpan; m2.dependency = m.dependency; sm2.add(m2); } copy.add(sm2); } return copy; } private static void recallErrors(List<List<Mention>> goldMentions, List<List<Mention>> predictedMentions, Annotation doc) throws IOException { List<CoreMap> coreMaps = doc.get(CoreAnnotations.SentencesAnnotation.class); int numSentences = goldMentions.size(); for (int i=0;i<numSentences;i++){ CoreMap coreMap = coreMaps.get(i); List<CoreLabel> words = coreMap.get(CoreAnnotations.TokensAnnotation.class); Tree tree = coreMap.get(TreeCoreAnnotations.TreeAnnotation.class); List<Mention> goldMentionsSent = goldMentions.get(i); List<Pair<Integer,Integer>> goldMentionsSpans = extractSpans(goldMentionsSent); for (Pair<Integer,Integer> mentionSpan: goldMentionsSpans){ logger.finer("RECALL ERROR\n"); logger.finer(coreMap + "\n"); for (int x=mentionSpan.first;x<mentionSpan.second;x++){ logger.finer(words.get(x).value() + " "); } logger.finer("\n"+tree + "\n"); } } } private static List<Pair<Integer,Integer>> extractSpans(List<Mention> listOfMentions) { List<Pair<Integer,Integer>> mentionSpans = new ArrayList<>(); for (Mention mention: listOfMentions){ Pair<Integer,Integer> mentionSpan = new Pair<>(mention.startIndex, mention.endIndex); mentionSpans.add(mentionSpan); } return mentionSpans; } public List<List<Mention>> extractGoldMentions(CoNLL2011DocumentReader.Document conllDoc) { List<CoreMap> sentences = conllDoc.getAnnotation().get(CoreAnnotations.SentencesAnnotation.class); List<List<Mention>> allGoldMentions = new ArrayList<>(); CollectionValuedMap<String,CoreMap> corefChainMap = conllDoc.getCorefChainMap(); for (int i = 0; i < sentences.size(); i++) { allGoldMentions.add(new ArrayList<>()); } int maxCorefClusterId = -1; for (String corefIdStr:corefChainMap.keySet()) { int id = Integer.parseInt(corefIdStr); if (id > maxCorefClusterId) { maxCorefClusterId = id; } } int newMentionID = maxCorefClusterId + 1; for (Map.Entry<String, Collection<CoreMap>> idChainEntry : corefChainMap.entrySet()) { int id = Integer.parseInt(idChainEntry.getKey()); int clusterMentionCnt = 0; for (CoreMap m : idChainEntry.getValue()) { clusterMentionCnt++; Mention mention = new Mention(); mention.goldCorefClusterID = id; if (clusterMentionCnt == 1) { // First mention in cluster mention.mentionID = id; mention.originalRef = -1; } else { mention.mentionID = newMentionID; mention.originalRef = id; newMentionID++; } if(maxID < mention.mentionID) maxID = mention.mentionID; int sentIndex = m.get(CoreAnnotations.SentenceIndexAnnotation.class); CoreMap sent = sentences.get(sentIndex); mention.startIndex = m.get(CoreAnnotations.TokenBeginAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); mention.endIndex = m.get(CoreAnnotations.TokenEndAnnotation.class) - sent.get(CoreAnnotations.TokenBeginAnnotation.class); // will be set by arrange mention.originalSpan = m.get(CoreAnnotations.TokensAnnotation.class); // Mention dependency graph is the enhanced dependency graph of the sentence mention.dependency = sentences.get(sentIndex).get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class); allGoldMentions.get(sentIndex).add(mention); } } return allGoldMentions; } }