package edu.stanford.nlp.coref.statistical; import java.util.Map; import java.util.Properties; import java.util.stream.Collectors; import edu.stanford.nlp.coref.CorefAlgorithm; import edu.stanford.nlp.coref.CorefUtils; import edu.stanford.nlp.coref.data.Dictionaries; import edu.stanford.nlp.coref.data.Document; import edu.stanford.nlp.coref.statistical.ClustererDataLoader.ClustererDoc; import edu.stanford.nlp.stats.ClassicCounter; import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Pair; /** * Builds up coreference clusters incrementally with agglomerative clustering. * The model is described in * <p/> * Kevin Clark and Christopher D. Manning. 2015. * <a href="http://nlp.stanford.edu/pubs/clark-manning-acl15-entity.pdf"> * Entity-Centric Coreference Resolution with Model Stacking</a>. * In Association for Computational Linguistics. * <p/> * See {@link StatisticalCorefTrainer} for training a new model. * @author Kevin Clark */ public class ClusteringCorefAlgorithm implements CorefAlgorithm { private final Clusterer clusterer; private final PairwiseModel classificationModel; private final PairwiseModel rankingModel; private final PairwiseModel anaphoricityModel; private final FeatureExtractor extractor; public ClusteringCorefAlgorithm(Properties props, Dictionaries dictionaries) { this(props, dictionaries, StatisticalCorefProperties.clusteringModelPath(props), StatisticalCorefProperties.classificationModelPath(props), StatisticalCorefProperties.rankingModelPath(props), StatisticalCorefProperties.anaphoricityModelPath(props), StatisticalCorefProperties.wordCountsPath(props)); } public ClusteringCorefAlgorithm(Properties props, Dictionaries dictionaries, String clusteringPath, String classificationPath, String rankingPath, String anaphoricityPath, String wordCountsPath) { clusterer = new Clusterer(clusteringPath); classificationModel = PairwiseModel.newBuilder("classification", MetaFeatureExtractor.newBuilder().build()) .modelPath(classificationPath).build(); rankingModel = PairwiseModel.newBuilder("ranking", MetaFeatureExtractor.newBuilder().build()) .modelPath(rankingPath).build(); anaphoricityModel = PairwiseModel.newBuilder("anaphoricity", MetaFeatureExtractor.anaphoricityMFE()) .modelPath(anaphoricityPath).build(); extractor = new FeatureExtractor(props, dictionaries, null, wordCountsPath); } @Override public void runCoref(Document document) { Map<Pair<Integer, Integer>, Boolean> mentionPairs = CorefUtils.getUnlabeledMentionPairs(document); if (mentionPairs.size() == 0) { return; } Compressor<String> compressor = new Compressor<>(); DocumentExamples examples = extractor.extract(0, document, mentionPairs, compressor); Counter<Pair<Integer, Integer>> classificationScores = new ClassicCounter<>(); Counter<Pair<Integer, Integer>> rankingScores = new ClassicCounter<>(); Counter<Integer> anaphoricityScores = new ClassicCounter<>(); for (Example example : examples.examples) { CorefUtils.checkForInterrupt(); Pair<Integer, Integer> mentionPair = new Pair<>(example.mentionId1, example.mentionId2); classificationScores.incrementCount(mentionPair, classificationModel .predict(example, examples.mentionFeatures, compressor)); rankingScores.incrementCount(mentionPair, rankingModel .predict(example, examples.mentionFeatures, compressor)); if (!anaphoricityScores.containsKey(example.mentionId2)) { anaphoricityScores.incrementCount(example.mentionId2, anaphoricityModel .predict(new Example(example, false), examples.mentionFeatures, compressor)); } } ClustererDoc doc = new ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.entrySet().stream().collect( Collectors.toMap(Map.Entry::getKey, e -> e.getValue().mentionType.toString()))); for (Pair<Integer, Integer> mentionPair : clusterer.getClusterMerges(doc)) { CorefUtils.mergeCoreferenceClusters(mentionPair, document); } } }