/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.stanfordnlp;
import static java.util.Arrays.asList;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.io.FilenameUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.RootKey;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.TreeUtils;
import edu.stanford.nlp.dcoref.Constants;
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefChain.CorefMention;
import edu.stanford.nlp.dcoref.Document;
import edu.stanford.nlp.dcoref.Mention;
import edu.stanford.nlp.dcoref.MentionExtractor;
import edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.dcoref.SieveCoreferenceSystem;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.ParserAnnotatorUtils;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphFactory;
import edu.stanford.nlp.semgraph.SemanticGraphFactory.Mode;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructure.Extras;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.LabeledScoredTreeFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.util.CoreMap;
/**
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain",
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"})
public class StanfordCoreferenceResolver
extends JCasAnnotator_ImplBase
{
/**
* DCoRef parameter: Sieve passes - each class is defined in dcoref/sievepasses/.
*/
public static final String PARAM_SIEVES = "sieves";
@ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, mandatory = true)
private String sieves;
/**
* DCoRef parameter: Scoring the output of the system
*/
public static final String PARAM_SCORE = "score";
@ConfigurationParameter(name = PARAM_SCORE, defaultValue = "false", mandatory = true)
private boolean score;
/**
* DCoRef parameter: Do post processing
*/
public static final String PARAM_POSTPROCESSING = "postprocessing";
@ConfigurationParameter(name = PARAM_POSTPROCESSING, defaultValue = "false", mandatory = true)
private boolean postprocessing;
/**
* DCoRef parameter: setting singleton predictor
*/
public static final String PARAM_SINGLETON = "singleton";
@ConfigurationParameter(name = PARAM_SINGLETON, defaultValue = "true", mandatory = true)
private boolean singleton;
/**
* DCoRef parameter: Maximum sentence distance between two mentions for resolution (-1: no
* constraint on the distance)
*/
public static final String PARAM_MAXDIST = "maxDist";
@ConfigurationParameter(name = PARAM_MAXDIST, defaultValue = "-1", mandatory = true)
private int maxdist;
private CasConfigurableProviderBase<Coreferencer> modelProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new ModelProviderBase<Coreferencer>() {
{
setContextObject(StanfordCoreferenceResolver.this);
setDefault(ARTIFACT_ID, "${groupId}.stanfordnlp-model-coref-${language}-${variant}");
setDefault(LOCATION, "classpath:/${package}/lib/coref/${language}/${variant}/countries");
setDefault(VARIANT, "default");
// setOverride(LOCATION, modelLocation);
// setOverride(LANGUAGE, language);
// setOverride(VARIANT, variant);
}
@Override
protected Coreferencer produceResource(URL aUrl)
throws IOException
{
String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString())+"/";
Properties props = new Properties();
props.setProperty(Constants.SIEVES_PROP, sieves);
props.setProperty(Constants.SCORE_PROP, String.valueOf(score));
props.setProperty(Constants.POSTPROCESSING_PROP, String.valueOf(postprocessing));
props.setProperty(Constants.SINGLETON_PROP, String.valueOf(singleton));
props.setProperty(Constants.SINGLETON_MODEL_PROP, base + "singleton.predictor.ser");
props.setProperty(Constants.MAXDIST_PROP, String.valueOf(maxdist));
// props.setProperty(Constants.BIG_GENDER_NUMBER_PROP, "false");
props.setProperty(Constants.REPLICATECONLL_PROP, "false");
props.setProperty(Constants.CONLL_SCORER, Constants.conllMentionEvalScript);
// Cf. edu.stanford.nlp.dcoref.Dictionaries.Dictionaries(Properties)
// props.getProperty(Constants.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM),
props.setProperty(Constants.DEMONYM_PROP, base + "demonyms.txt");
// props.getProperty(Constants.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE),
props.setProperty(Constants.ANIMATE_PROP, base + "animate.unigrams.txt");
// props.getProperty(Constants.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE),
props.setProperty(Constants.INANIMATE_PROP, base + "inanimate.unigrams.txt");
// props.getProperty(Constants.MALE_PROP),
props.setProperty(Constants.MALE_PROP, base + "male.unigrams.txt");
// props.getProperty(Constants.NEUTRAL_PROP),
props.setProperty(Constants.NEUTRAL_PROP, base + "neutral.unigrams.txt");
// props.getProperty(Constants.FEMALE_PROP),
props.setProperty(Constants.FEMALE_PROP, base + "female.unigrams.txt");
// props.getProperty(Constants.PLURAL_PROP),
props.setProperty(Constants.PLURAL_PROP, base + "plural.unigrams.txt");
// props.getProperty(Constants.SINGULAR_PROP),
props.setProperty(Constants.SINGULAR_PROP, base + "singular.unigrams.txt");
// props.getProperty(Constants.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES),
props.setProperty(Constants.STATES_PROP, base + "state-abbreviations.txt");
//props.getProperty(Constants.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER);
props.setProperty(Constants.GENDER_NUMBER_PROP, base + "gender.map.ser.gz");
// props.getProperty(Constants.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES),
props.setProperty(Constants.COUNTRIES_PROP, base + "countries");
// props.getProperty(Constants.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
props.setProperty(Constants.STATES_PROVINCES_PROP, base + "statesandprovinces");
// The following properties are only relevant if the "CorefDictionaryMatch" sieve
// is enabled.
// PropertiesUtils.getStringArray(props, Constants.DICT_LIST_PROP,
// new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
// DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
props.put(Constants.DICT_LIST_PROP, '[' + base + "coref.dict1.tsv" + ',' + base
+ "coref.dict2.tsv" + ',' + base + "coref.dict3.tsv" + ',' + base
+ "coref.dict4.tsv" + ']');
// props.getProperty(Constants.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
props.put(Constants.DICT_PMI_PROP, base + "coref.dict1.tsv");
// props.getProperty(Constants.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
props.put(Constants.SIGNATURES_PROP, base + "ne.signatures.txt");
try {
Coreferencer coref = new Coreferencer();
coref.corefSystem = new SieveCoreferenceSystem(props);
coref.mentionExtractor = new MentionExtractor(coref.corefSystem.dictionaries(),
coref.corefSystem.semantics());
return coref;
}
catch (Exception e) {
throw new IOException(e);
}
}
};
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
modelProvider.configure(aJCas.getCas());
List<Tree> trees = new ArrayList<Tree>();
List<CoreMap> sentences = new ArrayList<CoreMap>();
List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>();
for (ROOT root : select(aJCas, ROOT.class)) {
// SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
// it with PRN to avoid NPEs.
TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory())
{
@Override
public Tree newTreeNode(String aParent, List<Tree> aChildren)
{
String parent = aParent;
if ("PRN0".equals(parent)) {
parent = "PRN";
}
Tree node = super.newTreeNode(parent, aChildren);
return node;
}
};
Tree tree = TreeUtils.createStanfordTree(root, tFact);
tree.indexSpans();
trees.add(tree);
// Build the tokens
List<CoreLabel> tokens = new ArrayList<CoreLabel>();
for (Tree leave : tree.getLeaves()) {
tokens.add((CoreLabel) leave.label());
}
sentenceTokens.add(tokens);
// Build the sentence
CoreMap sentence = new CoreLabel();
sentence.set(TreeAnnotation.class, tree);
sentence.set(TokensAnnotation.class, tokens);
sentence.set(RootKey.class, root);
sentences.add(sentence);
// https://github.com/dkpro/dkpro-core/issues/590
// We currently do not copy over dependencies from the CAS. This is supposed to fill
// in the dependencies so we do not get NPEs.
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(
tlp.punctuationWordRejectFilter(), tlp.typedDependencyHeadFinder());
ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence,
asList(tree), GrammaticalStructure.Extras.NONE);
// https://github.com/dkpro/dkpro-core/issues/582
SemanticGraph deps = sentence
.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
for (IndexedWord vertex : deps.vertexSet()) {
vertex.setWord(vertex.value());
}
// These lines are necessary since CoreNLP 3.5.2 - without them the mentions lack
// dependency information which causes an NPE
SemanticGraph dependencies = SemanticGraphFactory.makeFromTree(tree,
Mode.COLLAPSED, Extras.NONE, null, false, true);
sentence.set(SemanticGraphCoreAnnotations.AlternativeDependenciesAnnotation.class,
dependencies);
// merge the new CoreLabels with the tree leaves
MentionExtractor.mergeLabels(tree, tokens);
MentionExtractor.initializeUtterance(tokens);
}
Annotation document = new Annotation(aJCas.getDocumentText());
document.set(SentencesAnnotation.class, sentences);
Coreferencer coref = modelProvider.getResource();
// extract all possible mentions
// Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here
// disables reparsing.
RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false);
List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0,
coref.corefSystem.dictionaries());
// add the relevant info to mentions and order them for coref
Map<Integer, CorefChain> result;
try {
Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees,
allUnprocessedMentions);
result = coref.corefSystem.coref(doc);
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
for (CorefChain chain : result.values()) {
CoreferenceLink last = null;
for (CorefMention mention : chain.getMentionsInTextualOrder()) {
CoreLabel beginLabel = sentences.get(mention.sentNum - 1)
.get(TokensAnnotation.class).get(mention.startIndex - 1);
CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
.get(mention.endIndex - 2);
CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class)
.getBegin(), endLabel.get(TokenKey.class).getEnd());
if (mention.mentionType != null) {
link.setReferenceType(mention.mentionType.toString());
}
if (last == null) {
// This is the first mention. Here we'll initialize the chain
CoreferenceChain corefChain = new CoreferenceChain(aJCas);
corefChain.setFirst(link);
corefChain.addToIndexes();
}
else {
// For the other mentions, we'll add them to the chain.
last.setNext(link);
}
last = link;
link.addToIndexes();
}
}
}
protected CoreLabel tokenToWord(Token aToken)
{
CoreLabel t = CoreNlpUtils.tokenToWord(aToken);
t.set(TokenKey.class, aToken);
List<NamedEntity> nes = selectCovered(NamedEntity.class, aToken);
if (nes.size() > 0) {
t.setNER(nes.get(0).getValue());
}
else {
t.setNER("O");
}
return t;
}
private static class Coreferencer {
MentionExtractor mentionExtractor;
SieveCoreferenceSystem corefSystem;
}
}