/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.corenlp;
import java.io.IOException;
import java.net.URL;
import java.util.List;
import java.util.Properties;
import org.apache.commons.io.FilenameUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp;
import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro;
import edu.stanford.nlp.dcoref.Constants;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.DeterministicCorefAnnotator;
import edu.stanford.nlp.process.PTBEscapingProcessor;
/**
* Deterministic coreference annotator from CoreNLP.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain",
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink"})
public class CoreNlpCoreferenceResolver
extends JCasAnnotator_ImplBase
{
/**
* DCoRef parameter: Sieve passes - each class is defined in dcoref/sievepasses/.
*/
public static final String PARAM_SIEVES = "sieves";
@ConfigurationParameter(name = PARAM_SIEVES, defaultValue = Constants.SIEVEPASSES, mandatory = true)
private String sieves;
/**
* DCoRef parameter: Scoring the output of the system
*/
public static final String PARAM_SCORE = "score";
@ConfigurationParameter(name = PARAM_SCORE, defaultValue = "false", mandatory = true)
private boolean score;
/**
* DCoRef parameter: Do post-processing
*/
public static final String PARAM_POSTPROCESSING = "postprocessing";
@ConfigurationParameter(name = PARAM_POSTPROCESSING, defaultValue = "false", mandatory = true)
private boolean postprocessing;
/**
* DCoRef parameter: setting singleton predictor
*/
public static final String PARAM_SINGLETON = "singleton";
@ConfigurationParameter(name = PARAM_SINGLETON, defaultValue = "true", mandatory = true)
private boolean singleton;
/**
* DCoRef parameter: Maximum sentence distance between two mentions for resolution (-1: no
* constraint on the distance)
*/
public static final String PARAM_MAXDIST = "maxDist";
@ConfigurationParameter(name = PARAM_MAXDIST, defaultValue = "-1", mandatory = true)
private int maxdist;
/**
* Enable all traditional PTB3 token transforms (like -LRB-, -RRB-).
*
* @see PTBEscapingProcessor
*/
public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping";
@ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true")
private boolean ptb3Escaping;
/**
* List of extra token texts (usually single character strings) that should be treated like
* opening quotes and escaped accordingly before being sent to the parser.
*/
public static final String PARAM_QUOTE_BEGIN = "quoteBegin";
@ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false)
private List<String> quoteBegin;
/**
* List of extra token texts (usually single character strings) that should be treated like
* closing quotes and escaped accordingly before being sent to the parser.
*/
public static final String PARAM_QUOTE_END = "quoteEnd";
@ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false)
private List<String> quoteEnd;
private CasConfigurableProviderBase<DeterministicCorefAnnotator> annotatorProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
annotatorProvider = new CoreNlpPosTaggerModelProvider(this);
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
CAS cas = aJCas.getCas();
annotatorProvider.configure(cas);
// Transfer from CAS to CoreNLP
DKPro2CoreNlp converter = new DKPro2CoreNlp();
converter.setPtb3Escaping(ptb3Escaping);
converter.setQuoteBegin(quoteBegin);
converter.setQuoteEnd(quoteEnd);
Annotation document = new Annotation((String) null);
converter.convert(aJCas, document);
// Actual processing
annotatorProvider.getResource().annotate(document);
// Transfer back into the CAS
CoreNlp2DKPro.convertCorefChains(aJCas, document);
};
private class CoreNlpPosTaggerModelProvider
extends ModelProviderBase<DeterministicCorefAnnotator>
{
public CoreNlpPosTaggerModelProvider(Object aObject)
{
super(aObject, "stanfordnlp", "coref");
setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp");
setDefault(ARTIFACT_ID, "${groupId}.stanfordnlp-model-coref-${language}-${variant}");
setDefault(LOCATION, "classpath:/${package}/lib/coref/${language}/${variant}/countries");
setDefault(VARIANT, "default");
}
@Override
protected DeterministicCorefAnnotator produceResource(URL aUrl) throws IOException
{
String base = FilenameUtils.getFullPathNoEndSeparator(aUrl.toString())+"/";
// Loading gzipped files from URL is broken in CoreNLP
// https://github.com/stanfordnlp/CoreNLP/issues/94
String logicalBase = getModelLocation(getAggregatedProperties());
logicalBase = FilenameUtils.getFullPathNoEndSeparator(logicalBase)+"/";
logicalBase = logicalBase.substring("classpath:/".length());
Properties props = new Properties();
props.setProperty(Constants.ALLOW_REPARSING_PROP, String.valueOf(false));
props.setProperty(Constants.SIEVES_PROP, sieves);
props.setProperty(Constants.SCORE_PROP, String.valueOf(score));
props.setProperty(Constants.POSTPROCESSING_PROP, String.valueOf(postprocessing));
props.setProperty(Constants.SINGLETON_PROP, String.valueOf(singleton));
props.setProperty(Constants.SINGLETON_MODEL_PROP, base + "singleton.predictor.ser");
props.setProperty(Constants.MAXDIST_PROP, String.valueOf(maxdist));
// props.setProperty(Constants.BIG_GENDER_NUMBER_PROP, "false");
props.setProperty(Constants.REPLICATECONLL_PROP, "false");
props.setProperty(Constants.CONLL_SCORER, Constants.conllMentionEvalScript);
// Cf. edu.stanford.nlp.dcoref.Dictionaries.Dictionaries(Properties)
// props.getProperty(Constants.DEMONYM_PROP, DefaultPaths.DEFAULT_DCOREF_DEMONYM),
props.setProperty(Constants.DEMONYM_PROP, base + "demonyms.txt");
// props.getProperty(Constants.ANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_ANIMATE),
props.setProperty(Constants.ANIMATE_PROP, base + "animate.unigrams.txt");
// props.getProperty(Constants.INANIMATE_PROP, DefaultPaths.DEFAULT_DCOREF_INANIMATE),
props.setProperty(Constants.INANIMATE_PROP, base + "inanimate.unigrams.txt");
// props.getProperty(Constants.MALE_PROP),
props.setProperty(Constants.MALE_PROP, base + "male.unigrams.txt");
// props.getProperty(Constants.NEUTRAL_PROP),
props.setProperty(Constants.NEUTRAL_PROP, base + "neutral.unigrams.txt");
// props.getProperty(Constants.FEMALE_PROP),
props.setProperty(Constants.FEMALE_PROP, base + "female.unigrams.txt");
// props.getProperty(Constants.PLURAL_PROP),
props.setProperty(Constants.PLURAL_PROP, base + "plural.unigrams.txt");
// props.getProperty(Constants.SINGULAR_PROP),
props.setProperty(Constants.SINGULAR_PROP, base + "singular.unigrams.txt");
// props.getProperty(Constants.STATES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES),
props.setProperty(Constants.STATES_PROP, base + "state-abbreviations.txt");
//props.getProperty(Constants.GENDER_NUMBER_PROP, DefaultPaths.DEFAULT_DCOREF_GENDER_NUMBER);
props.setProperty(Constants.GENDER_NUMBER_PROP, logicalBase + "gender.map.ser.gz");
// props.getProperty(Constants.COUNTRIES_PROP, DefaultPaths.DEFAULT_DCOREF_COUNTRIES),
props.setProperty(Constants.COUNTRIES_PROP, base + "countries");
// props.getProperty(Constants.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
props.setProperty(Constants.STATES_PROVINCES_PROP, base + "statesandprovinces");
// The following properties are only relevant if the "CorefDictionaryMatch" sieve
// is enabled.
// PropertiesUtils.getStringArray(props, Constants.DICT_LIST_PROP,
// new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
// DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
props.put(Constants.DICT_LIST_PROP, '[' + base + "coref.dict1.tsv" + ',' + base
+ "coref.dict2.tsv" + ',' + base + "coref.dict3.tsv" + ',' + base
+ "coref.dict4.tsv" + ']');
// props.getProperty(Constants.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
props.put(Constants.DICT_PMI_PROP, base + "coref.dict1.tsv");
// props.getProperty(Constants.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
props.put(Constants.SIGNATURES_PROP, base + "ne.signatures.txt");
DeterministicCorefAnnotator annotator = new DeterministicCorefAnnotator(props);
return annotator;
}
}
}