/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.stanfordnlp;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.util.Level.INFO;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.util.CoreMap;
/**
* Stanford Named Entity Recognizer component.
*
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" })
public class StanfordNamedEntityRecognizer
extends JCasAnnotator_ImplBase
{
/**
* Log the tag set(s) when a model is loaded.
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
protected boolean printTagSet;
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Variant of a model the model. Used to address a specific model if here are multiple models
* for one language.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Location from which the model is read.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Location of the mapping file for named entity tags to UIMA types.
*/
public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false)
protected String mappingLocation;
/**
* Enable all traditional PTB3 token transforms (like -LRB-, -RRB-).
*
* @see PTBEscapingProcessor
*/
public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping";
@ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true")
private boolean ptb3Escaping;
/**
* List of extra token texts (usually single character strings) that should be treated like
* opening quotes and escaped accordingly before being sent to the parser.
*/
public static final String PARAM_QUOTE_BEGIN = "quoteBegin";
@ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false)
private List<String> quoteBegin;
/**
* List of extra token texts (usually single character strings) that should be treated like
* closing quotes and escaped accordingly before being sent to the parser.
*/
public static final String PARAM_QUOTE_END = "quoteEnd";
@ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false)
private List<String> quoteEnd;
private StanfordNlpNamedEntityRecognizerModelProvider modelProvider;
private MappingProvider mappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new StanfordNlpNamedEntityRecognizerModelProvider(this);
mappingProvider = new MappingProvider();
mappingProvider
.setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-default-variants.map");
mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/"
+ "core/stanfordnlp/lib/ner-${language}-${variant}.map");
mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName());
mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation);
mappingProvider.setOverride(MappingProvider.LANGUAGE, language);
mappingProvider.setOverride(MappingProvider.VARIANT, variant);
mappingProvider.addTagMappingImport("ner", modelProvider);
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
CAS cas = aJCas.getCas();
modelProvider.configure(cas);
mappingProvider.configure(cas);
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<Token> tokens = selectCovered(aJCas, Token.class, sentence);
List<HasWord> words = new ArrayList<>(tokens.size());
for (Token t : tokens) {
words.add(CoreNlpUtils.tokenToWord(t));
}
if (ptb3Escaping) {
words = CoreNlpUtils.applyPtbEscaping(words, quoteBegin, quoteEnd);
}
List<CoreMap> taggedWords = modelProvider.getResource().classifySentence(words);
int entityBegin = -1;
int entityEnd = -1;
String entityType = null;
for (CoreMap t : taggedWords) {
String tokenType = mappingProvider
.getTag(t.get(CoreAnnotations.AnswerAnnotation.class));
// If an entity is currently open, then close it
if ("O".equals(tokenType) || !tokenType.equals(entityType)) {
if (entityType != null) {
Type type = mappingProvider.getTagType(entityType);
NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, entityEnd);
neAnno.setValue(entityType);
neAnno.addToIndexes();
entityType = null;
}
}
// If a new entity starts or continues, track it
if (!"O".equals(tokenType)) {
if (entityType == null) {
entityType = tokenType;
entityBegin = t.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
}
entityEnd = t.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
}
}
// If the last entity is still open, then close it
if (entityType != null) {
Type type = mappingProvider.getTagType(entityType);
NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, entityBegin, entityEnd);
neAnno.setValue(entityType);
neAnno.addToIndexes();
}
}
}
private class StanfordNlpNamedEntityRecognizerModelProvider
extends ModelProviderBase<AbstractSequenceClassifier<CoreMap>>
{
public StanfordNlpNamedEntityRecognizerModelProvider(Object aObject)
{
super(aObject, "stanfordnlp", "ner");
// setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp");
setDefault(LOCATION,
"classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-${language}-${variant}.properties");
}
@Override
protected AbstractSequenceClassifier<CoreMap> produceResource(URL aUrl)
throws IOException
{
Properties metadata = getResourceMetaData();
InputStream is = null;
try {
is = aUrl.openStream();
if (aUrl.toString().endsWith(".gz")) {
// it's faster to do the buffering _outside_ the gzipping as here
is = new GZIPInputStream(is);
}
AbstractSequenceClassifier<CoreMap> classifier = (AbstractSequenceClassifier<CoreMap>)
CRFClassifier.getClassifier(is);
String tagsetName = metadata.getProperty("ner.tagset");
if (tagsetName == null) {
tagsetName = "unknown";
}
SingletonTagset tsdp = new SingletonTagset(NamedEntity.class, tagsetName);
for (String tag : classifier.classIndex) {
String mapped = metadata.getProperty("ner.tag.map."+tag);
String finalTag = mapped != null ? mapped : tag;
// "O" has a special meaning in the CRF-NER: not a named entity
if (!"O".equals(finalTag)) {
tsdp.add(finalTag);
}
}
addTagset(tsdp);
if (printTagSet) {
getContext().getLogger().log(INFO, tsdp.toString());
}
return classifier;
}
catch (ClassNotFoundException e) {
throw new IOException(e);
}
finally {
closeQuietly(is);
}
}
}
}