/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.corenlp; import static org.apache.uima.util.Level.INFO; import java.io.IOException; import java.io.InputStream; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.DKPro2CoreNlp; import de.tudarmstadt.ukp.dkpro.core.corenlp.internal.CoreNlp2DKPro; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.NERClassifierCombiner; import edu.stanford.nlp.ie.crf.CRFClassifier; import edu.stanford.nlp.ie.ner.CMMClassifier; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.NERCombinerAnnotator; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.util.ErasureUtils; import edu.stanford.nlp.util.StringUtils; /** * Named entity recognizer from CoreNLP. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class CoreNlpNamedEntityRecognizer extends JCasAnnotator_ImplBase { /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") protected boolean printTagSet; /** * Use this language instead of the document language to resolve the model and tag set mapping. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Variant of a model the model. Used to address a specific model if here are multiple models * for one language. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; /** * Location from which the model is read. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) private String modelLocation; /** * The character encoding used by the model. */ public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING; @ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false) private String modelEncoding; /** * Location of the mapping file for named entity tags to UIMA types. */ public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) private String mappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code false} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internStrings; public static final String PARAM_MAX_SENTENCE_LENGTH = ComponentParameters.PARAM_MAX_SENTENCE_LENGTH; @ConfigurationParameter(name = PARAM_MAX_SENTENCE_LENGTH, mandatory = true, defaultValue = "2147483647") private int maxSentenceLength; public static final String PARAM_MAX_TIME = "maxTime"; @ConfigurationParameter(name = PARAM_MAX_TIME, mandatory = true, defaultValue = "-1") private int maxTime; public static final String PARAM_NUM_THREADS = ComponentParameters.PARAM_NUM_THREADS; @ConfigurationParameter(name = PARAM_NUM_THREADS, mandatory = true, defaultValue = ComponentParameters.AUTO_NUM_THREADS) private int numThreads; /** * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). * * @see PTBEscapingProcessor */ public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") private boolean ptb3Escaping; /** * List of extra token texts (usually single character strings) that should be treated like * opening quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) private List<String> quoteBegin; /** * List of extra token texts (usually single character strings) that should be treated like * closing quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_END = "quoteEnd"; @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List<String> quoteEnd; /** * @see NERClassifierCombiner#APPLY_NUMERIC_CLASSIFIERS_DEFAULT */ public static final String PARAM_APPLY_NUMERIC_CLASSIFIERS = "applyNumericClassifiers"; @ConfigurationParameter(name = PARAM_APPLY_NUMERIC_CLASSIFIERS, mandatory = true, defaultValue="true") boolean applyNumericClassifiers; // FIXME Using USE_SUTIME_DEFAULT autodetects presence of SUTime. Need three values here: // on, off, auto public static final String PARAM_USE_SUTIME = "useSUTime"; @ConfigurationParameter(name = PARAM_USE_SUTIME, mandatory = true, defaultValue="false") boolean useSUTime; // = NumberSequenceClassifier.USE_SUTIME_DEFAULT; public static final String PARAM_AUGMENT_REGEX_NER = "augmentRegexNER"; @ConfigurationParameter(name = PARAM_AUGMENT_REGEX_NER, mandatory = true, defaultValue="false") boolean augmentRegexNER; // = NERClassifierCombiner.APPLY_GAZETTE_PROPERTY; boolean verbose = false; private ModelProviderBase<NERCombinerAnnotator> annotatorProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); annotatorProvider = new CoreNlpNamedEntityRecognizerModelProvider(this); mappingProvider = new MappingProvider(); mappingProvider .setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/corenlp/lib/ner-default-variants.map"); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/corenlp/lib/ner-${language}-${variant}.map"); mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); mappingProvider.setOverride(MappingProvider.LANGUAGE, language); mappingProvider.setOverride(MappingProvider.VARIANT, variant); numThreads = ComponentParameters.computeNumThreads(numThreads); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); annotatorProvider.configure(cas); mappingProvider.configure(cas); // Transfer from CAS to CoreNLP DKPro2CoreNlp converter = new DKPro2CoreNlp(); converter.setPtb3Escaping(ptb3Escaping); converter.setQuoteBegin(quoteBegin); converter.setQuoteEnd(quoteEnd); converter.setEncoding(modelEncoding); Annotation document = new Annotation((String) null); converter.convert(aJCas, document); // Actual processing annotatorProvider.getResource().annotate(document); // Transfer back into the CAS CoreNlp2DKPro.convertNamedEntities(aJCas, document, mappingProvider, internStrings); } private class CoreNlpNamedEntityRecognizerModelProvider extends ModelProviderBase<NERCombinerAnnotator> { public CoreNlpNamedEntityRecognizerModelProvider(Object aObject) { super(aObject, "stanfordnlp", "ner"); // setDefault(PACKAGE, "de/tudarmstadt/ukp/dkpro/core/stanfordnlp"); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/stanfordnlp/lib/ner-${language}-${variant}.properties"); } @Override protected NERCombinerAnnotator produceResource(URL aUrl) throws IOException { AbstractSequenceClassifier<CoreLabel> classifier = null; Exception e1 = null; Exception e2 = null; //try loading as a CRFClassifier try (InputStream is = aUrl.openStream()) { InputStream zis = is; if (aUrl.toString().endsWith(".gz")) { zis = new GZIPInputStream(is); } classifier = ErasureUtils.uncheckedCast(CRFClassifier.getClassifier(zis)); } catch (Exception e) { e1 = e; } //try loading as a CMMClassifier if (classifier == null) { try (InputStream is = aUrl.openStream()) { InputStream zis = is; if (aUrl.toString().endsWith(".gz")) { zis = new GZIPInputStream(is); } classifier = ErasureUtils.uncheckedCast(CMMClassifier.getClassifier(zis)); } catch (Exception e) { e2 = e; } } if (classifier == null) { getLogger().error("Unable to load as CRFClassifier", e1); getLogger().error("Unable to load as CMMClassifier", e2); throw new IOException("Unable to load model - see log for details."); } if (printTagSet) { StringBuilder sb = new StringBuilder(); sb.append("Model contains [").append(classifier.classIndex.size()) .append("] tags: "); List<String> tags = new ArrayList<String>(); for (String t : classifier.classIndex) { tags.add(t); } Collections.sort(tags); sb.append(StringUtils.join(tags, " ")); getContext().getLogger().log(INFO, sb.toString()); } NERClassifierCombiner combiner = new NERClassifierCombiner(applyNumericClassifiers, useSUTime, augmentRegexNER, classifier); NERCombinerAnnotator annotator = new NERCombinerAnnotator(combiner, verbose, numThreads, maxTime, maxSentenceLength); return annotator; } } }