/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.lingpipe; import static java.util.Arrays.asList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.toText; import static org.apache.uima.util.Level.INFO; import java.io.InputStream; import java.io.ObjectInputStream; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.reflect.FieldUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.aliasi.chunk.AbstractCharLmRescoringChunker; import com.aliasi.chunk.Chunk; import com.aliasi.chunk.Chunker; import com.aliasi.chunk.Chunking; import com.aliasi.chunk.HmmChunker; import com.aliasi.chunk.TokenShapeChunker; import com.aliasi.hmm.HiddenMarkovModel; import com.aliasi.symbol.SymbolTable; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * LingPipe named entity recognizer. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class LingPipeNamedEntityRecognizer extends JCasAnnotator_ImplBase { /** * Log the tag set(s) when a model is loaded. */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false") protected boolean printTagSet; /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Variant of a model the model. Used to address a specific model if here are multiple models * for one language. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Location from which the model is read. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Location of the mapping file for named entity tags to UIMA types. */ public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) protected String mappingLocation; private ModelProviderBase<Chunker> modelProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new ModelProviderBase<Chunker>(this, "lingpipe", "ner") { @Override protected Chunker produceResource(InputStream aStream) throws Exception { ObjectInputStream ois = new ObjectInputStream(aStream); Chunker chunker = (Chunker) ois.readObject(); System.out.println(chunker.getClass()); SingletonTagset tags = new SingletonTagset(NamedEntity.class, null); if (chunker instanceof HmmChunker) { HiddenMarkovModel hmm = ((HmmChunker) chunker).getDecoder().getHmm(); List<String> prefixes = asList("B_", "M_", "E_", "W_", "BB_O_", "EE_O_", "WW_O_"); for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) { String tag = hmm.stateSymbolTable().idToSymbol(n); if (prefixes.contains(StringUtils.substring(tag, 0, 5))) { tag = tag.substring(5); } else if (prefixes.contains(StringUtils.substring(tag, 0, 2))) { tag = tag.substring(2); } if ("BOS".equals(tag) || "MM_O".equals(tag)) { // BOS is reserved by the system continue; } tags.add(tag); } } else if (chunker instanceof TokenShapeChunker) { Object decoder = FieldUtils.readField(chunker, "mDecoder", true); Object estimator = FieldUtils.readField(decoder, "mEstimator", true); SymbolTable tagTable = (SymbolTable) FieldUtils.readField(estimator, "mTagSymbolTable", true); for (int n = 0; n < tagTable.numSymbols(); n++) { String tag = tagTable.idToSymbol(n); // Handle BIO encoding if (tag.startsWith("B-") || tag.startsWith("I-")) { tag = tag.substring(2); } if ("O".equals(tag)) { continue; } tags.add(tag); } } else if (chunker instanceof AbstractCharLmRescoringChunker) { @SuppressWarnings("unchecked") Map<String, Character> typeToChar = (Map<String, Character>) FieldUtils .readField(chunker, "mTypeToChar", true); for (String tag : typeToChar.keySet()) { tags.add(tag); } } addTagset(tags); if (printTagSet) { getContext().getLogger().log(INFO, tags.toString()); } return chunker; } }; mappingProvider = new MappingProvider(); mappingProvider.setDefaultVariantsLocation( "de/tudarmstadt/ukp/dkpro/core/lingpipe/lib/ner-default-variants.map"); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/lingpipe/lib/ner-${language}-${variant}.map"); mappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); mappingProvider.setOverride(MappingProvider.LOCATION, mappingLocation); mappingProvider.setOverride(MappingProvider.LANGUAGE, language); mappingProvider.setOverride(MappingProvider.VARIANT, variant); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); modelProvider.configure(cas); mappingProvider.configure(cas); // get the document text List<Token> tokenList = new ArrayList<Token>(select(aJCas, Token.class)); String[] tokens = toText(tokenList).toArray(new String[tokenList.size()]); Chunking chunking = modelProvider.getResource().chunk(cas.getDocumentText()); // get the named entities and their character offsets for (Chunk namedEntity : chunking.chunkSet()) { Type type = mappingProvider.getTagType(namedEntity.type()); NamedEntity neAnno = (NamedEntity) cas.createAnnotation(type, namedEntity.start(), namedEntity.end()); neAnno.setValue(namedEntity.type()); neAnno.addToIndexes(); } } }