/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.lingpipe; import static java.util.Arrays.asList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; import static org.apache.uima.util.Level.INFO; import java.io.InputStream; import java.io.ObjectInputStream; import java.util.List; import java.util.Locale; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import com.aliasi.hmm.HiddenMarkovModel; import com.aliasi.hmm.HmmDecoder; import com.aliasi.tag.Tagging; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * LingPipe part-of-speech tagger. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) public class LingPipePosTagger extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") protected boolean printTagSet; /** * Lingpipe models tend to be trained on lower-case tags, but our POS mappings use uppercase. */ public static final String PARAM_UPPERCASE_TAGS = "uppercaseTags"; @ConfigurationParameter(name = PARAM_UPPERCASE_TAGS, mandatory = true, defaultValue="true") protected boolean uppercaseTags; private CasConfigurableProviderBase<HmmDecoder> modelProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new ModelProviderBase<HmmDecoder>(this, "lingpipe", "tagger") { @Override protected HmmDecoder produceResource(InputStream aStream) throws Exception { ObjectInputStream ois = new ObjectInputStream(aStream); HiddenMarkovModel hmm = (HiddenMarkovModel) ois.readObject(); SingletonTagset tags = new SingletonTagset(POS.class, getResourceMetaData() .getProperty(("pos.tagset"))); for (int n = 0; n < hmm.stateSymbolTable().numSymbols(); n++) { String tag = hmm.stateSymbolTable().idToSymbol(n); if (uppercaseTags) { tag = tag.toUpperCase(Locale.US); } tags.add(tag); } addTagset(tags); if (printTagSet) { getContext().getLogger().log(INFO, getTagset().toString()); } return new HmmDecoder(hmm); } }; mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, language, modelProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); modelProvider.configure(cas); mappingProvider.configure(cas); for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = selectCovered(aJCas, Token.class, sentence); String[] tokenTexts = toText(tokens).toArray(new String[tokens.size()]); Tagging<String> tagging = modelProvider.getResource().tag(asList(tokenTexts)); for (int n = 0; n < tagging.size(); n++) { Token t = tokens.get(n); String tag = tagging.tag(n); if (uppercaseTags) { tag = tag.toUpperCase(Locale.US); } Type posTag = mappingProvider.getTagType(tag); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); posAnno.setPosValue(internTags ? tag.intern() : tag); posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null : posAnno.getType().getShortName().intern()); posAnno.addToIndexes(); t.setPos(posAnno); } } } }