/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.gate; import static java.util.Collections.singletonList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; import hepple.postag.InvalidRuleException; import hepple.postag.POSTagger; import java.io.IOException; import java.net.URL; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * GATE Hepple part-of-speech tagger. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) public class HepplePosTagger extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the lexicon from this location instead of locating it automatically. */ public static final String PARAM_LEXICON_LOCATION = "lexiconLocation"; @ConfigurationParameter(name = PARAM_LEXICON_LOCATION, mandatory = false) protected String lexiconLocation; /** * Load the ruleset from this location instead of locating it automatically. */ public static final String PARAM_RULESET_LOCATION = "rulesetLocation"; @ConfigurationParameter(name = PARAM_RULESET_LOCATION, mandatory = false) protected String rulesetLocation; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; /** * Log the tag set(s) when a model is loaded. * * Default: {@code false} */ public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET; @ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false") protected boolean printTagSet; private CasConfigurableProviderBase<URL> ruleProvider; private CasConfigurableProviderBase<URL> lexiconProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); ruleProvider = new CasConfigurableProviderBase<URL>() { { setContextObject(HepplePosTagger.this); setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/gate/lib/" + "tagger/${language}/${variant}/ruleset"); setDefault(VARIANT, "annie"); setOverride(LOCATION, rulesetLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected URL produceResource(URL aUrl) throws IOException { return aUrl; } }; lexiconProvider = new CasConfigurableProviderBase<URL>() { { setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/gate/lib/" + "tagger/${language}/${variant}/lexicon"); setDefault(VARIANT, "annie"); setOverride(LOCATION, lexiconLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected URL produceResource(URL aUrl) throws IOException { return aUrl; } }; mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, language, ruleProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); lexiconProvider.configure(cas); ruleProvider.configure(cas); mappingProvider.configure(cas); POSTagger posTagger; try { posTagger = new POSTagger(lexiconProvider.getResource(), ruleProvider.getResource(), "UTF-8"); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } catch (InvalidRuleException e) { throw new AnalysisEngineProcessException(e); } for (Sentence sentence : select(aJCas, Sentence.class)) { List<Token> tokens = selectCovered(aJCas, Token.class, sentence); List<String> tokenTexts = toText(tokens); List<String[]> tagged = (List<String[]>) posTagger.runTagger(singletonList(tokenTexts)) .get(0); int i = 0; for (Token t : tokens) { Type posTag = mappingProvider.getTagType(tagged.get(i)[1]); POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd()); posAnno.setPosValue(internTags ? tagged.get(i)[1].intern() : tagged.get(i)[1]); posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null : posAnno.getType().getShortName().intern()); posAnno.addToIndexes(); t.setPos(posAnno); i++; } } } }