/* * Licensed to the Technische Universität Darmstadt under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The Technische Universität Darmstadt * licenses this file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.dkpro.core.udpipe; import static org.apache.uima.fit.util.JCasUtil.indexCovered; import static org.apache.uima.fit.util.JCasUtil.select; import java.io.File; import java.io.IOException; import java.net.URL; import java.util.Collection; import java.util.Map; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.dkpro.core.udpipe.internal.DKPro2UDPipe; import org.dkpro.core.udpipe.internal.UDPipe2DKPro; import org.dkpro.core.udpipe.internal.UDPipeUtils; import cz.cuni.mff.ufal.udpipe.Model; import cz.cuni.mff.ufal.udpipe.ProcessingError; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Part-of-Speech, lemmatizer, and morphological analyzer using UDPipe. UDPipe uses MorphoDiTa for * this task, a Morphological Dictionary and Tagger. * * @see <a href="https://github.com/ufal/udpipe/tree/master/src/morphodita">MorphoDiTa in UDPipe</a> */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures"}) public class UDPipePosTagger extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) protected String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) protected String modelLocation; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; private ModelProviderBase<Model> modelProvider; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new ModelProviderBase<Model>() { { setContextObject(UDPipePosTagger.this); setDefault(LOCATION, "classpath:/org/dkpro/core/udpipe/lib/" + "tagger-${language}-${variant}.properties"); setDefault(VARIANT, "ud"); setOverride(LOCATION, modelLocation); setOverride(LANGUAGE, language); setOverride(VARIANT, variant); } @Override protected Model produceResource(URL aUrl) throws IOException { UDPipeUtils.init(); File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); return Model.load(modelFile.getAbsolutePath()); } }; mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, language, modelProvider); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); // Document-specific configuration of model and mapping provider in process() modelProvider.configure(cas); // Mind the mapping provider must be configured after the model provider as it uses the // model metadata mappingProvider.configure(cas); Map<Sentence, Collection<Token>> index = indexCovered(aJCas, Sentence.class, Token.class); for (Sentence sentence : select(aJCas, Sentence.class)) { Collection<Token> tokens = index.get(sentence); cz.cuni.mff.ufal.udpipe.Sentence udSent = new cz.cuni.mff.ufal.udpipe.Sentence(); DKPro2UDPipe.convert(tokens, udSent); ProcessingError error = new ProcessingError(); modelProvider.getResource().tag(udSent, Model.getDEFAULT(), error); if (error.occurred()) { throw new AnalysisEngineProcessException( new IllegalStateException(error.getMessage())); } UDPipe2DKPro.convertPosLemmaMorph(udSent, tokens, aJCas, mappingProvider, internTags); } } }