/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.matetools; import is2.data.SentenceData09; import is2.io.CONLLReader09; import is2.lemmatizer.Lemmatizer; import java.io.File; import java.io.IOException; import java.net.URL; import java.util.LinkedList; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * DKPro Annotator for the MateToolsLemmatizer. */ @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) public class MateLemmatizer extends JCasAnnotator_ImplBase { /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) private String language; /** * Override the default variant used to locate the model. */ public static final String PARAM_VARIANT = "variant"; @ConfigurationParameter(name = PARAM_VARIANT, mandatory = false) private String variant; /** * Load the model from this location instead of locating the model automatically. */ public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION; @ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false) private String modelLocation; /** * Try reconstructing proper casing for lemmata. This is useful for German, but e.g. for * English creates odd results. */ public static final String PARAM_UPPERCASE = "uppercase"; @ConfigurationParameter(name = PARAM_UPPERCASE, mandatory = true, defaultValue="false") private boolean uppercase; private CasConfigurableProviderBase<Lemmatizer> modelProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); modelProvider = new ModelProviderBase<Lemmatizer>(this, "matetools", "lemmatizer") { @Override protected Lemmatizer produceResource(URL aUrl) throws IOException { File modelFile = ResourceUtils.getUrlAsFile(aUrl, true); return new Lemmatizer(modelFile.getPath(), uppercase); // create a lemmatizer } }; } @Override public void process(JCas jcas) throws AnalysisEngineProcessException { CAS cas = jcas.getCas(); modelProvider.configure(cas); for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) { List<Token> tokens = JCasUtil.selectCovered(Token.class, sentence); List<String> forms = new LinkedList<String>(); forms.add(CONLLReader09.ROOT); forms.addAll(JCasUtil.toText(tokens)); SentenceData09 sd = new SentenceData09(); sd.init(forms.toArray(new String[0])); String[] lemmas = modelProvider.getResource().apply(sd).plemmas; for (int i = 0; i < lemmas.length; i++) { Token token = tokens.get(i); if (lemmas[i] == null) { lemmas[i] = token.getCoveredText(); } Lemma lemma = new Lemma(jcas, token.getBegin(), token.getEnd()); lemma.setValue(lemmas[i]); lemma.addToIndexes(); token.setLemma(lemma); } } } }