/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.languagetool; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import static org.apache.uima.fit.util.JCasUtil.toText; import java.io.IOException; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.languagetool.AnalyzedSentence; import org.languagetool.AnalyzedToken; import org.languagetool.AnalyzedTokenReadings; import org.languagetool.Language; import org.languagetool.Languages; import de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Naive lexicon-based lemmatizer. The words are looked up using the wordform lexicons of * LanguageTool. Multiple readings are produced. The annotator simply takes the most frequent * lemma from those readings. If no readings could be found, the original text is assigned as * lemma. */ @LanguageCapability({ "en", "fa", "fr", "de", "pl", "ca", "it", "br", "nl", "pt", "ru", "be", "zh", "da", "eo", "gl", "el", "is", "ja", "km", "lt", "ml", "ro", "sk", "sl", "es", "sv", "ta", "tl", "uk" }) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" }) public class LanguageToolLemmatizer extends JCasAnnotator_ImplBase { public static final String PARAM_SANITIZE = "sanitize"; @ConfigurationParameter(name=PARAM_SANITIZE, mandatory=true, defaultValue="true") private boolean sanitize; public static final String PARAM_SANTIZE_CHARS = "sanitizeChars"; @ConfigurationParameter(name = PARAM_SANTIZE_CHARS, mandatory = true, defaultValue = { "(", ")", "[", "]" }) private String[] sanitizeChars; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); mappingProvider = new MappingProvider(); mappingProvider.setDefault(MappingProvider.VARIANT, "default"); mappingProvider.setDefaultVariantsLocation( "de/tudarmstadt/ukp/dkpro/core/languagetool/lib/language-tagset.map"); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/api/lexmorph/tagset/${language}-${variant}.map"); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { mappingProvider.configure(aJCas.getCas()); try { Language lang = Languages.getLanguageForShortName(aJCas.getDocumentLanguage()); Language defaultVariant = lang.getDefaultLanguageVariant(); if (defaultVariant != null) { getLogger().info( "Using default variant [" + defaultVariant.getShortNameWithCountryAndVariant() + "] for language [" + aJCas.getDocumentLanguage() + "]"); lang = defaultVariant; } for (Sentence s : select(aJCas, Sentence.class)) { // Get the tokens from the sentence List<Token> tokens = selectCovered(Token.class, s); List<String> tokenText = toText(tokens); // Let LanguageTool analyze the tokens List<AnalyzedTokenReadings> rawTaggedTokens = lang.getTagger().tag(tokenText); AnalyzedSentence as = new AnalyzedSentence( rawTaggedTokens.toArray(new AnalyzedTokenReadings[rawTaggedTokens.size()])); as = lang.getDisambiguator().disambiguate(as); for (int i = 0; i < tokens.size(); i++) { Token token = tokens.get(i); String l = null; // Try using the POS to disambiguate the lemma if (token.getPos() != null) { l = getByPos(token.getPos(), as.getTokens()[i]); } // Get the most frequent lemma if (l == null) { l = getMostFrequentLemma(as.getTokens()[i]); } // Sanitize if we have a lemma by now if (sanitize && l != null) { l = sanitizeLemma(token.getCoveredText(), l); } if (l == null) { l = token.getCoveredText(); } // Create the annotation Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(l); lemma.addToIndexes(); token.setLemma(lemma); } } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } private String getByPos(POS aPos, AnalyzedTokenReadings aReadings) { String tag = aPos.getPosValue(); //System.out.printf("%s %n", tag); for (AnalyzedToken t : aReadings.getReadings()) { //System.out.printf("-- %s %s ", t.getPOSTag(), t.getLemma()); if (t.getPOSTag() == null) { return null; } // Lets see if we have mapped tagsets try { String typeName = mappingProvider.getTagType(t.getPOSTag()).getName(); if (aPos.getClass().getName().equals(typeName)) { //System.out.printf("- mapped match%n"); return t.getLemma(); } } catch (IllegalStateException e) { // Type could not be looked up. Go on with other types of matching } // Full match... feeling lucky ;) This is quite unlikely to happen because the tagset // used by LanguageTool is most certainly different from tagset used by POS tagger. if (tag.equals(t.getPOSTag())) { //System.out.printf("- full match%n"); return t.getLemma(); } // Some tagsets used by LanguageTool use ':' as separator. If we are lucky, the string // before the first ':' matches our POS tag. if (t.getPOSTag().length() > 1 && tag.equals(t.getPOSTag().split(":")[0])) { //System.out.printf("- first element match%n"); return t.getLemma(); } //System.out.printf("- no match%n"); } //System.out.printf("- no reading matches%n"); return null; } private String getMostFrequentLemma(AnalyzedTokenReadings aReadings) { FrequencyDistribution<String> freq = new FrequencyDistribution<String>(); for (AnalyzedToken t : aReadings.getReadings()) { if (t.getLemma() != null) { freq.inc(t.getLemma()); } } String best = null; for (String l : freq.getKeys()) { if (best == null) { best = l; } else if (freq.getCount(best) < freq.getCount(l)) { best = l; } } return best; } private String sanitizeLemma(String aWordForm, String aLemma) { String sanitized = aLemma; for (String c : sanitizeChars) { if (!aWordForm.contains(c)) { sanitized = sanitized.replace(c, ""); } } return sanitized; } }