/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.snowball; import static org.apache.commons.lang.StringUtils.isBlank; import java.util.Collections; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.reflect.MethodUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeaturePath; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.tartarus.snowball.SnowballProgram; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * <p>UIMA wrapper for the Snowball stemmer. Annotation types to be stemmed can beconfigured by a * {@link FeaturePath}.</p> * <p>If you use this component in a pipeline which uses stop word removal, make sure that it * runs after the stop word removal step, so only words that are no stop words are stemmed.</p> * * @see <a href="http://snowball.tartarus.org/">Snowball stemmer homepage</a> * @see FeaturePathAnnotatorBase * @since 1.1.0 */ @LanguageCapability({ "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", "sv", "tr" }) @TypeCapability( outputs={ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem"}) public class SnowballStemmer extends FeaturePathAnnotatorBase { private static final String MESSAGE_DIGEST = SnowballStemmer.class.getName()+"_Messages"; private static final String SNOWBALL_PACKAGE = "org.tartarus.snowball.ext."; /** * Use this language instead of the document language to resolve the model. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Per default the stemmer runs in case-sensitive mode. If this parameter is enabled, tokens * are lower-cased before being passed to the stemmer. * * <table border="1" cellspacing="0"> * <caption>Examples</caption> * <tr><th></th><th>false (default)</th><th>true</th></tr> * <tr><td>EDUCATIONAL</td><td>EDUCATIONAL</td><td>educ</td></tr> * <tr><td>Educational</td><td>Educat</td><td>educ</td></tr> * <tr><td>educational</td><td>educ</td><td>educ</td></tr> * </table> */ public static final String PARAM_LOWER_CASE = "lowerCase"; @ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = false, defaultValue="false") protected boolean lowerCase; public static final Map<String, String> languages = new HashMap<String, String>(); static { languages.put("da", "Danish"); languages.put("nl", "Dutch"); languages.put("en", "English"); languages.put("fi", "Finnish"); languages.put("fr", "French"); languages.put("de", "German"); languages.put("hu", "Hungarian"); languages.put("it", "Italian"); languages.put("no", "Norwegian"); languages.put("pt", "Portuguese"); languages.put("ro", "Romanian"); languages.put("ru", "Russian"); languages.put("es", "Spanish"); languages.put("sv", "Swedish"); languages.put("tr", "Turkish"); } private SnowballProgram snowballProgram; private String snowballProgramLanguage; @Override protected Set<String> getDefaultPaths() { return Collections.singleton(Token.class.getName()); } @Override protected void generateAnnotations(JCas jcas) throws AnalysisEngineProcessException, FeaturePathException { // CAS is necessary to retrieve values CAS currCAS = jcas.getCas(); for (String path : paths) { // Separate Typename and featurepath String[] segments = path.split("/", 2); String typeName = segments[0]; // Try to get the type from the typesystem of the CAS Type t = currCAS.getTypeSystem().getType(typeName); if (t == null) { throw new IllegalStateException("Type [" + typeName + "] not found in type system"); } // get an fpi object and initialize it // initialize the FeaturePathInfo with the corresponding part initializeFeaturePathInfoFrom(fp, segments); // get the annotations AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t); FSIterator<?> iterator = idx.iterator(); while (iterator.hasNext()) { AnnotationFS fs = (AnnotationFS) iterator.next(); try { if (this.filterFeaturePath != null) { // check annotation filter condition if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) { createStemAnnotation(jcas, fs); } } else { // no annotation filter specified createStemAnnotation(jcas, fs); } } catch (AnalysisEngineProcessException e) { // TODO Auto-generated catch block throw new IllegalStateException( "error occured while creating a stem annotation", e); } } } } private SnowballProgram getSnowballProgram(JCas aCas) throws AnalysisEngineProcessException { // Try language set on analysis engine String lang = language; if (isBlank(lang)) { lang = aCas.getDocumentLanguage(); } // Try language set in CAS. if (isBlank(lang)) { throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null); } lang = lang.toLowerCase(Locale.US); if (!lang.equals(snowballProgramLanguage)) { try { String langPart = languages.get(lang); if (langPart == null) { throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error", new Object[] { lang }); } String snowballStemmerClass = SNOWBALL_PACKAGE + languages.get(lang) + "Stemmer"; @SuppressWarnings("unchecked") Class<SnowballProgram> stemClass = (Class<SnowballProgram>) Class .forName(snowballStemmerClass); snowballProgram = stemClass.newInstance(); snowballProgramLanguage = lang; } catch (Exception e) { throw new AnalysisEngineProcessException(e); } } return snowballProgram; } /** * Creates a Stem annotation with same begin and end as the AnnotationFS fs, the value is the * stemmed value derived by applying the featurepath. * * @param jcas * the JCas * @param fs * the AnnotationFS where the Stem annotation is created * @throws AnalysisEngineProcessException * if the {@code stem} method from the snowball stemmer cannot be invoked. */ private void createStemAnnotation(JCas jcas, AnnotationFS fs) throws AnalysisEngineProcessException { // Check for blank text, it makes no sense to add a stem then (and raised an exception) String value = fp.getValue(fs); if (!StringUtils.isBlank(value)) { if (lowerCase) { // Fixme - should use locale/language defined in CAS. value = value.toLowerCase(Locale.US); } Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd()); SnowballProgram programm = getSnowballProgram(jcas); programm.setCurrent(value); try { // The patched snowball from Lucene has this as a method on SnowballProgram // but if we have some other snowball also in the classpath, Java might // choose to use the other. So to be safe, we use a reflection here. // -- REC, 2011-04-17 MethodUtils.invokeMethod(programm, "stem", null); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } stemAnnot.setValue(programm.getCurrent()); stemAnnot.addToIndexes(jcas); // Try setting the "stem" feature on Tokens. Feature feat = fs.getType().getFeatureByBaseName("stem"); if (feat != null && feat.getRange() != null && jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) { fs.setFeatureValue(feat, stemAnnot); } } } }