/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.lancaster;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.Locale;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.LanguageCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathAnnotatorBase;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* This Paice/Husk Lancaster stemmer implementation only works with the English language so far.
*/
@LanguageCapability("en")
@TypeCapability(
inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" },
outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem" })
public class LancasterStemmer
extends FeaturePathAnnotatorBase
{
private static final String MESSAGE_DIGEST = LancasterStemmer.class.getName() + "_Messages";
/**
* True if the stemmer will strip prefix such as kilo, micro, milli, intra, ultra, mega, nano,
* pico, pseudo.
*/
public static final String PARAM_STRIP_PREFIXES = "stripPrefix";
@ConfigurationParameter(name = PARAM_STRIP_PREFIXES, mandatory = true, defaultValue = "false")
private boolean stripPrefix;
/**
* Specifies an URL that should resolve to a location from where to load custom rules. If the
* location starts with {@code classpath:} the location is interpreted as a classpath location,
* e.g. "classpath:my/path/to/the/rules". Otherwise it is tried as an URL, file and at last UIMA
* resource.
*
* @see ResourceUtils
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
private String modelLocation;
/**
* Specifies the language supported by the stemming model. Default value is "en" (English).
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = true, defaultValue = "en")
protected String language;
/**
* The stemmer only has to be initialized once since it's used like a pure function with the given
* configuration parameters.
*/
private smile.nlp.stemmer.LancasterStemmer stemmer;
@Override
protected Set<String> getDefaultPaths()
{
return Collections.singleton(Token.class.getName());
}
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
language = language.toLowerCase();
if (modelLocation != null) {
try {
URL url = ResourceUtils.resolveLocation(modelLocation, this, aContext);
stemmer = new smile.nlp.stemmer.LancasterStemmer(url.openStream(), stripPrefix);
} catch (MalformedURLException e) {
throw new ResourceInitializationException(e);
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
} else {
stemmer = new smile.nlp.stemmer.LancasterStemmer(stripPrefix);
}
}
@Override
protected void generateAnnotations(JCas jcas)
throws FeaturePathException, AnalysisEngineProcessException
{
// CAS is necessary to retrieve values
CAS currCAS = jcas.getCas();
// Try language set in CAS.
String lang = jcas.getDocumentLanguage();
if (StringUtils.isBlank(lang)) {
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "no_language_error", null);
}
lang = lang.toLowerCase(Locale.US);
if (!language.equals(lang)) { // Only specified language is supported
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, "unsupported_language_error",
new Object[] { lang });
}
for (String path : paths) {
// Separate Typename and featurepath
String[] segments = path.split("/", 2);
String typeName = segments[0];
// Try to get the type from the typesystem of the CAS
Type t = CasUtil.getType(currCAS, typeName);
if (t == null) {
throw new IllegalStateException("Type [" + typeName + "] not found in type system");
}
// get an fpi object and initialize it
// initialize the FeaturePathInfo with the corresponding part
initializeFeaturePathInfoFrom(fp, segments);
// get the annotations
AnnotationIndex<?> idx = currCAS.getAnnotationIndex(t);
FSIterator<?> iterator = idx.iterator();
while (iterator.hasNext()) {
AnnotationFS fs = (AnnotationFS) iterator.next();
if (this.filterFeaturePath != null) {
// check annotation filter condition
if (this.filterFeaturePathInfo.match(fs, this.filterCondition)) {
createStemAnnotation(jcas, stemmer, fs);
}
}
else { // no annotation filter specified
createStemAnnotation(jcas, stemmer, fs);
}
}
}
}
private void createStemAnnotation(JCas jcas, smile.nlp.stemmer.LancasterStemmer stemmer,
AnnotationFS fs)
throws AnalysisEngineProcessException
{
// Check for blank text, it makes no sense to add a stem then (and raised an exception)
String value = fp.getValue(fs);
if (!StringUtils.isBlank(value)) {
Stem stemAnnot = new Stem(jcas, fs.getBegin(), fs.getEnd());
stemAnnot.setValue(stemmer.stem(value));
stemAnnot.addToIndexes(jcas);
// Try setting the "stem" feature on Tokens.
Feature feat = fs.getType().getFeatureByBaseName("stem");
if (feat != null && feat.getRange() != null
&& jcas.getTypeSystem().subsumes(feat.getRange(), stemAnnot.getType())) {
fs.setFeatureValue(feat, stemAnnot);
}
}
}
}