/*
* Copyright 2010
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.stopwordremover;
import static de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils.resolveLocation;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.CasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.getView;
import static org.apache.uima.util.Level.FINE;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Logger;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathInfo;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Remove all of the specified types from the CAS if their covered text is in the stop word
* dictionary. Also remove any other of the specified types that is covered by a matching instance.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.StopWord" })
public class StopWordRemover
extends JCasAnnotator_ImplBase
{
// VIEW NAMES
private static final String TOPIC_VIEW = "topic";
private static final String DOC_VIEW = "doc";
/**
* A list of URLs from which to load the stop word lists. If an URL is prefixed with a language
* code in square brackets, the stop word list is only used for documents in that language.
* Using no prefix or the prefix "[*]" causes the list to be used for every document.
* Example: "[de]classpath:/stopwords/en_articles.txt"
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true)
private Set<String> swFileNames;
/**
* The character encoding used by the model.
*/
public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
@ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String modelEncoding;
/**
* Feature paths for annotations that should be matched/removed. The default is
*
* <pre>
* StopWord.class.getName()
* Token.class.getName()
* Lemma.class.getName()+"/value"
* </pre>
*/
public static final String PARAM_PATHS = "Paths";
@ConfigurationParameter(name = PARAM_PATHS, mandatory = false)
private Set<String> paths;
/**
* Anything annotated with this type will be removed even if it does not match any word in the
* lists.
*/
public static final String PARAM_STOP_WORD_TYPE = "StopWordType";
@ConfigurationParameter(name = PARAM_STOP_WORD_TYPE, mandatory = false)
private String stopWordType;
private Map<String, StopWordSet> stopWordSets;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
// Set default paths. This cannot be done in the annotation because we cannot call
// methods there.
if (paths == null || paths.size() == 0) {
paths = new HashSet<String>();
paths.add(StopWord.class.getName());
paths.add(Token.class.getName());
paths.add(Lemma.class.getName()+"/value");
}
// Set default stop word type. This cannot be done in the annotation because we cannot call
// methods there.
if (stopWordType == null) {
stopWordType = StopWord.class.getName();
}
try {
stopWordSets = new HashMap<String, StopWordSet>();
for (String swFileName : swFileNames) {
String fileLocale = "*";
// Check if a locale is defined for the file
if (swFileName.startsWith("[")) {
fileLocale = swFileName.substring(1, swFileName.indexOf(']'));
swFileName = swFileName.substring(swFileName.indexOf(']')+1);
}
// Fetch the set for the specified locale
StopWordSet set = stopWordSets.get(fileLocale);
if (set == null) {
set = new StopWordSet();
stopWordSets.put(fileLocale, set);
}
// Load the set
URL source = resolveLocation(swFileName, this, context);
InputStream is = null;
try {
is = source.openStream();
set.load(is, modelEncoding);
}
finally {
closeQuietly(is);
}
getLogger().info(
"Loaded stopwords for locale [" + fileLocale + "] from [" + source + "]");
}
}
catch (IOException e1) {
throw new ResourceInitializationException(e1);
}
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
JCas doc = getView(jcas, DOC_VIEW, null);
JCas topic = getView(jcas, TOPIC_VIEW, null);
try {
if (doc != null) {
check(doc);
}
if (topic != null) {
check(topic);
}
if (topic == null && doc == null) {
check(jcas);
}
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
}
private void check(JCas aJCas)
throws FeaturePathException
{
Logger log = getContext().getLogger();
Locale casLocale = new Locale(aJCas.getDocumentLanguage());
StopWordSet anyLocaleSet = stopWordSets.get("*");
StopWordSet casLocaleSet = stopWordSets.get(aJCas.getDocumentLanguage());
// Now really to the removal part
FeaturePathInfo fp = new FeaturePathInfo();
for (String path : paths) {
// Create a sorted list of annotations that we can quickly search on
AnnotationFS[] candidates = getCandidates(aJCas);
// Initialize list of annotations to remove
List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>();
// Separate Typename and featurepath
String[] segments = path.split("/", 2);
String typeName = segments[0];
boolean isStopWordType = stopWordType.equals(typeName);
Type t = aJCas.getTypeSystem().getType(typeName);
if (t == null) {
throw new IllegalStateException("Type [" + typeName + "] not found in type system");
}
// initialize the FeaturePathInfo with the corresponding part
if (segments.length > 1) {
fp.initialize(segments[1]);
}
else {
fp.initialize("");
}
int safeStart = 0;
Iterator<Annotation> i = aJCas.getAnnotationIndex(t).iterator();
while (i.hasNext()) {
Annotation anno = i.next();
// Move the start of the containment scanning range ahead if possible
while ((safeStart + 1) < candidates.length
&& candidates[safeStart + 1].getEnd() < anno.getBegin()) {
safeStart++;
}
String candidate = fp.getValue(anno).toLowerCase(casLocale);
if (isStopWordType || ((anyLocaleSet != null) && anyLocaleSet.contains(candidate))
|| ((casLocaleSet != null) && casLocaleSet.contains(candidate))) {
// Remove the annotation that matched the stop word
toRemove.add(anno);
if (log.isLoggable(FINE)) {
log.log(FINE, "Removing ["
+ typeName.substring(typeName.lastIndexOf('.') + 1)
+ "] annotated as stop word [" + anno.getCoveredText() + "]@"
+ anno.getBegin() + ".." + anno.getEnd());
}
// Scan all potential annotations that may be covered the current
// annotation and remove them as well
int n = safeStart;
while (n < candidates.length && candidates[n].getBegin() < anno.getEnd()) {
if ((anno.getBegin() <= candidates[n].getBegin())
&& (candidates[n].getEnd() <= anno.getEnd())) {
if (log.isLoggable(FINE)) {
log.log(FINE, "Removing as well ["
+ candidates[n].getClass().getSimpleName()
+ "] annotated as stop word ["
+ candidates[n].getCoveredText() + "]@"
+ candidates[n].getBegin() + ".." + candidates[n].getEnd());
}
toRemove.add(candidates[n]);
}
n++;
}
}
}
// Remove from the CAS
for (AnnotationFS anno : toRemove) {
aJCas.removeFsFromIndexes(anno);
}
}
}
private AnnotationFS[] getCandidates(JCas aJCas)
{
// Make a list of all the annotations that can be matched by the given paths. If any one
// of the paths match, we want to remove instances of all others being covered by the
// match as well.
List<AnnotationFS> candidateList = new ArrayList<AnnotationFS>();
for (String path : paths) {
String[] segments = path.split("/", 2);
String typeName = segments[0];
Type t = aJCas.getTypeSystem().getType(typeName);
if (t == null) {
throw new IllegalStateException("Type [" + typeName + "] not found in type system");
}
for (AnnotationFS fs : select(aJCas.getCas(), t)) {
candidateList.add(fs);
}
}
AnnotationFS[] candidates = candidateList.toArray(new AnnotationFS[candidateList.size()]);
Arrays.sort(candidates, new BeginEndComparator());
return candidates;
}
static class BeginEndComparator implements Comparator<AnnotationFS>
{
@Override
public int compare(AnnotationFS aO1, AnnotationFS aO2)
{
if (aO1.getBegin() == aO2.getBegin()) {
return aO1.getEnd() - aO2.getEnd();
}
else {
return aO1.getBegin() - aO2.getBegin();
}
}
}
}