/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.textnormalizer.annotations;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException;
import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
/**
* Reads a list of words from a text file (one token per line) and retains only tokens or other
* annotations that match any of these words.
*
*
*/
public class AnnotationByTextFilter
extends JCasAnnotator_ImplBase
{
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = true)
private File modelLocation;
private Set<String> words;
/**
* If true, annotation texts are filtered case-independently. Default: true, i.e. words that
* occur in the list with different casing are not filtered out.
*/
public static final String PARAM_IGNORE_CASE = "ignoreCase";
@ConfigurationParameter(name = PARAM_IGNORE_CASE, mandatory = true, defaultValue = "true")
private boolean ignoreCase;
public static final String PARAM_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String modelEncoding;
/**
* Annotation type to filter. Default:
* {@link de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token}.
*/
public static final String PARAM_TYPE_NAME = "typeName";
@ConfigurationParameter(name = PARAM_TYPE_NAME, mandatory = true, defaultValue = "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token")
private String typeName;
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
try {
readWords();
}
catch (IOException e) {
throw new ResourceInitializationException(e);
}
};
private void readWords()
throws IOException
{
words = new HashSet<>();
for (String line : FileUtils.readLines(modelLocation, modelEncoding)) {
words.add(ignoreCase ? line.trim().toLowerCase() : line.trim());
}
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
Set<AnnotationFS> toRemove = new HashSet<>();
try {
for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(),
typeName)) {
String text = ignoreCase ? entry.getValue().toLowerCase() : entry.getValue();
if (!words.contains(text)) {
toRemove.add(entry.getKey());
}
}
}
catch (FeaturePathException e) {
throw new AnalysisEngineProcessException(e);
}
for (AnnotationFS annotation : toRemove) {
aJCas.removeFsFromIndexes(annotation);
}
}
}