/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.tokit; import java.util.ArrayList; import java.util.Collection; import java.util.Map.Entry; import java.util.Set; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; /** * Removes annotations that do not conform to minimum or maximum length constraints. * * (This was previously called TokenFilter). * */ public class AnnotationByLengthFilter extends JCasAnnotator_ImplBase { /** * A set of annotation types that should be filtered. */ public static final String PARAM_FILTER_ANNOTATION_TYPES = "FilterTypes"; @ConfigurationParameter(name=PARAM_FILTER_ANNOTATION_TYPES, mandatory=true, defaultValue={}) private Set<String> filterTypes; /** * Any annotation in filterTypes shorter than this value will be removed. */ public static final String PARAM_MIN_LENGTH = "MinLengthFilter"; @ConfigurationParameter(name=PARAM_MIN_LENGTH, mandatory=true, defaultValue="0") private int minTokenLength; /** * Any annotation in filterAnnotations shorter than this value will be removed. */ public static final String PARAM_MAX_LENGTH = "MaxLengthFilter"; @ConfigurationParameter(name=PARAM_MAX_LENGTH, mandatory=true, defaultValue="1000") private int maxTokenLength; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { for (String filterType : filterTypes) { try { Collection<Annotation> toRemove = new ArrayList<Annotation>(); for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(aJCas.getCas(), filterType)) { int length = entry.getKey().getCoveredText().length(); if (length < minTokenLength || length > maxTokenLength) { toRemove.add((Annotation)entry.getKey()); } } for (Annotation anno : toRemove) { anno.removeFromIndexes(); } } catch (FeaturePathException e) { throw new AnalysisEngineProcessException(e); } } } }