/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.posfilter; import java.util.ArrayList; import java.util.List; import java.util.Map.Entry; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.util.Level; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathException; import de.tudarmstadt.ukp.dkpro.core.api.featurepath.FeaturePathFactory; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADJ; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADP; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.ADV; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.AUX; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.CONJ; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.DET; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.INTJ; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NOUN; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.NUM; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PART; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PRON; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PROPN; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.PUNCT; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.SCONJ; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.SYM; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.VERB; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Stem; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Removes all tokens/lemmas/stems/POS tags (depending on the "Mode" setting) that do not match the * given parts of speech. * */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }) public class PosFilter extends JCasAnnotator_ImplBase { /** * The fully qualified name of the type that should be filtered. */ public static final String PARAM_TYPE_TO_REMOVE = "typeToRemove"; @ConfigurationParameter(name = PARAM_TYPE_TO_REMOVE, mandatory = true) private String typeToRemove; /** * Keep/remove adjectives (true: keep, false: remove) */ public static final String PARAM_ADJ = "adj"; @ConfigurationParameter(name = PARAM_ADJ, mandatory = true, defaultValue = "false") private boolean adj; /** * Keep/remove adpositions (true: keep, false: remove) */ public static final String PARAM_ADP = "adp"; @ConfigurationParameter(name = PARAM_ADP, mandatory = true, defaultValue = "false") private boolean adp; /** * Keep/remove adverbs (true: keep, false: remove) */ public static final String PARAM_ADV = "adv"; @ConfigurationParameter(name = PARAM_ADV, mandatory = true, defaultValue = "false") private boolean adv; /** * Keep/remove auxiliary verbs (true: keep, false: remove) */ public static final String PARAM_AUX = "aux"; @ConfigurationParameter(name = PARAM_AUX, mandatory = true, defaultValue = "false") private boolean aux; /** * Keep/remove conjunctions (true: keep, false: remove) */ public static final String PARAM_CONJ = "conj"; @ConfigurationParameter(name = PARAM_CONJ, mandatory = true, defaultValue = "false") private boolean conj; /** * Keep/remove articles (true: keep, false: remove) */ public static final String PARAM_DET = "det"; @ConfigurationParameter(name = PARAM_DET, mandatory = true, defaultValue = "false") private boolean det; /** * Keep/remove interjections (true: keep, false: remove) */ public static final String PARAM_INTJ = "intj"; @ConfigurationParameter(name = PARAM_INTJ, mandatory = true, defaultValue = "false") private boolean intj; /** * Keep/remove nouns (true: keep, false: remove) */ public static final String PARAM_NOUN = "noun"; @ConfigurationParameter(name = PARAM_NOUN, mandatory = true, defaultValue = "false") private boolean noun; /** * Keep/remove numerals (true: keep, false: remove) */ public static final String PARAM_NUM = "num"; @ConfigurationParameter(name = PARAM_NUM, mandatory = true, defaultValue = "false") private boolean num; /** * Keep/remove particles (true: keep, false: remove) */ public static final String PARAM_PART = "part"; @ConfigurationParameter(name = PARAM_PART, mandatory = true, defaultValue = "false") private boolean part; /** * Keep/remove pronnouns (true: keep, false: remove) */ public static final String PARAM_PRON = "pron"; @ConfigurationParameter(name = PARAM_PRON, mandatory = true, defaultValue = "false") private boolean pron; /** * Keep/remove proper nouns (true: keep, false: remove) */ public static final String PARAM_PROPN = "propn"; @ConfigurationParameter(name = PARAM_PROPN, mandatory = true, defaultValue = "false") private boolean propn; /** * Keep/remove punctuation (true: keep, false: remove) */ public static final String PARAM_PUNCT = "punct"; @ConfigurationParameter(name = PARAM_PUNCT, mandatory = true, defaultValue = "false") private boolean punct; /** * Keep/remove conjunctions (true: keep, false: remove) */ public static final String PARAM_SCONJ = "sconj"; @ConfigurationParameter(name = PARAM_SCONJ, mandatory = true, defaultValue = "false") private boolean sconj; /** * Keep/remove symbols (true: keep, false: remove) */ public static final String PARAM_SYM = "sym"; @ConfigurationParameter(name = PARAM_SYM, mandatory = true, defaultValue = "false") private boolean sym; /** * Keep/remove verbs (true: keep, false: remove) */ public static final String PARAM_VERB = "verb"; @ConfigurationParameter(name = PARAM_VERB, mandatory = true, defaultValue = "false") private boolean verb; /** * Keep/remove other (true: keep, false: remove) */ public static final String PARAM_X = "x"; @ConfigurationParameter(name = PARAM_X, mandatory = true, defaultValue = "false") private boolean x; @Override public void process(JCas jcas) throws AnalysisEngineProcessException { getContext().getLogger().log(Level.CONFIG, "Entering " + this.getClass().getSimpleName()); Type tokenType = jcas.getCas().getTypeSystem().getType(Token.class.getCanonicalName()); Type stemType = jcas.getCas().getTypeSystem().getType(Stem.class.getCanonicalName()); Type lemmaType = jcas.getCas().getTypeSystem().getType(Lemma.class.getCanonicalName()); Type posType = jcas.getCas().getTypeSystem().getType(POS.class.getCanonicalName()); Type typeToRemoveType = jcas.getCas().getTypeSystem().getType(typeToRemove); if (typeToRemoveType == null) { throw new AnalysisEngineProcessException(new Throwable( "Could not get type for feature path: " + typeToRemove)); } List<AnnotationFS> toRemove = new ArrayList<AnnotationFS>(); try { for (Entry<AnnotationFS, String> entry : FeaturePathFactory.select(jcas.getCas(), typeToRemove)) { AnnotationFS annotation = entry.getKey(); AnnotationFS pos; if (typeToRemoveType.equals(posType)) { pos = annotation; } else { pos = getAnnotation(posType, annotation); if (pos == null) { continue; } } String posString = pos.getType().getShortName(); if (posString.equals(ADJ.class.getSimpleName()) && !adj) { toRemove.add(annotation); continue; } if (posString.equals(ADP.class.getSimpleName()) && !adp) { toRemove.add(annotation); continue; } if (posString.equals(ADV.class.getSimpleName()) && !adv) { toRemove.add(annotation); continue; } if (posString.equals(AUX.class.getSimpleName()) && !aux) { toRemove.add(annotation); continue; } if (posString.equals(CONJ.class.getSimpleName()) && !conj) { toRemove.add(annotation); continue; } if (posString.equals(DET.class.getSimpleName()) && !det) { toRemove.add(annotation); continue; } if (posString.equals(INTJ.class.getSimpleName()) && !intj) { toRemove.add(annotation); continue; } if (posString.equals(NOUN.class.getSimpleName()) && !noun) { toRemove.add(annotation); continue; } if (posString.equals(NUM.class.getSimpleName()) && !num) { toRemove.add(annotation); continue; } if (posString.equals(PART.class.getSimpleName()) && !part) { toRemove.add(annotation); continue; } if (posString.equals(PRON.class.getSimpleName()) && !pron) { toRemove.add(annotation); continue; } if (posString.equals(PROPN.class.getSimpleName()) && !propn) { toRemove.add(annotation); continue; } if (posString.equals(PUNCT.class.getSimpleName()) && !punct) { toRemove.add(annotation); continue; } if (posString.equals(SCONJ.class.getSimpleName()) && !sconj) { toRemove.add(annotation); continue; } if (posString.equals(SYM.class.getSimpleName()) && !sym) { toRemove.add(annotation); continue; } if (posString.equals(VERB.class.getSimpleName()) && !verb) { toRemove.add(annotation); continue; } } } catch (FeaturePathException e) { throw new AnalysisEngineProcessException(e); } for (AnnotationFS fs : toRemove) { // If we want to remove tokens, we also remove accompanying lemma, stem, POS tag. if (fs.getType().equals(tokenType)) { AnnotationFS stemFS = getAnnotation(stemType, fs); if (stemFS != null) { jcas.getCas().removeFsFromIndexes(stemFS); } AnnotationFS lemmaFS = getAnnotation(lemmaType, fs); if (lemmaFS != null) { jcas.getCas().removeFsFromIndexes(lemmaFS); } AnnotationFS posFS = getAnnotation(posType, fs); if (posFS != null) { jcas.getCas().removeFsFromIndexes(posFS); } } // We don't want to keep the feature in the token, remove it here. else { if (fs.getType().equals(stemType) || fs.getType().equals(lemmaType)) { Token token = (Token) getAnnotation(tokenType, fs); if (token != null) { String fbn = fs.getType().getShortName().toLowerCase(); Feature f = tokenType.getFeatureByBaseName(fbn); token.setFeatureValue(f, null); } } else if (fs instanceof POS) { Token token = (Token) getAnnotation(tokenType, fs); if (token != null) { token.setPos(null); } } } jcas.getCas().removeFsFromIndexes(fs); } } /** * Returns the (one) annotation of a given type that is aligned with another annotation. * * @param type * The annotation type to be looked up. * @param annotation * An annotation. * @return The annotation aligned with another annotation. */ private AnnotationFS getAnnotation(Type type, AnnotationFS annotation) { List<AnnotationFS> annotations = CasUtil.selectCovered(annotation.getCAS(), type, annotation); if (annotations.size() != 1) { getLogger().debug( "Could not find matching annotation of type " + type + " for annotation: " + annotation.getCoveredText()); return null; } return annotations.get(0); } }