/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.tokit; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; import org.apache.commons.jxpath.JXPathContext; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * Merges any Tokens that are covered by a given annotation type. E.g. this component can be used * to create a single tokens from all tokens that constitute a multi-token named entity. * */ @TypeCapability( inputs={ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}, outputs={ "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) public class TokenMerger extends JCasAnnotator_ImplBase { public static enum LemmaMode { JOIN, REMOVE, LEAVE } /** * Annotation type for which tokens should be merged. */ public static final String PARAM_ANNOTATION_TYPE = "annotationType"; @ConfigurationParameter(name=PARAM_ANNOTATION_TYPE, mandatory=true) private String annotationType; /** * A constraint on the annotations that should be considered in form of a JXPath statement. * Example: set {@link #PARAM_ANNOTATION_TYPE} to a {@code NamedEntity} type and set the * {@link #PARAM_CONSTRAINT} to {@code ".[value = 'LOCATION']"} to merge only tokens that are * part of a location named entity. */ public static final String PARAM_CONSTRAINT = "constraint"; @ConfigurationParameter(name=PARAM_CONSTRAINT, mandatory=false) private String constraint; /** * Configure what should happen to the lemma of the merged tokens. It is possible to JOIN * the lemmata to a single lemma (space separated), to REMOVE the lemma or LEAVE the lemma * of the first token as-is. */ public static final String PARAM_LEMMA_MODE = "lemmaMode"; @ConfigurationParameter(name=PARAM_LEMMA_MODE, mandatory=true, defaultValue="JOIN") private LemmaMode lemmaMode; /** * Set a new POS value for the new merged token. This is the actual tag set value and is subject * to tagset mapping. For example when merging tokens for named entities, the new POS value * may be set to "NNP" (English/Penn Treebank Tagset). */ public static final String PARAM_POS_VALUE = "posValue"; @ConfigurationParameter(name=PARAM_POS_VALUE, mandatory=false) private String posValue; /** * Set a new coarse POS value for the new merged token. This is the actual tag set value and is * subject to tagset mapping. For example when merging tokens for named entities, the new POS * value may be set to "NNP" (English/Penn Treebank Tagset). */ public static final String PARAM_CPOS_VALUE = "cposValue"; @ConfigurationParameter(name=PARAM_CPOS_VALUE, mandatory=false) private String cposValue; /** * Set a new POS tag for the new merged token. This is the mapped type. If this is specified, * tag set mapping will not be performed. This parameter has no effect unless PARAM_POS_VALUE * is also set. */ public static final String PARAM_POS_TYPE = "posType"; @ConfigurationParameter(name=PARAM_POS_TYPE, mandatory=false) private String posType; /** * Use this language instead of the document language to resolve the model and tag set mapping. */ public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; /** * Override the tagset mapping. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; private MappingProvider mappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); mappingProvider = new MappingProvider(); mappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/" + "core/api/lexmorph/tagset/${language}-${pos.tagset}-pos.map"); mappingProvider.setDefault(MappingProvider.BASE_TYPE, POS.class.getName()); mappingProvider.setDefault("pos.tagset", "default"); mappingProvider.setOverride(MappingProvider.LOCATION, posMappingLocation); mappingProvider.setOverride(MappingProvider.LANGUAGE, language); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { CAS cas = aJCas.getCas(); if (posValue != null) { mappingProvider.configure(cas); } List<AnnotationFS> covers = new ArrayList<>(CasUtil.select(cas, CasUtil.getAnnotationType(cas, annotationType))); Collection<Annotation> toRemove = new ArrayList<Annotation>(); for (AnnotationFS cover : covers) { List<Token> covered = selectCovered(Token.class, cover); if (covered.size() < 2) { continue; } if (constraint != null) { JXPathContext ctx = JXPathContext.newContext(cover); boolean match = ctx.iterate(constraint).hasNext(); if (!match) { continue; } } Iterator<Token> i = covered.iterator(); // Extend first token Token token = i.next(); token.removeFromIndexes(); token.setEnd(covered.get(covered.size() - 1).getEnd()); token.addToIndexes(); // Optionally update the POS value if (posValue != null) { updatePos(token, toRemove); } // Record lemma - may be needed for join later List<String> lemmata = new ArrayList<String>(); if (token.getLemma() != null) { lemmata.add(token.getLemma().getValue()); } // Mark the rest for deletion - record lemmata if desired for later join while (i.hasNext()) { Token t = i.next(); Lemma lemma = t.getLemma(); if (lemma != null) { lemmata.add(lemma.getValue()); toRemove.add(lemma); } POS pos = t.getPos(); if (pos != null) { toRemove.add(pos); } toRemove.add(t); } // Join lemmata if desired if (lemmaMode == LemmaMode.JOIN) { Lemma lemma = token.getLemma(); if (!lemmata.isEmpty()) { if (lemma == null) { lemma = new Lemma(aJCas); } lemma.setValue(StringUtils.join(lemmata, " ")); } // Remove if there was nothing to join... I don't really ever expect to get here else if (lemma != null) { token.setLemma(null); toRemove.add(lemma); } } // Remove the lemma - if desired else if (lemmaMode == LemmaMode.REMOVE) { Lemma lemma = token.getLemma(); if (lemma != null) { token.setLemma(null); toRemove.add(lemma); } } // Update offsets for lemma if (token.getLemma() != null) { Lemma lemma = token.getLemma(); lemma.removeFromIndexes(); lemma.setBegin(token.getBegin()); lemma.setEnd(token.getEnd()); lemma.addToIndexes(); } } // Remove tokens no longer needed for (Annotation t : toRemove) { t.removeFromIndexes(); } } private void updatePos(Token aToken, Collection<Annotation> aToRemove) { // Determine the mapped type Type type; if (posType != null) { type = CasUtil.getType(aToken.getCAS(), posType); } else { type = mappingProvider.getTagType(posValue); } POS pos = aToken.getPos(); if (pos != null && !pos.getType().equals(type)) { // Remove wrong existing POS annotation aToRemove.add(pos); pos = null; } if (pos == null) { // Create correct annotation pos = (POS) aToken.getCAS().createAnnotation(type, aToken.getBegin(), aToken.getEnd()); pos.addToIndexes(); } else { // Update offsets - no need to add to indexes, was in CAS already pos.setBegin(aToken.getBegin()); pos.setEnd(aToken.getEnd()); } // Update the POS value pos.setPosValue(posValue); pos.setCoarseValue(cposValue); aToken.setPos(pos); } }