/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package de.tudarmstadt.ukp.dkpro.core.decompounding.uima.annotator; import static org.apache.uima.fit.util.JCasUtil.select; import java.util.ArrayList; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.DummyRanker; import de.tudarmstadt.ukp.dkpro.core.decompounding.ranking.Ranker; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.DecompoundedWord; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.Fragment; import de.tudarmstadt.ukp.dkpro.core.decompounding.splitter.SplitterAlgorithm; /** * Annotates compound parts and linking morphemes. */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Compound", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Split", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.CompoundPart", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.LinkingMorpheme" }) public class CompoundAnnotator extends JCasAnnotator_ImplBase { /** * This component allows the user to create different strategies for decompounding words, * combining different splitting algorithms with different ranking algorithms. This external * resource wraps the splitter algorithm which shall be used by the annotator. */ public static final String PARAM_SPLITTING_ALGO = "splittingAlgorithm"; @ExternalResource(key = PARAM_SPLITTING_ALGO) private SplitterAlgorithm splitter; /** * This external resource wraps the ranking algorithm which shall be used by the annotator. */ public static final String PARAM_RANKING_ALGO = "rankingAlgorithm"; @ExternalResource(key = PARAM_RANKING_ALGO, mandatory = false) private Ranker ranker; @Override public void initialize(final UimaContext context) throws ResourceInitializationException { super.initialize(context); if (ranker == null) { ranker = new DummyRanker(); } } @Override public void process(final JCas aJCas) throws AnalysisEngineProcessException { for (Token token : select(aJCas, Token.class)) { final String coveredText = token.getCoveredText(); DecompoundedWord result; result = ranker.highestRank(splitter.split(coveredText)); if (!result.isCompound()) { continue; } final int beginIndex = token.getBegin(); final Compound compound = new Compound(aJCas, beginIndex, token.getEnd()); indexSplits(aJCas, result.getSplits(), beginIndex, token.getEnd(), null, compound); compound.addToIndexes(); } } private void indexSplits(final JCas aJCas, final List<Fragment> splits, final int beginIndex, final int tokenEndIndex, final Split parentSplit, final Compound compound) { if (splits.size() == 1) { return; } final List<Split> splitChildren = new ArrayList<Split>(); final Fragment element = splits.get(0); int endIndex = beginIndex + element.getWord().length(); final Split split = new CompoundPart(aJCas, beginIndex, endIndex); split.addToIndexes(); splitChildren.add(split); int newBeginIndex = endIndex; if (element.hasMorpheme()) { endIndex = newBeginIndex + element.getMorpheme().length(); final Split morpheme = new LinkingMorpheme(aJCas, newBeginIndex, endIndex); morpheme.addToIndexes(); splitChildren.add(morpheme); newBeginIndex = endIndex; } final Split remainingSplit = new CompoundPart(aJCas, newBeginIndex, tokenEndIndex); splitChildren.add(remainingSplit); final FSArray childArray = FSCollectionFactory.createFSArray(aJCas, splitChildren); if (parentSplit == null) { compound.setSplits(childArray); } else { parentSplit.setSplits(childArray); } indexSplits(aJCas, splits.subList(1, splits.size()), newBeginIndex, tokenEndIndex, remainingSplit, compound); remainingSplit.addToIndexes(); } }