/******************************************************************************* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package org.dkpro.lab.ml.example; import static java.util.Arrays.asList; import static java.util.Collections.singletonList; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; import org.apache.uima.resource.ResourceInitializationException; import org.cleartk.classifier.CleartkSequenceAnnotator; import org.cleartk.classifier.Instance; import org.cleartk.classifier.feature.extractor.ContextExtractor; import org.cleartk.classifier.feature.extractor.ContextExtractor.Following; import org.cleartk.classifier.feature.extractor.ContextExtractor.Preceding; import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor; import org.cleartk.classifier.feature.extractor.simple.SpannedTextExtractor; import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor; import org.cleartk.classifier.feature.proliferate.CapitalTypeProliferator; import org.cleartk.classifier.feature.proliferate.CharacterNGramProliferator; import org.cleartk.classifier.feature.proliferate.LowerCaseProliferator; import org.cleartk.classifier.feature.proliferate.NumericTypeProliferator; import org.cleartk.classifier.feature.proliferate.ProliferatingExtractor; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; public class ExamplePosAnnotator extends CleartkSequenceAnnotator<String> { private List<SimpleFeatureExtractor> tokenFeatureExtractors; private List<ContextExtractor<Token>> contextFeatureExtractors; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // alias for NGram feature parameters int fromRight = CharacterNGramProliferator.RIGHT_TO_LEFT; // a list of feature extractors that require only the token: the stem of the word, the text // of the word itself, plus features created from the word text like character ngrams this.tokenFeatureExtractors = asList( new TypePathExtractor(Token.class, "stem/value"), new ProliferatingExtractor( new SpannedTextExtractor(), new LowerCaseProliferator(), new CapitalTypeProliferator(), new NumericTypeProliferator(), new CharacterNGramProliferator(fromRight, 0, 2), new CharacterNGramProliferator(fromRight, 0, 3))); // a list of feature extractors that require the token and the sentence this.contextFeatureExtractors = singletonList(new ContextExtractor<Token>(Token.class, new TypePathExtractor(Token.class, "stem"), new Preceding(2), new Following(2))); } @Override public void process(JCas jCas) throws AnalysisEngineProcessException { Collection<TOP> addToIndexes = new ArrayList<TOP>(); // generate a list of training instances for each sentence in the document for (Sentence sentence : select(jCas, Sentence.class)) { List<Instance<String>> instances = new ArrayList<Instance<String>>(); List<Token> tokens = selectCovered(jCas, Token.class, sentence); // for each token, extract all feature values and the label for (Token token : tokens) { Instance<String> instance = new Instance<String>(); // extract all features that require only the token annotation for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) { instance.addAll(extractor.extract(jCas, token)); } // extract all features that require the token and sentence annotations for (ContextExtractor<Token> extractor : this.contextFeatureExtractors) { instance.addAll(extractor.extractWithin(jCas, token, sentence)); } // set the instance label from the token's part of speech if (this.isTraining()) { instance.setOutcome(token.getPos().getPosValue()); } // add the instance to the list instances.add(instance); } if (this.isTraining()) { // for training, write instances to the data write this.dataWriter.write(instances); } else { // for classification, set the labels as the token POS labels Iterator<Token> tokensIter = tokens.iterator(); List<String> labels = classify(instances); for (String label : labels) { Token t = tokensIter.next(); POS pos = t.getPos(); if (pos == null) { pos = new POS(jCas, t.getBegin(), t.getEnd()); addToIndexes.add(pos); t.setPos(pos); } pos.setPosValue(label); } } for (TOP fs : addToIndexes) { fs.addToIndexes(); } } } }