/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.stanfordnlp; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.ArrayList; import java.util.List; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.LanguageCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.parameter.Messages; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.internal.TokenKey; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.IndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentenceIndexAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.MorphaAnnotator; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBEscapingProcessor; import edu.stanford.nlp.util.CoreMap; /** * Stanford Lemmatizer component. The Stanford Morphology-class computes the base form of English * words, by removing just inflections (not derivational morphology). That is, it only does noun * plurals, pronoun case, and verb endings, and not things like comparative adjectives or derived * nominals. It is based on a finite-state transducer implemented by John Carroll et al., written in * flex and publicly available. See: * http://www.informatics.susx.ac.uk/research/nlp/carroll/morph.html * * <p>This only works for ENGLISH.</p> */ @LanguageCapability("en") @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" }, outputs = {"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma"}) public class StanfordLemmatizer extends JCasAnnotator_ImplBase { /** * Enable all traditional PTB3 token transforms (like -LRB-, -RRB-). * * @see PTBEscapingProcessor */ public static final String PARAM_PTB3_ESCAPING = "ptb3Escaping"; @ConfigurationParameter(name = PARAM_PTB3_ESCAPING, mandatory = true, defaultValue = "true") private boolean ptb3Escaping; /** * List of extra token texts (usually single character strings) that should be treated like * opening quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_BEGIN = "quoteBegin"; @ConfigurationParameter(name = PARAM_QUOTE_BEGIN, mandatory = false) private List<String> quoteBegin; /** * List of extra token texts (usually single character strings) that should be treated like * closing quotes and escaped accordingly before being sent to the parser. */ public static final String PARAM_QUOTE_END = "quoteEnd"; @ConfigurationParameter(name = PARAM_QUOTE_END, mandatory = false) private List<String> quoteEnd; private MorphaAnnotator annotator; private CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory(); @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); annotator = new MorphaAnnotator(false); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { if (!"en".equals(aJCas.getDocumentLanguage())) { throw new AnalysisEngineProcessException(Messages.BUNDLE, Messages.ERR_UNSUPPORTED_LANGUAGE, new String[] { aJCas.getDocumentLanguage() }); } Annotation document = new Annotation(aJCas.getDocumentText()); List<CoreMap> sentences = new ArrayList<>(); for (Sentence s : select(aJCas, Sentence.class)) { Annotation sentence = new Annotation(s.getCoveredText()); sentence.set(CharacterOffsetBeginAnnotation.class, s.getBegin()); sentence.set(CharacterOffsetEndAnnotation.class, s.getEnd()); sentence.set(SentenceIndexAnnotation.class, sentences.size()); List<CoreLabel> tokens = new ArrayList<>(); for (Token t : selectCovered(Token.class, s)) { CoreLabel token = tokenFactory.makeToken(t.getCoveredText(), t.getBegin(), t.getEnd() - t.getBegin()); // First add token so that tokens.size() returns a 1-based counting as required // by IndexAnnotation tokens.add(token); token.set(SentenceIndexAnnotation.class, sentences.size()); token.set(IndexAnnotation.class, tokens.size()); token.set(TokenKey.class, t); POS pos = t.getPos(); if (pos == null) { throw new AnalysisEngineProcessException( new IllegalStateException("No POS tag available for token:\n" + t)); } else { token.set(PartOfSpeechAnnotation.class, pos.getPosValue()); } } if (ptb3Escaping) { tokens = CoreNlpUtils.applyPtbEscaping(tokens, quoteBegin, quoteEnd); } sentence.set(TokensAnnotation.class, tokens); sentences.add(sentence); } document.set(SentencesAnnotation.class, sentences); annotator.annotate(document); for (CoreMap s : document.get(SentencesAnnotation.class)) { for (CoreLabel t : s.get(TokensAnnotation.class)) { Token token = t.get(TokenKey.class); String tag = t.get(LemmaAnnotation.class); Lemma anno = new Lemma(aJCas, token.getBegin(), token.getEnd()); anno.setValue(tag); anno.addToIndexes(); token.setLemma(anno); } } } }