package de.tudarmstadt.ukp.dkpro.core.arktools;
/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.component.CasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.resource.ResourceInitializationException;
import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.impl.Model;
import cmu.arktweetnlp.impl.ModelSentence;
import cmu.arktweetnlp.impl.Sentence;
import cmu.arktweetnlp.impl.features.FeatureExtractor;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Wrapper for Twitter Tokenizer and POS Tagger.
*
* As described in: Olutobi Owoputi, Brendan O’Connor, Chris Dyer, Kevin Gimpel, Nathan Schneider
* and Noah A. Smith. Improved Part-of-Speech Tagging for Online Conversational Text with Word
* Clusters In Proceedings of NAACL 2013.
*
*
*/
public class ArktweetPosTagger
extends CasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model and tag set mapping.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Variant of a model the model. Used to address a specific model if here are multiple models
* for one language.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Location from which the model is read.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Location of the mapping file for part-of-speech tags to UIMA types.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String posMappingLocation;
private Type tokenType;
private Feature featPos;
private CasConfigurableProviderBase<TweetTagger> modelProvider;
private MappingProvider mappingProvider;
/**
* Loads a model from a file. The tagger should be ready to tag after calling this.
*/
public class TweetTagger
{
Model model;
FeatureExtractor featureExtractor;
public void loadModel(String modelFilename)
throws IOException
{
model = Model.loadModelFromText(modelFilename);
featureExtractor = new FeatureExtractor(model, false);
}
}
/**
* One token and its tag.
**/
public static class TaggedToken
{
public AnnotationFS token;
public String tag;
}
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
modelProvider = new ModelProviderBase<TweetTagger>()
{
{
setContextObject(ArktweetPosTagger.this);
setDefault(ARTIFACT_ID, "${groupId}.arktools-model-tagger-${language}-${variant}");
setDefault(LOCATION,
"classpath:/${package}/lib/tagger-${language}-${variant}.properties");
setDefault(VARIANT, "default");
setOverride(LOCATION, modelLocation);
setOverride(LANGUAGE, language);
setOverride(VARIANT, variant);
}
@Override
protected TweetTagger produceResource(URL aUrl)
throws IOException
{
try {
TweetTagger model = new TweetTagger();
model.loadModel(ResourceUtils.getUrlAsFile(aUrl, false).getAbsolutePath());
return model;
}
catch (Exception e) {
throw new IOException(e);
}
}
};
mappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation,
language, modelProvider);
}
@Override
public void typeSystemInit(TypeSystem aTypeSystem)
throws AnalysisEngineProcessException
{
super.typeSystemInit(aTypeSystem);
tokenType = aTypeSystem.getType(Token.class.getName());
featPos = tokenType.getFeatureByBaseName("pos");
}
@Override
public void process(CAS cas)
throws AnalysisEngineProcessException
{
modelProvider.configure(cas);
mappingProvider.configure(cas);
List<AnnotationFS> tokens = CasUtil.selectCovered(cas, tokenType, 0, cas.getDocumentText()
.length());
List<TaggedToken> taggedTokens = tagTweetTokens(tokens, modelProvider.getResource());
for (TaggedToken taggedToken : taggedTokens) {
Type posType = mappingProvider.getTagType(taggedToken.tag);
AnnotationFS posAnno = cas.createAnnotation(posType, taggedToken.token.getBegin(),
taggedToken.token.getEnd());
posAnno.setStringValue(posType.getFeatureByBaseName("PosValue"), taggedToken.tag);
cas.addFsToIndexes(posAnno);
taggedToken.token.setFeatureValue(featPos, posAnno);
}
}
private List<TaggedToken> tagTweetTokens(List<AnnotationFS> annotatedTokens,
TweetTagger tweetTagModel)
{
List<String> tokens = new LinkedList<String>();
for (AnnotationFS a : annotatedTokens) {
String tokenText = a.getCoveredText();
tokenText = Twokenize.normalizeTextForTagger(tokenText);
tokens.add(tokenText);
}
Sentence sentence = new Sentence();
sentence.tokens = tokens;
ModelSentence ms = new ModelSentence(sentence.T());
tweetTagModel.featureExtractor.computeFeatures(sentence, ms);
tweetTagModel.model.greedyDecode(ms, false);
ArrayList<TaggedToken> taggedTokens = new ArrayList<TaggedToken>();
for (int t = 0; t < sentence.T(); t++) {
TaggedToken tt = new TaggedToken();
tt.token = annotatedTokens.get(t);
tt.tag = tweetTagModel.model.labelVocab.name(ms.labels[t]);
taggedTokens.add(tt);
}
return taggedTokens;
}
}