/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.treetagger;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.util.Level.INFO;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicInteger;
import org.annolab.tt4j.TokenAdapter;
import org.annolab.tt4j.TokenHandler;
import org.annolab.tt4j.TreeTaggerException;
import org.annolab.tt4j.TreeTaggerModelUtil;
import org.annolab.tt4j.TreeTaggerWrapper;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.treetagger.internal.DKProExecutableResolver;
/**
* Part-of-Speech and lemmatizer annotator using TreeTagger.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma" })
public class TreeTaggerPosTagger
extends JCasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Use this TreeTagger executable instead of trying to locate the executable automatically.
*/
public static final String PARAM_EXECUTABLE_PATH = "executablePath";
@ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false)
private File executablePath;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* The character encoding used by the model.
*/
public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
@ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = false)
protected String modelEncoding;
/**
* Load the part-of-speech tag to UIMA type mapping from this location instead of locating
* the mapping automatically.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String posMappingLocation;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid
* spaming the heap with thousands of strings representing only a few different tags.
*
* Default: {@code true}
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
/**
* Log the tag set(s) when a model is loaded.
*
* Default: {@code false}
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false")
protected boolean printTagSet;
/**
* TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is
* not allowed). Turning this on will increase your performance, but the wrapper may throw
* exceptions if illegal data is provided.
*/
public static final String PARAM_PERFORMANCE_MODE = "performanceMode";
@ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false")
private boolean performanceMode;
/**
* Write part-of-speech information.
*
* Default: {@code true}
*/
public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS;
@ConfigurationParameter(name=PARAM_WRITE_POS, mandatory=true, defaultValue="true")
private boolean writePos;
/**
* Write lemma information.
*
* Default: {@code true}
*/
public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA;
@ConfigurationParameter(name=PARAM_WRITE_LEMMA, mandatory=true, defaultValue="true")
private boolean writeLemma;
private CasConfigurableProviderBase<TreeTaggerWrapper<Token>> modelProvider;
private MappingProvider posMappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new ModelProviderBase<TreeTaggerWrapper<Token>>() {
private TreeTaggerWrapper<Token> treetagger;
{
setContextObject(TreeTaggerPosTagger.this);
setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-tagger-${language}-${variant}");
setDefault(LOCATION, "classpath:/${package}/lib/tagger-${language}-${variant}.properties");
//setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/treetagger/lib/tagger-default-variants.map");
setDefault(VARIANT, "le"); // le = little-endian
setOverride(LOCATION, modelLocation);
setOverride(LANGUAGE, language);
setOverride(VARIANT, variant);
treetagger = new TreeTaggerWrapper<Token>();
treetagger.setPerformanceMode(performanceMode);
DKProExecutableResolver executableProvider = new DKProExecutableResolver(treetagger);
executableProvider.setExecutablePath(executablePath);
treetagger.setExecutableProvider(executableProvider);
treetagger.setAdapter(new TokenAdapter<Token>()
{
@Override
public String getText(Token aObject)
{
synchronized (aObject.getCAS()) {
return aObject.getCoveredText();
}
}
});
}
@Override
protected TreeTaggerWrapper<Token> produceResource(URL aUrl)
throws IOException
{
Properties meta = getResourceMetaData();
String encoding = modelEncoding != null ? modelEncoding : meta
.getProperty("encoding");
String tagset = meta.getProperty("pos.tagset");
File modelFile = ResourceUtils.getUrlAsFile(aUrl, true);
// Reconfigure tagger
treetagger.setModel(modelFile.getPath() + ":" + encoding);
// Get tagset
List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding);
SingletonTagset posTags = new SingletonTagset(POS.class, tagset);
posTags.addAll(tags);
addTagset(posTags);
if (printTagSet) {
getContext().getLogger().log(INFO, getTagset().toString());
}
return treetagger;
}
};
posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation,
language, modelProvider);
}
@Override
public void process(final JCas aJCas)
throws AnalysisEngineProcessException
{
final CAS cas = aJCas.getCas();
modelProvider.configure(cas);
posMappingProvider.configure(cas);
TreeTaggerWrapper<Token> treetagger = modelProvider.getResource();
try {
List<Token> tokens = new ArrayList<Token>(select(aJCas, Token.class));
final POS pos[] = new POS[tokens.size()];
final Lemma lemma[] = new Lemma[tokens.size()];
// Set the handler creating new UIMA annotations from the analyzed
// tokens
final AtomicInteger count = new AtomicInteger(0);
treetagger.setHandler(new TokenHandler<Token>() {
@Override
public void token(Token aToken, String aPos, String aLemma)
{
synchronized (cas) {
// Add the Part of Speech
if (writePos && aPos != null) {
Type posTag = posMappingProvider.getTagType(aPos);
POS posAnno = (POS) cas.createAnnotation(posTag, aToken.getBegin(),
aToken.getEnd());
posAnno.setPosValue(internTags ? aPos.intern() : aPos);
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
aToken.setPos(posAnno);
pos[count.get()] = posAnno;
}
// Add the lemma
if (writeLemma && aLemma != null) {
Lemma lemmaAnno = new Lemma(aJCas, aToken.getBegin(), aToken.getEnd());
lemmaAnno.setValue(internTags ? aLemma.intern() : aLemma);
aToken.setLemma(lemmaAnno);
lemma[count.get()] = lemmaAnno;
}
count.getAndIncrement();
}
}
});
treetagger.process(tokens);
// Add the annotations to the indexes
for (int i = 0; i < count.get(); i++) {
if (pos[i] != null) {
pos[i].addToIndexes();
}
if (lemma[i] != null) {
lemma[i].addToIndexes();
}
}
}
catch (TreeTaggerException e) {
throw new AnalysisEngineProcessException(e);
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
}