/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.treetagger;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import static org.apache.uima.util.Level.INFO;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.annolab.tt4j.DefaultModel;
import org.annolab.tt4j.TokenAdapter;
import org.annolab.tt4j.TokenHandler;
import org.annolab.tt4j.TreeTaggerException;
import org.annolab.tt4j.TreeTaggerModelUtil;
import org.annolab.tt4j.TreeTaggerWrapper;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk;
import de.tudarmstadt.ukp.dkpro.core.treetagger.internal.DKProExecutableResolver;
/**
* Chunk annotator using TreeTagger.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk" })
public class TreeTaggerChunker
extends JCasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Use this TreeTagger executable instead of trying to locate the executable automatically.
*/
public static final String PARAM_EXECUTABLE_PATH = "executablePath";
@ConfigurationParameter(name = PARAM_EXECUTABLE_PATH, mandatory = false)
private File executablePath;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Location of the mapping file for chunk tags to UIMA types.
*/
public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false)
protected String chunkMappingLocation;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid
* spaming the heap with thousands of strings representing only a few different tags.
*
* Default: {@code true}
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
/**
* Log the tag set(s) when a model is loaded.
*
* Default: {@code false}
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue="false")
protected boolean printTagSet;
/**
* TT4J setting: Disable some sanity checks, e.g. whether tokens contain line breaks (which is
* not allowed). Turning this on will increase your performance, but the wrapper may throw
* exceptions if illegal data is provided.
*/
public static final String PARAM_PERFORMANCE_MODE = "performanceMode";
@ConfigurationParameter(name = PARAM_PERFORMANCE_MODE, mandatory = true, defaultValue = "false")
private boolean performanceMode;
/**
* A sequence to flush the internal TreeTagger buffer and to force it to output the rest of the
* completed analysis. This is typically just a sequence of like 5-10 full stops (".") separated
* by new line characters. However, some models may require a different flush sequence, e.g. a
* short sentence in the respective language. For chunker models, mind that the sentence must
* also be POS tagged, e.g. {@code Nous-PRO:PER\n...}.
*/
public static final String PARAM_FLUSH_SEQUENCE = "flushSequence";
@ConfigurationParameter(name = PARAM_FLUSH_SEQUENCE, mandatory = false)
private String flushSequence;
private CasConfigurableProviderBase<TreeTaggerWrapper<POS>> modelProvider;
private MappingProvider mappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new ModelProviderBase<TreeTaggerWrapper<POS>>() {
private TreeTaggerWrapper<POS> treetagger;
{
setContextObject(TreeTaggerChunker.this);
setDefault(ARTIFACT_ID, "${groupId}.treetagger-model-chunker-${language}-${variant}");
setDefault(LOCATION, "classpath:/${package}/lib/chunker-${language}-${variant}.properties");
//setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/treetagger/lib/chunker-default-variants.map");
setDefault(VARIANT, "le"); // le = little-endian
setOverride(LOCATION, modelLocation);
setOverride(LANGUAGE, language);
setOverride(VARIANT, variant);
treetagger = new TreeTaggerWrapper<POS>();
treetagger.setPerformanceMode(performanceMode);
treetagger.setEpsilon(0.00000001);
treetagger.setHyphenHeuristics(true);
DKProExecutableResolver executableProvider = new DKProExecutableResolver(treetagger);
executableProvider.setExecutablePath(executablePath);
treetagger.setExecutableProvider(executableProvider);
}
@Override
protected TreeTaggerWrapper<POS> produceResource(URL aUrl)
throws IOException
{
Properties meta = getResourceMetaData();
String encoding = meta.getProperty("encoding");
String tagset = meta.getProperty("chunk.tagset");
String flush = meta.getProperty("flushSequence",
DefaultModel.DEFAULT_FLUSH_SEQUENCE);
if (flushSequence != null) {
flush = flushSequence;
}
File modelFile = ResourceUtils.getUrlAsFile(aUrl, true);
DefaultModel model = new DefaultModel(modelFile.getPath() + ":" + encoding,
modelFile, encoding, flush);
// Reconfigure tagger
treetagger.setModel(model);
treetagger.setAdapter(new MappingTokenAdapter(meta));
// Get tagset
List<String> tags = TreeTaggerModelUtil.getTagset(modelFile, encoding);
SingletonTagset chunkTags = new SingletonTagset(Chunk.class, tagset);
for (String tag : tags) {
String fields1[] = tag.split("/");
String fields2[] = fields1[1].split("-");
String chunkTag = fields2.length == 2 ? fields2[1] : fields2[0];
chunkTags.add(chunkTag);
}
addTagset(chunkTags);
if (printTagSet) {
getContext().getLogger().log(INFO, getTagset().toString());
}
return treetagger;
}
};
mappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation,
language, modelProvider);
}
@Override
public void process(final JCas aJCas)
throws AnalysisEngineProcessException
{
final CAS cas = aJCas.getCas();
modelProvider.configure(cas);
mappingProvider.configure(cas);
// Set the handler creating new UIMA annotations from the analyzed tokens
final TokenHandler<POS> handler = new TokenHandler<POS>()
{
private String openChunk;
private int start;
private int end;
@Override
public void token(POS aPOS, String aChunk, String aDummy)
{
synchronized (cas) {
if (aChunk == null) {
// End of processing signal
chunkComplete();
return;
}
String fields1[] = aChunk.split("/");
String fields2[] = fields1[1].split("-");
//String tag = fields1[0];
String flag = fields2.length == 2 ? fields2[0] : "NONE";
String chunk = fields2.length == 2 ? fields2[1] : fields2[0];
// Start of a new chunk
if (!chunk.equals(openChunk) || "B".equals(flag)) {
if (openChunk != null) {
// End of previous chunk
chunkComplete();
}
openChunk = chunk;
start = aPOS.getBegin();
}
// Record how much of the chunk we have seen so far
end = aPOS.getEnd();
}
}
private void chunkComplete()
{
if (openChunk != null) {
Type chunkType = mappingProvider.getTagType(openChunk);
Chunk chunk = (Chunk) cas.createAnnotation(chunkType, start, end);
chunk.setChunkValue(internTags ? openChunk.intern() : openChunk);
cas.addFsToIndexes(chunk);
openChunk = null;
}
}
};
try {
TreeTaggerWrapper<POS> treetagger = modelProvider.getResource();
treetagger.setHandler(handler);
// Issue #636 - process each sentence individually to ensure that sentence boundaries
// are respected
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<POS> posTags = new ArrayList<POS>(selectCovered(POS.class, sentence));
treetagger.process(posTags);
// Commit the final chunk
handler.token(null, null, null);
}
}
catch (TreeTaggerException e) {
throw new AnalysisEngineProcessException(e);
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
private static class MappingTokenAdapter implements TokenAdapter<POS>
{
private Map<String, String> mapping;
public MappingTokenAdapter(Properties aMetadata)
{
mapping = new HashMap<String, String>();
for (Entry<Object, Object> e : aMetadata.entrySet()) {
String key = String.valueOf(e.getKey());
if (key.startsWith("pos.tag.map.")) {
String old = key.substring("pos.tag.map.".length());
String rep = String.valueOf(e.getValue());
mapping.put(old, rep);
}
}
}
@Override
public String getText(POS aPos)
{
synchronized (aPos.getCAS()) {
String pos = mapping.get(aPos.getPosValue());
if (pos == null) {
pos = aPos.getPosValue();
}
return aPos.getCoveredText() + "-" + pos;
}
}
}
}