/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.hunpos;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.ProcessBuilder.Redirect;
import java.net.URL;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Part-of-Speech annotator using HunPos. Requires {@link Sentence}s to be annotated
* before.
*
* <p><b>References</b></p>
* <ul>
* <li>HALÁCSY, Péter; KORNAI, András; ORAVECZ, Csaba. HunPos: an open source trigram tagger. In:
* Proceedings of the 45th annual meeting of the ACL on interactive poster and demonstration
* sessions. Association for Computational Linguistics, 2007. S. 209-212.
* <a href="http://aclweb.org/anthology/P/P07/P07-2053.pdf">(pdf)</a>
* <a href="http://aclweb.org/anthology/P/P07/P07-2053.bib">(bibtex)</a></li>
* </ul>
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" }, outputs = { "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS" })
public class HunPosTagger
extends JCasAnnotator_ImplBase
{
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Load the part-of-speech tag to UIMA type mapping from this location instead of locating the
* mapping automatically.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String posMappingLocation;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid spaming
* the heap with thousands of strings representing only a few different tags.
*
* Default: {@code true}
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
/**
* Log the tag set(s) when a model is loaded.
*
* Default: {@code false}
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
protected boolean printTagSet;
private CasConfigurableProviderBase<File> modelProvider;
private RuntimeProvider runtimeProvider;
private MappingProvider posMappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
modelProvider = new CasConfigurableProviderBase<File>()
{
{
setContextObject(HunPosTagger.this);
setDefault(ARTIFACT_ID, "${groupId}.hunpos-model-tagger-${language}-${variant}");
setDefault(LOCATION, "classpath:/de/tudarmstadt/ukp/dkpro/core/hunpos/lib/"
+ "tagger-${language}-${variant}.model");
setDefault(VARIANT, "default");
setDefaultVariantsLocation("de/tudarmstadt/ukp/dkpro/core/hunpos/lib/tagger-default-variants.map");
setOverride(LOCATION, modelLocation);
setOverride(LANGUAGE, language);
setOverride(VARIANT, variant);
}
@Override
protected File produceResource(URL aUrl)
throws IOException
{
return ResourceUtils.getUrlAsFile(aUrl, true);
}
};
// provider for the sfst binary
runtimeProvider = new RuntimeProvider(
"classpath:/de/tudarmstadt/ukp/dkpro/core/hunpos/bin/");
posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation,
language, modelProvider);
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
CAS cas = aJCas.getCas();
modelProvider.configure(cas);
posMappingProvider.configure(cas);
String modelEncoding = (String) modelProvider.getResourceMetaData().get("model.encoding");
if (modelEncoding == null) {
throw new AnalysisEngineProcessException(
new Throwable("Model should contain encoding metadata"));
}
File model = modelProvider.getResource();
File executable;
try {
executable = runtimeProvider.getFile("hunpos-tag");
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
ProcessBuilder pb = new ProcessBuilder(executable.getAbsolutePath(),
model.getAbsolutePath());
pb.redirectError(Redirect.INHERIT);
StringBuffer lastOut = new StringBuffer();
String lastIn = null;
boolean success = false;
Process proc = null;
try {
proc = pb.start();
PrintWriter out = new PrintWriter(new OutputStreamWriter(proc.getOutputStream(),
modelEncoding));
BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(),
modelEncoding));
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<Token> tokens = selectCovered(Token.class, sentence);
// Skip empty sentences
if (tokens.isEmpty()) {
continue;
}
// Send full sentence
for (Token token : tokens) {
lastOut.append(token.getCoveredText()).append(' ');
out.printf("%s%n", token.getCoveredText());
}
out.printf("%n");
out.flush();
// Read sentence tags
String[] tags = new String[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
lastIn = in.readLine();
tags[i] = lastIn.split("\t", 2)[1].trim();
}
in.readLine(); // Read extra new line after sentence
int i = 0;
for (Token t : tokens) {
Type posTag = posMappingProvider.getTagType(tags[i]);
POS posAnno = (POS) cas.createAnnotation(posTag, t.getBegin(), t.getEnd());
posAnno.setPosValue(internTags ? tags[i].intern() : tags[i]);
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
posAnno.addToIndexes();
t.setPos(posAnno);
i++;
}
lastOut.setLength(0);
}
success = true;
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
finally {
if (!success) {
getLogger().error("Sent before error: [" + lastOut + "]");
getLogger().error("Last response before error: [" + lastIn + "]");
}
if (proc != null) {
proc.destroy();
}
}
}
@Override
public void destroy()
{
runtimeProvider.uninstall();
super.destroy();
}
}