/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.sfst;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.ProcessBuilder.Redirect;
import java.net.URL;
import java.util.List;
import java.util.Properties;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.morph.MorphologicalFeaturesParser;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.SingletonTagset;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.LittleEndianDataInputStream;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* Sfst morphological analyzer.
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures"})
public class SfstAnnotator
extends JCasAnnotator_ImplBase
{
private static final String FLUSH_TOKEN = "-= FLUSH =-";
public static enum Mode {
FIRST,
ALL
}
/**
* Write part-of-speech information.
*
* Default: {@code true}
*/
public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS;
@ConfigurationParameter(name=PARAM_WRITE_POS, mandatory=true, defaultValue="true")
private boolean writePos;
/**
* Write lemma information.
*
* Default: {@code true}
*/
public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA;
@ConfigurationParameter(name=PARAM_WRITE_LEMMA, mandatory=true, defaultValue="true")
private boolean writeLemma;
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
private String language;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
private String variant;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
private String modelLocation;
/**
* Write the tag set(s) to the log when a model is loaded.
*/
public static final String PARAM_PRINT_TAGSET = ComponentParameters.PARAM_PRINT_TAGSET;
@ConfigurationParameter(name = PARAM_PRINT_TAGSET, mandatory = true, defaultValue = "false")
protected boolean printTagSet;
/**
* Specifies the model encoding.
*/
public static final String PARAM_MODEL_ENCODING = ComponentParameters.PARAM_MODEL_ENCODING;
@ConfigurationParameter(name = PARAM_MODEL_ENCODING, mandatory = true, defaultValue="UTF-8")
private String modelEncoding;
public static final String PARAM_MODE = "mode";
@ConfigurationParameter(name = PARAM_MODE, mandatory = true, defaultValue="FIRST")
private Mode mode;
public static final String PARAM_MORPH_MAPPING_LOCATION = ComponentParameters.PARAM_MORPH_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_MORPH_MAPPING_LOCATION, mandatory = false)
private String morphMappingLocation;
private ModelProviderBase<File> modelProvider;
private MorphologicalFeaturesParser featuresParser;
private RuntimeProvider runtimeProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
// Returns FST automaton for specified language, which is then passed to fst-infl from SFST.
// Currently available for Turkish and German.
modelProvider = new ModelProviderBase<File>(this, "sfst", "morph")
{
@Override
protected File produceResource(URL aUrl)
throws IOException
{
Properties metadata = getResourceMetaData();
SingletonTagset morphFeats = new SingletonTagset(
MorphologicalFeatures.class, metadata.getProperty("morph.tagset"));
try (LittleEndianDataInputStream is = new LittleEndianDataInputStream(
aUrl.openStream())) {
byte type = is.readByte(); // "c" for "compact"
if (type != 0x63) {
throw new IOException("Incompatible model. Must be a compact model.");
}
byte enc = is.readByte(); // "0" for ??? - "1" for UTF-8
getLogger().info("Model encoding: " + (enc == 0 ? "unknown" : "UTF-8"));
short n = is.readShort(); // alphabet size
for (int i = 0; i < n; i++) {
@SuppressWarnings("unused")
int idx = is.readShort(); // need to read index
String symbol = readZeroTerminatedString(is, "UTF-8");
if (symbol.startsWith("<") && symbol.endsWith(">") && symbol.length() > 2) {
morphFeats.add(symbol);
}
}
}
addTagset(morphFeats);
if (printTagSet) {
getLogger().info(getTagset().toString());
}
return ResourceUtils.getUrlAsFile(aUrl, true);
}
private String readZeroTerminatedString(DataInput aIn, String aEncoding)
throws IOException
{
ByteArrayOutputStream bos = new ByteArrayOutputStream();
byte b = aIn.readByte();
while (b != 0) {
bos.write(b);
b = aIn.readByte();
}
return new String(bos.toByteArray(), aEncoding);
}
};
featuresParser = new MorphologicalFeaturesParser(this, modelProvider);
// provider for the sfst binary
runtimeProvider = new RuntimeProvider("classpath:/de/tudarmstadt/ukp/dkpro/core/sfst/bin/");
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
CAS cas = aJCas.getCas();
modelProvider.configure(cas);
featuresParser.configure(cas);
String modelEncoding = (String) modelProvider.getResourceMetaData().get("model.encoding");
if (modelEncoding == null) {
throw new AnalysisEngineProcessException(
new Throwable("Model should contain encoding metadata"));
}
File model = modelProvider.getResource();
File executable;
try {
executable = runtimeProvider.getFile("fst-infl2");
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
ProcessBuilder pb = new ProcessBuilder(executable.getAbsolutePath(), "-s", "-q",
model.getAbsolutePath());
pb.redirectError(Redirect.INHERIT);
StringBuffer lastOut = new StringBuffer();
String lastIn = null;
boolean success = false;
Process proc = null;
try {
proc = pb.start();
PrintWriter out = new PrintWriter(new OutputStreamWriter(proc.getOutputStream(),
modelEncoding));
BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(),
modelEncoding));
for (Sentence sentence : select(aJCas, Sentence.class)) {
List<Token> tokens = selectCovered(Token.class, sentence);
// Skip empty sentences
if (tokens.isEmpty()) {
continue;
}
// Send full sentence
for (Token token : tokens) {
lastOut.append(token.getCoveredText()).append(' ');
out.printf("%s%n", token.getCoveredText());
out.printf("%s%n", FLUSH_TOKEN);
}
out.flush();
// Read sentence tags
tokenLoop: for (Token token : tokens) {
boolean skip = false;
analysisLoop: while ((lastIn = in.readLine()) != null) {
// Analysis line
if (lastIn.startsWith(">")) {
// Echo line, ignore.
continue analysisLoop;
}
if (lastIn.contains(FLUSH_TOKEN)) {
// End of analysis
continue tokenLoop;
}
if (lastIn.startsWith("no result for")) {
// No analysis for this token
MorphologicalFeatures morph = new MorphologicalFeatures(aJCas,
token.getBegin(), token.getEnd());
morph.setValue("");
morph.addToIndexes();
if (token.getMorph() == null) {
token.setMorph(morph);
}
// We need to continue the inner loop because we still need to consume
// the flush marker.
continue analysisLoop;
}
// Analysis line
if (!skip) {
MorphologicalFeatures morph = featuresParser
.parse(aJCas, token, lastIn);
if (token.getMorph() == null) {
token.setMorph(morph);
}
}
switch (mode) {
case FIRST:
// Go to next token after reading first analysis
skip = true;
break;
case ALL:
// We record all analyses
break;
}
}
}
lastOut.setLength(0);
}
success = true;
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
finally {
if (!success) {
getLogger().error("Sent before error: [" + lastOut + "]");
getLogger().error("Last response before error: [" + lastIn + "]");
}
if (proc != null) {
proc.destroy();
}
}
}
@Override
public void destroy()
{
runtimeProvider.uninstall();
super.destroy();
}
}