/**
* Copyright 2007-2014
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package de.tudarmstadt.ukp.dkpro.core.matetools;
import static org.apache.uima.fit.util.JCasUtil.indexCovered;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.zip.ZipFile;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasConsumer_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.Morpheme;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CasConfigurableProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ModelProviderBase;
import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import se.lth.cs.srl.SemanticRoleLabeler;
import se.lth.cs.srl.corpus.Predicate;
import se.lth.cs.srl.corpus.Word;
import se.lth.cs.srl.languages.Language;
import se.lth.cs.srl.languages.Language.L;
import se.lth.cs.srl.pipeline.Pipeline;
/**
* DKPro Annotator for the MateTools Semantic Role Labeler.
*<p>
* Please cite the following paper, if you use the semantic role labeler
* Anders Björkelund, Love Hafdell, and Pierre Nugues. Multilingual semantic role labeling.
* In Proceedings of The Thirteenth Conference on Computational Natural Language Learning (CoNLL-2009),
* pages 43--48, Boulder, June 4--5 2009.
* </p>
*/
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" },
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred",
"de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" })
public class MateSemanticRoleLabeler extends JCasConsumer_ImplBase {
/**
* Use this language instead of the document language to resolve the model.
*/
public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE;
@ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false)
protected String language;
/**
* Load the model from this location instead of locating the model automatically.
*/
public static final String PARAM_MODEL_LOCATION = ComponentParameters.PARAM_MODEL_LOCATION;
@ConfigurationParameter(name = PARAM_MODEL_LOCATION, mandatory = false)
protected String modelLocation;
/**
* Override the default variant used to locate the model.
*/
public static final String PARAM_VARIANT = ComponentParameters.PARAM_VARIANT;
@ConfigurationParameter(name = PARAM_VARIANT, mandatory = false)
protected String variant;
private CasConfigurableProviderBase<SemanticRoleLabeler> modelProvider;
private static final String UNUSED = "_";
private static final int UNUSED_INT = -1;
private static final Pattern NEWLINE_PATTERN=Pattern.compile("\n");
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
modelProvider = new ModelProviderBase<SemanticRoleLabeler>(this, "matetools", "srl")
{
@Override
protected SemanticRoleLabeler produceResource(URL aUrl)
throws IOException
{
File modelFile = ResourceUtils.getUrlAsFile(aUrl, false);
try {
ZipFile zipFile = new ZipFile(modelFile);
SemanticRoleLabeler srl = Pipeline.fromZipFile(zipFile);
zipFile.close();
return srl;
}
catch (Exception e) {
throw new IOException(e);
}
}
};
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
modelProvider.configure(jcas.getCas());
SemanticRoleLabeler srl = modelProvider.getResource();
//Set the language information for SRL
switch(jcas.getDocumentLanguage()){
case "de": Language.setLanguage(L.ger); break;
case "en": Language.setLanguage(L.eng); break;
case "zh": Language.setLanguage(L.chi); break;
case "es": Language.setLanguage(L.spa); break;
default: throw new AnalysisEngineProcessException("Language not supported", null);
}
for(Sentence s : JCasUtil.select(jcas, Sentence.class)) {
String conll2009String = convert(jcas, s);
se.lth.cs.srl.corpus.Sentence sen = se.lth.cs.srl.corpus.Sentence.newDepsOnlySentence(NEWLINE_PATTERN.split(conll2009String));
srl.parseSentence(sen);
List<Predicate> preds = sen.getPredicates();
List<Token> tokens = JCasUtil.selectCovered(Token.class, s);
for(Predicate pred : preds) {
//Add the predicates
Token predToken = tokens.get(pred.getIdx()-1);
SemPred semanticPredicate = new SemPred(jcas, predToken.getBegin(), predToken.getEnd());
semanticPredicate.setCategory(pred.getSense());
semanticPredicate.addToIndexes();
//Add the arguments
Map<Word, String> argmap = pred.getArgMap();
List<SemArgLink> arguments = new LinkedList<>();
for(Map.Entry<Word, String> entry : argmap.entrySet()) {
Token argumentToken = tokens.get(entry.getKey().getIdx()-1);
SemArg arg = new SemArg(jcas, argumentToken.getBegin(), argumentToken.getEnd());
arg.addToIndexes();
SemArgLink link = new SemArgLink(jcas);
link.setRole(pred.getArgumentTag(entry.getKey()));
link.setTarget(arg);
arguments.add(link);
}
//Add the arguments to the predicate
semanticPredicate.setArguments(
FSCollectionFactory.createFSArray(jcas, arguments));
}
}
}
private String convert(JCas aJCas, Sentence sentence)
{
Map<Token, Collection<SemPred>> predIdx = indexCovered(aJCas, Token.class, SemPred.class);
Map<SemArg, Collection<Token>> argIdx = indexCovered(aJCas, SemArg.class, Token.class);
HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>();
StringBuilder conll2009String = new StringBuilder();
// Tokens
List<Token> tokens = selectCovered(Token.class, sentence);
// Check if we should try to include the FEATS in output
List<Morpheme> morphology = selectCovered(Morpheme.class, sentence);
boolean useFeats = tokens.size() == morphology.size();
int tokenSize = tokens.size();
int morhSize = morphology.size();
List<SemPred> preds = selectCovered(SemPred.class, sentence);
for (int i = 0; i < tokens.size(); i++) {
Row row = new Row();
row.id = i+1;
row.token = tokens.get(i);
row.args = new SemArgLink[preds.size()];
if (useFeats) {
row.feats = morphology.get(i);
}
// If there are multiple semantic predicates for the current token, then
// we keep only the first
Collection<SemPred> predsForToken = predIdx.get(row.token);
if (predsForToken != null && !predsForToken.isEmpty()) {
row.pred = predsForToken.iterator().next();
}
ctokens.put(row.token, row);
}
// Dependencies
for (Dependency rel : selectCovered(Dependency.class, sentence)) {
ctokens.get(rel.getDependent()).deprel = rel;
}
// Semantic arguments
for (int p = 0; p < preds.size(); p++) {
FSArray args = preds.get(p).getArguments();
for (SemArgLink link : select(args, SemArgLink.class)) {
for (Token t : argIdx.get(link.getTarget())) {
Row row = ctokens.get(t);
row.args[p] = link;
}
}
}
// Write sentence in CONLL 2009 format
for (Row row : ctokens.values()) {
int id = row.id;
String form = row.token.getCoveredText();
String lemma = UNUSED;
if (row.token.getLemma() != null) {
lemma = row.token.getLemma().getValue();
}
String plemma = lemma;
String pos = UNUSED;
if (row.token.getPos() != null) {
POS posAnno = row.token.getPos();
pos = posAnno.getPosValue();
}
String ppos = pos;
String feat = UNUSED;
if (row.feats != null) {
feat = row.feats.getMorphTag();
}
String pfeat = feat;
int headId = UNUSED_INT;
String deprel = UNUSED;
if (row.deprel != null) {
deprel = row.deprel.getDependencyType();
headId = ctokens.get(row.deprel.getGovernor()).id;
if (headId == row.id) {
// ROOT dependencies may be modeled as a loop, ignore these.
headId = 0;
}
} else {
headId = 0; //Mate SRL expects the head to have id = 0
}
String head = UNUSED;
if (headId != UNUSED_INT) {
head = Integer.toString(headId);
}
String phead = head;
String pdeprel = deprel;
String fillpred = UNUSED;
String pred = UNUSED;
StringBuilder apreds = new StringBuilder();
conll2009String.append(
String.format("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id, form,
lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, pred, apreds)
);
}
return conll2009String.toString();
}
private static final class Row {
int id;
Token token;
Morpheme feats;
Dependency deprel;
SemPred pred;
SemArgLink[] args; // These are the arguments roles for the current token!
}
}