/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.tiger;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Unmarshaller;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.cas.FSArray;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT;
import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode;
import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.IllegalAnnotationStructureException;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.AnnotationDecl;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.Meta;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerEdge;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFeNode;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFrame;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerFrameElement;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerGraph;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNode;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerNonTerminal;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerPart;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSem;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSentence;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerSplitword;
import de.tudarmstadt.ukp.dkpro.core.io.tiger.internal.model.TigerTerminal;
/**
* UIMA collection reader for TIGER-XML files. Also supports the augmented format used in the
* Semeval 2010 task which includes semantic role data.
*/
@MimeTypeCapability({MimeTypes.APPLICATION_X_TIGER_XML, MimeTypes.APPLICATION_X_SEMEVAL_2010_XML})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent",
"de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg",
"de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred" })
public class TigerXmlReader
extends JCasResourceCollectionReader_ImplBase
{
/**
* Location of the mapping file for part-of-speech tags to UIMA types.
*/
public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false)
protected String mappingPosLocation;
/**
* Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the
* tag set defined as part of the model meta data. This can be useful if a custom model is
* specified which does not have such meta data, or it can be used in readers.
*/
public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET;
@ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false)
protected String posTagset;
/**
* Write Penn Treebank bracketed structure information. Mind this may not work with all tagsets,
* in particular not with such that contain "(" or ")" in their tags. The tree is generated
* using the original tag set in the corpus, not using the mapped tagset!
*
* Default: {@code false}
*/
public static final String PARAM_READ_PENN_TREE = ComponentParameters.PARAM_READ_PENN_TREE;
@ConfigurationParameter(name = PARAM_READ_PENN_TREE, mandatory = true, defaultValue = "false")
private boolean pennTreeEnabled;
/**
* If a sentence has an illegal structure (e.g. TIGER 2.0 has non-terminal nodes that do not
* have child nodes), then just ignore these sentences.
*
* Default: {@code false}
*/
public static final String PARAM_IGNORE_ILLEGAL_SENTENCES = "ignoreIllegalSentences";
@ConfigurationParameter(name = PARAM_IGNORE_ILLEGAL_SENTENCES, mandatory = true, defaultValue = "false")
private boolean ignoreIllegalSentences;
private MappingProvider posMappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
posMappingProvider = MappingProviderFactory.createPosMappingProvider(mappingPosLocation,
posTagset, getLanguage());
}
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);
try {
posMappingProvider.configure(aJCas.getCas());
}
catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
InputStream is = null;
try {
is = CompressionUtils.getInputStream(res.getLocation(), res.getInputStream());
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
XMLEventReader xmlEventReader = xmlInputFactory.createXMLEventReader(is);
JAXBContext context = JAXBContext.newInstance(Meta.class, AnnotationDecl.class,
TigerSentence.class);
Unmarshaller unmarshaller = context.createUnmarshaller();
JCasBuilder jb = new JCasBuilder(aJCas);
XMLEvent e = null;
while ((e = xmlEventReader.peek()) != null) {
if (isStartElement(e, "s")) {
TigerSentence sentence = unmarshaller
.unmarshal(xmlEventReader, TigerSentence.class).getValue();
try {
readSentence(jb, sentence);
}
catch (IllegalAnnotationStructureException ex) {
if (ignoreIllegalSentences) {
getLogger().warn("Unable to read sentence [" + sentence.id + "]: "
+ ex.getMessage());
}
else {
getLogger().error("Unable to read sentence [" + sentence.id + "]: "
+ ex.getMessage());
throw new CollectionException(ex);
}
}
}
else {
xmlEventReader.next();
}
}
jb.close();
// Can only do that after the builder is closed, otherwise the text is not yet set in
// the CAS and we get "null" for all token strings.
if (pennTreeEnabled) {
for (ROOT root : select(aJCas, ROOT.class)) {
PennTree pt = new PennTree(aJCas, root.getBegin(), root.getEnd());
PennTreeNode rootNode = PennTreeUtils.convertPennTree(root);
pt.setPennTree(PennTreeUtils.toPennTree(rootNode));
pt.addToIndexes();
}
}
}
catch (XMLStreamException ex1) {
throw new IOException(ex1);
}
catch (JAXBException ex2) {
throw new IOException(ex2);
}
finally {
closeQuietly(is);
}
}
protected void readSentence(JCasBuilder aBuilder, TigerSentence aSentence)
throws IllegalAnnotationStructureException
{
int sentenceBegin = aBuilder.getPosition();
int sentenceEnd = aBuilder.getPosition();
Map<String, Token> terminals = new LinkedHashMap<>();
Map<String, Constituent> nonterminals = new HashMap<>();
Map<String, String> tokenIdToTextMap = new HashMap<>();
for (TigerTerminal t : aSentence.graph.terminals) {
Token token = aBuilder.add(t.word, Token.class);
token.setId(t.id);
terminals.put(t.id, token);
tokenIdToTextMap.put(t.id, t.word);
if (t.lemma != null) {
Lemma lemma = new Lemma(aBuilder.getJCas(), token.getBegin(), token.getEnd());
lemma.setValue(t.lemma);
lemma.addToIndexes();
token.setLemma(lemma);
}
if (t.pos != null) {
Type posType = posMappingProvider.getTagType(t.pos);
POS posAnno = (POS) aBuilder.getJCas().getCas().createAnnotation(posType,
token.getBegin(), token.getEnd());
posAnno.setPosValue(t.pos.intern());
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
posAnno.addToIndexes();
token.setPos(posAnno);
}
// Remember position before adding space
sentenceEnd = aBuilder.getPosition();
aBuilder.add(" ");
}
aBuilder.add("\n");
Sentence sentence = new Sentence(aBuilder.getJCas(), sentenceBegin, sentenceEnd);
sentence.setId(aSentence.id);
sentence.addToIndexes();
if (aSentence.graph.root != null) {
readNode(aBuilder.getJCas(), terminals, nonterminals, aSentence.graph, null, null,
aSentence.graph.get(aSentence.graph.root));
}
// Read Semeval 2010 frame and role annotations
if (aSentence.sem != null) {
if (aSentence.sem.splitwords != null) {
// read splitwords as terminals/tokens
readSplit(aBuilder.getJCas(), terminals, aSentence.sem.splitwords, tokenIdToTextMap);
}
readSem(aBuilder.getJCas(), terminals, nonterminals, aSentence.sem, tokenIdToTextMap);
}
}
private void readSplit(JCas jCas, Map<String, Token> terminals, List<TigerSplitword> splitwords, Map<String, String> tokenIdToTextMap)
{
for (TigerSplitword split : splitwords) {
Token orig = terminals.get(split.idref);
int begin = orig.getBegin();
int end = 0;
for (TigerPart part : split.parts) {
end = begin + part.word.length();
Token t = new Token(jCas, begin, end);
t.addToIndexes();
terminals.put(part.id, t);
begin = end;
tokenIdToTextMap.put(part.id, part.word);
}
}
}
private void readSem(JCas jCas, Map<String, Token> terminals,
Map<String, Constituent> nonterminals, TigerSem sem, Map<String, String> tokenIdToTextMap)
{
if (sem.frames != null) {
for (TigerFrame frame : sem.frames) {
SemPred p = new SemPred(jCas);
p.setCategory(frame.name);
Set<Token> frameTokenSet = new HashSet<Token>();
for (TigerFeNode fenode : frame.target.fenodes) {
String reference = fenode.idref;
if (terminals.containsKey(reference)) {
Token target = terminals.get(reference);
frameTokenSet.add(target);
}
else if (nonterminals.containsKey(reference)) {
Constituent target = nonterminals.get(reference);
addAllTokens(frameTokenSet, target, nonterminals);
}
}
int[] boundary = getBoundaryOfFirstContiguousElement(frameTokenSet, terminals,
frame.id, tokenIdToTextMap);
p.setBegin(boundary[0]);
p.setEnd(boundary[1]);
List<SemArgLink> arguments = new ArrayList<>();
if (frame.fes != null) {
for (TigerFrameElement fe : frame.fes) {
if (fe.fenodes != null) {
for (TigerFeNode fenode : fe.fenodes) {
if (terminals.containsKey(fenode.idref)) {
Token argument = terminals.get(fenode.idref);
SemArg a = new SemArg(jCas, argument.getBegin(),
argument.getEnd());
a.addToIndexes();
SemArgLink link = new SemArgLink(jCas);
link.setRole(fe.name);
link.setTarget(a);
arguments.add(link);
}
else if (nonterminals.containsKey(fenode.idref)) {
Constituent argument = nonterminals.get(fenode.idref);
SemArg a = new SemArg(jCas, argument.getBegin(),
argument.getEnd());
a.addToIndexes();
SemArgLink link = new SemArgLink(jCas);
link.setRole(fe.name);
link.setTarget(a);
arguments.add(link);
}
}
}
}
FSArray fsa = new FSArray(jCas, arguments.size());
for (int i = 0; i < arguments.size(); i++) {
fsa.set(i, arguments.get(i));
}
p.setArguments(fsa);
}
p.addToIndexes();
}
}
}
private void addAllTokens(Set<Token> frameTokenSet, Constituent target,
Map<String, Constituent> nonterminals)
{
for (Token child : selectCovered(Token.class, target)) {
frameTokenSet.add((Token) child);
}
}
/**
* returns begin-end offset of first contiguous frame element in frameTokenSet
*
* @param frameTokenSet
* list of tokens in the current frame
* @param terminals
* all tokens of the sentence
* @return
*/
private int[] getBoundaryOfFirstContiguousElement(Set<Token> frameTokenSet,
Map<String, Token> terminals, String frameName, Map<String, String> tokenIdToTextMap)
{
// sort frameTokenSet
Token[] tokenArray = frameTokenSet.toArray(new Token[0]);
if (tokenArray.length > 1) {
// avoid unnecessary computation for single token frames
for (int i = 0, j; i < tokenArray.length; ++i) {
int minValue = tokenArray[i].getBegin();
int minValueIndex = i;
for (j = i + 1; j < tokenArray.length; ++j) {
if (tokenArray[j].getBegin() < minValue) {
minValue = tokenArray[j].getBegin();
minValueIndex = j;
}
}
Token temp = tokenArray[i];
tokenArray[i] = tokenArray[minValueIndex];
tokenArray[minValueIndex] = temp;
}
}
// merge begin-end boundary of nearby tokens
int i = 0;
List<int[]> tokenBoundaryList = new ArrayList<>();
List<String> tokenList = new ArrayList<>();
boolean continuousToken = false;
if (tokenArray.length > 1) {
// avoid unnecessary computation for single token frames
for (Entry<String, Token> entry : terminals.entrySet()) {
if (tokenArray[i].equals(entry.getValue())) {
if (continuousToken == false) {
tokenBoundaryList.add(
new int[] { tokenArray[i].getBegin(), tokenArray[i].getEnd() });
tokenList.add(entry.getKey());
}
else {
tokenBoundaryList.get(tokenBoundaryList.size() - 1)[1] = tokenArray[i]
.getEnd();
tokenList.set(tokenList.size() - 1,
tokenList.get(tokenList.size() - 1) + " " + entry.getKey());
}
continuousToken = true;
++i;
}
else {
continuousToken = false;
}
if (i >= tokenArray.length) {
break;
}
}
}
else {
tokenBoundaryList.add(new int[] { tokenArray[0].getBegin(), tokenArray[0].getEnd() });
}
//Give warning for noncontiguous frame targets
if (tokenBoundaryList.size() > 1) {
String completeFrameTarget = "";
for (String word : tokenList) {
String textRepresentation = tokenIdToTextMap.get(word);
if(textRepresentation == null){
textRepresentation = "";
for(String part:word.split(" ")) {
textRepresentation += tokenIdToTextMap.get(part) + " ";
}
textRepresentation = textRepresentation.trim();
}
completeFrameTarget += "<" + word + "," + textRepresentation + "> ";
}
getLogger().warn("Target of [" + frameName
+ "] frame consists of noncontiguous tokens! Tokens are: "
+ completeFrameTarget);
}
// return begin and end for first element
int begin = tokenBoundaryList.get(0)[0];
int end = tokenBoundaryList.get(0)[1];
return new int[] { begin, end };
}
private Annotation readNode(JCas aJCas, Map<String, Token> aTerminals,
Map<String, Constituent> aNonTerminals, TigerGraph aGraph, Constituent aParent,
TigerEdge aInEdge, TigerNode aNode)
throws IllegalAnnotationStructureException
{
int begin = Integer.MAX_VALUE;
int end = 0;
List<Annotation> children = new ArrayList<Annotation>();
if (aNode instanceof TigerNonTerminal) {
Constituent con;
if (aParent == null) {
con = new ROOT(aJCas);
}
else {
con = new Constituent(aJCas);
}
// TIGER 2.0 has some invalid non-terminal nodes without edges
if (aNode.edges == null) {
throw new IllegalAnnotationStructureException(
"Non-terminal node [" + aNode.id + "] has no edges.");
}
for (TigerEdge edge : aNode.edges) {
Annotation child = readNode(aJCas, aTerminals, aNonTerminals, aGraph, con, edge,
aGraph.get(edge.idref));
if (child instanceof Token) {
((Token) child).setParent(con);
}
children.add(child);
begin = Math.min(child.getBegin(), begin);
end = Math.max(child.getEnd(), end);
}
if (aInEdge != null && !"-".equals(aInEdge.label) && !"--".equals(aInEdge.label)) {
con.setSyntacticFunction(aInEdge.label);
}
con.setParent(aParent);
con.setConstituentType(((TigerNonTerminal) aNode).cat);
con.setChildren(FSCollectionFactory.createFSArray(aJCas, children));
con.setBegin(begin);
con.setEnd(end);
con.addToIndexes();
aNonTerminals.put(aNode.id, con);
return con;
}
else /* Terminal node */ {
return aTerminals.get(aNode.id);
}
}
public static boolean isStartElement(XMLEvent aEvent, String aElement)
{
return aEvent.isStartElement()
&& ((StartElement) aEvent).getName().getLocalPart().equals(aElement);
}
}