/*
* Copyright 2012
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.tcf;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;
import eu.clarin.weblicht.wlfxb.io.TextCorpusStreamed;
import eu.clarin.weblicht.wlfxb.io.WLDObjector;
import eu.clarin.weblicht.wlfxb.io.WLFormatException;
import eu.clarin.weblicht.wlfxb.tc.api.DependencyParse;
import eu.clarin.weblicht.wlfxb.tc.api.DependencyParsingLayer;
import eu.clarin.weblicht.wlfxb.tc.api.Reference;
import eu.clarin.weblicht.wlfxb.tc.api.TextCorpus;
import eu.clarin.weblicht.wlfxb.xb.WLData;
/**
* Reader for the WebLicht TCF format. It reads all the available annotation Layers from the TCF
* file and convert it to a CAS annotations. The TCF data do not have begin/end offsets for all of
* its annotations which is required in CAS annotation. Hence, addresses are manually calculated per
* tokens and stored in a map (token_id, token(CAS object)) where later we get can get the offset
* from the token
*/
@MimeTypeCapability({MimeTypes.TEXT_TCF})
@TypeCapability(outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity",
"de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma",
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain",
"de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink",
"de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" })
public class TcfReader
extends JCasResourceCollectionReader_ImplBase
{
int j = 0;
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);
InputStream is = null;
try {
is = new BufferedInputStream(res.getInputStream());
WLData wLData = WLDObjector.read(is);
TextCorpus aCorpusData = wLData.getTextCorpus();
convertToCas(aJCas, aCorpusData);
}
catch (WLFormatException e) {
throw new CollectionException(e);
}
finally {
closeQuietly(is);
}
}
private void convertToCas(JCas aJCas, TextCorpus aCorpusData)
{
convertText(aJCas, aCorpusData);
Map<String, Token> tokens = convertTokens(aJCas, aCorpusData);
if (tokens.size() > 0) {
convertPos(aJCas, aCorpusData, tokens);
convertLemma(aJCas, aCorpusData, tokens);
convertSentences(aJCas, aCorpusData, tokens);
convertDependencies(aJCas, aCorpusData, tokens);
convertNamedEntities(aJCas, aCorpusData, tokens);
convertCoreference(aJCas, aCorpusData, tokens);
}
}
/**
* This method builds texts from the {@link eu.clarin.weblicht.wlfxb.tc.api.Token} annotation
* layer. The getText Method of {@link TextCorpusStreamed} is not used as some tokens, such as
* special characters represented differently than in the original text.
*
* @param aJCas
* the JCas.
* @param aCorpusData
* the TCF document.
*/
private void convertText(JCas aJCas, TextCorpus aCorpusData)
{
StringBuilder text = new StringBuilder();
for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) {
if (i > 0) {
text.append(" ");
}
eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer().getToken(i);
text.append(token.getString());
}
aJCas.setDocumentText(text.toString());
aJCas.setDocumentLanguage(aCorpusData.getLanguage());
}
/**
* Convert TCF Tokens Layer to CAS Token Annotation.
*
* @param aJCas
* the JCas.
* @param aCorpusData
* the TCF document.
* @return returns {@code Map} of (token_id, Token), for later references
*/
private Map<String, Token> convertTokens(JCas aJCas, TextCorpus aCorpusData)
{
if (aCorpusData.getTokensLayer() == null) {
// No layer to read from.
return new HashMap<String, Token>();
}
String text = aJCas.getDocumentText();
Token outToken;
int tokenBeginPosition = 0;
int tokenEndPosition;
Map<String, Token> tokens = new HashMap<String, Token>();
for (int i = 0; i < aCorpusData.getTokensLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.Token token = aCorpusData.getTokensLayer().getToken(i);
tokenBeginPosition = text.indexOf(token.getString(), tokenBeginPosition);
tokenEndPosition = text.indexOf(token.getString(), tokenBeginPosition)
+ token.getString().length();
outToken = new Token(aJCas, tokenBeginPosition, tokenEndPosition);
outToken.addToIndexes();
tokens.put(token.getID(), outToken);
tokenBeginPosition = tokenEndPosition;
}
return tokens;
}
private void convertPos(JCas aJCas, TextCorpus aCorpusData, Map<String, Token> aTokens)
{
if (aCorpusData.getPosTagsLayer() == null) {
return;
}
for (int i = 0; i < aCorpusData.getPosTagsLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.Token[] posTokens = aCorpusData.getPosTagsLayer()
.getTokens(aCorpusData.getPosTagsLayer().getTag(i));
String value = aCorpusData.getPosTagsLayer().getTag(i).getString();
POS outPos = new POS(aJCas);
outPos.setBegin(aTokens.get(posTokens[0].getID()).getBegin());
outPos.setEnd(aTokens.get(posTokens[0].getID()).getEnd());
outPos.setPosValue(value);
outPos.setCoarseValue(outPos.getClass().equals(POS.class) ? null
: outPos.getType().getShortName().intern());
outPos.addToIndexes();
// Set the POS to the token
aTokens.get(posTokens[0].getID()).setPos(outPos);
}
}
private void convertLemma(JCas aJCas, TextCorpus aCorpusData, Map<String, Token> aTokens)
{
if (aCorpusData.getLemmasLayer() == null) {
return;
}
for (int i = 0; i < aCorpusData.getLemmasLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.Token[] lemmaTokens = aCorpusData.getLemmasLayer()
.getTokens(aCorpusData.getLemmasLayer().getLemma(i));
String value = aCorpusData.getLemmasLayer().getLemma(i).getString();
Lemma outLemma = new Lemma(aJCas);
outLemma.setBegin(aTokens.get(lemmaTokens[0].getID()).getBegin());
outLemma.setEnd(aTokens.get(lemmaTokens[0].getID()).getEnd());
outLemma.setValue(value);
outLemma.addToIndexes();
// Set the lemma to the token
aTokens.get(lemmaTokens[0].getID()).setLemma(outLemma);
}
}
private void convertSentences(JCas aJCas, TextCorpus aCorpusData,
Map<String, Token> aTokens)
{
if (aCorpusData.getSentencesLayer() == null) {
// No layer to read from.
return;
}
for (int i = 0; i < aCorpusData.getSentencesLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.Token[] sentencesTokens = aCorpusData
.getSentencesLayer().getTokens(aCorpusData.getSentencesLayer().getSentence(i));
Sentence outSentence = new Sentence(aJCas);
outSentence.setBegin(aTokens.get(sentencesTokens[0].getID()).getBegin());
outSentence.setEnd(aTokens.get(sentencesTokens[sentencesTokens.length - 1].getID())
.getEnd());
outSentence.addToIndexes();
}
}
private void convertDependencies(JCas aJCas, TextCorpus aCorpusData,
Map<String, Token> aTokens)
{
DependencyParsingLayer depLayer = aCorpusData.getDependencyParsingLayer();
if (depLayer == null) {
// No layer to read from.
return;
}
for (int i = 0; i < depLayer.size(); i++) {
DependencyParse dependencyParse = depLayer.getParse(i);
for (eu.clarin.weblicht.wlfxb.tc.api.Dependency dependency : dependencyParse
.getDependencies()) {
eu.clarin.weblicht.wlfxb.tc.api.Token[] governorTokens = depLayer
.getGovernorTokens(dependency);
eu.clarin.weblicht.wlfxb.tc.api.Token[] dependentTokens = depLayer
.getDependentTokens(dependency);
POS dependentPos = aTokens.get(dependentTokens[0].getID()).getPos();
// For dependency annotations in the TCF file without POS, add as a default POS --
if (dependentPos == null) {
getLogger().warn("There is no pos for this token, added [--] as a pos");
dependentPos = new POS(aJCas);
dependentPos.setBegin(aTokens.get(dependentTokens[0].getID()).getBegin());
dependentPos.setEnd(aTokens.get(dependentTokens[0].getID()).getEnd());
dependentPos.setPosValue("--");
dependentPos.setCoarseValue("--");
dependentPos.addToIndexes();
aTokens.get(dependentTokens[0].getID()).setPos(dependentPos);
}
if (governorTokens != null) {
POS governerPos = aTokens.get(governorTokens[0].getID()).getPos();
if (governerPos == null) {
if (dependency.getFunction().equals("ROOT")) {
// do nothing
}
else {
getLogger().warn("There is no pos for this token, added [--] as a pos");
governerPos = new POS(aJCas);
governerPos.setBegin(aTokens.get(governorTokens[0].getID()).getBegin());
governerPos.setEnd(aTokens.get(governorTokens[0].getID()).getEnd());
governerPos.setPosValue("--");
governerPos.addToIndexes();
aTokens.get(governorTokens[0].getID()).setPos(governerPos);
}
}
}
else {
governorTokens = dependentTokens;
}
// We set governorTokens = dependentTokens above for root nodes
if (governorTokens == dependentTokens) {
Dependency outDependency = new ROOT(aJCas);
outDependency.setDependencyType(dependency.getFunction());
outDependency.setGovernor(aTokens.get(dependentTokens[0].getID()));
outDependency.setDependent(aTokens.get(dependentTokens[0].getID()));
outDependency.setBegin(outDependency.getDependent().getBegin());
outDependency.setEnd(outDependency.getDependent().getEnd());
outDependency.setFlavor(depLayer.hasMultipleGovernors()
? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC);
outDependency.addToIndexes();
}
else {
Dependency outDependency = new Dependency(aJCas);
outDependency.setDependencyType(dependency.getFunction());
outDependency.setGovernor(aTokens.get(governorTokens[0].getID()));
outDependency.setDependent(aTokens.get(dependentTokens[0].getID()));
outDependency.setBegin(outDependency.getDependent().getBegin());
outDependency.setEnd(outDependency.getDependent().getEnd());
outDependency.setFlavor(depLayer.hasMultipleGovernors()
? DependencyFlavor.ENHANCED : DependencyFlavor.BASIC);
outDependency.addToIndexes();
}
}
}
}
private void convertNamedEntities(JCas aJCas, TextCorpus aCorpusData,
Map<String, Token> aTokens)
{
if (aCorpusData.getNamedEntitiesLayer() == null) {
// No layer to read from.
return;
}
for (int i = 0; i < aCorpusData.getNamedEntitiesLayer().size(); i++) {
// get the named entity
eu.clarin.weblicht.wlfxb.tc.api.NamedEntity entity = aCorpusData
.getNamedEntitiesLayer().getEntity(i);
eu.clarin.weblicht.wlfxb.tc.api.Token[] namedEntityTokens = aCorpusData
.getNamedEntitiesLayer().getTokens(entity);
NamedEntity outNamedEntity = new NamedEntity(aJCas);
outNamedEntity.setBegin(getOffsets(namedEntityTokens, aTokens)[0]);
outNamedEntity.setEnd(getOffsets(namedEntityTokens, aTokens)[1]);
outNamedEntity.setValue(entity.getType());
outNamedEntity.addToIndexes();
}
}
/**
* Correferences in CAS should be represented {@link CoreferenceChain} and
* {@link CoreferenceLink}. The TCF representation Uses <b> rel </b> and <b>target </b> to build
* chains. Example: </br><i>
* {@literal <entity><reference ID="rc_0" tokenIDs="t_0" mintokIDs="t_0" type="nam"/> } </br>
* {@literal <reference ID="rc_1" tokenIDs="t_6" mintokIDs="t_6" type="pro.per3" rel="anaphoric" target="rc_0"/></entity>
* }</i> </br> The first phase of conversion is getting all <b>references</b> and
* <b>targets</b> alongside the <b>type</b> and <b>relations in different maps</b> <br>
* Second, an iteration is made through all the maps and the {@link CoreferenceChain} and
* {@link CoreferenceLink} annotations are constructed.
*
* @param aJCas
* the JCas.
* @param aCorpusData
* the TCF document.
* @param aTokens
* id/token map.
*/
private void convertCoreference(JCas aJCas, TextCorpus aCorpusData,
Map<String, Token> aTokens)
{
if (aCorpusData.getReferencesLayer() == null) {
// No layer to read from.
return;
}
for (int i = 0; i < aCorpusData.getReferencesLayer().size(); i++) {
eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity = aCorpusData
.getReferencesLayer().getReferencedEntity(i);
Map<Integer, CoreferenceLink> referencesMap = new TreeMap<Integer, CoreferenceLink>();
storeReferencesAndTargetsInMap(referencesMap, entity, aCorpusData, aTokens, aJCas);
CoreferenceChain chain = new CoreferenceChain(aJCas);
CoreferenceLink link = null;
for (Integer address : referencesMap.keySet()) {
if (chain.getFirst() == null) {
chain.setFirst(referencesMap.get(address));
link = chain.getFirst();
chain.addToIndexes();
}
else {
link.setNext(referencesMap.get(address));
if (link.getReferenceRelation() == null) {
link.setReferenceRelation(referencesMap.get(address).getReferenceRelation());
}
link = link.getNext();
link.addToIndexes();
}
}
}
}
private void storeReferencesAndTargetsInMap(Map<Integer, CoreferenceLink> aReferencesMap,
eu.clarin.weblicht.wlfxb.tc.api.ReferencedEntity entity, TextCorpus aCorpusData,
Map<String, Token> aTokens, JCas aJcas)
{
for (Reference reference : entity.getReferences()) {
StringBuilder sbTokens = new StringBuilder();
for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aCorpusData.getReferencesLayer()
.getTokens(reference)) {
sbTokens.append(token.getID() + " ");
}
String[] referenceTokens = sbTokens.toString().split(" ");
int begin = getOffsets(referenceTokens, aTokens)[0];
int end = getOffsets(referenceTokens, aTokens)[1];
CoreferenceLink link = new CoreferenceLink(aJcas);
link.setBegin(begin);
link.setEnd(end);
String referencesType = reference.getType() == null ? "nam" : reference.getType();
link.setReferenceType(referencesType);
if (reference.getRelation() != null) {
link.setReferenceRelation(reference.getRelation());
}
link.addToIndexes();
aReferencesMap.put(link.getAddress(), link);
}
}
/**
* Get the start and end offsets of a span annotation
*
* @param aSpanTokens
* list of span {@link eu.clarin.weblicht.wlfxb.tc.api.Token}s
* @param aAllTokens
* all available tokens in the file
* @return the offsets.
*/
private int[] getOffsets(eu.clarin.weblicht.wlfxb.tc.api.Token[] aSpanTokens,
Map<String, Token> aAllTokens)
{
List<Integer> beginPositions = new ArrayList<Integer>();
List<Integer> endPositions = new ArrayList<Integer>();
for (eu.clarin.weblicht.wlfxb.tc.api.Token token : aSpanTokens) {
beginPositions.add(aAllTokens.get(token.getID()).getBegin());
endPositions.add(aAllTokens.get(token.getID()).getEnd());
}
return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) };
}
/**
* Get the start and end offsets of a span annotation
*
* @param aSpanTokens
* list of span token ids. [t_3,_t_5, t_1]
* @param aAllTokens
* all available tokens in the file
* @return the offsets.
*/
private int[] getOffsets(String[] aSpanTokens, Map<String, Token> aAllTokens)
{
List<Integer> beginPositions = new ArrayList<Integer>();
List<Integer> endPositions = new ArrayList<Integer>();
for (String token : aSpanTokens) {
beginPositions.add(aAllTokens.get(token).getBegin());
endPositions.add(aAllTokens.get(token).getEnd());
}
return new int[] { (Collections.min(beginPositions)), (Collections.max(endPositions)) };
}
}