/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.tsv;
import static org.apache.commons.io.IOUtils.closeQuietly;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.io.IOUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.CASRuntimeException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.impl.LowLevelCAS;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.TagsetDescription;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
/**
* Export annotations in TAB separated format. Header includes information about the UIMA type and
* features The number of columns are depend on the number of types/features exist. All the spans
* will be written first and subsequently all the relations. relation is given in the form of
* Source-->Target and the RelationType is added to the Target token. The next column indicates the
* source of the relation (the source of the arc drown)
*
* @author Seid Muhie Yimam
*
*/
public class WebannoTsv2Writer
extends JCasFileWriter_ImplBase
{
/**
* Name of configuration parameter that contains the character encoding used by the input files.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix";
@ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".tsv")
private String filenameSuffix;
public static final String MULTIPLE_SPAN_ANNOTATIONS = "multipleSpans";
@ConfigurationParameter(name = MULTIPLE_SPAN_ANNOTATIONS, mandatory = true, defaultValue = {})
private List<String> multipleSpans;
private final String DEPENDENT = "Dependent";
private final String GOVERNOR = "Governor";
private final String FIRST = "first";
private final String NEXT = "next";
Map<Integer, String> tokenIds;
NavigableMap<Integer, Integer> tokenPositions;
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
OutputStream docOS = null;
try {
docOS = getOutputStream(aJCas, filenameSuffix);
convertToTsv(aJCas, docOS, encoding);
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
finally {
closeQuietly(docOS);
}
}
private void convertToTsv(JCas aJCas, OutputStream aOs, String aEncoding)
throws IOException, ResourceInitializationException, CASRuntimeException, CASException
{
LowLevelCAS llCas = aJCas.getLowLevelCas();
tokenIds = new HashMap<Integer, String>();
setTokenId(aJCas, tokenIds);
tokenPositions = new TreeMap<Integer, Integer>();
setTokenPosition(aJCas, tokenPositions);
Map<Integer, Integer> getTokensPerSentence = new TreeMap<Integer, Integer>();
setTokenSentenceAddress(aJCas, getTokensPerSentence);
// list of annotation types
Set<Type> allTypes = new LinkedHashSet<Type>();
for (Annotation a : select(aJCas, Annotation.class)) {
if (!(a instanceof Token || a instanceof Sentence || a instanceof DocumentMetaData
|| a instanceof TagsetDescription || a instanceof CoreferenceLink)) {
allTypes.add(a.getType());
}
}
Set<Type> relationTypes = new LinkedHashSet<Type>();
// get all arc types
for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypes.add(type);
break;
}
}
}
allTypes.removeAll(relationTypes);
// relation annotations
Map<Type, String> relationTypesMap = new HashMap<Type, String>();
for (Type type : relationTypes) {
if (type.getName().equals(Dependency.class.getName())) {
relationTypesMap.put(type, POS.class.getName());
continue;
}
for (AnnotationFS anno : CasUtil.select(aJCas.getCas(), type)) {
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
relationTypesMap.put(type, anno.getFeatureValue(feature).getType()
.getName());
}
}
}
}
// all span annotation first
Map<Feature, Type> spanFeatures = new LinkedHashMap<Feature, Type>();
allTypes: for (Type type : allTypes) {
if (type.getFeatures().size() == 0) {
continue;
}
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
spanFeatures.put(feature, type);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
}
// write all relation annotation first
Set<Feature> relationFeatures = new LinkedHashSet<Feature>();
for (Type type : relationTypes) {
IOUtils.write(" # " + type.getName(), aOs, aEncoding);
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end")
|| feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT)) {
continue;
}
relationFeatures.add(feature);
IOUtils.write(" | " + feature.getShortName(), aOs, aEncoding);
}
// Add the attach type for the realtion anotation
IOUtils.write(" | AttachTo=" + relationTypesMap.get(type), aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
Map<Feature, Map<Integer, String>> allAnnos = new HashMap<Feature, Map<Integer, String>>();
allTypes: for (Type type : allTypes) {
for (Feature feature : type.getFeatures()) {
// coreference annotation not supported
if (feature.getShortName().equals(FIRST) || feature.getShortName().equals(NEXT)) {
continue allTypes;
}
}
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end")) {
continue;
}
Map<Integer, String> tokenAnnoMap = new TreeMap<Integer, String>();
setTokenAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
allAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown to
Map<Feature, Map<Integer, String>> relAnnos = new HashMap<Feature, Map<Integer, String>>();
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end")
|| feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT)) {
continue;
}
Map<Integer, String> tokenAnnoMap = new HashMap<Integer, String>();
setRelationFeatureAnnos(aJCas.getCas(), tokenAnnoMap, type, feature);
relAnnos.put(feature, tokenAnnoMap);
}
}
// get tokens where dependents are drown from - the governor
Map<Type, Map<Integer, String>> governorAnnos = new HashMap<Type, Map<Integer, String>>();
for (Type type : relationTypes) {
Map<Integer, String> govAnnoMap = new HashMap<Integer, String>();
setRelationGovernorPos(aJCas.getCas(), govAnnoMap, type);
governorAnnos.put(type, govAnnoMap);
}
int sentId = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
IOUtils.write("#id=" + sentId++ + "\n", aOs, aEncoding);
IOUtils.write("#text=" + sentence.getCoveredText().replace("\n", "") + "\n", aOs,
aEncoding);
for (Token token : selectCovered(Token.class, sentence)) {
IOUtils.write(tokenIds.get(llCas.ll_getFSRef(token)) + "\t" + token.getCoveredText()
+ "\t", aOs, aEncoding);
// all span annotations on this token
for (Feature feature : spanFeatures.keySet()) {
String annos = allAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
if (multipleSpans.contains(spanFeatures.get(feature).getName())) {
IOUtils.write("O\t", aOs, aEncoding);
}
else {
IOUtils.write("_\t", aOs, aEncoding);
}
}
else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
// for all relation features
for (Type type : relationTypes) {
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end")
|| feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT)) {
continue;
}
String annos = relAnnos.get(feature).get(llCas.ll_getFSRef(token));
if (annos == null) {
IOUtils.write("_\t", aOs, aEncoding);
}
else {
IOUtils.write(annos + "\t", aOs, aEncoding);
}
}
// the governor positions
String govPos = governorAnnos.get(type).get(llCas.ll_getFSRef(token));
if (govPos == null) {
IOUtils.write("_\t", aOs, aEncoding);
}
else {
IOUtils.write(
governorAnnos.get(type).get(llCas.ll_getFSRef(token)) + "\t", aOs,
aEncoding);
}
}
IOUtils.write("\n", aOs, aEncoding);
}
IOUtils.write("\n", aOs, aEncoding);
}
}
private void setTokenSentenceAddress(JCas aJCas, Map<Integer, Integer> aTokenListInSentence)
{
LowLevelCAS llCas = aJCas.getLowLevelCas();
for (Sentence sentence : select(aJCas, Sentence.class)) {
for (Token token : selectCovered(Token.class, sentence)) {
aTokenListInSentence.put(llCas.ll_getFSRef(token), llCas.ll_getFSRef(sentence));
}
}
}
private void setTokenId(JCas aJCas, Map<Integer, String> aTokenAddress)
{
LowLevelCAS llCas = aJCas.getLowLevelCas();
int sentenceId = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
int tokenId = 1;
for (Token token : selectCovered(Token.class, sentence)) {
aTokenAddress.put(llCas.ll_getFSRef(token), sentenceId + "-" + tokenId++);
}
sentenceId++;
}
}
private void setTokenPosition(JCas aJCas, Map<Integer, Integer> aTokenAddress)
{
LowLevelCAS llCas = aJCas.getLowLevelCas();
for (Token token : select(aJCas, Token.class)) {
aTokenAddress.put(token.getBegin(), llCas.ll_getFSRef(token));
}
}
private void setTokenAnnos(CAS aCas, Map<Integer, String> aTokenAnnoMap, Type aType,
Feature aFeature)
{
LowLevelCAS llCas = aCas.getLowLevelCAS();
for (AnnotationFS annoFs : CasUtil.select(aCas, aType)) {
boolean first = true;
boolean previous = false; // exists previous annotation, place-holed O-_ should be kept
for (Token token : selectCovered(Token.class, annoFs)) {
if (annoFs.getBegin() <= token.getBegin() && annoFs.getEnd() >= token.getEnd()) {
String annotation = annoFs.getFeatureValueAsString(aFeature);
if (annotation == null) {
annotation = aType.getName()+"_";
}
if (aTokenAnnoMap.get(llCas.ll_getFSRef(token)) == null) {
if (previous) {
if (!multipleSpans.contains(aType.getName())) {
aTokenAnnoMap.put(llCas.ll_getFSRef(token), annotation);
}
else {
aTokenAnnoMap.put(llCas.ll_getFSRef(token), "O-_|"
+ (first ? "B-" : "I-") + annotation);
first = false;
}
}
else {
if (!multipleSpans.contains(aType.getName())) {
aTokenAnnoMap.put(llCas.ll_getFSRef(token), annotation);
}
else {
aTokenAnnoMap.put(llCas.ll_getFSRef(token), (first ? "B-" : "I-")
+ annotation);
first = false;
}
}
}
else {
if (!multipleSpans.contains(aType.getName())) {
aTokenAnnoMap.put(llCas.ll_getFSRef(token),
aTokenAnnoMap.get(llCas.ll_getFSRef(token)) + "|"
+ annotation);
previous = true;
}
else {
aTokenAnnoMap.put(llCas.ll_getFSRef(token),
aTokenAnnoMap.get(llCas.ll_getFSRef(token)) + "|"
+ (first ? "B-" : "I-") + annotation);
first = false;
previous = true;
}
}
}
}
}
}
private void setRelationFeatureAnnos(CAS aCas, Map<Integer, String> aRelAnnoMap, Type aType,
Feature aFeature)
throws CASRuntimeException, CASException
{
LowLevelCAS llCas = aCas.getLowLevelCAS();
Feature dependent = null;
AnnotationFS temp = null;
for (Feature feature : aType.getFeatures()) {
if (feature.getShortName().equals(DEPENDENT)) {
dependent = feature;
}
}
for (AnnotationFS annoFs : CasUtil.select(aCas, aType)) {
// relation annotation will be from Governor to Dependent
// Entry done on Dependent side
temp = annoFs;
annoFs = (AnnotationFS) annoFs.getFeatureValue(dependent);
boolean first = true;
for (Token token : selectCovered(aCas.getJCas(), Token.class, annoFs.getBegin(),
annoFs.getEnd())) {
if (annoFs.getBegin() <= token.getBegin() && annoFs.getEnd() >= token.getEnd()) {
annoFs = temp;
String annotation = annoFs.getFeatureValueAsString(aFeature);
if (annotation == null) {
annotation = aType.getName()+"_";
}
if (aRelAnnoMap.get(llCas.ll_getFSRef(token)) == null) {
if (!multipleSpans.contains(aType.getName())) {
aRelAnnoMap.put(llCas.ll_getFSRef(token), annotation);
}
else {
aRelAnnoMap.put(llCas.ll_getFSRef(token),
(first ? "B-" : "I-") + annotation);
first = false;
}
}
else {
if (!multipleSpans.contains(aType.getName())) {
aRelAnnoMap.put(llCas.ll_getFSRef(token),
aRelAnnoMap.get(llCas.ll_getFSRef(token)) + "|" + annotation);
}
else {
aRelAnnoMap.put(llCas.ll_getFSRef(token),
aRelAnnoMap.get(llCas.ll_getFSRef(token)) + "|"
+ (first ? "B-" : "I-") + annotation);
first = false;
}
}
}
//TODO: remove the B- and I- code in the if/else above. no such a thing of
// multiplespan annotation on relations.
// if the annotation gov/dep span annotation is on multiple tokens,
//we just need an arc to the first token.
break;
}
}
}
private void setRelationGovernorPos(CAS aCas, Map<Integer, String> aRelationGovernorMap,
Type aType)
throws CASRuntimeException, CASException
{
Feature governor = null, dependent = null;
AnnotationFS temp = null;
for (Feature feature : aType.getFeatures()) {
if (feature.getShortName().equals(GOVERNOR)) {
governor = feature;
}
if (feature.getShortName().equals(DEPENDENT)) {
dependent = feature;
}
}
LowLevelCAS llCas = aCas.getLowLevelCAS();
for (AnnotationFS anno : CasUtil.select(aCas, aType)) {
// relation annotation will be from Governor to Dependent
// Entry done on Dependent side
temp = anno;
anno = (AnnotationFS) anno.getFeatureValue(dependent);
for (Token token : selectCovered(aCas.getJCas(), Token.class, anno.getBegin(),
anno.getEnd())) {
if (anno.getBegin() <= token.getBegin() && anno.getEnd() >= token.getEnd()) {
if (aRelationGovernorMap.get(llCas.ll_getFSRef(token)) == null) {
AnnotationFS govAnno = (AnnotationFS) temp.getFeatureValue(governor);
aRelationGovernorMap.put(llCas.ll_getFSRef(token), tokenIds
.get(tokenPositions.floorEntry(govAnno.getBegin()).getValue()));
}
else {
AnnotationFS govAnno = (AnnotationFS) temp.getFeatureValue(governor);
aRelationGovernorMap.put(llCas.ll_getFSRef(token), aRelationGovernorMap
.get(llCas.ll_getFSRef(token)) + "|"
+ tokenIds.get(
tokenPositions.floorEntry(govAnno.getBegin()).getValue()));
}
}
// if the annotation gov/dep span annotation is on multiple tokens,
//we just need an arc to the first token.
break;
}
}
}
}