/*
* Copyright 2014
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.tsv;
import static org.apache.commons.lang3.StringEscapeUtils.escapeJava;
import static org.apache.uima.fit.util.CasUtil.getType;
import static org.apache.uima.fit.util.CasUtil.selectFS;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.apache.uima.fit.util.JCasUtil.selectCovered;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.ArrayFS;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.CasUtil;
import org.apache.uima.jcas.JCas;
import de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
/**
* Export annotations in TAB separated format. Header includes information about
* the UIMA type and features The number of columns are depend on the number of
* types/features exist. All the spans will be written first and subsequently
* all the relations. relation is given in the form of Source-->Target and
* the RelationType is added to the Target token. The next column indicates the
* source of the relation (the source of the arc drown)
*/
public class WebannoTsv3Writer
extends JCasFileWriter_ImplBase
{
/**
* Name of configuration parameter that contains the character encoding used
* by the input files.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix";
@ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".tsv")
private String filenameSuffix;
public static final String PARAM_SPAN_LAYERS = "spanLayers";
@ConfigurationParameter(name = PARAM_SPAN_LAYERS, mandatory = true, defaultValue = {})
private List<String> spanLayers;
public static final String PARAM_SLOT_FEATS = "slotFeatures";
@ConfigurationParameter(name = PARAM_SLOT_FEATS, mandatory = true, defaultValue = {})
private List<String> slotFeatures;
public static final String PARAM_LINK_TYPES = "linkTypes";
@ConfigurationParameter(name = PARAM_LINK_TYPES, mandatory = true, defaultValue = {})
private List<String> linkTypes;
public static final String PARAM_SLOT_TARGETS = "slotTargets";
@ConfigurationParameter(name = PARAM_SLOT_TARGETS, mandatory = true, defaultValue = {})
private List<String> slotTargets;
public static final String PARAM_CHAIN_LAYERS = "chainLayers";
@ConfigurationParameter(name = PARAM_CHAIN_LAYERS, mandatory = true, defaultValue = {})
private List<String> chainLayers;
public static final String PARAM_RELATION_LAYERS = "relationLayers";
@ConfigurationParameter(name = PARAM_RELATION_LAYERS, mandatory = true, defaultValue = {})
private List<String> relationLayers;
private static final String TAB = "\t";
private static final String LF = "\n";
private static final String DEPENDENT = "Dependent";
private static final String GOVERNOR = "Governor";
private static final String REF_REL = "referenceRelation";
private static final String CHAIN = "Chain";
private static final String LINK = "Link";
private static final String FIRST = "first";
private static final String NEXT = "next";
public static final String SP = "T_SP"; // span annotation type
public static final String CH = "T_CH"; // chain annotation type
public static final String RL = "T_RL"; // relation annotation type
public static final String ROLE = "ROLE_";
public static final String BT = "BT_"; // base type for the relation
// annotation
private List<AnnotationUnit> units = new ArrayList<>();
// number of subunits under this Annotation Unit
private Map<AnnotationUnit, Integer> subUnits = new HashMap<>();
private Map<String, Set<String>> featurePerLayer = new LinkedHashMap<>();
private Map<AnnotationUnit, String> unitsLineNumber = new HashMap<>();
private Map<AnnotationUnit, String> sentenceUnits = new HashMap<>();
private Map<String, Map<AnnotationUnit, List<List<String>>>> annotationsPerPostion = new HashMap<>();
private Map<Feature, Type> slotFeatureTypes = new HashMap<>();
private Map<Type,Map<FeatureStructure, Integer>> annotaionRefPerType = new HashMap<>();
private Map<String, Map<AnnotationUnit, Boolean>> ambigUnits = new HashMap<>();
private Map<Type, Map<AnnotationUnit, Map<FeatureStructure, Integer>>> multiAnnosPerUnit = new HashMap<>();
private Map<String, String> slotLinkTypes = new HashMap<>();
private Map<Type, Integer> layerMaps = new LinkedHashMap<>();
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
setSlotLinkTypes();
setLinkMaps(aJCas);
setTokenSentenceAddress(aJCas);
setAmbiguity(aJCas);
setSpanAnnotation(aJCas);
setChainAnnotation(aJCas);
setRelationAnnotation(aJCas);
writeHeader(docOS);
for (AnnotationUnit unit : units) {
if (sentenceUnits.containsKey(unit)) {
String [] sentWithNl = sentenceUnits.get(unit).split("\n");
IOUtils.write(LF + "#Text=" + escapeJava(sentWithNl[0]) + LF, docOS, encoding);
// if sentence contains new line character
// GITHUB ISSUE 318: New line in sentence should be exported as is
if(sentWithNl.length >1){
for(int i=0;i<sentWithNl.length-1;i++){
IOUtils.write("#Text=" +escapeJava(sentWithNl[i+1]) + LF, docOS, encoding);
}
}
}
if (unit.isSubtoken) {
IOUtils.write(
unitsLineNumber.get(unit) + TAB + unit.begin + "-" + unit.end + TAB + unit.token + TAB,
docOS, encoding);
} else {
IOUtils.write(
unitsLineNumber.get(unit) + TAB + unit.begin + "-" + unit.end + TAB + unit.token + TAB,
docOS, encoding);
}
for (String type : featurePerLayer.keySet()) {
List<List<String>> annos = annotationsPerPostion.getOrDefault(type, new HashMap<>())
.getOrDefault(unit, new ArrayList<>());
List<String> merged = null;
for (List<String> annofs : annos) {
if (merged == null) {
merged = annofs;
} else {
for (int i = 0; i < annofs.size(); i++) {
merged.set(i, merged.get(i) + "|" + annofs.get(i));
}
}
}
if (merged != null) {
for (String anno : merged) {
IOUtils.write(anno + TAB, docOS, encoding);
}
} // No annotation of this type in this layer
else {
// if type do not have a feature,
if (featurePerLayer.get(type).size() == 0) {
IOUtils.write("_" + TAB, docOS, encoding);
} else {
for (String feature : featurePerLayer.get(type)) {
IOUtils.write("_" + TAB, docOS, encoding);
}
}
}
}
IOUtils.write(LF, docOS, encoding);
}
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
}
private void setSlotLinkTypes() {
int i = 0;
for (String f : slotFeatures) {
slotLinkTypes.put(f, linkTypes.get(i));
i++;
}
}
private void setLinkMaps(JCas aJCas) {
for (String l : spanLayers) {
if (l.equals(Token.class.getName())) {
continue;
}
Type type = getType(aJCas.getCas(), l);
layerMaps.put(type, layerMaps.size() + 1);
}
for (String l : chainLayers) {
Type type = getType(aJCas.getCas(), l + LINK);
layerMaps.put(type, layerMaps.size() + 1);
}
for (String l : relationLayers) {
Type type = getType(aJCas.getCas(), l);
layerMaps.put(type, layerMaps.size() + 1);
}
}
/**
* Write headers, in the sequence <br>
* Type TAB List(Features sep by TAB)
*/
private void writeHeader(OutputStream docOS) throws IOException {
IOUtils.write("#FORMAT=WebAnno TSV 3.1" + LF, docOS, encoding);
for (String type : featurePerLayer.keySet()) {
String annoType;
if (spanLayers.contains(type)) {
annoType = SP;
} else if (relationLayers.contains(type)) {
annoType = RL;
} else {
annoType = CH;
}
IOUtils.write("#" + annoType + "=" + type + "|", docOS, encoding);
StringBuffer fsb = new StringBuffer();
for (String feature : featurePerLayer.get(type)) {
if (fsb.length() < 1) {
fsb.append(feature);
} else {
fsb.append("|" + feature);
}
}
IOUtils.write(fsb.toString() + LF, docOS, encoding);
}
IOUtils.write(LF, docOS, encoding);
}
private void setAmbiguity(JCas aJCas) {
List<String> spanAndTokenLayers = spanLayers;
spanAndTokenLayers.add(Token.class.getName());
for (String l : spanAndTokenLayers) {
Type type = getType(aJCas.getCas(), l);
ambigUnits.putIfAbsent(type.getName(), new HashMap<>());
for (AnnotationFS fs : CasUtil.select(aJCas.getCas(), type)) {
AnnotationUnit unit = getFirstUnit(fs);
// multiple token anno
if (isMultipleTokenAnnotation(fs.getBegin(), fs.getEnd())) {
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(fs.getBegin());
sta.setEnd(fs.getEnd());
sta.setText(fs.getCoveredText());
Set<AnnotationUnit> sus = new LinkedHashSet<>();
for (AnnotationUnit newUnit : getSubUnits(sta, sus)) {
ambigUnits.get(type.getName()).put(newUnit, true);
}
}
// stacked anno
else if (ambigUnits.get(type.getName()).get(unit) != null) {
ambigUnits.get(type.getName()).put(unit, true);
}
//single or first occurrence of stacked anno
else {
ambigUnits.get(type.getName()).put(unit, false);
}
}
}
}
private void setSpanAnnotation(JCas aJCas) {
int i = 0;
// store slot targets for each slot features
for (String l : spanLayers) {
Type type = getType(aJCas.getCas(), l);
for (Feature f : type.getFeatures()) {
if (slotFeatures != null && slotFeatures.contains(f.getName())) {
slotFeatureTypes.put(f, getType(aJCas.getCas(), slotTargets.get(i)));
i++;
}
}
}
for (String l : spanLayers) {
if (l.equals(Token.class.getName())) {
continue;
}
Map<AnnotationUnit, List<List<String>>> annotationsPertype;
if (annotationsPerPostion.get(l) == null) {
annotationsPertype = new HashMap<>();
} else {
annotationsPertype = annotationsPerPostion.get(l);
}
Type type = getType(aJCas.getCas(), l);
for (AnnotationFS fs : CasUtil.select(aJCas.getCas(), type)) {
AnnotationUnit unit = new AnnotationUnit(fs.getBegin(), fs.getEnd(), false, fs.getCoveredText());
// annotation is per Token
if (units.contains(unit)) {
setSpanAnnoPerFeature(annotationsPertype, type, fs, unit, false, false);
}
// Annotation is on sub-token or multiple tokens
else {
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(fs.getBegin());
sta.setEnd(fs.getEnd());
sta.setText(fs.getCoveredText());
boolean isMultiToken = isMultiToken(fs);
boolean isFirst = true;
Set<AnnotationUnit> sus = new LinkedHashSet<>();
for (AnnotationUnit newUnit : getSubUnits(sta, sus)) {
setSpanAnnoPerFeature(annotationsPertype, type, fs, newUnit, isMultiToken, isFirst);
isFirst = false;
}
}
}
if (annotationsPertype.keySet().size() > 0) {
annotationsPerPostion.put(l, annotationsPertype);
}
}
}
private void setChainAnnotation(JCas aJCas) {
for (String l : chainLayers) {
if (l.equals(Token.class.getName())) {
continue;
}
Map<AnnotationUnit, List<List<String>>> annotationsPertype = null;
Type type = getType(aJCas.getCas(), l + CHAIN);
Feature chainFirst = type.getFeatureByBaseName(FIRST);
int chainNo = 1;
for (FeatureStructure chainFs : selectFS(aJCas.getCas(), type)) {
AnnotationFS linkFs = (AnnotationFS) chainFs.getFeatureValue(chainFirst);
AnnotationUnit unit = getUnit(linkFs.getBegin(), linkFs.getEnd(), linkFs.getCoveredText());
Type lType = linkFs.getType();
// this is the layer with annotations
l = lType.getName();
if (annotationsPerPostion.get(l) == null) {
annotationsPertype = new HashMap<>();
} else {
annotationsPertype = annotationsPerPostion.get(l);
}
Feature linkNext = linkFs.getType().getFeatureByBaseName(NEXT);
int linkNo = 1;
while (linkFs != null) {
AnnotationFS nextLinkFs = (AnnotationFS) linkFs.getFeatureValue(linkNext);
if (nextLinkFs != null) {
addChinFeatureAnno(annotationsPertype, lType, linkFs, unit, linkNo, chainNo);
} else {
addChinFeatureAnno(annotationsPertype, lType, linkFs, unit, linkNo, chainNo);
}
linkFs = nextLinkFs;
linkNo++;
if (nextLinkFs != null) {
unit = getUnit(linkFs.getBegin(), linkFs.getEnd(), linkFs.getCoveredText());
}
}
if (annotationsPertype.keySet().size() > 0) {
annotationsPerPostion.put(l, annotationsPertype);
}
chainNo++;
}
}
}
private void setRelationAnnotation(JCas aJCas) {
for (String l : relationLayers) {
if (l.equals(Token.class.getName())) {
continue;
}
Map<AnnotationUnit, List<List<String>>> annotationsPertype;
if (annotationsPerPostion.get(l) == null) {
annotationsPertype = new HashMap<>();
} else {
annotationsPertype = annotationsPerPostion.get(l);
}
Type type = getType(aJCas.getCas(), l);
Feature dependentFeature = null;
Feature governorFeature = null;
for (Feature feature : type.getFeatures()) {
if (feature.getShortName().equals(DEPENDENT)) {
// check if the dependent is
dependentFeature = feature;
}
if (feature.getShortName().equals(GOVERNOR)) {
governorFeature = feature;
}
}
for (AnnotationFS fs : CasUtil.select(aJCas.getCas(), type)) {
AnnotationFS depFs = (AnnotationFS) fs.getFeatureValue(dependentFeature);
AnnotationFS govFs = (AnnotationFS) fs.getFeatureValue(governorFeature);
Type govType = govFs.getType();
AnnotationUnit govUnit = getFirstUnit(
getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText()));
if(ambigUnits.get(govType.getName()).get(govUnit)==null){
govUnit = getUnit(govFs.getBegin(), govFs.getEnd(), govFs.getCoveredText());
}
AnnotationUnit depUnit = getFirstUnit(
getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText()));
if(ambigUnits.get(govType.getName()).get(depUnit)==null){
depUnit = getUnit(depFs.getBegin(), depFs.getEnd(), depFs.getCoveredText());
}
// Since de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency is over
// Over POS anno which itself attached to Token, we need the POS type here
if (type.getName().equals(Dependency.class.getName())){
govType = aJCas.getCas().getTypeSystem().getType(POS.class.getName());
}
int govRef = 0;
int depRef = 0;
// For that unit test case onle, where annotations are on Tokens.
// The WebAnno world do not ever process Token as an annotation
if(!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(govUnit).equals(true)){
govRef = annotaionRefPerType.get(govType).get(govFs);
}
if(!govType.getName().equals(Token.class.getName()) && ambigUnits.get(govType.getName()).get(depUnit).equals(true)){
depRef = annotaionRefPerType.get(govType).get(depFs);
}
setRelationAnnoPerFeature(annotationsPertype, type, fs, depUnit, govUnit, govRef,
depRef, govType);
}
if (annotationsPertype.keySet().size() > 0) {
annotationsPerPostion.put(l, annotationsPertype);
}
}
}
private boolean isMultiToken(AnnotationFS aFs) {
for (AnnotationUnit unit : units) {
if (unit.begin <= aFs.getBegin() && unit.end > aFs.getBegin() && unit.end < aFs.getEnd()) {
return true;
}
}
return false;
}
private AnnotationUnit getUnit(int aBegin, int aEnd, String aText) {
for (AnnotationUnit unit : units) {
if (unit.begin == aBegin && unit.end == aEnd) {
return unit;
}
}
return new AnnotationUnit(aBegin, aEnd, false, aText);
}
private Set<AnnotationUnit> getSubUnits(SubTokenAnno aSTA, Set<AnnotationUnit> aSubUnits) {
AnnotationUnit prevUnit = null;
List<AnnotationUnit> tmpUnits = new ArrayList<>(units);
if(aSTA.getBegin() == aSTA.getEnd()){
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, "");
for(AnnotationUnit unit: units){
if(unit.begin>=newUnit.begin && unit.end>=newUnit.end){
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
units = new ArrayList<>(tmpUnits);
return aSubUnits;
}
}
}
for (AnnotationUnit unit : units) {
if (unit.end > aSTA.end) {
if(unit.begin==aSTA.begin){
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
}
break;
}
// this is a sub-token annotation
if (unit.begin <= aSTA.getBegin() && aSTA.getBegin() <= unit.end && aSTA.getEnd() <= unit.end) {
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), aSTA.getEnd(), false, aSTA.getText());
updateUnitLists(tmpUnits, unit, newUnit);
aSubUnits.add(newUnit);
}
// if sub-token annotation crosses multiple tokens
else if ((unit.begin <= aSTA.getBegin() && aSTA.getBegin() < unit.end && aSTA.getEnd() > unit.end)) {
int thisSubTextLen = unit.end - aSTA.begin;
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.end, false,
aSTA.getText().substring(0, thisSubTextLen));
aSubUnits.add(newUnit);
updateUnitLists(tmpUnits, unit, newUnit);
aSTA.setBegin(getNextUnitBegin(aSTA.getBegin()));
aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
getSubUnits(aSTA, aSubUnits);
}
// empty annotation between tokens
else if(aSTA.getBegin()<=unit.begin && prevUnit !=null && prevUnit.end<unit.begin){
int thisSubTextLen = unit.begin - aSTA.begin;
AnnotationUnit newUnit = new AnnotationUnit(aSTA.getBegin(), unit.begin, false,
aSTA.getText().substring(0, thisSubTextLen));
aSubUnits.add(newUnit);
updateUnitLists(tmpUnits, prevUnit, newUnit);
aSTA.setBegin(unit.begin);
aSTA.setText(aSTA.getText().trim().substring(thisSubTextLen));
getSubUnits(aSTA, aSubUnits);
}
else{
prevUnit = unit;
}
}
units = new ArrayList<>(tmpUnits);
return aSubUnits;
}
private int getNextUnitBegin(int aSTABegin) {
for (AnnotationUnit unit : units) {
if (unit.begin > aSTABegin && !unit.isSubtoken) {
return unit.begin;
}
}
// this is the last token
return aSTABegin;
}
/**
* If there is at least one non-sub-token annotation whose begin is larger than this one, it is
* a multiple tokens (or crossing multiple tokens) annotation.
*/
private boolean isMultipleTokenAnnotation(int aBegin, int aEnd) {
for (AnnotationUnit unit : units) {
if (unit.begin > aBegin && unit.begin < aEnd && !unit.isSubtoken) {
return true;
}
}
// this is the last token
return false;
}
private void updateUnitLists(List<AnnotationUnit> tmpUnits, AnnotationUnit unit, AnnotationUnit newUnit) {
if (!tmpUnits.contains(newUnit)) {
newUnit.isSubtoken = true;
// is this sub-token already there
if (!tmpUnits.contains(newUnit)) {
tmpUnits.add(tmpUnits.indexOf(unit) + 1, newUnit);
subUnits.put(unit, subUnits.getOrDefault(unit, 0) + 1);
unitsLineNumber.put(newUnit, unitsLineNumber.get(unit) + "." + subUnits.get(unit));
}
}
}
private void setSpanAnnoPerFeature(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType,
AnnotationFS aFs, AnnotationUnit aUnit, boolean aIsMultiToken, boolean aIsFirst) {
List<String> annoPerFeatures = new ArrayList<>();
featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
int ref = getRefId(aType, aFs, aUnit);
if(ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit))!=null &&
ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit)).equals(false)){
ref = 0;
}
if(ambigUnits.get(aType.getName()).get(getFirstUnit(aUnit))== null &&
ambigUnits.get(aType.getName()).get(aUnit).equals(false)){
ref = 0;
}
for (Feature feature : aType.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT) || feature.getShortName().equals(FIRST)
|| feature.getShortName().equals(NEXT)) {
continue;
}
// if slot feature
if (slotFeatures != null && slotFeatures.contains(feature.getName())) {
if (aFs.getFeatureValue(feature) != null) {
ArrayFS array = (ArrayFS) aFs.getFeatureValue(feature);
StringBuffer sbRole = new StringBuffer();
StringBuffer sbTarget = new StringBuffer();
for (FeatureStructure linkFS : array.toArray()) {
String role = linkFS.getStringValue(linkFS.getType().getFeatureByBaseName("role"));
AnnotationFS targetFs = (AnnotationFS) linkFS
.getFeatureValue(linkFS.getType().getFeatureByBaseName("target"));
Type tType = targetFs.getType();
AnnotationUnit firstUnit = getFirstUnit(targetFs);
ref = getRefId(tType, targetFs, firstUnit);
// Check if the target is ambiguous or not
if(ambigUnits.get(tType.getName()).get(firstUnit).equals(false)){
ref = 0;
}
if (role == null) {
role = "*";
} else {
// Escape special character
role = replaceEscapeChars(role);
}
if (sbRole.length() < 1) {
sbRole.append(role);
// record the actual target type column number if slot target is
// uima.tcas.Annotation
int targetTypeNumber = 0;
if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
targetTypeNumber = layerMaps.get(tType);
}
sbTarget.append(
unitsLineNumber.get(firstUnit) + (targetTypeNumber ==0 ? "" : "-" + targetTypeNumber)
+ (ref > 0 ? "[" + ref + "]" : ""));
} else {
sbRole.append(";");
sbTarget.append(";");
sbRole.append(role);
int targetTypeNumber = 0;
if (slotFeatureTypes.get(feature).getName().equals(CAS.TYPE_NAME_ANNOTATION)) {
targetTypeNumber = layerMaps.get(tType);
}
sbTarget.append(
unitsLineNumber.get(firstUnit) + (targetTypeNumber ==0 ? "" : "-" + targetTypeNumber)
+ (ref > 0 ? "[" + ref + "]" : ""));
}
}
annoPerFeatures.add(sbRole.toString().isEmpty() ? "_" : sbRole.toString());
annoPerFeatures.add(sbTarget.toString().isEmpty() ? "_" : sbTarget.toString());
} else {
// setting it to null
annoPerFeatures.add("_");
annoPerFeatures.add("_");
}
featurePerLayer.get(aType.getName())
.add(ROLE + feature.getName() + "_" + slotLinkTypes.get(feature.getName()));
featurePerLayer.get(aType.getName()).add(slotFeatureTypes.get(feature).getName());
} else {
String annotation = aFs.getFeatureValueAsString(feature);
if (annotation == null) {
annotation = "*";
} else {
// Escape special character
annotation = replaceEscapeChars(annotation);
}
annotation = annotation + (ref > 0 ? "[" + ref + "]" : "");
// only add BIO markers to multiple annotations
setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, annotation);
featurePerLayer.get(aType.getName()).add(feature.getShortName());
}
}
aAnnotationsPertype.putIfAbsent(aUnit, new ArrayList<>());
// If the layer do not have a feature at all, add dummy * as a place holder
if (annoPerFeatures.size() == 0){
setAnnoFeature(aIsMultiToken, aIsFirst, annoPerFeatures, "*"+ (ref > 0 ? "[" + ref + "]" : ""));
}
aAnnotationsPertype.get(aUnit).add(annoPerFeatures);
}
/**
*
* @param aAnnotationsPertype
* store annotations per type associated with the annotation
* units
* @param aType
* the coreference annotation type
* @param aFs
* the feature structure
* @param aUnit
* the current annotation unit of the coreference chain
* @param aLinkNo
* a reference to the link in a chain, starting at one for the
* first link and n for the last link in the chain
* @param achainNo
* a reference to the chain, starting at 1 for the first chain
* and n for the last chain where n is the number of coreference
* chains the document
*/
private void addChinFeatureAnno(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType,
AnnotationFS aFs, AnnotationUnit aUnit, int aLinkNo, int achainNo) {
featurePerLayer.putIfAbsent(aType.getName(), new LinkedHashSet<>());
// StringBuffer sbAnnotation = new StringBuffer();
// annotation is per Token
if (units.contains(aUnit)) {
setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, aUnit, aLinkNo, achainNo, false, false);
}
// Annotation is on sub-token or multiple tokens
else {
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(aFs.getBegin());
sta.setEnd(aFs.getEnd());
sta.setText(aFs.getCoveredText());
boolean isMultiToken = isMultiToken(aFs);
boolean isFirst = true;
Set<AnnotationUnit> sus = new LinkedHashSet<>();
for (AnnotationUnit newUnit : getSubUnits(sta, sus)) {
setChainAnnoPerFeature(aAnnotationsPertype, aType, aFs, newUnit, aLinkNo, achainNo, isMultiToken,
isFirst);
isFirst = false;
}
}
}
private void setChainAnnoPerFeature(Map<AnnotationUnit, List<List<String>>> aAnnotationsPertype, Type aType,
AnnotationFS aFs, AnnotationUnit aUnit, int aLinkNo, int achainNo, boolean aMultiUnit, boolean aFirst) {
List<String> annoPerFeatures = new ArrayList<>();
for (Feature feature : aType.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT) || feature.getShortName().equals(FIRST)
|| feature.getShortName().equals(NEXT)) {
continue;
}
String annotation = aFs.getFeatureValueAsString(feature);
if (annotation == null) {
annotation = "*";
}
else {
annotation = replaceEscapeChars(annotation);
}
if (feature.getShortName().equals(REF_REL)) {
annotation = annotation + "->" + achainNo + "-" + aLinkNo;
}
else if (aMultiUnit) {
annotation = annotation + "[" + achainNo + "]";
}
else {
annotation = annotation + "[" + achainNo + "]";
}
featurePerLayer.get(aType.getName()).add(feature.getShortName());
annoPerFeatures.add(annotation);
}
aAnnotationsPertype.putIfAbsent(aUnit, new ArrayList<>());
ambigUnits.putIfAbsent(aType.getName(), new HashMap<>());
ambigUnits.get(aType.getName()).put(aUnit, true); // coref are always ambig
if (annoPerFeatures.size() == 0) {
annoPerFeatures.add("*"+"[" + achainNo + "]");
}
aAnnotationsPertype.get(aUnit).add(annoPerFeatures);
}
private void setRelationAnnoPerFeature(Map<AnnotationUnit, List<List<String>>> annotationsPertype, Type type,
AnnotationFS fs, AnnotationUnit depUnit, AnnotationUnit govUnit, int aGovRef, int aDepRef, Type aDepType) {
List<String> annoPerFeatures = new ArrayList<>();
featurePerLayer.putIfAbsent(type.getName(), new LinkedHashSet<>());
for (Feature feature : type.getFeatures()) {
if (feature.toString().equals("uima.cas.AnnotationBase:sofa")
|| feature.toString().equals("uima.tcas.Annotation:begin")
|| feature.toString().equals("uima.tcas.Annotation:end") || feature.getShortName().equals(GOVERNOR)
|| feature.getShortName().equals(DEPENDENT) || feature.getShortName().equals(FIRST)
|| feature.getShortName().equals(NEXT)) {
continue;
}
int ref = getRefId(type, fs, depUnit);
String annotation = fs.getFeatureValueAsString(feature);
if (annotation == null) {
annotation = "*";
}
else{
annotation = replaceEscapeChars(annotation);
}
annoPerFeatures.add(annotation);// +(ref > 0 ? "[" + ref + "]" : ""));
featurePerLayer.get(type.getName()).add(feature.getShortName());
}
// add the governor and dependent unit addresses (separated by _
String govRef = unitsLineNumber.get(govUnit) + ((aDepRef > 0 || aGovRef > 0)
? "[" + aGovRef + "_" + aDepRef + "]"
: "");
annoPerFeatures.add(govRef);
featurePerLayer.get(type.getName()).add(BT + aDepType.getName());
// the column for the dependent unit address
annotationsPertype.putIfAbsent(depUnit, new ArrayList<>());
if (annoPerFeatures.size() == 0) {
annoPerFeatures.add("*");
}
annotationsPertype.get(depUnit).add(annoPerFeatures);
}
public static String replaceEscapeChars(String annotation)
{
return StringUtils.replaceEach(annotation,
new String[] {"\\", "[", "]", "|", "_", "->", ";", "\t", "\n", "*"},
new String[] {"\\\\", "\\[", "\\]", "\\|", "\\_", "\\->", "\\;", "\\t", "\\n", "\\*"});
}
private void setAnnoFeature(boolean aIsMultiToken, boolean aIsFirst, List<String> aAnnoPerFeatures,
String annotation) {
if (aIsMultiToken) {
if (aIsFirst) {
aAnnoPerFeatures.add(annotation);
} else {
aAnnoPerFeatures.add(annotation);
}
} else {
aAnnoPerFeatures.add(annotation);
}
}
private AnnotationUnit getFirstUnit(AnnotationFS targetFs) {
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(targetFs.getBegin());
sta.setEnd(targetFs.getEnd());
sta.setText(targetFs.getCoveredText());
Set<AnnotationUnit> sus = new LinkedHashSet<>();
AnnotationUnit firstUnit = null;
for (AnnotationUnit u : getSubUnits(sta, sus)) {
firstUnit = u;
break;
}
return firstUnit;
}
// for relation annotation drawn on multiple span annotation, we put the info only to the first
// unit
private AnnotationUnit getFirstUnit(AnnotationUnit aUnit)
{
SubTokenAnno sta = new SubTokenAnno();
sta.setBegin(aUnit.begin);
sta.setEnd(aUnit.end);
sta.setText(aUnit.token);
Set<AnnotationUnit> sus = new LinkedHashSet<>();
AnnotationUnit firstUnit = null;
for (AnnotationUnit u : getSubUnits(sta, sus)) {
firstUnit = u;
break;
}
return firstUnit;
}
/**
* Annotations of same type those: <br>
* 1) crosses multiple sentences AND <br>
* 2) repeated on the same unit (even if different value) <br>
* Will be referenced by a number so that re-importing or processing outside
* WebAnno can be easily distinguish same sets of annotations. This is much
* Meaningful for relation/slot and chain annotations. Reference numbers are incremental
*
* @param type
* The annotation type
* @param fs
* the annotation
* @param unit
* the annotation element (Token or sub-tokens)
* @return the reference number to be attached on this annotation value
*/
private int getRefId(Type type, AnnotationFS fs, AnnotationUnit unit) {
// first time
if (annotaionRefPerType.get(type) == null) {
Map<FeatureStructure, Integer> annoRefs = new HashMap<>();
annoRefs.put(fs, 1);
annotaionRefPerType.put(type, annoRefs);
multiAnnosPerUnit.putIfAbsent(type, new HashMap<>());
Map<FeatureStructure, Integer> multiAnooRefs = new HashMap<>();
multiAnooRefs.put(fs, 1);
multiAnnosPerUnit.get(type).put(unit, multiAnooRefs);
return 1;
}
else {
// This is a multiple token annotation, re-USE reference id
if (annotaionRefPerType.get(type).get(fs) != null) {
return annotaionRefPerType.get(type).get(fs);
}
Map<FeatureStructure, Integer> annoRefs = annotaionRefPerType.get(type);
int max = Collections.max(annoRefs.values()); // the last reference number so far.
annoRefs.put(fs, max + 1);
annotaionRefPerType.put(type, annoRefs);
/* Map<Integer, FeatureStructure> refsAnnos = refAnnotaionperType.get(type);
refsAnnos.put(max + 1, fs);
refAnnotaionperType.put(type, refsAnnos);*/
int ref = annotaionRefPerType.get(type).get(fs);
Map<FeatureStructure, Integer> multiAnooRefs = multiAnnosPerUnit.get(type).get(unit);
if (multiAnooRefs == null) {
multiAnooRefs = new HashMap<>();
multiAnooRefs.put(fs, ref);
multiAnnosPerUnit.get(type).put(unit, multiAnooRefs);
return ref;
}
// this is for sure a stacked annotation
else {
multiAnooRefs.put(fs, ref);
multiAnnosPerUnit.get(type).put(unit, multiAnooRefs);
return ref;
}
}
}
private void setTokenSentenceAddress(JCas aJCas) {
int sentNMumber = 1;
for (Sentence sentence : select(aJCas, Sentence.class)) {
int lineNumber = 1;
for (Token token : selectCovered(Token.class, sentence)) {
AnnotationUnit unit = new AnnotationUnit(token.getBegin(), token.getEnd(), false,
token.getCoveredText());
units.add(unit);
if (lineNumber == 1) {
sentenceUnits.put(unit, sentence.getCoveredText());
}
unitsLineNumber.put(unit, sentNMumber + "-" + lineNumber);
lineNumber++;
}
sentNMumber++;
}
}
class SubTokenAnno {
int begin;
int end;
String text;
public int getBegin() {
return begin;
}
public int getEnd() {
return end;
}
public void setEnd(int end) {
this.end = end;
}
public void setBegin(int begin) {
this.begin = begin;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
}
}