/* * Copyright 2015 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.brat; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.FSUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotation; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAnnotationDocument; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratAttribute; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventAnnotation; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratEventArgument; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratRelationAnnotation; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.BratTextAnnotation; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.RelationParam; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TextAnnotationParam; import de.tudarmstadt.ukp.dkpro.core.io.brat.internal.model.TypeMapping; /** * Reader for the brat format. * * @see <a href="http://brat.nlplab.org/standoff.html">brat standoff format</a> * @see <a href="http://brat.nlplab.org/configuration.html">brat configuration format</a> */ public class BratReader extends JCasResourceCollectionReader_ImplBase { /** * Name of configuration parameter that contains the character encoding used by the input files. */ public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; /** * Types that are relations. It is mandatory to provide the type name followed by two feature * names that represent Arg1 and Arg2 separated by colons, e.g. * <code>de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}</code>. * Additionally, a subcategorization feature may be specified. */ public static final String PARAM_RELATION_TYPES = "relationTypes"; @ConfigurationParameter(name = PARAM_RELATION_TYPES, mandatory = true, defaultValue = { "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency:Governor:Dependent{A}" }) private Set<String> relationTypes; private Map<String, RelationParam> parsedRelationTypes; /** * Types that are text annotations. It is mandatory to provide the type name which can * optionally be followed by a subcategorization feature. Using this parameter is * only necessary to specify a subcategorization feature. Otherwise, text annotation types are * automatically detected. */ public static final String PARAM_TEXT_ANNOTATION_TYPES = "textAnnotationTypes"; @ConfigurationParameter(name = PARAM_TEXT_ANNOTATION_TYPES, mandatory = true, defaultValue = {}) private Set<String> textAnnotationTypes; private Map<String, TextAnnotationParam> parsedTextAnnotationTypes; public static final String PARAM_TYPE_MAPPINGS = "typeMappings"; @ConfigurationParameter(name = PARAM_TYPE_MAPPINGS, mandatory = false, defaultValue = { // "Token -> de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", // "Organization -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Organization", // "Location -> de.tudarmstadt.ukp.dkpro.core.api.ner.type.Location" }) private String[] typeMappings; private TypeMapping typeMapping; private Map<String, AnnotationFS> spanIdMap; private Set<String> warnings; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); parsedRelationTypes = new HashMap<>(); for (String rel : relationTypes) { RelationParam p = RelationParam.parse(rel); parsedRelationTypes.put(p.getType(), p); } parsedTextAnnotationTypes = new HashMap<>(); for (String rel : textAnnotationTypes) { TextAnnotationParam p = TextAnnotationParam.parse(rel); parsedTextAnnotationTypes.put(p.getType(), p); } typeMapping = new TypeMapping(typeMappings); warnings = new LinkedHashSet<String>(); } @Override public void close() throws IOException { super.close(); for (String warning : warnings) { getLogger().warn(warning); } } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { spanIdMap = new HashMap<>(); Resource res = nextFile(); initCas(aJCas, res); readText(aJCas, res); readAnnotations(aJCas, res); } private void readAnnotations(JCas aJCas, Resource aRes) throws IOException { BratAnnotationDocument doc; try (Reader r = new InputStreamReader(aRes.getInputStream(), encoding)) { doc = BratAnnotationDocument.read(r); } CAS cas = aJCas.getCas(); TypeSystem ts = aJCas.getTypeSystem(); List<BratRelationAnnotation> relations = new ArrayList<>(); List<BratEventAnnotation> events = new ArrayList<>(); for (BratAnnotation anno : doc.getAnnotations()) { Type type = typeMapping.getUimaType(ts, anno); if (anno instanceof BratTextAnnotation) { create(cas, type, (BratTextAnnotation) anno); } else if (anno instanceof BratRelationAnnotation) { relations.add((BratRelationAnnotation) anno); } else if (anno instanceof BratEventAnnotation) { create(cas, type, (BratEventAnnotation) anno); events.add((BratEventAnnotation) anno); } else { throw new IllegalStateException("Annotation type [" + anno.getClass() + "] is currently not supported."); } } // Go through the relations now for (BratRelationAnnotation rel : relations) { Type type = typeMapping.getUimaType(ts, rel); create(cas, type, rel); } // Go through the events again and handle the slots for (BratEventAnnotation e : events) { Type type = typeMapping.getUimaType(ts, e); fillSlots(cas, type, doc, e); } } private void readText(JCas aJCas, Resource res) throws IOException { String annUrl = res.getResource().getURL().toString(); String textUrl = FilenameUtils.removeExtension(annUrl) + ".txt"; try (InputStream is = new BufferedInputStream(new URL(textUrl).openStream())) { aJCas.setDocumentText(IOUtils.toString(is, encoding)); } } private void create(CAS aCAS, Type aType, BratTextAnnotation aAnno) { TextAnnotationParam param = parsedTextAnnotationTypes.get(aType.getName()); AnnotationFS anno = aCAS.createAnnotation(aType, aAnno.getBegin(), aAnno.getEnd()); if (param != null && param.getSubcat() != null) { anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); } fillAttributes(anno, aAnno.getAttributes()); aCAS.addFsToIndexes(anno); spanIdMap.put(aAnno.getId(), anno); } private void create(CAS aCAS, Type aType, BratEventAnnotation aAnno) { AnnotationFS anno = aCAS.createAnnotation(aType, aAnno.getTriggerAnnotation().getBegin(), aAnno.getTriggerAnnotation().getEnd()); fillAttributes(anno, aAnno.getAttributes()); // Slots cannot be handled yet because they might point to events that have not been // created yet. aCAS.addFsToIndexes(anno); spanIdMap.put(aAnno.getId(), anno); } private void create(CAS aCAS, Type aType, BratRelationAnnotation aAnno) { RelationParam param = parsedRelationTypes.get(aType.getName()); AnnotationFS arg1 = spanIdMap.get(aAnno.getArg1Target()); AnnotationFS arg2 = spanIdMap.get(aAnno.getArg2Target()); FeatureStructure anno = aCAS.createFS(aType); anno.setFeatureValue(getFeature(anno, param.getArg1()), arg1); anno.setFeatureValue(getFeature(anno, param.getArg2()), arg2); AnnotationFS anchor = null; if (param.getFlags1().contains(RelationParam.FLAG_ANCHOR) && param.getFlags2().contains(RelationParam.FLAG_ANCHOR)) { throw new IllegalStateException("Only one argument can be the anchor."); } else if (param.getFlags1().contains(RelationParam.FLAG_ANCHOR)) { anchor = arg1; } else if (param.getFlags2().contains(RelationParam.FLAG_ANCHOR)) { anchor = arg2; } if (param.getSubcat() != null) { anno.setStringValue(getFeature(anno, param.getSubcat()), aAnno.getType()); } if (anchor != null) { anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN), anchor.getBegin()); anno.setIntValue(anno.getType().getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END), anchor.getEnd()); } else { TypeSystem ts = aCAS.getTypeSystem(); if (ts.subsumes(ts.getType(CAS.TYPE_NAME_ANNOTATION), anno.getType())) { warnings.add("Relation type [" + aType.getName() + "] has offsets but no anchor is specified."); } } fillAttributes(anno, aAnno.getAttributes()); aCAS.addFsToIndexes(anno); } private void fillAttributes(FeatureStructure aAnno, Collection<BratAttribute> aAttributes) { for (BratAttribute attr : aAttributes) { // Try treating the attribute name as an unqualified name, then as a qualified name. Feature feat = aAnno.getType().getFeatureByBaseName(attr.getName()); if (feat == null) { String featName = attr.getName().replace('_', ':'); featName = featName.substring(featName.indexOf(TypeSystem.FEATURE_SEPARATOR) + 1); feat = aAnno.getType().getFeatureByBaseName(featName); } // FIXME HACK! We may not find a "role" feature from slot links in the target type // because it should be in the link type. This here is a bad hack, but it should work // as long as the target type doesn't define a "role" feature itself. if ((("role".equals(attr.getName())) || attr.getName().endsWith("_role")) && feat == null) { return; } if (feat == null) { throw new IllegalStateException("Type [" + aAnno.getType().getName() + "] has no feature named [" + attr.getName() + "]"); } if (attr.getValues().length == 0) { // Nothing to do } else if (attr.getValues().length == 1) { aAnno.setFeatureValueFromString(feat, attr.getValues()[0]); } else { throw new IllegalStateException("Multi-valued attributes currently not supported"); } } } private void fillSlots(CAS aCas, Type aType, BratAnnotationDocument aDoc, BratEventAnnotation aE) { AnnotationFS event = spanIdMap.get(aE.getId()); Map<String, List<BratEventArgument>> groupedArgs = aE.getGroupedArguments(); for (Entry<String, List<BratEventArgument>> slot : groupedArgs.entrySet()) { // Resolve the target IDs to feature structures List<FeatureStructure> targets = new ArrayList<>(); // Lets see if there is a multi-valued feature by the name of the slot if (FSUtil.hasFeature(event, slot.getKey()) && FSUtil.isMultiValuedFeature(event, slot.getKey())) { for (BratEventArgument arg : slot.getValue()) { FeatureStructure target = spanIdMap.get(arg.getTarget()); if (target == null) { throw new IllegalStateException("Unable to resolve id [" + arg.getTarget() + "]"); } // Handle WebAnno-style slot links // FIXME It would be better if the link type could be configured, e.g. what // is the name of the link feature and what is the name of the role feature... // but right now we just keep it hard-coded to the values that are used // in the DKPro Core SemArgLink and that are also hard-coded in WebAnno Type componentType = event.getType().getFeatureByBaseName(slot.getKey()) .getRange().getComponentType(); if (CAS.TYPE_NAME_TOP .equals(aCas.getTypeSystem().getParent(componentType).getName())) { BratAnnotation targetAnno = aDoc.getAnnotation(arg.getTarget()); BratAttribute roleAttr = targetAnno.getAttribute("role"); if (roleAttr == null) { roleAttr = targetAnno.getAttribute( target.getType().getName().replace('.', '-') + "_role"); } FeatureStructure link = aCas.createFS(componentType); FSUtil.setFeature(link, "role", roleAttr.getValues()); FSUtil.setFeature(link, "target", target); target = link; } targets.add(target); } FSUtil.setFeature(event, slot.getKey(), targets); } // Lets see if there is a single-valued feature by the name of the slot else if (FSUtil.hasFeature(event, slot.getKey())) { for (BratEventArgument arg : slot.getValue()) { AnnotationFS target = spanIdMap.get(arg.getTarget()); if (target == null) { throw new IllegalStateException("Unable to resolve id [" + arg.getTarget() + "]"); } String fname = arg.getSlot() + (arg.getIndex() > 0 ? arg.getIndex() : ""); if (FSUtil.hasFeature(event, fname)) { FSUtil.setFeature(event, fname, target); } else { throw new IllegalStateException("Type [" + event.getType().getName() + "] has no feature naemd [" + fname + "]"); } } } else { throw new IllegalStateException("Type [" + event.getType().getName() + "] has no feature naemd [" + slot.getKey() + "]"); } } } private Feature getFeature(FeatureStructure aFS, String aName) { Feature f = aFS.getType().getFeatureByBaseName(aName); if (f == null) { throw new IllegalArgumentException("Type [" + aFS.getType().getName() + "] has no feature called [" + aName + "]"); } return f; } }