/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.tsv; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.CasUtil.getType; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * This class reads a WebAnno compatible TSV files and create annotations from * the information provided. The very beginning of the file, the header, * mentions the existing annotation layers with their feature structure * information Annotation layers are separated by # character and features by | * character. If the layer is a relation annotation, it includes the string * AttachToType=... where the attach type is expressed.There is no Chain TSV * reader Writer yet. * * */ public class WebannoTsv2Reader extends JCasResourceCollectionReader_ImplBase { private String fileName; Map<String, Token> indexedTokens; public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException { StringBuilder text = new StringBuilder(); DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); fileName = documentMetadata.getDocumentTitle(); setAnnotations(aJCas, aIs, aEncoding, text); aJCas.setDocumentText(text.toString()); } /** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation */ private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int columns = 1;// token number + token columns (minimum required) int tokenStart = 0, sentenceStart = 0; Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>(); Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>(); // an annotation for every feature in a layer Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>(); // store if this is a Begin/Intermediate/End of an annotation Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>(); // Store annotations of tokens so that it can be used later for relation // annotations Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>(); // store target token ids used for a relation Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>(); // store tokens indexing with the concat of itsbegin-end so that lemma // and pos annotation // can be attached, if exists, later indexedTokens = new HashMap<String, Token>(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.trim().equals("") && sentenceStart == tokenStart) { continue; } if (line.trim().equals("")) { text.replace(tokenStart - 1, tokenStart, ""); tokenStart = tokenStart - 1; Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); tokenStart++; sentenceStart = tokenStart; text.append("\n"); continue; } // sentence if (line.startsWith("#text=")) { continue; } if (line.startsWith("#id=")) { continue;// it is a comment line } if (line.startsWith("#")) { columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line); continue; } // some times, the sentence in #text= might have a new line which // break this reader, // so skip such lines if (!Character.isDigit(line.split(" ")[0].charAt(0))) { continue; } // If we are still unlucky, the line starts with a number from the // sentence but not // a token number, check if it didn't in the format NUM-NUM if (!Character.isDigit(line.split("-")[1].charAt(0))) { continue; } int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } // adding tokens and sentence StringTokenizer lineTk = new StringTokenizer(line, "\t"); String tokenNumberColumn = lineTk.nextToken(); String tokenColumn = lineTk.nextToken(); Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length()); token.addToIndexes(); Type posType = JCasUtil.getType(aJcas, POS.class); Type lemmaType = JCasUtil.getType(aJcas, Lemma.class); if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) { indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token); } // adding the annotations createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno, tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn); tokenStart = tokenStart + tokenColumn.length() + 1; text.append(tokenColumn + " "); } if (tokenStart > sentenceStart) { Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart); sentence.addToIndexes(); text.append("\n"); } createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets); } private int getLayerAndFeature(JCas aJcas, int columns, Map<Type, Set<Feature>> spanLayers, Map<Type, Type> relationayers, String line) throws IOException { StringTokenizer headerTk = new StringTokenizer(line, "#"); while (headerTk.hasMoreTokens()) { String layerNames = headerTk.nextToken().trim(); StringTokenizer layerTk = new StringTokenizer(layerNames, "|"); Set<Feature> features = new LinkedHashSet<Feature>(); String layerName = layerTk.nextToken().trim(); Iterator<Type> types = aJcas.getTypeSystem().getTypeIterator(); boolean layerExists = false; while (types.hasNext()) { if (types.next().getName().equals(layerName)) { layerExists = true; break; } } if (!layerExists) { throw new IOException(fileName + " This is not a valid TSV File. The layer " + layerName + " is not created in the project."); } Type layer = CasUtil.getType(aJcas.getCas(), layerName); while (layerTk.hasMoreTokens()) { String ft = layerTk.nextToken().trim(); if (ft.startsWith("AttachTo=")) { Type attachLayer = CasUtil.getType(aJcas.getCas(), ft.substring(9)); relationayers.put(layer, attachLayer); columns++; continue; } Feature feature = layer.getFeatureByBaseName(ft); if (feature == null) { throw new IOException(fileName + " This is not a valid TSV File. The feature " + ft + " is not created for the layer " + layerName); } features.add(feature); columns++; } spanLayers.put(layer, features); } return columns; } /** * Creates a relation layer. For every token, store the governor positions * and the dependent annotation */ private void createRelationLayer(JCas aJcas, Map<Type, Type> relationayers, Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations, Map<Type, Map<String, List<String>>> relationTargets) { for (Type layer : relationayers.keySet()) { if (relationTargets.get(layer) == null) { continue; } Feature dependentFeature = layer.getFeatureByBaseName("Dependent"); Feature governorFeature = layer.getFeatureByBaseName("Governor"); Map<String, List<String>> tokenIdMaps = relationTargets.get(layer); Map<String, List<AnnotationFS>> tokenAnnos = tokenAnnotations.get(relationayers.get(layer)); Map<String, List<AnnotationFS>> relationAnnos = tokenAnnotations.get(layer); for (String dependnetId : tokenIdMaps.keySet()) { int i = 0; for (String governorId : tokenIdMaps.get(dependnetId)) { AnnotationFS relationAnno = relationAnnos.get(dependnetId).get(i); AnnotationFS dependentAnno = tokenAnnos.get(dependnetId).get(0); AnnotationFS governorAnno = tokenAnnos.get(governorId).get(0); if (layer.getName().equals(Dependency.class.getName())) { Type tokenType = getType(aJcas.getCas(), Token.class.getName()); Feature attachFeature = tokenType.getFeatureByBaseName("pos"); AnnotationFS posDependentAnno = dependentAnno; dependentAnno = CasUtil.selectCovered(aJcas.getCas(), tokenType, dependentAnno.getBegin(), dependentAnno.getEnd()).get(0); dependentAnno.setFeatureValue(attachFeature, posDependentAnno); AnnotationFS posGovernorAnno = governorAnno; governorAnno = CasUtil.selectCovered(aJcas.getCas(), tokenType, governorAnno.getBegin(), governorAnno.getEnd()).get(0); governorAnno.setFeatureValue(attachFeature, posGovernorAnno); } // update begin/end of relation annotation if (dependentAnno.getEnd() <= governorAnno.getEnd()) { ((Annotation) relationAnno).setBegin(dependentAnno.getBegin()); ((Annotation) relationAnno).setEnd(governorAnno.getEnd()); } else { ((Annotation) relationAnno).setBegin(governorAnno.getBegin()); ((Annotation) relationAnno).setEnd(dependentAnno.getEnd()); } relationAnno.setFeatureValue(dependentFeature, dependentAnno); relationAnno.setFeatureValue(governorFeature, governorAnno); i++; } } } } private void createSpanAnnotation(JCas aJcas, int aTokenStart, Map<Type, Set<Feature>> aLayers, Map<Type, Type> aRelationayers, Map<Type, Map<Integer, AnnotationFS>> aAnnotations, Map<Type, Map<Integer, String>> aBeginEndAnno, Map<Type, Map<String, List<AnnotationFS>>> aTokenAnnotations, Map<Type, Map<String, List<String>>> aRelationTargets, StringTokenizer lineTk, String aToken, String aTokenNumberColumn) { for (Type layer : aLayers.keySet()) { int lastIndex = 1; // if a layer is bound to a single token but has multiple feature // annotation is created once and feature values be appended Map<Integer, AnnotationFS> singleTokenMultiFeature = new HashMap<Integer, AnnotationFS>(); // The relation line number should be read once all feature columns // are obtained int numberOfFeaturesPerLayer = aLayers.get(layer).size(); for (Feature feature : aLayers.get(layer)) { numberOfFeaturesPerLayer--; int index = 1; String multipleAnnotations = lineTk.nextToken(); String relationTargetNumbers = null; if (aRelationayers.containsKey(layer) && numberOfFeaturesPerLayer == 0) { relationTargetNumbers = lineTk.nextToken(); } int i = 0; String[] relationTargets = null; if (relationTargetNumbers != null) { relationTargets = relationTargetNumbers.split("\\|"); } for (String annotation : multipleAnnotations.split("\\|")) { // If annotation is not on multpile spans if (!(annotation.startsWith("B-") || annotation.startsWith("I-") || annotation.startsWith("O-")) && !(annotation.equals("_") || annotation.equals("O"))) { AnnotationFS newAnnotation; // if the layer has multiple features, create new // annotation only once if (singleTokenMultiFeature.get(index) == null) { newAnnotation = aJcas.getCas().createAnnotation(layer, aTokenStart, aTokenStart + aToken.length()); singleTokenMultiFeature.put(index, newAnnotation); } else { newAnnotation = singleTokenMultiFeature.get(index); } // annotations without feature value set, those with the // layer name prefix, should be // stripped out - make it null if (annotation.startsWith(layer.getName())) { annotation = null; } newAnnotation.setFeatureValueFromString(feature, annotation); aJcas.addFsToIndexes(newAnnotation); // Set the POS to the token if (layer.getName().equals(POS.class.getName())) { indexedTokens.get(aTokenStart + "-" + aTokenStart + aToken.length()) .setPos((POS) newAnnotation); } // Set the Lemma to the token if (layer.getName().equals(Lemma.class.getName())) { indexedTokens.get(aTokenStart + "-" + aTokenStart + aToken.length()) .setLemma((Lemma) newAnnotation); } if (aRelationayers.containsKey(layer) && numberOfFeaturesPerLayer == 0) { Map<String, List<String>> targets = aRelationTargets.get(layer); if (targets == null) { List<String> governors = new ArrayList<String>(); governors.add(relationTargets[i]); targets = new HashMap<String, List<String>>(); targets.put(aTokenNumberColumn, governors); i++; aRelationTargets.put(layer, targets); } else { List<String> governors = targets.get(aTokenNumberColumn); if (governors == null) { governors = new ArrayList<String>(); } governors.add(relationTargets[i]); targets.put(aTokenNumberColumn, governors); i++; aRelationTargets.put(layer, targets); } } Map<String, List<AnnotationFS>> tokenAnnotations = aTokenAnnotations.get(layer); if (tokenAnnotations == null) { tokenAnnotations = new HashMap<String, List<AnnotationFS>>(); } List<AnnotationFS> relAnnos = tokenAnnotations.get(aTokenNumberColumn); if (relAnnos == null) { relAnnos = new ArrayList<AnnotationFS>(); } relAnnos.add(newAnnotation); tokenAnnotations.put(aTokenNumberColumn, relAnnos); aTokenAnnotations.put(layer, tokenAnnotations); index++; } // for annotations such as B_LOC|B-_|I_PER and the like // O-_ is a position marker else if (annotation.equals("O-_") || annotation.equals("B-_") || annotation.equals("I-_")) { index++; } else if (annotation.startsWith("B-")) { boolean isNewAnnotation = true; Map<Integer, AnnotationFS> indexedAnnos = aAnnotations.get(layer); Map<Integer, String> indexedBeginEndAnnos = aBeginEndAnno.get(layer); AnnotationFS newAnnotation; if (indexedAnnos == null) { newAnnotation = aJcas.getCas().createAnnotation(layer, aTokenStart, aTokenStart + aToken.length()); indexedAnnos = new LinkedHashMap<Integer, AnnotationFS>(); indexedBeginEndAnnos = new LinkedHashMap<Integer, String>(); } else if (indexedAnnos.get(index) == null) { newAnnotation = aJcas.getCas().createAnnotation(layer, aTokenStart, aTokenStart + aToken.length()); } else if (indexedAnnos.get(index) != null && indexedBeginEndAnnos.get(index).equals("E-")) { newAnnotation = aJcas.getCas().createAnnotation(layer, aTokenStart, aTokenStart + aToken.length()); } // B-LOC I-LOC B-LOC - the last B-LOC is a new // annotation else if (indexedBeginEndAnnos.get(index).equals("I-")) { newAnnotation = aJcas.getCas().createAnnotation(layer, aTokenStart, aTokenStart + aToken.length()); } else { newAnnotation = indexedAnnos.get(index); isNewAnnotation = false; } // remove prefixes such as B-/I- before creating the // annotation annotation = (annotation.substring(2)); if (annotation.startsWith(layer.getName())) { annotation = null; } newAnnotation.setFeatureValueFromString(feature, annotation); aJcas.addFsToIndexes(newAnnotation); indexedAnnos.put(index, newAnnotation); indexedBeginEndAnnos.put(index, "B-"); aAnnotations.put(layer, indexedAnnos); if (aRelationayers.containsKey(layer)) { Map<String, List<String>> targets = aRelationTargets.get(layer); if (targets == null) { List<String> governors = new ArrayList<String>(); governors.add(relationTargets[i]); targets = new HashMap<String, List<String>>(); targets.put(aTokenNumberColumn, governors); i++; aRelationTargets.put(layer, targets); } else { List<String> governors = targets.get(aTokenNumberColumn); if (governors == null) { governors = new ArrayList<String>(); } governors.add(relationTargets[i]); targets.put(aTokenNumberColumn, governors); i++; aRelationTargets.put(layer, targets); } } Map<String, List<AnnotationFS>> tokenAnnotations = aTokenAnnotations.get(layer); if (isNewAnnotation) { if (tokenAnnotations == null) { tokenAnnotations = new HashMap<String, List<AnnotationFS>>(); } List<AnnotationFS> relAnnos = tokenAnnotations.get(aTokenNumberColumn); if (relAnnos == null) { relAnnos = new ArrayList<AnnotationFS>(); } relAnnos.add(newAnnotation); tokenAnnotations.put(aTokenNumberColumn, relAnnos); aTokenAnnotations.put(layer, tokenAnnotations); } aBeginEndAnno.put(layer, indexedBeginEndAnnos); index++; } else if (annotation.startsWith("I-")) { // beginEndAnnotation.put(layer, "I-"); Map<Integer, String> indexedBeginEndAnnos = aBeginEndAnno.get(layer); indexedBeginEndAnnos.put(index, "I-"); aBeginEndAnno.put(layer, indexedBeginEndAnnos); Map<Integer, AnnotationFS> indexedAnnos = aAnnotations.get(layer); AnnotationFS newAnnotation = indexedAnnos.get(index); ((Annotation) newAnnotation).setEnd(aTokenStart + aToken.length()); index++; } else { aAnnotations.put(layer, null); index++; } } lastIndex = index - 1; } // tokens annotated as B-X B-X, no B-I means it is end by itself for (int i = 1; i <= lastIndex; i++) { if (aBeginEndAnno.get(layer) != null && aBeginEndAnno.get(layer).get(i) != null && aBeginEndAnno.get(layer).get(i).equals("B-")) { aBeginEndAnno.get(layer).put(i, "E-"); } } } } public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; /* * public static final String MULTIPLE_SPAN_ANNOTATIONS = "multipleSpans"; * * @ConfigurationParameter(name = MULTIPLE_SPAN_ANNOTATIONS, mandatory = * true, defaultValue = {}) private List<String>multipleSpans; */ @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); InputStream is = null; try { is = res.getInputStream(); convertToCas(aJCas, is, encoding); } finally { closeQuietly(is); } } }