/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.tsv; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.commons.lang3.StringEscapeUtils.unescapeJava; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import java.util.TreeMap; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; import org.apache.uima.cas.ArrayFS; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Feature; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.CasUtil; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.clarin.webanno.tsv.util.AnnotationUnit; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; /** * This class reads a WebAnno compatible TSV files and create annotations from * the information provided. The the header of the file records the existing * annotation layers with their features names.<br> * If the annotation type or a feature in the type do not exist in the CAS, it * throws an error.<br> * Span types starts with the prefix <b> #T_SP=</b>. <br> * Relation types starts with the prefix <b> #T_RL=</b>. <br> * Chain types starts with the prefix <b> #T_CH=</b>. <br> * Slot features start with prefix <b> ROLE_</b>. <br> * All features of a type follows the the name separated by <b>|</b> character. * <br> */ public class WebannoTsv3Reader extends JCasResourceCollectionReader_ImplBase { private static final String TAB = "\t"; private static final String LF = "\n"; private static final String REF_REL = "referenceRelation"; private static final String REF_LINK = "referenceType"; private static final String CHAIN = "Chain"; private static final String FIRST = "first"; private static final String NEXT = "next"; public static final String ROLE = "ROLE_"; public static final String BT = "BT_"; // base type for the relation // annotation private static final String DEPENDENT = "Dependent"; private static final String GOVERNOR = "Governor"; private String fileName; private int columns = 2;// token number + token columns (minimum required) private Map<Type, Set<Feature>> allLayers = new LinkedHashMap<Type, Set<Feature>>(); private Map<Feature, Type> roleLinks = new HashMap<>(); private Map<Feature, Type> roleTargets = new HashMap<>(); private Map<Feature, Type> slotLinkTypes = new HashMap<>(); private StringBuilder coveredText =new StringBuilder(); // for each type, for each unit, annotations per position private Map<Type, Map<AnnotationUnit, List<String>>> annotationsPerPostion = new LinkedHashMap<>(); // For multiple span annotations and stacked annotations private Map<Type, Map<Integer, String>> annotationsPerTyep = new LinkedHashMap<>(); private Map<Type, Map<Integer, Map<Integer, AnnotationFS>>> chainAnnosPerTyep = new HashMap<>(); private List<AnnotationUnit> units = new ArrayList<>(); private Map<String, AnnotationUnit> token2Units = new HashMap<>(); private Map<AnnotationUnit, Token> units2Tokens = new HashMap<>(); private Map<Integer, Type> layerMaps = new LinkedHashMap<>(); private Map<Type, Map<AnnotationUnit, Map<Integer, AnnotationFS>>> annosPerRef = new HashMap<>(); private Map<Type, Feature> depFeatures = new HashMap<>(); private Map<Type, Type> depTypess = new HashMap<>(); // record the annotation at ref position when it is multiple token // annotation private Map<Type, Map<AnnotationUnit, Map<Integer, AnnotationFS>>> annoUnitperAnnoFs = new HashMap<>(); public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException { DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); fileName = documentMetadata.getDocumentTitle(); // setLayerAndFeature(aJCas, aIs, aEncoding); setAnnotations(aJCas, aIs, aEncoding); aJCas.setDocumentText(coveredText.toString()); } /** * Iterate through lines and create span annotations accordingly. For * multiple span annotation, based on the position of the annotation in the * line, update only the end position of the annotation */ private void setAnnotations(JCas aJCas, InputStream aIs, String aEncoding) throws IOException { // getting header information LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); int sentBegin = -1, sentEnd = 0; int prevSentEnd = 0; StringBuilder sentLineSb = new StringBuilder(); String lastSent = ""; int format = -1; while (lineIterator.hasNext()) { String line = lineIterator.next(); if (line.startsWith("#T_")) { setLayerAndFeature(aJCas, line); continue; } if (line.startsWith("#Text=")) { String text = line.substring(line.indexOf("=") + 1); if (format == 31) { text = unescapeJava(text); } if (sentLineSb.toString().isEmpty()) { sentLineSb.append(text); } else { sentLineSb.append(LF + text); } lastSent = sentLineSb.toString(); continue; } if (line.startsWith("#FORMAT=")) { if ("#FORMAT=WebAnno TSV 3".equals(line)) { format = 3; } else if ("#FORMAT=WebAnno TSV 3.1".equals(line)) { format = 31; } continue; } if (line.trim().isEmpty()) { if (!sentLineSb.toString().isEmpty()) { createSentence(aJCas, sentLineSb.toString(), sentBegin, sentEnd, prevSentEnd); prevSentEnd = sentEnd; sentBegin = -1;// reset for next sentence begin sentLineSb = new StringBuilder(); } continue; } line = line.trim(); int count = StringUtils.countMatches(line, "\t"); if (columns != count) { throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line); } String regex = "(?<!\\\\)*" + Pattern.quote(TAB); String[] lines = line.split(regex); int begin = Integer.parseInt(lines[1].split("-")[0]); int end = Integer.parseInt(lines[1].split("-")[1]); if(sentBegin == -1){ sentBegin = begin; } sentEnd = end; AnnotationUnit unit = createTokens(aJCas, lines, begin, end); int ind = 3; setAnnosPerTypePerUnit(lines, unit, ind); } // the last sentence if (!lastSent.isEmpty()) { createSentence(aJCas, lastSent, sentBegin, sentEnd, prevSentEnd); } Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> annosPerTypePerUnit = new HashMap<>(); setAnnosPerUnit(aJCas, annosPerTypePerUnit); addAnnotations(aJCas, annosPerTypePerUnit); addChainAnnotations(aJCas); } /** * The individual link annotations are stored in a {@link TreeMap} (chainAnnosPerTye) with chain * number and link number references, sorted in an ascending order <br> * Iterate over each chain number and link number references and construct the chain. */ private void addChainAnnotations(JCas aJCas) { for (Type linkType : chainAnnosPerTyep.keySet()) { for (int chainNo : chainAnnosPerTyep.get(linkType).keySet()) { Type chainType = aJCas.getCas().getTypeSystem() .getType(linkType.getName().substring(0, linkType.getName().length() - 4) + CHAIN); Feature firstF = chainType.getFeatureByBaseName(FIRST); Feature nextF = linkType.getFeatureByBaseName(NEXT); FeatureStructure chain = aJCas.getCas().createFS(chainType); aJCas.addFsToIndexes(chain); AnnotationFS firstFs = chainAnnosPerTyep.get(linkType).get(chainNo).get(1); AnnotationFS linkFs = firstFs; chain.setFeatureValue(firstF, firstFs); for (int i = 2; i <= chainAnnosPerTyep.get(linkType).get(chainNo).size(); i++) { linkFs.setFeatureValue(nextF, chainAnnosPerTyep.get(linkType).get(chainNo).get(i)); linkFs = chainAnnosPerTyep.get(linkType).get(chainNo).get(i); } } } } /** * Importing span annotations including slot annotations. */ private void addAnnotations(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) { for (Type type : annotationsPerPostion.keySet()) { Map<AnnotationUnit, Map<Integer, AnnotationFS>> multiTokUnits = new HashMap<>(); int ref = 1; AnnotationFS prevAnnoFs = null; // to see if it is on multiple token for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) { int end = unit.end; List<AnnotationFS> annos = aAnnosPerTypePerUnit.get(type).get(unit); int j = 0; Feature linkeF = null; Map<AnnotationFS, List<FeatureStructure>> linkFSesPerSlotAnno = new HashMap<>(); if(allLayers.get(type).size() == 0){ ref = addAnnotationWithNoFeature(aJCas, type, unit, annos, multiTokUnits, end, ref); continue; } for (Feature feat : allLayers.get(type)) { String anno = annotationsPerPostion.get(type).get(unit).get(j); if (!anno.equals("_")) { int i = 0; // if it is a slot annotation (multiple slots per // single annotation // (Target1<--role1--Base--role2-->Target2) int slot = 0; boolean targetAdd = false; String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|"); String[] stackedAnnos = anno.split(stackedAnnoRegex); for (String mAnnos : stackedAnnos) { String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";"); for (String mAnno : mAnnos.split(multipleSlotAnno)) { String depRef = ""; String multSpliter = "(?<!\\\\)" + Pattern.quote("["); // is this slot target ambiguous? boolean ambigTarget = false; if (mAnno.split(multSpliter).length>1) { ambigTarget = true; depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1); ref = depRef.contains("_") ? ref : Integer.valueOf( mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1)); mAnno = mAnno.substring(0, mAnno.indexOf("[")); } if (mAnno.equals("*")) { mAnno = null; } boolean isMultitoken = false; if (!multiTokUnits.isEmpty() && prevAnnoFs !=null && prevAnnoFs.getBegin()!=unit.begin ) { contAnno: for (AnnotationUnit u : multiTokUnits.keySet()) { for (Integer r : multiTokUnits.get(u).keySet()) { if (ref == r) { isMultitoken = true; prevAnnoFs = multiTokUnits.get(u).get(r); break contAnno; } } } } if (isMultitoken) { Feature endF = type .getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END); prevAnnoFs.setIntValue(endF, end); mAnno = getEscapeChars(mAnno); prevAnnoFs.setFeatureValueFromString(feat, mAnno); if (feat.getShortName().equals(REF_LINK)) { // since REF_REL do not start with BIO, // update it it... annos.set(i, prevAnnoFs); } setAnnoRefPerUnit(unit, type, ref, prevAnnoFs); } else { if (roleLinks.containsKey(feat)) { linkeF = feat; FeatureStructure link = aJCas.getCas().createFS(slotLinkTypes.get(feat)); Feature roleFeat = link.getType().getFeatureByBaseName("role"); mAnno = getEscapeChars(mAnno); link.setStringValue(roleFeat, mAnno); linkFSesPerSlotAnno.putIfAbsent(annos.get(i), new ArrayList<>()); linkFSesPerSlotAnno.get(annos.get(i)).add(link); } else if (roleTargets.containsKey(feat)) { FeatureStructure link = linkFSesPerSlotAnno.get(annos.get(i)).get(slot); int customTypeNumber = 0; if(mAnno.split("-").length>2){ customTypeNumber =Integer.valueOf(mAnno.substring(mAnno.lastIndexOf("-")+1)); mAnno = mAnno.substring(0,mAnno.lastIndexOf("-")); } AnnotationUnit targetUnit = token2Units.get(mAnno); Type tType = null; if (customTypeNumber == 0){ tType = roleTargets.get(feat); } else{ tType = layerMaps.get(customTypeNumber); } AnnotationFS targetFs; if(ambigTarget){ targetFs = annosPerRef.get(tType).get(targetUnit).get(ref); } else { targetFs = annosPerRef.get(tType).get(targetUnit).entrySet().iterator().next().getValue(); } link.setFeatureValue(feat, targetFs); addSlotAnnotations(linkFSesPerSlotAnno, linkeF); targetAdd = true; slot++; } else if (feat.getShortName().equals(REF_REL)) { int chainNo = Integer.valueOf(mAnno.split("->")[1].split("-")[0]); int LinkNo = Integer.valueOf(mAnno.split("->")[1].split("-")[1]); chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>()); if (chainAnnosPerTyep.get(type).get(chainNo) != null && chainAnnosPerTyep.get(type).get(chainNo).get(LinkNo) != null) { continue; } String refRel = mAnno.split("->")[0]; refRel = getEscapeChars(refRel); if (refRel.equals("*")) { refRel = null; } annos.get(i).setFeatureValueFromString(feat, refRel); chainAnnosPerTyep.putIfAbsent(type, new TreeMap<>()); chainAnnosPerTyep.get(type).putIfAbsent(chainNo, new TreeMap<>()); chainAnnosPerTyep.get(type).get(chainNo).put(LinkNo, annos.get(i)); } else if (feat.getShortName().equals(REF_LINK)) { mAnno = getEscapeChars(mAnno); annos.get(i).setFeatureValueFromString(feat, mAnno); aJCas.addFsToIndexes(annos.get(i)); } else if (depFeatures.get(type) != null && depFeatures.get(type).equals(feat)) { int g = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[0]); int d = depRef.isEmpty() ? 0 : Integer.valueOf(depRef.split("_")[1]); Type depType = depTypess.get(type); AnnotationUnit govUnit = token2Units.get(mAnno); int l = annotationsPerPostion.get(type).get(unit).size(); String thisUnit = annotationsPerPostion.get(type).get(unit).get(l-1); AnnotationUnit depUnit = token2Units.get(thisUnit); AnnotationFS govFs; AnnotationFS depFs; if (depType.getName().equals(POS.class.getName())) { depType = aJCas.getCas().getTypeSystem().getType(Token.class.getName()); govFs = units2Tokens.get(govUnit); depFs = units2Tokens.get(unit); } // to pass the test case, which have relation on Token which not the case // in WebAnno world :)(! else if (depType.getName().equals(Token.class.getName())){ govFs = units2Tokens.get(govUnit); depFs = units2Tokens.get(unit); } else if(g==0 && d == 0 ) { govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue(); depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue(); } else if(g==0){ govFs = annosPerRef.get(depType).get(govUnit).entrySet().iterator().next().getValue(); depFs = annosPerRef.get(depType).get(depUnit).get(d); } else { govFs = annosPerRef.get(depType).get(govUnit).get(g); depFs = annosPerRef.get(depType).get(depUnit).entrySet().iterator().next().getValue(); } annos.get(i).setFeatureValue(feat, depFs); annos.get(i).setFeatureValue(type.getFeatureByBaseName(GOVERNOR), govFs); if (depFs.getBegin() <= annos.get(i).getBegin()) { Feature beginF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_BEGIN); annos.get(i).setIntValue(beginF, depFs.getBegin()); } else { Feature endF = type.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END); annos.get(i).setIntValue(endF, depFs.getEnd()); } aJCas.addFsToIndexes(annos.get(i)); } else { mAnno = getEscapeChars(mAnno); multiTokUnits.putIfAbsent(unit, new HashMap<>()); multiTokUnits.get(unit).put(ref, annos.get(i)); prevAnnoFs = annos.get(i); annos.get(i).setFeatureValueFromString(feat, mAnno); aJCas.addFsToIndexes(annos.get(i)); setAnnoRefPerUnit(unit, type, ref, annos.get(i)); } } if(stackedAnnos.length>1) { ref++; } } if (type.getName().equals(POS.class.getName())) { units2Tokens.get(unit).setPos((POS) annos.get(i)); } if (type.getName().equals(Lemma.class.getName())) { units2Tokens.get(unit).setLemma((Lemma) annos.get(i)); } i++; } if(targetAdd){ linkFSesPerSlotAnno = new HashMap<>(); } } else { prevAnnoFs = null; } j++; } if(prevAnnoFs !=null){ ref++; } } annosPerRef.put(type, multiTokUnits); } } private int addAnnotationWithNoFeature(JCas aJCas, Type aType, AnnotationUnit aUnit, List<AnnotationFS> aAnnos, Map<AnnotationUnit, Map<Integer, AnnotationFS>> aMultiTokUnits, int aEnd, int aRef) { String anno = annotationsPerPostion.get(aType).get(aUnit).get(0); if (!anno.equals("_")) { int i = 0; String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|"); for (String mAnnos : anno.split(stackedAnnoRegex)) { String multipleSlotAnno = "(?<!\\\\)" + Pattern.quote(";"); for (String mAnno : mAnnos.split(multipleSlotAnno)) { String depRef = ""; if (mAnno.endsWith("]")) { depRef = mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1); aRef = depRef.contains("_") ? 0 : Integer.valueOf(mAnno.substring(mAnno.indexOf("[") + 1, mAnno.length() - 1)); mAnno = mAnno.substring(0, mAnno.indexOf("[")); } boolean isMultitoken = false; AnnotationFS multiAnnoFs = null; if (!aMultiTokUnits.isEmpty()) { for (AnnotationUnit u : aMultiTokUnits.keySet()) { for (Integer r : aMultiTokUnits.get(u).keySet()) { if (aRef == r) { isMultitoken = true; multiAnnoFs = aMultiTokUnits.get(u).get(r); break; } } } } if (isMultitoken) { Feature endF = aType.getFeatureByBaseName(CAS.FEATURE_BASE_NAME_END); multiAnnoFs.setIntValue(endF, aEnd); setAnnoRefPerUnit(aUnit, aType, aRef, multiAnnoFs); } else { aMultiTokUnits.putIfAbsent(aUnit, new HashMap<>()); aMultiTokUnits.get(aUnit).put(aRef, aAnnos.get(i)); aJCas.addFsToIndexes(aAnnos.get(i)); setAnnoRefPerUnit(aUnit, aType, aRef, aAnnos.get(i)); } aRef++; } i++; } } return aRef; } private String getEscapeChars(String aAnno) { if(aAnno==null){ return null; } return unescapeJava(aAnno); } /** * update a base annotation with slot annotations * * @param linkFSesPerAnno * contains list of slot annotations per a base annotation * @param aLinkeF * The link slot annotation feature */ private void addSlotAnnotations(Map<AnnotationFS, List<FeatureStructure>> linkFSesPerAnno, Feature aLinkeF) { for (AnnotationFS anno : linkFSesPerAnno.keySet()) { ArrayFS array = anno.getCAS().createArrayFS(linkFSesPerAnno.get(anno).size()); array.copyFromArray( linkFSesPerAnno.get(anno).toArray(new FeatureStructure[linkFSesPerAnno.get(anno).size()]), 0, 0, linkFSesPerAnno.get(anno).size()); anno.setFeatureValue(aLinkeF, array); anno.getCAS().addFsToIndexes(anno); } } /** * Gets annotations from lines (of {@link AnnotationUnit}s) and save for the * later access, while reading the document the first time. <br> * * @param lines * TSV lines exported from WebAnno * @param unit * the annotation unit (Token or sub-tokens) * @param ind * index of the annotation, from the TAB separated annotations in * the TSV lines */ private void setAnnosPerTypePerUnit(String[] lines, AnnotationUnit unit, int ind) { for (Type type : allLayers.keySet()) { annotationsPerPostion.putIfAbsent(type, new LinkedHashMap<>()); if(allLayers.get(type).size() ==0){ annotationsPerPostion.get(type).put(unit, annotationsPerPostion.get(type).getOrDefault(unit, new ArrayList<>())); annotationsPerPostion.get(type).get(unit).add(lines[ind]); ind++; continue; } for (Feature f : allLayers.get(type)) { annotationsPerPostion.get(type).put(unit, annotationsPerPostion.get(type).getOrDefault(unit, new ArrayList<>())); annotationsPerPostion.get(type).get(unit).add(lines[ind]); ind++; } // Add at the last position the line number // It will be used to get Annotation unit annotationsPerPostion.get(type).get(unit).add(lines[0]); } } private void setAnnosPerUnit(JCas aJCas, Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> aAnnosPerTypePerUnit) { for (Type type : annotationsPerPostion.keySet()) { Map<AnnotationUnit, List<AnnotationFS>> annosPerUnit = new HashMap<>(); for (AnnotationUnit unit : annotationsPerPostion.get(type).keySet()) { int begin = unit.begin; int end = unit.end; List<AnnotationFS> annos = new ArrayList<>(); // if there are multiple annos int multAnnos = 1; for (String anno : annotationsPerPostion.get(type).get(unit)) { String stackedAnnoRegex = "(?<!\\\\)" + Pattern.quote("|"); if (anno.split(stackedAnnoRegex).length > multAnnos) { multAnnos = anno.split(stackedAnnoRegex).length; } } for (int i = 0; i < multAnnos; i++) { annos.add(aJCas.getCas().createAnnotation(type, begin, end)); } annosPerUnit.put(unit, annos); } aAnnosPerTypePerUnit.put(type, annosPerUnit); } } private void setAnnoRefPerUnit(AnnotationUnit unit, Type type, int ref, AnnotationFS aAnnoFs) { annoUnitperAnnoFs.putIfAbsent(type, new HashMap<>()); annoUnitperAnnoFs.get(type).putIfAbsent(unit, new HashMap<>()); annoUnitperAnnoFs.get(type).get(unit).put(ref, aAnnoFs); } private AnnotationUnit createTokens(JCas aJCas, String[] lines, int begin, int end) { // subtokens should not be consider as tokens. example 1-2.1 ==> subtoken under token 2 if (!lines[0].contains(".")) { Token token = new Token(aJCas, begin, end); AnnotationUnit unit = new AnnotationUnit(begin, end, false, ""); units.add(unit); token.addToIndexes(); token2Units.put(lines[0], unit); units2Tokens.put(unit, token); return unit; } else { AnnotationUnit unit = new AnnotationUnit(begin, end, true, ""); units.add(unit); token2Units.put(lines[0], unit); return unit; } } private void createSentence(JCas aJCas, String aLine, int aBegin, int aEnd, int aPrevEnd) { // If the next sentence immediately follows the last one without any space or line break // in between, then we need to chop off again the linebreak that we added at the end of the // last sentence - otherwise offsets will be off on a round-trip. if ( aPrevEnd == aBegin && coveredText.length() > 0 && (coveredText.charAt(coveredText.length()-1) == '\n') ) { coveredText.deleteCharAt(coveredText.length()-1); } if (aPrevEnd + 1 < aBegin) { // FIXME This is very slow. Better use StringUtils.repeat() String pad = ""; // if there is plenty of spaces between sentences for (int i = aPrevEnd + 1; i < aBegin; i++) { pad = pad + " "; } coveredText.append(pad + aLine + LF); } else { coveredText.append(aLine + LF); } Sentence sentence = new Sentence(aJCas, aBegin, aEnd); sentence.addToIndexes(); } /** * Get the type and feature information from the TSV file header * * @param header * the header line * @throws IOException * If the type or the feature do not exist in the CAs */ private void setLayerAndFeature(JCas aJcas, String header) throws IOException { try { StringTokenizer headerTk = new StringTokenizer(header, "#"); while (headerTk.hasMoreTokens()) { String layerNames = headerTk.nextToken().trim(); StringTokenizer layerTk = new StringTokenizer(layerNames, "|"); Set<Feature> features = new LinkedHashSet<Feature>(); String layerName = layerTk.nextToken().trim(); layerName = layerName.substring(layerName.indexOf("=") + 1); Iterator<Type> types = aJcas.getTypeSystem().getTypeIterator(); boolean layerExists = false; while (types.hasNext()) { if (types.next().getName().equals(layerName)) { layerExists = true; break; } } if (!layerExists) { throw new IOException(fileName + " This is not a valid TSV File. The layer " + layerName + " is not created in the project."); } Type layer = CasUtil.getType(aJcas.getCas(), layerName); // if the layer do not have a feature, just update columns count for the place holder if(!layerTk.hasMoreTokens()){ columns++; allLayers.put(layer, features); layerMaps.put(layerMaps.size()+1, layer); return; } while (layerTk.hasMoreTokens()) { String ft = layerTk.nextToken().trim(); columns++; Feature feature; if (ft.startsWith(BT)) { feature = layer.getFeatureByBaseName(DEPENDENT); depFeatures.put(layer, feature); depTypess.put(layer, CasUtil.getType(aJcas.getCas(), ft.substring(3))); } else { feature = layer.getFeatureByBaseName(ft); } if (ft.startsWith(ROLE)) { ft = ft.substring(5); String t = layerTk.nextToken().toString(); columns++; Type tType = CasUtil.getType(aJcas.getCas(), t); String fName = ft.substring(0, ft.indexOf("_")); Feature slotF = layer.getFeatureByBaseName(fName.substring(fName.indexOf(":") + 1)); if (slotF == null) { throw new IOException(fileName + " This is not a valid TSV File. The feature " + ft + " is not created for the layer " + layerName); } features.add(slotF); roleLinks.put(slotF, tType); Type slotType = CasUtil.getType(aJcas.getCas(), ft.substring(ft.indexOf("_") + 1)); Feature tFeatore = slotType.getFeatureByBaseName("target"); if (tFeatore == null) { throw new IOException(fileName + " This is not a valid TSV File. The feature " + ft + " is not created for the layer " + layerName); } roleTargets.put(tFeatore, tType); features.add(tFeatore); slotLinkTypes.put(slotF, slotType); continue; } if (feature == null) { throw new IOException(fileName + " This is not a valid TSV File. The feature " + ft + " is not created for the layer " + layerName); } features.add(feature); } allLayers.put(layer, features); layerMaps.put(layerMaps.size()+1, layer); } } catch (Exception e) { throw new IOException(e.getMessage() + "\nTSV header:\n" + header); } } public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); InputStream is = null; try { is = res.getInputStream(); convertToCas(aJCas, is, encoding); } finally { closeQuietly(is); } } }