/* * Copyright 2012 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.tsv; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import org.apache.commons.io.IOUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.util.Level; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * Reads a specific TSV File (9 TAB separated) annotation and change it to CAS object. Example of * Input Files: <br> * 1 Heutzutage heutzutage ADV _ _ 2 ADV _ _ <br> * Columns are separated by a TAB character and sentences are separated by a blank new line. * * */ public class WebannoTsv1Reader extends JCasResourceCollectionReader_ImplBase { private String fileName; public void convertToCas(JCas aJCas, InputStream aIs, String aEncoding) throws IOException { StringBuilder text = new StringBuilder(); Map<Integer, String> tokens = new HashMap<Integer, String>(); Map<Integer, String> pos = new HashMap<Integer, String>(); Map<Integer, String> lemma = new HashMap<Integer, String>(); Map<Integer, String> namedEntity = new HashMap<Integer, String>(); Map<Integer, String> dependencyFunction = new HashMap<Integer, String>(); Map<Integer, Integer> dependencyDependent = new HashMap<Integer, Integer>(); List<Integer> firstTokenInSentence = new ArrayList<Integer>(); DocumentMetaData documentMetadata = DocumentMetaData.get(aJCas); fileName = documentMetadata.getDocumentTitle(); setAnnotations(aIs, aEncoding, text, tokens, pos, lemma, namedEntity, dependencyFunction, dependencyDependent, firstTokenInSentence); aJCas.setDocumentText(text.toString()); Map<String, Token> tokensStored = new HashMap<String, Token>(); createToken(aJCas, text, tokens, pos, lemma, tokensStored); createNamedEntity(namedEntity, aJCas, tokens, tokensStored); createDependency(aJCas, tokens, dependencyFunction, dependencyDependent, tokensStored); createSentence(aJCas, firstTokenInSentence, tokensStored); } /** * Create {@link Token} in the {@link CAS}. If the lemma and pos columns are not empty it will * create {@link Lemma} and {@link POS} annotations */ private void createToken(JCas aJCas, StringBuilder text, Map<Integer, String> tokens, Map<Integer, String> pos, Map<Integer, String> lemma, Map<String, Token> tokensStored) { int tokenBeginPosition = 0; int tokenEndPosition = 0; for (int i = 1; i <= tokens.size(); i++) { tokenBeginPosition = text.indexOf(tokens.get(i), tokenBeginPosition); Token outToken = new Token(aJCas, tokenBeginPosition, text.indexOf(tokens.get(i), tokenBeginPosition) + tokens.get(i).length()); tokenEndPosition = text.indexOf(tokens.get(i), tokenBeginPosition) + tokens.get(i).length(); tokenBeginPosition = tokenEndPosition; outToken.addToIndexes(); // Add pos to CAS if exist if (!pos.get(i).equals("_")) { POS outPos = new POS(aJCas, outToken.getBegin(), outToken.getEnd()); outPos.setPosValue(pos.get(i)); outPos.addToIndexes(); outToken.setPos(outPos); } // Add lemma if exist if (!lemma.get(i).equals("_")) { Lemma outLemma = new Lemma(aJCas, outToken.getBegin(), outToken.getEnd()); outLemma.setValue(lemma.get(i)); outLemma.addToIndexes(); outToken.setLemma(outLemma); } tokensStored.put("t_" + i, outToken); } } /** * add dependency parsing to CAS */ private void createDependency(JCas aJCas, Map<Integer, String> tokens, Map<Integer, String> dependencyFunction, Map<Integer, Integer> dependencyDependent, Map<String, Token> tokensStored) { for (int i = 1; i <= tokens.size(); i++) { if (dependencyFunction.get(i) != null) { Dependency outDependency = new Dependency(aJCas); outDependency.setDependencyType(dependencyFunction.get(i)); // if span A has (start,end)= (20, 26) and B has (start,end)= (30, 36) // arc drawn from A to B, dependency will have (start, end) = (20, 36) // arc drawn from B to A, still dependency will have (start, end) = (20, 36) int begin = 0, end = 0; // if not ROOT if (dependencyDependent.get(i) != 0) { begin = tokensStored.get("t_" + i).getBegin() > tokensStored.get( "t_" + dependencyDependent.get(i)).getBegin() ? tokensStored.get( "t_" + dependencyDependent.get(i)).getBegin() : tokensStored.get( "t_" + i).getBegin(); end = tokensStored.get("t_" + i).getEnd() < tokensStored.get( "t_" + dependencyDependent.get(i)).getEnd() ? tokensStored.get( "t_" + dependencyDependent.get(i)).getEnd() : tokensStored .get("t_" + i).getEnd(); } else { begin = tokensStored.get("t_" + i).getBegin(); end = tokensStored.get("t_" + i).getEnd(); } outDependency.setBegin(begin); outDependency.setEnd(end); outDependency.setDependent(tokensStored.get("t_" + i)); if (dependencyDependent.get(i) == 0) { outDependency.setGovernor(tokensStored.get("t_" + i)); } else { outDependency.setGovernor(tokensStored.get("t_" + dependencyDependent.get(i))); } outDependency.addToIndexes(); } } } /** * Add sentence layer to CAS */ private void createSentence(JCas aJCas, List<Integer> firstTokenInSentence, Map<String, Token> tokensStored) { for (int i = 0; i < firstTokenInSentence.size(); i++) { Sentence outSentence = new Sentence(aJCas); // Only last sentence, and no the only sentence in the document (i!=0) if (i == firstTokenInSentence.size() - 1 && i != 0) { outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd()); outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd()); outSentence.addToIndexes(); break; } if (i == firstTokenInSentence.size() - 1 && i == 0) { outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)) .getBegin()); outSentence.setEnd(tokensStored.get("t_" + (tokensStored.size())).getEnd()); outSentence.addToIndexes(); } else if (i == 0) { outSentence.setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)) .getBegin()); outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)) .getEnd()); outSentence.addToIndexes(); } else { outSentence .setBegin(tokensStored.get("t_" + firstTokenInSentence.get(i)).getEnd() + 1); outSentence.setEnd(tokensStored.get("t_" + firstTokenInSentence.get(i + 1)) .getEnd()); outSentence.addToIndexes(); } } } /** * Iterate through all lines and get available annotations<br> * First column is sentence number and a blank new line marks end of a sentence<br> * The Second column is the token <br> * The third column is the lemma annotation <br> * The fourth column is the POS annotation <br> * The fifth column is used for Named Entity annotations (Multiple annotations separeted by | * character) <br> * The sixth column is the origin token number of dependency parsing <br> * The seventh column is the function/type of the dependency parsing <br> * eighth and ninth columns are undefined currently */ private void setAnnotations(InputStream aIs, String aEncoding, StringBuilder text, Map<Integer, String> tokens, Map<Integer, String> pos, Map<Integer, String> lemma, Map<Integer, String> namedEntity, Map<Integer, String> dependencyFunction, Map<Integer, Integer> dependencyDependent, List<Integer> firstTokenInSentence) throws IOException { int tokenNumber = 0; boolean first = true; int base = 0; LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding); boolean textFound = false; StringBuffer tmpText = new StringBuffer(); while (lineIterator.hasNext()) { String line = lineIterator.next().trim(); if (line.startsWith("#text=")) { text.append(line.substring(6) + "\n"); textFound = true; continue; } if (line.startsWith("#")) { continue;// it is a comment line } int count = StringUtils.countMatches(line, "\t"); if (line.isEmpty()) { continue; } if (count != 9) {// not a proper TSV file getUimaContext().getLogger().log(Level.INFO, "This is not a valid TSV File"); throw new IOException(fileName + " This is not a valid TSV File"); } StringTokenizer lineTk = new StringTokenizer(line, "\t"); if (first) { tokenNumber = Integer.parseInt(line.substring(0, line.indexOf("\t"))); firstTokenInSentence.add(tokenNumber); first = false; } else { int lineNumber = Integer.parseInt(line.substring(0, line.indexOf("\t"))); if (lineNumber == 1) { base = tokenNumber; firstTokenInSentence.add(base); } tokenNumber = base + Integer.parseInt(line.substring(0, line.indexOf("\t"))); } while (lineTk.hasMoreElements()) { lineTk.nextToken(); String token = lineTk.nextToken(); // for backward compatibility tmpText.append(token + " "); tokens.put(tokenNumber, token); lemma.put(tokenNumber, lineTk.nextToken()); pos.put(tokenNumber, lineTk.nextToken()); String ne = lineTk.nextToken(); lineTk.nextToken();// make it compatible with prev WebAnno TSV reader namedEntity.put(tokenNumber, (ne.equals("_")||ne.equals("-")) ? "O" : ne); String dependentValue = lineTk.nextToken(); if (NumberUtils.isDigits(dependentValue)) { int dependent = Integer.parseInt(dependentValue); dependencyDependent.put(tokenNumber, dependent == 0 ? 0 : base + dependent); dependencyFunction.put(tokenNumber, lineTk.nextToken()); } else { lineTk.nextToken(); } lineTk.nextToken(); lineTk.nextToken(); } } if (!textFound) { text.append(tmpText); } } public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); InputStream is = null; try { is = res.getInputStream(); convertToCas(aJCas, is, encoding); } finally { closeQuietly(is); } } /** * Creates Named Entities from CoNLL BIO format to CAS format */ private void createNamedEntity(Map<Integer, String> aNamedEntityMap, JCas aJCas, Map<Integer, String> aTokensMap, Map<String, Token> aJcasTokens) { Map<Integer, NamedEntity> indexedNeAnnos = new LinkedHashMap<Integer, NamedEntity>(); for (int i = 1; i <= aTokensMap.size(); i++) { if (aNamedEntityMap.get(i).equals("O")) { continue; } int index = 1;// to maintain multiple span ne annotation in the same index for (String ne : aNamedEntityMap.get(i).split("\\|")) { if (ne.equals("O")) {// for annotations such as B_LOC|O|I_PER and the like index++; } else if (ne.startsWith("B_") || ne.startsWith("B-")) { NamedEntity outNamedEntity = new NamedEntity(aJCas, aJcasTokens.get("t_" + i) .getBegin(), aJcasTokens.get("t_" + i).getEnd()); outNamedEntity.setValue(ne.substring(2)); outNamedEntity.addToIndexes(); indexedNeAnnos.put(index, outNamedEntity); index++; } else if (ne.startsWith("I_")||ne.startsWith("I-")) { NamedEntity outNamedEntity = indexedNeAnnos.get(index); outNamedEntity.setEnd(aJcasTokens.get("t_" + i).getEnd()); outNamedEntity.addToIndexes(); index++; } else {// NE is not in IOB format. store one NE per token. No way to detect multiple // token NE NamedEntity outNamedEntity = new NamedEntity(aJCas, aJcasTokens.get("t_" + i) .getBegin(), aJcasTokens.get("t_" + i).getEnd()); outNamedEntity.setValue(ne); outNamedEntity.addToIndexes(); indexedNeAnnos.put(index, outNamedEntity); index++; } } } } }