/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.conll;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
/**
* <p>Reads by default the CoNLL 2002 named entity format.</p>
*
* <p>The reader is also compatible with the CoNLL-based GermEval 2014 named entity format,
* in which the columns are separated by a tab, and there is an extra column for embedded named entities,
* besides the token number being put in the first column (see below).
* For that, additional parameters are provided, by which one can determine the column separator,
* whether there is an additional first column for token numbers, and whether embedded
* named entities should be read.
* (Note: Currently, the reader only reads the outer named entities, not the embedded ones.</p>
*
* <pre><code>
* The following snippet shows an example of the TSV format
* # http://de.wikipedia.org/wiki/Manfred_Korfmann [2009-10-17]
* 1 Aufgrund O O
* 2 seiner O O
* 3 Initiative O O
* 4 fand O O
* 5 2001/2002 O O
* 6 in O O
* 7 Stuttgart B-LOC O
* 8 , O O
* 9 Braunschweig B-LOC O
* 10 und O O
* 11 Bonn B-LOC O
* 12 eine O O
* 13 große O O
* 14 und O O
* 15 publizistisch O O
* 16 vielbeachtete O O
* 17 Troia-Ausstellung B-LOCpart O
* 18 statt O O
* 19 , O O
* 20 „ O O
* 21 Troia B-OTH B-LOC
* 22 - I-OTH O
* 23 Traum I-OTH O
* 24 und I-OTH O
* 25 Wirklichkeit I-OTH O
* 26 “ O O
* 27 . O O
* </code></pre>
*
* <ol>
* <li>WORD_NUMBER - token number</li>
* <li>FORM - token</li>
* <li>NER1 - outer named entity (BIO encoded)</li>
* <li>NER2 - embedded named entity (BIO encoded)</li>
* </ol>
* The sentence is encoded as one token per line, with information provided in tab-separated columns.
* The first column contains either a #, which signals the source the sentence is cited from and the date it was retrieved,
* or the token number within the sentence. The second column contains the token.
* Name spans are encoded in the BIO-scheme. Outer spans are encoded in the third column,
* embedded spans in the fourth column.
*
* @see <a href="http://www.clips.ua.ac.be/conll2002/ner/">CoNLL 2002 shared task</a>
* @see <a href="https://sites.google.com/site/germeval2014ner/data">GermEval 2014 NER task</a>
*/
@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2002, MimeTypes.TEXT_X_GERMEVAL_2014})
@TypeCapability(
outputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token",
"de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"})
public class Conll2002Reader
extends JCasResourceCollectionReader_ImplBase
{
/**
*
* Column Separators
*
*/
public enum ColumnSeparators
{
SPACE("space", " "),
TAB("tab", "\t"),
INVALID("", "");
private String name;
private String value;
private ColumnSeparators(String aName, String aValue)
{
name = aName;
value = aValue;
}
public String getName()
{
return name;
}
private String getValue()
{
return value;
}
private static ColumnSeparators getInstance(String Name) {
for (ColumnSeparators cs : ColumnSeparators.values()) {
if (Name.equals(cs.getName())) {
return cs;
}
}
return INVALID;
}
}
/**
* Column separator
*/
ColumnSeparators columnSeparator;
/**
* Column positions
*/
private int FORM = 0;
private int IOB = 1;
/**
* Column separator parameter. Acceptable input values come from {@link ColumnSeparators}.<br>
* Example usage: if you want to define 'tab' as the column separator the following value should be input for
* this parameter {@code Conll2002Reader.ColumnSeparators.TAB.getName()}
*/
public static final String PARAM_COLUMN_SEPARATOR = "columnSeparator";
@ConfigurationParameter(name = PARAM_COLUMN_SEPARATOR, mandatory = false, defaultValue = "space")
private String columnSeparatorName;
/**
* Token number flag. When true, the first column contains the token number
* inside the sentence (as in GermEval 2014 format)
*/
public static final String PARAM_HAS_TOKEN_NUMBER = "hasTokenNumber";
@ConfigurationParameter(name = PARAM_HAS_TOKEN_NUMBER, mandatory = false, defaultValue = "false")
private boolean hasTokenNumber;
/**
* Indicates that there is a header line before the sentence
*/
public static final String PARAM_HAS_HEADER = "hasHeader";
@ConfigurationParameter(name = PARAM_HAS_HEADER, mandatory = false, defaultValue = "false")
private boolean hasHeader;
/**
* Character encoding of the input data.
*/
public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING;
@ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8")
private String encoding;
/**
* Use the {@link String#intern()} method on tags. This is usually a good idea to avoid
* spamming the heap with thousands of strings representing only a few different tags.
*
* Default: {@code true}
*/
public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS;
@ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true")
private boolean internTags;
/**
* Read named entity information.
*
* Default: {@code true}
*/
public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY;
@ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true")
private boolean namedEntityEnabled;
/**
* Has embedded named entity extra column.
*
* Default: {@code false}
*/
public static final String PARAM_HAS_EMBEDDED_NAMED_ENTITY = "hasEmbeddedNamedEntity";
@ConfigurationParameter(name = PARAM_HAS_EMBEDDED_NAMED_ENTITY, mandatory = false, defaultValue = "false")
private boolean hasEmbeddedNamedEntity;
/**
* Location of the mapping file for named entity tags to UIMA types.
*/
public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION;
@ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false)
private String namedEntityMappingLocation;
private MappingProvider namedEntityMappingProvider;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
namedEntityMappingProvider = new MappingProvider();
namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/there/is/no/mapping/yet");
namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName());
namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, namedEntityMappingLocation);
namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage());
// Configure column positions. First column may be used for token number
FORM = hasTokenNumber?1:0;
IOB = hasTokenNumber?2:1;
// Configure column separator
columnSeparator = ColumnSeparators.getInstance(columnSeparatorName);
if (columnSeparator == ColumnSeparators.INVALID) {
Object[] params = {columnSeparatorName, PARAM_COLUMN_SEPARATOR};
throw new ResourceInitializationException(
ResourceInitializationException.RESOURCE_DATA_NOT_VALID, params);
}
}
@Override
public void getNext(JCas aJCas)
throws IOException, CollectionException
{
try{
if (namedEntityEnabled) {
namedEntityMappingProvider.configure(aJCas.getCas());
}
}
catch(AnalysisEngineProcessException e){
throw new IOException(e);
}
Resource res = nextFile();
initCas(aJCas, res);
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(
CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()),
encoding));
convert(aJCas, reader);
}
finally {
closeQuietly(reader);
}
}
private void convert(JCas aJCas, BufferedReader aReader)
throws IOException
{
JCasBuilder doc = new JCasBuilder(aJCas);
Type namedEntityType = JCasUtil.getType(aJCas, NamedEntity.class);
Feature namedEntityValue = namedEntityType.getFeatureByBaseName("value");
IobDecoder decoder = new IobDecoder(aJCas.getCas(), namedEntityValue, namedEntityMappingProvider);
decoder.setInternTags(internTags);
List<String[]> words;
while ((words = readSentence(aReader)) != null) {
if (words.isEmpty()) {
continue;
}
int sentenceBegin = doc.getPosition();
int sentenceEnd = sentenceBegin;
List<Token> tokens = new ArrayList<Token>();
String[] namedEntityTags = new String[words.size()];
// Tokens, POS
int i = 0;
Iterator<String[]> wordIterator = words.iterator();
while (wordIterator.hasNext()) {
String[] word = wordIterator.next();
// Read token
Token token = doc.add(word[FORM], Token.class);
sentenceEnd = token.getEnd();
if (wordIterator.hasNext()) {
doc.add(" ");
}
tokens.add(token);
namedEntityTags[i] = word[IOB];
i++;
}
if (namedEntityEnabled) {
decoder.decode(tokens, namedEntityTags);
}
// Sentence
Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd);
sentence.addToIndexes();
// Once sentence per line.
doc.add("\n");
}
doc.close();
}
/**
* Read a single sentence.
*/
private List<String[]> readSentence(BufferedReader aReader)
throws IOException
{
List<String[]> words = new ArrayList<String[]>();
String line;
boolean beginSentence = true;
while ((line = aReader.readLine()) != null) {
if (StringUtils.isBlank(line)) {
beginSentence = true;
break; // End of sentence
}
if (hasHeader && beginSentence) {
// Ignore header line
beginSentence = false;
continue;
}
String[] fields = line.split(columnSeparator.getValue());
if (!hasEmbeddedNamedEntity && fields.length != 2 + FORM) {
throw new IOException(String.format(
"Invalid file format. Line needs to have %d %s-separated fields: [%s]", 2 + FORM,
columnSeparator.getName(), line));
}
else if (hasEmbeddedNamedEntity && fields.length != 3 + FORM) {
throw new IOException(String.format(
"Invalid file format. Line needs to have %d %s-separated fields: [%s]", 3 + FORM,
columnSeparator.getName(), line));
}
words.add(fields);
}
if (line == null && words.isEmpty()) {
return null;
}
else {
return words;
}
}
}