/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.io.IobDecoder; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk; /** * Reads the CoNLL 2003 format. * * @see <a href="http://www.cnts.ua.ac.be/conll2003/ner/">CoNLL 2003 shared task</a> */ @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2003}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity" }) public class Conll2003Reader extends JCasResourceCollectionReader_ImplBase { private static final int FORM = 0; private static final int POSTAG = 1; private static final int CHUNK = 2; private static final int NAMED_ENTITY = 3; /** * Character encoding of the input data. */ public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spamming the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; /** * Write part-of-speech information. * * Default: {@code true} */ public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean posEnabled; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Write chunk information. * * Default: {@code true} */ public static final String PARAM_READ_CHUNK = ComponentParameters.PARAM_READ_CHUNK; @ConfigurationParameter(name = PARAM_READ_CHUNK, mandatory = true, defaultValue = "true") private boolean chunkEnabled; /** * Use this chunk tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_CHUNK_TAG_SET = ComponentParameters.PARAM_CHUNK_TAG_SET; @ConfigurationParameter(name = PARAM_CHUNK_TAG_SET, mandatory = false) protected String chunkTagset; /** * Load the chunk tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_CHUNK_MAPPING_LOCATION = ComponentParameters.PARAM_CHUNK_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CHUNK_MAPPING_LOCATION, mandatory = false) protected String chunkMappingLocation; /** * Read named entity information. * * Default: {@code true} */ public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean namedEntityEnabled; /** * Location of the mapping file for named entity tags to UIMA types. */ public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) private String namedEntityMappingLocation; private MappingProvider posMappingProvider; private MappingProvider chunkMappingProvider; private MappingProvider namedEntityMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, posTagset, getLanguage()); chunkMappingProvider = MappingProviderFactory.createChunkMappingProvider(chunkMappingLocation, chunkTagset, getLanguage()); namedEntityMappingProvider = new MappingProvider(); namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, "classpath:/there/is/no/mapping/yet"); namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, NamedEntity.class.getName()); namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, namedEntityMappingLocation); namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { try { if (posEnabled) { posMappingProvider.configure(aJCas.getCas()); } if (chunkEnabled) { chunkMappingProvider.configure(aJCas.getCas()); } if (namedEntityEnabled) { namedEntityMappingProvider.configure(aJCas.getCas()); } } catch (AnalysisEngineProcessException e) { throw new IOException(e); } Resource res = nextFile(); initCas(aJCas, res); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), encoding)); convert(aJCas, reader); } finally { closeQuietly(reader); } } private void convert(JCas aJCas, BufferedReader aReader) throws IOException { JCasBuilder doc = new JCasBuilder(aJCas); Type chunkType = JCasUtil.getType(aJCas, Chunk.class); Feature chunkValue = chunkType.getFeatureByBaseName("chunkValue"); IobDecoder chunkDecoder = new IobDecoder(aJCas.getCas(), chunkValue, chunkMappingProvider); chunkDecoder.setInternTags(internTags); Type namedEntityType = JCasUtil.getType(aJCas, NamedEntity.class); Feature namedEntityValue = namedEntityType.getFeatureByBaseName("value"); IobDecoder neDecoder = new IobDecoder(aJCas.getCas(), namedEntityValue, namedEntityMappingProvider); neDecoder.setInternTags(internTags); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; List<Token> tokens = new ArrayList<Token>(); String[] chunkTags = new String[words.size()]; String[] namedEntityTags = new String[words.size()]; // Tokens, POS int i = 0; for (String[] word : words) { // Read token Token token = doc.add(word[FORM], Token.class); sentenceEnd = token.getEnd(); doc.add(" "); if (posEnabled) { Type posTag = posMappingProvider.getTagType(word[POSTAG]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POSTAG].intern()); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : posTag.getShortName().intern()); pos.addToIndexes(); token.setPos(pos); } tokens.add(token); chunkTags[i] = word[CHUNK]; namedEntityTags[i] = word[NAMED_ENTITY]; i++; } if (chunkEnabled) { chunkDecoder.decode(tokens, chunkTags); } if (namedEntityEnabled) { neDecoder.decode(tokens, namedEntityTags); } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); } /** * Read a single sentence. */ private static List<String[]> readSentence(BufferedReader aReader) throws IOException { List<String[]> words = new ArrayList<String[]>(); String line; while ((line = aReader.readLine()) != null) { if (StringUtils.isBlank(line)) { break; // End of sentence } String[] fields = line.split(" "); if (fields.length != 4) { throw new IOException( "Invalid file format. Line needs to have 4 space-separted fields: [" + line + "]"); } words.add(fields); } if (line == null && words.isEmpty()) { return null; } else { return words; } } }