/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.fit.util.FSCollectionFactory; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeToJCasConverter; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; /** * Reads a file in the CoNLL-2012 format. * * @see <a href="http://conll.cemantix.org/2012/data.html">CoNLL 2012 Shared Task: * Modeling Multilingual Unrestricted Coreference in OntoNotes</a> */ @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2012}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg"}) public class Conll2012Reader extends JCasResourceCollectionReader_ImplBase { public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; /** * Disabled by default because CoNLL 2012 format does not include lemmata for all words, only * for predicates. */ public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "false") private boolean readLemma; public static final String PARAM_READ_SEMANTIC_PREDICATE = "readSemanticPredicate"; @ConfigurationParameter(name = PARAM_READ_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean readSemanticPredicate; public static final String PARAM_READ_WORD_SENSE = "readWordSense"; @ConfigurationParameter(name = PARAM_READ_WORD_SENSE, mandatory = true, defaultValue = "true") private boolean readWordSense; public static final String PARAM_READ_CONSTITUENT = ComponentParameters.PARAM_READ_CONSTITUENT; @ConfigurationParameter(name = PARAM_READ_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean readConstituent; public static final String PARAM_READ_COREFERENCE = ComponentParameters.PARAM_READ_COREFERENCE; @ConfigurationParameter(name = PARAM_READ_COREFERENCE, mandatory = true, defaultValue = "true") private boolean readCoreference; public static final String PARAM_READ_NAMED_ENTITY = ComponentParameters.PARAM_READ_NAMED_ENTITY; @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") private boolean readNamedEntity; /** * Use this constituent tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_CONSTITUENT_TAG_SET = ComponentParameters.PARAM_CONSTITUENT_TAG_SET; @ConfigurationParameter(name = PARAM_CONSTITUENT_TAG_SET, mandatory = false) protected String constituentTagset; /** * Load the constituent tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = ComponentParameters.PARAM_CONSTITUENT_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_CONSTITUENT_MAPPING_LOCATION, mandatory = false) protected String constituentMappingLocation; /** * Use the {@link String#intern()} method on tags. This is usually a good idea to avoid * spaming the heap with thousands of strings representing only a few different tags. * * Default: {@code true} */ public static final String PARAM_INTERN_TAGS = ComponentParameters.PARAM_INTERN_TAGS; @ConfigurationParameter(name = PARAM_INTERN_TAGS, mandatory = false, defaultValue = "true") private boolean internTags; public static final String PARAM_WRITE_TRACES_TO_TEXT = "writeTracesToText"; @ConfigurationParameter(name = PARAM_WRITE_TRACES_TO_TEXT, mandatory = false, defaultValue = "false") private boolean writeTracesToText; /** * Use the document ID declared in the file header instead of using the filename. */ public static final String PARAM_USE_HEADER_METADATA = "useHeaderMetadata"; @ConfigurationParameter(name = PARAM_USE_HEADER_METADATA, mandatory = true, defaultValue = "true") private boolean useHeaderMetadata; private static final String UNUSED = "-"; // private static final int DOCUMENT_ID = 0; // Ignored // private static final int PART_NUMBER = 1; // Ignored private static final int ID = 2; private static final int FORM = 3; private static final int POS = 4; private static final int PARSE = 5; private static final int LEMMA = 6; private static final int PRED = 7; private static final int WORD_SENSE = 8; // private static final int SPEAKER = 9; // Ignored private static final int NAMED_ENTITIES = 10; private static final int APRED = 11; private MappingProvider posMappingProvider; private MappingProvider constituentMappingProvider; private PennTreeToJCasConverter converter; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, posTagset, getLanguage()); constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider( constituentMappingLocation, constituentTagset, getLanguage()); converter = new PennTreeToJCasConverter(posMappingProvider, constituentMappingProvider); converter.setInternTags(internTags); converter.setWriteTracesToText(writeTracesToText); converter.setCreatePosTags(false); // We handle POS tags via the column already converter.setRootLabel("TOP"); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), encoding)); convert(aJCas, reader); } finally { closeQuietly(reader); } } public void convert(JCas aJCas, BufferedReader aReader) throws IOException { try { if (readPos) { posMappingProvider.configure(aJCas.getCas()); } if (readConstituent) { constituentMappingProvider.configure(aJCas.getCas()); } } catch (AnalysisEngineProcessException e) { throw new IOException(e); } Map<String, CoreferenceLink> chains = new HashMap<>(); JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aJCas, aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; StringBuilder parse = new StringBuilder(); // Tokens, Lemma, POS Map<Integer, Token> tokenById = new HashMap<Integer, Token>(); List<SemPred> preds = new ArrayList<>(); Iterator<String[]> wordIterator = words.iterator(); while (wordIterator.hasNext()) { String[] word = wordIterator.next(); // Read token Token token = doc.add(word[FORM], Token.class); tokenById.put(Integer.valueOf(word[ID]), token); if (wordIterator.hasNext()) { doc.add(" "); } // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POS]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POS]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POS].intern()); pos.setCoarseValue(pos.getClass().equals(POS.class) ? null : posTag.getShortName().intern()); pos.addToIndexes(); token.setPos(pos); } if (!UNUSED.equals(word[PRED]) && readSemanticPredicate) { SemPred pred = new SemPred(aJCas, token.getBegin(), token.getEnd()); pred.setCategory(word[PRED]); pred.addToIndexes(); preds.add(pred); } if (!UNUSED.equals(word[PARSE]) && readConstituent) { String fixed = word[PARSE].replace("*", "(" + word[POS] + " " + word[FORM] + ")"); parse.append(fixed); } if (!UNUSED.equals(word[WORD_SENSE]) && readWordSense) { WordSense wordSense = new WordSense(aJCas, token.getBegin(), token.getEnd()); wordSense.setValue(word[WORD_SENSE]); wordSense.addToIndexes(); } if (!UNUSED.equals(word[word.length-1]) && readCoreference) { String[] chainFragments = word[word.length-1].split("\\|"); for (String chainFragment : chainFragments) { boolean beginning = chainFragment.startsWith("("); boolean ending = chainFragment.endsWith(")"); String chainId = chainFragment.substring(beginning ? 1 : 0, ending ? chainFragment.length() -1 : chainFragment.length()); CoreferenceLink link = chains.get(chainId); if (beginning) { if (link == null) { link = new CoreferenceLink(aJCas); CoreferenceChain chain = new CoreferenceChain(aJCas); chain.setFirst(link); chain.addToIndexes(); } else { CoreferenceLink newLink = new CoreferenceLink(aJCas); link.setNext(newLink); link = newLink; } link.setReferenceType(chainId); link.setBegin(token.getBegin()); } if (ending) { link.setEnd(token.getEnd()); link.addToIndexes(); } chains.put(chainId, link); } } sentenceEnd = token.getEnd(); } // Named entities if (readNamedEntity) { int currentNeBegin = -1; String currentNeType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[NAMED_ENTITIES]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a NE is beginning, we remember what the NE is and where it began if (beginning) { // The NE is beginning with "(" and either ending with "(" or "*", so we trim // the first and last character currentNeType = ne.substring(1, ne.length()-1); currentNeBegin = i; } // We need to create an annotation if the current token is the end of an annotation if (ending) { // Determine begin and end of named entity int begin = tokenById.get(currentNeBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity NamedEntity namedEntity = new NamedEntity(aJCas, begin, end); namedEntity.setValue(currentNeType); namedEntity.addToIndexes(); // Forget remembered named entity currentNeBegin = -1; currentNeType = null; } } } // Semantic arguments if (readSemanticPredicate) { // Get arguments for one predicate at a time for (int p = 0; p < preds.size(); p++) { SemPred pred = preds.get(p); List<SemArgLink> args = new ArrayList<>(); int currentArgBegin = -1; String currentArgType = null; for (int i = 0; i < words.size(); i++) { String ne = words.get(i)[APRED + p]; boolean beginning = ne.startsWith("("); boolean ending = ne.endsWith(")"); // When a arg is beginning, we remember what the NE is and where it began if (beginning) { // The arg is beginning with "(" and either ending with "(" or "*", so // we trim the first and last character currentArgType = ne.substring(1, ne.length()-1); currentArgBegin = i; } // We need to create an annotation if the current token is the end of an // annotation if (ending) { // Determine begin and end of argument int begin = tokenById.get(currentArgBegin).getBegin(); int end = tokenById.get(i).getEnd(); // Add named entity unless it is a (V*) which has the same offsets as // the predicate if (!(pred.getBegin() == begin && pred.getEnd() == end)) { SemArg arg = new SemArg(aJCas, begin, end); arg.addToIndexes(); SemArgLink link = new SemArgLink(aJCas); link.setRole(currentArgType); link.setTarget(arg); args.add(link); } // Forget remembered arg currentArgBegin = -1; currentArgType = null; } } pred.setArguments(FSCollectionFactory.createFSArray(aJCas, args)); } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(parse.toString())); // Once sentence per line. doc.add("\n"); } doc.close(); } /** * Read a single sentence. */ private List<String[]> readSentence(JCas aJCas, BufferedReader aReader) throws IOException { List<String[]> words = new ArrayList<String[]>(); String line; while ((line = aReader.readLine()) != null) { if (StringUtils.isBlank(line)) { break; // End of sentence } if (line.startsWith("#")) { if (line.startsWith("#begin") && useHeaderMetadata) { Pattern pattern = Pattern.compile("^#begin document \\((.*)\\); part (\\d+)$"); Matcher matcher = pattern.matcher(line); if (matcher.matches()) { DocumentMetaData meta = DocumentMetaData.get(aJCas); meta.setDocumentId(matcher.group(1)+'#'+matcher.group(2)); } } // Comment/header line continue; } if (line.startsWith("<")) { // FinnTreeBank uses pseudo-XML to attach extra metadata to sentences. // Currently, we just ignore this. break; // Consider end of sentence } String[] fields = line.split("\\s+"); // if (fields.length != 10) { // throw new IOException( // "Invalid file format. Line needs to have 10 tab-separated fields, but it has " // + fields.length + ": [" + line + "]"); // } words.add(fields); } if (line == null && words.isEmpty()) { return null; } else { return words; } } }