/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.conll; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; /** * Reads a file in the CoNLL-U format. * * @see <a href="http://universaldependencies.github.io/docs/format.html">CoNLL-U Format</a> */ @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) @TypeCapability( outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class ConllUReader extends JCasResourceCollectionReader_ImplBase { public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; public static final String PARAM_READ_CPOS = ComponentParameters.PARAM_READ_CPOS; @ConfigurationParameter(name = PARAM_READ_CPOS, mandatory = true, defaultValue = "true") private boolean readCPos; public static final String PARAM_USE_CPOS_AS_POS = "useCPosAsPos"; @ConfigurationParameter(name = PARAM_USE_CPOS_AS_POS, mandatory = true, defaultValue = "false") private boolean useCPosAsPos; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") private boolean readMorph; public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean readDependency; private static final String UNUSED = "_"; private static final int ID = 0; private static final int FORM = 1; private static final int LEMMA = 2; private static final int CPOSTAG = 3; private static final int POSTAG = 4; private static final int FEATS = 5; private static final int HEAD = 6; private static final int DEPREL = 7; private static final int DEPS = 8; private static final int MISC = 9; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, posTagset, getLanguage()); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), encoding)); convert(aJCas, reader); } finally { closeQuietly(reader); } } public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { try{ posMappingProvider.configure(aJCas.getCas()); } catch(AnalysisEngineProcessException e){ throw new IOException(e); } } JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; int surfaceBegin = -1; int surfaceEnd = -1; String surfaceString = null; // Tokens, Lemma, POS Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>(); Iterator<String[]> wordIterator = words.iterator(); while (wordIterator.hasNext()) { String[] word = wordIterator.next(); if (word[ID].contains("-")) { String[] fragments = word[ID].split("-"); surfaceBegin = Integer.valueOf(fragments[0]); surfaceEnd = Integer.valueOf(fragments[1]); surfaceString = word[FORM]; continue; } // Read token int tokenIdx = Integer.valueOf(word[ID]); Token token = doc.add(word[FORM], Token.class); tokens.put(tokenIdx, token); if (!StringUtils.contains(word[MISC], "SpaceAfter=No") && wordIterator.hasNext()) { doc.add(" "); } // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag POS pos = null; String tag = useCPosAsPos ? word[CPOSTAG] : word[POSTAG]; if (!UNUSED.equals(tag) && readPos) { Type posTag = posMappingProvider.getTagType(tag); pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(tag.intern()); } // Read coarse part-of-speech tag if (!UNUSED.equals(word[CPOSTAG]) && readCPos && pos != null) { pos.setCoarseValue(word[CPOSTAG].intern()); } if (pos != null) { pos.addToIndexes(); token.setPos(pos); } // Read morphological features if (!UNUSED.equals(word[FEATS]) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); morphtag.setValue(word[FEATS]); morphtag.addToIndexes(); token.setMorph(morphtag); // Try parsing out individual feature values. Since the DKPro Core // MorphologicalFeatures type is based on the definition from the UD project, // we can do this rather straightforwardly. Type morphType = morphtag.getType(); String[] items = word[FEATS].split("\\|"); for (String item : items) { String[] keyValue = item.split("="); StringBuilder key = new StringBuilder(keyValue[0]); key.setCharAt(0, Character.toLowerCase(key.charAt(0))); String value = keyValue[1]; Feature feat = morphType.getFeatureByBaseName(key.toString()); if (feat != null) { morphtag.setStringValue(feat, value); } } } // Read surface form if (tokenIdx == surfaceEnd) { int begin = tokens.get(surfaceBegin).getBegin(); int end = tokens.get(surfaceEnd).getEnd(); SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end); surfaceForm.setValue(surfaceString); surfaceForm.addToIndexes(); surfaceBegin = -1; surfaceEnd = -1; surfaceString = null; } sentenceEnd = token.getEnd(); } // Dependencies if (readDependency) { for (String[] word : words) { if (!UNUSED.equals(word[DEPREL])) { int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(word[HEAD]); // Model the root as a loop onto itself makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word); } if (!UNUSED.equals(word[DEPS])) { // list items separated by vertical bar String[] items = word[DEPS].split("\\|"); for (String item : items) { String[] sItem = item.split(":"); int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(sItem[0]); makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word); } } } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); } private Dependency makeDependency(JCas aJCas, int govId, int depId, String label, String flavor, Int2ObjectMap<Token> tokens, String[] word) { Dependency rel; if (govId == 0) { rel = new ROOT(aJCas); rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); } else { rel = new Dependency(aJCas); rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); } rel.setDependencyType(label); rel.setFlavor(flavor); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); return rel; } /** * Read a single sentence. */ private static List<String[]> readSentence(BufferedReader aReader) throws IOException { List<String[]> words = new ArrayList<String[]>(); String line; while ((line = aReader.readLine()) != null) { if (StringUtils.isBlank(line)) { break; // End of sentence } if (line.startsWith("#")) { // Comment line continue; } String[] fields = line.split("\t"); if (fields.length != 10) { throw new IOException( "Invalid file format. Line needs to have 10 tab-separated fields, but it has " + fields.length + ": [" + line + "]"); } words.add(fields); } if (line == null && words.isEmpty()) { return null; } else { return words; } } }