/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.clarin.webanno.conllu; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; import org.apache.uima.cas.Type; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.factory.JCasBuilder; import org.apache.uima.fit.util.FSUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; import it.unimi.dsi.fastutil.ints.Int2ObjectMap; import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; /** * Reads a file in the CoNLL-U format. * * <ol> * <li>ID - <b>(ignored)</b> Word index, integer starting at 1 for each new sentence; may be a range * for tokens with multiple words.</li> * <li>FORM - <b>(Token)</b> Word form or punctuation symbol.</li> * <li>LEMMA - <b>(Lemma)</b> Lemma or stem of word form.</li> * <li>CPOSTAG - <b>(unused)</b> Google universal part-of-speech tag from the universal POS tag set. * </li> * <li>POSTAG - <b>(POS)</b> Language-specific part-of-speech tag; underscore if not available.</li> * <li>FEATS - <b>(MorphologicalFeatures)</b> List of morphological features from the universal * feature inventory or from a defined language-specific extension; underscore if not available.</li> * <li>HEAD - <b>(Dependency)</b> Head of the current token, which is either a value of ID or zero * (0).</li> * <li>DEPREL - <b>(Dependency)</b> Universal Stanford dependency relation to the HEAD (root iff * HEAD = 0) or a defined language-specific subtype of one.</li> * <li>DEPS - <b>(Dependency)</b> List of secondary dependencies (head-deprel pairs).</li> * <li>MISC - <b>(unused)</b> Any other annotation.</li> * </ol> * * Sentences are separated by a blank new line. * * @see <a href="http://universaldependencies.github.io/docs/format.html">CoNLL-U Format</a> */ @TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) public class ConllUReader extends JCasResourceCollectionReader_ImplBase { public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") private boolean readPos; /** * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the * tag set defined as part of the model meta data. This can be useful if a custom model is * specified which does not have such meta data, or it can be used in readers. */ public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) protected String posTagset; /** * Load the part-of-speech tag to UIMA type mapping from this location instead of locating * the mapping automatically. */ public static final String PARAM_POS_MAPPING_LOCATION = ComponentParameters.PARAM_POS_MAPPING_LOCATION; @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) protected String posMappingLocation; public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") private boolean readMorph; public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") private boolean readLemma; public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean readDependency; private static final String UNUSED = "_"; private static final int ID = 0; private static final int FORM = 1; private static final int LEMMA = 2; // private static final int CPOSTAG = 3; private static final int POSTAG = 4; private static final int FEATS = 5; private static final int HEAD = 6; private static final int DEPREL = 7; private static final int DEPS = 8; private static final int MISC = 9; private MappingProvider posMappingProvider; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); posMappingProvider = MappingProviderFactory.createPosMappingProvider(posMappingLocation, posTagset, getLanguage()); } @Override public void getNext(JCas aJCas) throws IOException, CollectionException { Resource res = nextFile(); initCas(aJCas, res); BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(res.getInputStream(), encoding)); convert(aJCas, reader); } finally { closeQuietly(reader); } } public void convert(JCas aJCas, BufferedReader aReader) throws IOException { if (readPos) { try{ posMappingProvider.configure(aJCas.getCas()); } catch(AnalysisEngineProcessException e){ throw new IOException(e); } } JCasBuilder doc = new JCasBuilder(aJCas); List<String[]> words; while ((words = readSentence(aReader)) != null) { if (words.isEmpty()) { // Ignore empty sentences. This can happen when there are multiple end-of-sentence // markers following each other. continue; } int sentenceBegin = doc.getPosition(); int sentenceEnd = sentenceBegin; int surfaceBegin = -1; int surfaceEnd = -1; String surfaceString = null; // Tokens, Lemma, POS Int2ObjectMap<Token> tokens = new Int2ObjectOpenHashMap<>(); for (String[] word : words) { if (word[ID].contains("-")) { String[] fragments = word[ID].split("-"); surfaceBegin = Integer.valueOf(fragments[0]); surfaceEnd = Integer.valueOf(fragments[1]); surfaceString = word[FORM]; continue; } // Read token int tokenIdx = Integer.valueOf(word[ID]); Token token = doc.add(word[FORM], Token.class); tokens.put(tokenIdx, token); if (!StringUtils.contains(word[MISC], "SpaceAfter=No")) { doc.add(" "); } // Read lemma if (!UNUSED.equals(word[LEMMA]) && readLemma) { Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); lemma.setValue(word[LEMMA]); lemma.addToIndexes(); token.setLemma(lemma); } // Read part-of-speech tag if (!UNUSED.equals(word[POSTAG]) && readPos) { Type posTag = posMappingProvider.getTagType(word[POSTAG]); POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), token.getEnd()); pos.setPosValue(word[POSTAG]); pos.addToIndexes(); token.setPos(pos); } // Read morphological features if (!UNUSED.equals(word[FEATS]) && readMorph) { MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, token.getBegin(), token.getEnd()); morphtag.setValue(word[FEATS]); morphtag.addToIndexes(); token.setMorph(morphtag); // Try parsing out individual feature values. Since the DKPro Core // MorphologicalFeatures type is based on the definition from the UD project, // we can do this rather straightforwardly. Type morphType = morphtag.getType(); String[] items = word[FEATS].split("\\|"); for (String item : items) { String[] keyValue = item.split("="); StringBuilder key = new StringBuilder(keyValue[0]); key.setCharAt(0, Character.toLowerCase(key.charAt(0))); String value = keyValue[1]; Feature feat = morphType.getFeatureByBaseName(key.toString()); if (feat != null) { morphtag.setStringValue(feat, value); } } } // Read surface form if (tokenIdx == surfaceEnd) { int begin = tokens.get(surfaceBegin).getBegin(); int end = tokens.get(surfaceEnd).getEnd(); SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end); surfaceForm.setValue(surfaceString); surfaceForm.addToIndexes(); surfaceBegin = -1; surfaceEnd = -1; surfaceString = null; } sentenceEnd = token.getEnd(); } // Dependencies if (readDependency) { for (String[] word : words) { if (!UNUSED.equals(word[DEPREL])) { int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(word[HEAD]); // Model the root as a loop onto itself makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, tokens, word); } if (!UNUSED.equals(word[DEPS])) { // list items separated by vertical bar String[] items = word[DEPS].split("\\|"); for (String item : items) { String[] sItem = item.split(":"); int depId = Integer.valueOf(word[ID]); int govId = Integer.valueOf(sItem[0]); makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, tokens, word); } } } } // Sentence Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); sentence.addToIndexes(); // Once sentence per line. doc.add("\n"); } doc.close(); } private Dependency makeDependency(JCas aJCas, int govId, int depId, String label, String flavor, Int2ObjectMap<Token> tokens, String[] word) { Dependency rel = new Dependency(aJCas); if (govId == 0) { rel.setGovernor(tokens.get(depId)); rel.setDependent(tokens.get(depId)); } else { rel.setGovernor(tokens.get(govId)); rel.setDependent(tokens.get(depId)); } rel.setDependencyType(label); // This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas classes FSUtil.setFeature(rel, "flavor", flavor); rel.setBegin(rel.getDependent().getBegin()); rel.setEnd(rel.getDependent().getEnd()); rel.addToIndexes(); return rel; } /** * Read a single sentence. */ private static List<String[]> readSentence(BufferedReader aReader) throws IOException { List<String[]> words = new ArrayList<String[]>(); String line; while ((line = aReader.readLine()) != null) { if (StringUtils.isBlank(line)) { break; // End of sentence } if (line.startsWith("#")) { // Comment line continue; } String[] fields = line.split("\t"); if (fields.length != 10) { throw new IOException( "Invalid file format. Line needs to have 10 tab-separated fields, but it has " + fields.length + ": [" + line + "]"); } words.add(fields); } if (line == null && words.isEmpty()) { return null; } else { return words; } } }