/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.conll; import static java.util.Arrays.asList; import static org.apache.commons.io.IOUtils.closeQuietly; import static org.apache.uima.fit.util.JCasUtil.indexCovered; import static org.apache.uima.fit.util.JCasUtil.indexCovering; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain; import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink; import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArgLink; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred; import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeNode; import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils; /** * Writer for the CoNLL-2012 format. */ @MimeTypeCapability({MimeTypes.TEXT_X_CONLL_2012}) @TypeCapability( inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemPred", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemArg" }) public class Conll2012Writer extends JCasFileWriter_ImplBase { private static final String UNUSED = "-"; private static final String ALT_UNUSED = "*"; /** * Name of configuration parameter that contains the character encoding used by the input files. */ public static final String PARAM_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; @ConfigurationParameter(name = PARAM_ENCODING, mandatory = true, defaultValue = "UTF-8") private String encoding; public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION; @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") private String filenameSuffix; public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; public static final String PARAM_WRITE_SEMANTIC_PREDICATE = "writeSemanticPredicate"; @ConfigurationParameter(name = PARAM_WRITE_SEMANTIC_PREDICATE, mandatory = true, defaultValue = "true") private boolean writeSemanticPredicate; @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { PrintWriter out = null; try { out = new PrintWriter(new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), encoding)); String documentId = DocumentMetaData.get(aJCas).getDocumentId(); int partNumber = 0; if (documentId.contains("#")) { partNumber = Integer.parseInt(StringUtils.substringAfterLast(documentId, "#")); documentId = StringUtils.substringBeforeLast(documentId, "#"); } out.printf("#begin document (%s); part %03d%n", documentId, partNumber); convert(aJCas, out); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } finally { closeQuietly(out); } } private void convert(JCas aJCas, PrintWriter aOut) { Map<Token, Collection<SemPred>> predIdx = indexCovered(aJCas, Token.class, SemPred.class); Map<SemArg, Collection<Token>> argIdx = indexCovered(aJCas, SemArg.class, Token.class); Map<Token, Collection<NamedEntity>> neIdx = indexCovering(aJCas, Token.class, NamedEntity.class); Map<Token, Collection<WordSense>> wordSenseIdx = indexCovered(aJCas, Token.class, WordSense.class); Map<Token, Collection<CoreferenceLink>> corefIdx = indexCovering(aJCas, Token.class, CoreferenceLink.class); Map<CoreferenceLink, Integer> corefChainIdx = new HashMap<>(); int chainId = 1; for (CoreferenceChain chain : select(aJCas, CoreferenceChain.class)) { for (CoreferenceLink link : chain.links()) { corefChainIdx.put(link, chainId); } chainId++; } for (Sentence sentence : select(aJCas, Sentence.class)) { HashMap<Token, Row> ctokens = new LinkedHashMap<Token, Row>(); // Tokens List<Token> tokens = selectCovered(Token.class, sentence); List<SemPred> preds = selectCovered(SemPred.class, sentence); String[] parseFragments = null; List<ROOT> root = selectCovered(ROOT.class, sentence); if (root.size() == 1) { PennTreeNode rootNode = PennTreeUtils.convertPennTree(root.get(0)); if ("ROOT".equals(rootNode.getLabel())) { rootNode.setLabel("TOP"); } parseFragments = toPrettyPennTree(rootNode); } if (parseFragments != null && parseFragments.length != tokens.size()) { throw new IllegalStateException("Parse fragments do not match tokens - tokens: " + tokens + " parse: " + asList(parseFragments)); } for (int i = 0; i < tokens.size(); i++) { Row row = new Row(); row.id = i; row.token = tokens.get(i); row.args = new SemArgLink[preds.size()]; row.parse = parseFragments != null ? parseFragments[i] : UNUSED; // If there are multiple semantic predicates for the current token, then // we keep only the first Collection<SemPred> predsForToken = predIdx.get(row.token); if (predsForToken != null && !predsForToken.isEmpty()) { row.pred = predsForToken.iterator().next(); } // If there are multiple named entities for the current token, we keep only the // first Collection<NamedEntity> neForToken = neIdx.get(row.token); if (neForToken != null && !neForToken.isEmpty()) { row.ne = neForToken.iterator().next(); } // If there are multiple word senses for the current token, we keep only the // first Collection<WordSense> senseForToken = wordSenseIdx.get(row.token); if (senseForToken != null && !senseForToken.isEmpty()) { row.wordSense = senseForToken.iterator().next(); } row.coref = corefIdx.get(row.token); ctokens.put(row.token, row); } // Semantic arguments for (int p = 0; p < preds.size(); p++) { FSArray args = preds.get(p).getArguments(); for (SemArgLink arg : select(args, SemArgLink.class)) { for (Token t : argIdx.get(arg.getTarget())) { Row row = ctokens.get(t); row.args[p] = arg; } } } // Write sentence in CONLL 2012 format for (Row row : ctokens.values()) { String documentId = DocumentMetaData.get(aJCas).getDocumentId(); if (StringUtils.isBlank(documentId)) { documentId = UNUSED; } int partNumber = 0; if (documentId.contains("#")) { partNumber = Integer.parseInt(StringUtils.substringAfterLast(documentId, "#")); documentId = StringUtils.substringBeforeLast(documentId, "#"); } int id = row.id; String form = row.token.getCoveredText(); String lemma = UNUSED + " "; if (writeLemma && (row.token.getLemma() != null)) { lemma = row.token.getLemma().getValue(); } String pos = UNUSED; if (writePos && (row.token.getPos() != null)) { POS posAnno = row.token.getPos(); pos = posAnno.getPosValue(); } String parse = row.parse; if (!parse.endsWith(")")) { // This is just the curious way that the CoNLL files are encoded... parse += " "; } String wordSense = UNUSED; if (row.wordSense != null) { wordSense = row.wordSense.getValue(); } String speaker = UNUSED; // FIXME String namedEntity = ALT_UNUSED + " "; if (row.ne != null) { namedEntity = encodeMultiTokenAnnotation(row.token, row.ne, row.ne.getValue()); } String pred = UNUSED; StringBuilder apreds = new StringBuilder(); if (writeSemanticPredicate) { if (row.pred != null) { pred = row.pred.getCategory(); } for (SemArgLink link : row.args) { if (apreds.length() > 0) { apreds.append(" "); } String value; if (link == null) { if (row.pred != null && row.pred.getBegin() == row.token.getBegin() && row.pred.getEnd() == row.token.getEnd()) { value = "(V*)"; } else { value = ALT_UNUSED + ' '; } } else { value = encodeMultiTokenAnnotation(row.token, link.getTarget(), link.getRole()); } apreds.append(String.format("%10s", value)); } } StringBuilder coref = new StringBuilder(); if (!row.coref.isEmpty()) { for (CoreferenceLink link : row.coref) { if (coref.length() > 0) { coref.append('|'); } coref.append(encodeMultiTokenLink(row.token, link, corefChainIdx.get(link))); } } if (coref.length() == 0) { coref.append(UNUSED); } aOut.printf("%s %3d %3d %10s %5s %13s %9s %3s %3s %10s %10s %10s %s\n", documentId, partNumber, id, form, pos, parse, lemma, pred, wordSense, speaker, namedEntity, apreds, coref); } aOut.println(); } aOut.println("#end document"); } private String encodeMultiTokenAnnotation(Token aToken, AnnotationFS aAnnotation, String aLabel) { boolean begin = aAnnotation.getBegin() == aToken.getBegin(); boolean end = aAnnotation.getEnd() == aToken.getEnd(); StringBuilder buf = new StringBuilder(); if (begin) { buf.append('('); buf.append(aLabel); if (!end) { buf.append('*'); } } else { buf.append('*'); } if (end) { buf.append(')'); } else { buf.append(' '); } return buf.toString(); } private String encodeMultiTokenLink(Token aToken, AnnotationFS aAnnotation, Integer aChainId) { boolean begin = aAnnotation.getBegin() == aToken.getBegin(); boolean end = aAnnotation.getEnd() == aToken.getEnd(); StringBuilder buf = new StringBuilder(); if (begin) { buf.append('('); } if (begin|end) { buf.append(aChainId); } if (end) { buf.append(')'); } return buf.toString(); } private static final class Row { NamedEntity ne; String parse; WordSense wordSense; int id; Token token; SemPred pred; SemArgLink[] args; // These are the arguments roles for the current token! Collection<CoreferenceLink> coref; } public static String[] toPrettyPennTree(PennTreeNode aNode) { StringBuilder sb = new StringBuilder(); toPennTree(sb, aNode); return sb.toString().trim().split("\n+"); } private static void toPennTree(StringBuilder aSb, PennTreeNode aNode) { // This is a "(Label Token)" if (aNode.isPreTerminal()) { aSb.append("*"); } else { aSb.append('('); aSb.append(aNode.getLabel()); Iterator<PennTreeNode> i = aNode.getChildren().iterator(); while (i.hasNext()) { PennTreeNode child = i.next(); toPennTree(aSb, child); if (i.hasNext()) { aSb.append("\n"); } } aSb.append(')'); } } }