/* * Copyright 2011 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.annis; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.FSArray; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; /** * This Consumer outputs the content of all CASes into the relAnnis file format. The produced files * can be fed into Annis2 (http://www.sfb632.uni-potsdam.de/d1/annis/) to visualize the data. e.g. * constituent and dependency structure. * */ @TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent" }) public class RelAnnisWriter extends JCasConsumer_ImplBase { /** * Location to which the output is written. */ public static final String PARAM_PATH = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_PATH, mandatory = true) private String path; /** * Write part-of-speech information. * * Default: {@code true} */ public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") private boolean writePos; /** * Write lemma information. * * Default: {@code true} */ public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") private boolean writeLemma; /** * Write constituent structure information. * * Default: {@code true} */ public static final String PARAM_WRITE_CONSTITUENT = ComponentParameters.PARAM_WRITE_CONSTITUENT; @ConfigurationParameter(name = PARAM_WRITE_CONSTITUENT, mandatory = true, defaultValue = "true") private boolean writeConstituents; /** * Write dependency relation information. * * Default: {@code true} */ public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") private boolean writeDependencies; private int textId; private int documentId; private int nodeId; private int rank; private int componentId; private static final String[] FILE_IDS = new String[] { "component", "corpus", "corpus_annotation", "edge_annotation", "node", "node_annotation", "rank", "resolver_vis_map", "text" }; private Map<String, PrintWriter> writers; private Map<Token, Integer> nodes; private Map<Token, List<Dependency>> dependencies; @Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); File f = new File(path); if (!f.exists()) { f.mkdirs(); } textId = 0; documentId = 1; // 0 is CORPUS nodeId = 0; rank = 0; componentId = 0; writers = new HashMap<String, PrintWriter>(); // open streams for all files for (String fileId : FILE_IDS) { File filePath = new File(path, fileId + ".tab"); try { PrintWriter pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream( filePath), "UTF-8")); writers.put(fileId, pw); } catch (UnsupportedEncodingException e) { throw new ResourceInitializationException(e); } catch (FileNotFoundException e) { throw new ResourceInitializationException(e); } } } @Override public void process(JCas jcas) throws AnalysisEngineProcessException { export(jcas); if (writeDependencies) { export_dependencies(jcas); } export_text(jcas); textId++; documentId++; } @Override public void collectionProcessComplete() { // write last files export_corpus(); export_corpus_annotation(); export_resolver_vis_map(); for (PrintWriter pw : writers.values()) { IOUtils.closeQuietly(pw); } } /** * Write corpus.tab */ private void export_corpus() { // DocumentMetaData meta = JCasUtil.selectSingle(jcas, // DocumentMetaData.class); // TODO use meta.getDocumentId() or meta.getDocumentTitle() as name // TODO for that, change export_corpus and call it for each jcas // write corpus entry writeToFile("corpus", 0, // id "c0", // name "CORPUS", // type (CORPUS|DOCUMENT) "NULL", // version "0", // pre-order documentId * 2 - 1);// post-order // write document entries for (int i = 1; i < documentId; i++) { writeToFile("corpus", i, "d" + i, "DOCUMENT", "NULL", i * 2 - 1, i * 2); } } /** * Write corpus_annotation.tab<br> * dummy file */ private void export_corpus_annotation() { // write empty corpus_annotation, because it is not essential writeToFile("corpus_annotation"); } /** * Traverse the constituent structure beginning from all roots. Eventually ending at token * level, so this method must be called, even when not wanting to write the constituent * structure. * * @param jcas * the CAS. */ // id | text-id (text.tab) | corpus-id (corpus.tab) | annotation engine | // tok_someid | t.begin | t.end | sentence-position | continuous (true if // span is gap-free) | token-text private void export(JCas jcas) { nodes = new LinkedHashMap<Token, Integer>(); for (Constituent root : select(jcas, ROOT.class)) { traverseConstituents(jcas, root, -1); } } /** * Recursively traverse the constituent structure, writing<br> * component.tab, edge_annotation.tab, node.tab, node_annotation.tab, rank.tab * * @param jcas * the CAS. * @param currAnno * the parent annotation from where to start the traversal * @param parent_rankPre * the pre-rank value of the parent */ private void traverseConstituents(JCas jcas, Annotation currAnno, int parent_rankPre) { Constituent c; int currNodeId = nodeId; int rankPre = rank; if (currAnno == null) { return; } nodeId++; rank++; if (currAnno instanceof Token) { Token t = (Token) currAnno; // get the token position; a more efficient method possible? int pos = selectCovered(jcas, Token.class, 0, t.getBegin()).size(); // store node token writeToFile("node", currNodeId, textId, documentId, "token_merged", "tok_" + currNodeId, t.getBegin(), t.getEnd(), pos, "true", t.getCoveredText()); // store node_annotation (token) if (writePos && (t.getPos() != null)) { writeToFile("node_annotation", currNodeId, "token_merged", "pos", t.getPos() .getPosValue()); } if (writeLemma && (t.getLemma() != null)) { writeToFile("node_annotation", currNodeId, "token_merged", "lemma", t.getLemma() .getValue()); } // store token with corresponding nodeId in hashmap for dependency // output nodes.put((Token) currAnno, currNodeId); // store rank writeToFile("rank", rankPre, rank, currNodeId, componentId, (parent_rankPre >= 0 ? parent_rankPre : "NULL")); // store edge annotation // TODO proper syntax function annotation; // use subiterate or iterate to get Tag annotation with same span as // token? writeToFile("edge_annotation", rankPre, "tiger", "func", "SF (T)"); } else { if (!(currAnno instanceof Constituent)) { return; } c = (Constituent) currAnno; // store node (const) writeToFile("node", currNodeId, textId, documentId, "tiger", "const_" + currNodeId, c.getBegin(), c.getEnd(), "NULL", "true", "NULL"); // store node_annotation (cat) writeToFile("node_annotation", currNodeId, "tiger", "cat", c.getConstituentType()); FSArray children = c.getChildren(); for (int i = 0; i < children.size(); i++) { traverseConstituents(jcas, c.getChildren(i), rankPre); } // store component writeToFile("component", componentId, "d", "tiger", "edge"); // store rank writeToFile("rank", rankPre, rank, currNodeId, componentId, (parent_rankPre >= 0 ? parent_rankPre : "NULL")); // store edge_annotation String synFunc = c.getSyntacticFunction(); // annis does not render the constituent tree if syntactic functions // are not at least 1 character wide if (synFunc == null) { synFunc = " "; } writeToFile("edge_annotation", rankPre, "tiger", "func", synFunc); componentId++; } rank++; } /** * Traverse the dependency structure beginning from all "roots", i.e. non-governed tokens. * * @param jcas * the CAS. */ private void export_dependencies(JCas jcas) { dependencies = new LinkedHashMap<Token, List<Dependency>>(); List<Token> nonGoverned = new ArrayList<Token>(nodes.keySet()); // fill governor->dependents hashmap for (Dependency dependency : select(jcas, Dependency.class)) { Token governor = dependency.getGovernor(); Token dependent = dependency.getDependent(); List<Dependency> l = dependencies.get(governor); if (l == null) { dependencies.put(governor, new ArrayList<Dependency>()); l = dependencies.get(governor); } l.add(dependency); nonGoverned.remove(dependent); } for (Token t : nonGoverned) { traverseDependents(t, "", -1); writeToFile("component", componentId, "p", "dep", "dep"); componentId++; } } /** * Recursively traverse the dependency structure, writing to<br> * edge_annotation.tab, rank.tab * * @param governor * the annotation whose dependents shall be visited * @param func * the function of the dependency pointing <b>to</b> the governor (from its governor) * @param parent_rankPre * the pre-rank value of the governor */ private void traverseDependents(Token governor, String func, int parent_rankPre) { int rankPre = rank; rank++; List<Dependency> dependents = dependencies.get(governor); if (dependents != null) { for (Dependency d : dependents) { traverseDependents(d.getDependent(), d.getDependencyType(), rankPre); } } int node_ref = nodes.get(governor); // parent_rankPre == -1 only for manual calls of non-governed tokens if (parent_rankPre >= 0) { writeToFile("rank", rankPre, rank, node_ref, componentId, parent_rankPre); writeToFile("edge_annotation", rankPre, "dep", "func", func); } else { writeToFile("rank", rankPre, rank, node_ref, componentId, "NULL"); // no edge annotation for "NULL"-parent } rank++; } /** * Write resolver_vis_map.tab<br> * corpus name | ??? | namespace (annotation engine) | element (node|edge) | vis_type * (tree,discourse,grid,...) | display name | layer-order | mappings (optional) */ private void export_resolver_vis_map() { // constituents if (writeConstituents) { writeToFile("resolver_vis_map", "c0", "NULL", "tiger", "node", "tree", "constituents (tree)", "1", "NULL"); } // dependencies if (writeDependencies) { writeToFile("resolver_vis_map", "c0", "NULL", "dep", "edge", "arch_dependency", "dependencies (arches)", "2", "NULL"); } } /** * Write text.tab<br> * id | some-text-identifier | text * * @param jcas * the CAS. */ private void export_text(JCas jcas) { StringBuilder text = new StringBuilder(); String documentId; for (Token token : select(jcas, Token.class)) { text.append(token.getCoveredText() + " "); } try { DocumentMetaData d = DocumentMetaData.get(jcas); documentId = d.getDocumentId(); } catch (IllegalArgumentException e) { documentId = "generic-" + textId; } writeToFile("text", textId, documentId, text.toString()); } /** * Concatenates the elements of input (with tab-separation) and appends the resulting string to * file fileId.tab. * * @param fileId * the tab-filename to write input to (without ".tab") * @param input * the values to write into tab-separated columns */ private void writeToFile(String fileId, Object... input) { String line = StringUtils.join(input, "\t"); PrintWriter pw = writers.get(fileId); if (line.length() > 0) { pw.println(line); } } }