/** * Copyright 2007-2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package de.tudarmstadt.ukp.dkpro.core.io.tgrep; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.UimaContext; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasAnnotator_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.TypeCapability; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; import de.tudarmstadt.ukp.dkpro.core.api.resources.RuntimeProvider; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; /** * TGrep2 corpus file writer. Requires {@link PennTree}s to be annotated before. */ @MimeTypeCapability({MimeTypes.APPLICATION_X_TGREP2}) @TypeCapability( inputs={ "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree"}) public class TGrepWriter extends JCasAnnotator_ImplBase { /** * Path to which the output is written. */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true) private File outputPath; /** * Set this parameter to true if you want to add a comment to each PennTree which is written to * the output files. The comment is of the form {@code documentId,beginOffset,endOffset}. * * Default: {@code true} */ public static final String PARAM_WRITE_COMMENTS = "writeComments"; @ConfigurationParameter(name = PARAM_WRITE_COMMENTS, mandatory = true, defaultValue="true") private boolean writeComments; /** * Set this parameter to true if you want to encode directly into the tgrep2 binary format. * * Default: {@code true} */ public static final String PARAM_WRITE_T2C = "writeT2c"; @ConfigurationParameter(name = PARAM_WRITE_T2C, mandatory = true, defaultValue = "true") private boolean writeT2c; /** * Method to compress the tgrep file (only used if PARAM_WRITE_T2C is true). Only NONE, GZIP and * BZIP2 are supported. * * Default: {@link CompressionMethod#NONE} * * @see CompressionMethod */ public static final String PARAM_COMPRESSION = "compression"; @ConfigurationParameter(name = PARAM_COMPRESSION, mandatory = true, defaultValue = "NONE") private CompressionMethod compression; /** * If true, silently drops malformed Penn Trees instead of throwing an exception. * * Default: {@code false} */ public static final String PARAM_DROP_MALFORMED_TREES = "dropMalformedTrees"; @ConfigurationParameter(name = PARAM_DROP_MALFORMED_TREES, mandatory = true, defaultValue = "false") private boolean dropMalformedTrees; private static final String EXT_CORPUS = ".txt"; private static final String EXT_BINARY = ".t2c"; private Map<String, PrintWriter> writers; private File tgrep2File; @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); if (compression != CompressionMethod.NONE && compression != CompressionMethod.GZIP && compression != CompressionMethod.BZIP2) { throw new ResourceInitializationException(new IllegalArgumentException( "Only gzip and bzip2 compression are supported by TGrep2, but [" + compression + "] was specified.")); } try { FileUtils.forceMkdir(outputPath); } catch (IOException e) { throw new ResourceInitializationException(e); } writers = new HashMap<String, PrintWriter>(); } @Override public void process(JCas aJCas) throws AnalysisEngineProcessException { String filename; String collectionId; String documentId; try { DocumentMetaData meta = DocumentMetaData.get(aJCas); collectionId = meta.getCollectionId(); documentId = meta.getDocumentId(); } catch (IllegalArgumentException e) { getLogger().warn("No DocumentMetaData found."); collectionId = "defaultCollectionId"; documentId = "defaultDocumentId"; } // if the collectionId contains inconvenient characters, remove them for the filename // filename = collectionId; filename = collectionId.replaceAll("\\W", ""); try { PrintWriter pw = writers.get(filename); if (pw == null) { pw = new PrintWriter(new OutputStreamWriter(new FileOutputStream(new File( outputPath, filename + EXT_CORPUS)), "UTF-8")); writers.put(filename, pw); } for (PennTree pt : JCasUtil.select(aJCas, PennTree.class)) { String tree = StringUtils.normalizeSpace(pt.getPennTree()); // detect and handle malformed trees if (!isTermiteFree(tree)) { if (dropMalformedTrees) { getLogger().warn("Dropping malformed tree: [" + tree + "]."); continue; } else { throw new AnalysisEngineProcessException(new IllegalArgumentException( "Found malformed tree: [" + tree + "].")); } } // write comments and trees if (writeComments) { pw.printf("# %s,%d,%d\n", documentId, pt.getBegin(), pt.getEnd()); } pw.printf("%s\n", tree); } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } /** * Check if a given Penn tree will be rejected by TGrep2. * * @param aTree * the Penn tree to check * @return true if aTree is fit for use with Tgrep2, false otherwise */ private boolean isTermiteFree(String aTree) { int bracketCount = 0; boolean justOpened = false; if (aTree.isEmpty() || aTree.charAt(0) != '(') { return false; } for (int idx = 0; idx < aTree.length(); idx++) { char c = aTree.charAt(idx); switch (c) { case '(': bracketCount++; if (justOpened) { // "((" is illegal, also with spaces in between return false; } justOpened = true; break; case ' ': break; case ')': bracketCount--; if (justOpened) { // "()" is illegal, also with spaces in between return false; } if (bracketCount < 0) { // more closing than opening brackets at any point are illegal return false; } justOpened = false; break; default: justOpened = false; break; } } // if not all brackets are closed, the next sentence is thought to be part of this one // we consider these cases as illegal, as the files are usually built one sentence/line return bracketCount == 0; } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { for (PrintWriter pw : writers.values()) { IOUtils.closeQuietly(pw); } if (writeT2c) { RuntimeProvider runtime = new RuntimeProvider( "classpath:/de/tudarmstadt/ukp/dkpro/core/io/tgrep/bin/"); try { tgrep2File = runtime.getFile("tgrep2"); for (String filename : writers.keySet()) { writeTgrepBinary(filename); } } catch (IOException e) { throw new AnalysisEngineProcessException(e); } finally { runtime.uninstall(); } } } /** * Produces a TGrep2 binary corpus file. * * @param aFilename * the name of the file from which a corpus file shall be created, without extension * @throws IOException * if the employed tgrep2 process is interrupted or if it reports an error */ private void writeTgrepBinary(String aFilename) throws IOException { List<String> cmd = new ArrayList<String>(); cmd.add(tgrep2File.getAbsolutePath()); if (writeComments) { // enable writing comments cmd.add("-C"); } // specify corpus cmd.add("-p"); cmd.add(new File(outputPath, aFilename + EXT_CORPUS).getAbsolutePath()); cmd.add(new File(outputPath, aFilename + EXT_BINARY + compression.getExtension()) .getAbsolutePath()); getLogger().info("Running tgrep2 command: [" + StringUtils.join(cmd, " ") + "]."); Process tgrepProcess = null; try { tgrepProcess = new ProcessBuilder(cmd).start(); tgrepProcess.waitFor(); } catch (InterruptedException e) { throw new IOException(); } finally { if (tgrepProcess != null) { InputStream stderr = tgrepProcess.getErrorStream(); if (stderr.available() > 0) { byte[] data = new byte[stderr.available()]; stderr.read(data); String error = new String(data, "UTF-8"); getLogger().error(error); throw new IOException(error); } } } } }