/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.api.io; import static org.apache.commons.io.IOUtils.closeQuietly; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.output.CloseShieldOutputStream; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.component.JCasConsumer_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; /** */ public abstract class JCasFileWriter_ImplBase extends JCasConsumer_ImplBase { protected static final String JAR_PREFIX = "jar:file:"; /** * Target location. If this parameter is not set, data is written to stdout. */ public static final String PARAM_TARGET_LOCATION = ComponentParameters.PARAM_TARGET_LOCATION; @ConfigurationParameter(name=PARAM_TARGET_LOCATION, mandatory=false) private String targetLocation; /** * Treat target location as a single file name. This is particularly useful if only a single * input file is processed and the result should be written to a pre-defined output file instead * of deriving the file name from the document URI or document ID. It can also be useful if * the user wishes to force multiple input files to be written to a single target file. The * latter case does not work for all formats (e.g. binary, XMI, etc.), but can be useful, e.g. * for Conll-based formats. This option has no effect if the target location points to an * archive location (ZIP/JAR). The {@link #PARAM_COMPRESSION} is respected, but does not * automatically add an extension. The {@link #PARAM_STRIP_EXTENSION} has no effect as the * original extension is not preserved. */ public static final String PARAM_SINGULAR_TARGET = "singularTarget"; @ConfigurationParameter(name=PARAM_SINGULAR_TARGET, mandatory=true, defaultValue="false") private boolean singularTarget; /** * Choose a compression method. (default: {@link CompressionMethod#NONE}) * * @see CompressionMethod */ public static final String PARAM_COMPRESSION = "compression"; @ConfigurationParameter(name=PARAM_COMPRESSION, mandatory=false, defaultValue="NONE") private CompressionMethod compression; /** * Remove the original extension. */ public static final String PARAM_STRIP_EXTENSION = "stripExtension"; @ConfigurationParameter(name=PARAM_STRIP_EXTENSION, mandatory=true, defaultValue="false") private boolean stripExtension; /** * Use the document ID as file name even if a relative path information is present. */ public static final String PARAM_USE_DOCUMENT_ID = "useDocumentId"; @ConfigurationParameter(name=PARAM_USE_DOCUMENT_ID, mandatory=true, defaultValue="false") private boolean useDocumentId; /** * URL-encode the document ID in the file name to avoid illegal characters (e.g. \, :, etc.) */ public static final String PARAM_ESCAPE_DOCUMENT_ID = "escapeDocumentId"; @ConfigurationParameter(name=PARAM_ESCAPE_DOCUMENT_ID, mandatory=true, defaultValue="true") private boolean escapeDocumentId; /** * Allow overwriting target files (ignored when writing to ZIP archives). */ public static final String PARAM_OVERWRITE = "overwrite"; @ConfigurationParameter(name = PARAM_OVERWRITE, mandatory = true, defaultValue = "false") private boolean overwrite; private ZipOutputStream zipOutputStream; private String zipPath; private String zipEntryPrefix; private OutputStream singularTargetStream; protected CompressionMethod getCompressionMethod() { return compression; } protected boolean isStripExtension() { return stripExtension; } protected boolean isUseDocumentId() { return useDocumentId; } @Override public void collectionProcessComplete() throws AnalysisEngineProcessException { if (zipOutputStream != null) { closeQuietly(zipOutputStream); } if (singularTargetStream != null) { closeQuietly(singularTargetStream); } super.collectionProcessComplete(); } protected NamedOutputStream getOutputStream(JCas aJCas, String aExtension) throws IOException { if (targetLocation == null) { return new NamedOutputStream(null, new CloseShieldOutputStream(System.out)); } return getOutputStream(getRelativePath(aJCas), aExtension); } protected String getTargetLocation() { return targetLocation; } protected NamedOutputStream getOutputStream(String aRelativePath, String aExtension) throws IOException { if (targetLocation == null) { return new NamedOutputStream(null, new CloseShieldOutputStream(System.out)); } else if (targetLocation.startsWith(JAR_PREFIX)) { if (zipOutputStream == null) { zipPath = targetLocation.substring(JAR_PREFIX.length()); zipEntryPrefix = ""; int sep = zipPath.indexOf('!'); if (sep > -1) { zipEntryPrefix = zipPath.substring(sep+1); zipPath = zipPath.substring(0, sep); } if (zipEntryPrefix.length() > 0 && !zipEntryPrefix.endsWith("/")) { zipEntryPrefix += '/'; } File zipFile = new File(zipPath); if (!overwrite && zipFile.exists()) { throw new IOException("Target file [" + zipFile + "] already exists and overwriting not enabled."); } zipOutputStream = new ZipOutputStream(new FileOutputStream(zipFile)); } // Begin new entry ZipEntry entry = new ZipEntry(zipEntryPrefix + aRelativePath + aExtension + compression.getExtension()); zipOutputStream.putNextEntry(entry); // We return an OutputStream for an individual entry. When this is closed by the // caller, it actually closes the entry. The full ZIP stream is closed when the // collectionProcessComplete event is triggered return new ZipEntryOutputStream(JAR_PREFIX + zipPath + '!' + entry.getName(), zipOutputStream); } else if (singularTarget) { File outputFile = new File(targetLocation); if (singularTargetStream == null) { if (!overwrite && outputFile.exists()) { throw new IOException("Target file [" + outputFile + "] already exists and overwriting not enabled."); } singularTargetStream = CompressionUtils.getOutputStream(outputFile); } return new NamedOutputStream(outputFile.getAbsolutePath(), new CloseShieldOutputStream(singularTargetStream)); } else { File outputFile = new File(targetLocation, aRelativePath + aExtension + compression.getExtension()); if (!overwrite && outputFile.exists()) { throw new IOException("Target file [" + outputFile + "] already exists and overwriting not enabled."); } return new NamedOutputStream(outputFile.getAbsolutePath(), CompressionUtils.getOutputStream(outputFile)); } } /** * Get the relative path from the CAS. If the CAS does not contain relative path information or * if {@link #PARAM_USE_DOCUMENT_ID} is set, the document ID is used. * * @param aJCas a CAS. * @return the relative target path. */ protected String getRelativePath(JCas aJCas) { DocumentMetaData meta = DocumentMetaData.get(aJCas); String baseUri = meta.getDocumentBaseUri(); String docUri = meta.getDocumentUri(); if (!useDocumentId && (StringUtils.isNotEmpty(baseUri))) { // In some cases, the baseUri may not end with a slash - if so, we add one if (baseUri.length() > 0 && !baseUri.endsWith("/")) { baseUri += '/'; } String relativeDocumentPath; if ((docUri == null) || !docUri.startsWith(baseUri)) { throw new IllegalStateException("Base URI [" + baseUri + "] is not a prefix of document URI [" + docUri + "]"); } relativeDocumentPath = docUri.substring(baseUri.length()); if (stripExtension) { relativeDocumentPath = FilenameUtils.removeExtension(relativeDocumentPath); } // relativeDocumentPath must not start with as slash - if there are any, remove them while (relativeDocumentPath.startsWith("/")) { relativeDocumentPath = relativeDocumentPath.substring(1); } return relativeDocumentPath; } else { String relativeDocumentPath; if (meta.getDocumentId() == null) { throw new IllegalStateException("Neither base URI/document URI nor document ID set"); } relativeDocumentPath = meta.getDocumentId(); if (stripExtension) { relativeDocumentPath = FilenameUtils.removeExtension(relativeDocumentPath); } if (escapeDocumentId) { try { relativeDocumentPath = URLEncoder.encode(relativeDocumentPath, "UTF-8"); } catch (UnsupportedEncodingException e) { // UTF-8 must be supported on all Java platforms per specification. This should // not happen. throw new IllegalStateException(e); } } return relativeDocumentPath; } } public static class NamedOutputStream extends OutputStream { private final String name; protected final OutputStream outputStream; public NamedOutputStream(String aName, OutputStream aOutputStream) { super(); name = aName; outputStream = aOutputStream; } public String getName() { return name; } @Override public void write(int paramInt) throws IOException { outputStream.write(paramInt); } @Override public void write(byte[] paramArrayOfByte) throws IOException { outputStream.write(paramArrayOfByte); } @Override public void write(byte[] paramArrayOfByte, int paramInt1, int paramInt2) throws IOException { outputStream.write(paramArrayOfByte, paramInt1, paramInt2); } @Override public void flush() throws IOException { outputStream.flush(); } @Override public void close() throws IOException { outputStream.close(); } @Override public String toString() { return getName() != null ? getName() : "<stdout>"; } } private static class ZipEntryOutputStream extends NamedOutputStream { public ZipEntryOutputStream(String aName, ZipOutputStream aOutputStream) { super(aName, aOutputStream); } @Override public void close() throws IOException { ((ZipOutputStream) outputStream).closeEntry(); } } }