/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.bincas;
import static org.apache.uima.cas.SerialFormat.BINARY;
import static org.apache.uima.cas.SerialFormat.COMPRESSED;
import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED;
import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED_TS;
import static org.apache.uima.cas.SerialFormat.COMPRESSED_FILTERED_TSI;
import static org.apache.uima.cas.SerialFormat.COMPRESSED_TSI;
import static org.apache.uima.cas.SerialFormat.SERIALIZED;
import static org.apache.uima.cas.SerialFormat.SERIALIZED_TSI;
import static org.apache.uima.cas.impl.Serialization.serializeCASMgr;
import static org.apache.uima.cas.impl.Serialization.serializeWithCompression;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.SerialFormat;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.CASCompleteSerializer;
import org.apache.uima.cas.impl.CASMgrSerializer;
import org.apache.uima.cas.impl.CASSerializer;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasIOUtils;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasFileWriter_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
/**
* <p>Write CAS in one of the UIMA binary formats.</p>
*
* <p>All the supported formats except <code>6+</code> can also be loaded and saved via the UIMA
* {@link CasIOUtils}.</p>
*
* <table>
* <caption>Supported formats</caption>
* <tr>
* <th>Format</th>
* <th>Description</th>
* <th>Type system on load</th>
* <th>CAS Addresses preserved</th>
* </tr>
* <tr>
* <td><code>SERIALIZED</code> or <code>S</code></td>
* <td>CAS structures are dumped to disc as they are using Java serialization ({@link CASSerializer}
* ). Because these structures are pre-allocated in memory at larger sizes than what is actually
* required, files in this format may be larger than necessary. However, the CAS addresses of
* feature structures are preserved in this format. When the data is loaded back into a CAS, it must
* have been initialized with the same type system as the original CAS.</td>
* <td>must be the same</td>
* <td>yes</td>
* </tr>
* <tr>
* <td><code>SERIALIZED_TSI</code> or <code>S+</code></td>
* <td>CAS structures are dumped to disc as they are using Java serialization as in form 0, but
* now using the {@link CASCompleteSerializer} which includes CAS metadata like type system and
* index repositories.</td>
* <td>is reinitialized</td>
* <td>yes</td>
* </tr>
* <tr>
* <td><code>BINARY</code> or 0</td>
* <td>CAS structures are dumped to disc as they are using Java serialization ({@link CASSerializer}
* ). This is basically the same as format {@code S} but includes a UIMA header and can be read
* using {@link org.apache.uima.cas.impl.Serialization#deserializeCAS}.</td>
* <td>must be the same</td>
* <td>yes</td>
* </tr>
* <tr>
* <td><code>BINARY_TSI</code> or 0</td>
* <td>The same as <code>BINARY_TSI</code>, except that the type system and index configuration
* are also stored in the file. However, lenient loading or reinitalizing the CAS with this
* information is presently not supported.</td>
* <td>must be the same</td>
* <td>yes</td>
* </tr>
* <tr>
* <td><code>COMPRESSED</code> or <code>4</code></td>
* <td>
* UIMA binary serialization saving all feature structures (reachable or not). This format
* internally uses gzip compression and a binary representation of the CAS, making it much more
* efficient than format 0.</td>
* <td>must be the same</td>
* <td>yes</td>
* </tr>
* <tr>
* <td><code>COMPRESSED_FILTERED</code> or <code>6</code></td>
* <td>
* UIMA binary serialization as format 4, but saving only reachable feature structures.</td>
* <td>must be the same</td>
* <td>no</td>
* </tr>
* <tr>
* <td>6+</td>
* <td>
* <b>This is a legacy format specific to DKPro Core.</b> Since UIMA 2.9.0, <code>COMPRESSED_FILTERED_TSI</code>
* is supported and should be used instead of this format. UIMA binary serialization as format 6,
* but also contains the type system definition. This allows the {@link BinaryCasReader} to load data
* leniently into a CAS that has been initialized with a different type system.</td>
* <td>lenient loading</td>
* <td>no</td>
* </tr>
* <tr>
* <td><code>COMPRESSED_FILTERED_TS</code></td>
* <td>
* Same as <code>COMPRESSED_FILTERED</code>, but also contains the type system definition. This
* allows the {@link BinaryCasReader} to load data leniently into a CAS that has been initialized
* with a different type system.</td>
* <td>lenient loading</td>
* <td>no</td>
* </tr>
* <tr>
* <td><code>COMPRESSED_FILTERED_TSI</code></td>
* <td>
* <b>Default</b>. UIMA binary serialization as format 6, but also contains the type system
* definition and index definitions. This allows the {@link BinaryCasReader} to load data leniently
* into a CAS that has been initialized with a different type system.</td>
* <td>lenient loading</td>
* <td>no</td>
* </tr>
* </table>
*
* @see <a href="http://uima.apache.org/d/uimaj-2.9.0/references.html#ugr.ref.compress">Compressed
* Binary CASes</a>
*/
@MimeTypeCapability({ MimeTypes.APPLICATION_X_UIMA_BINARY })
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class BinaryCasWriter
extends JCasFileWriter_ImplBase
{
public static final String AUTO = "AUTO";
/**
* Location to write the type system to. The type system is saved using Java serialization, it
* is not saved as a XML type system description. We recommend to use the name
* {@code typesystem.ser}.
* <br>
* The {@link #PARAM_COMPRESSION} parameter has no effect on the
* type system. Instead, if the type system file should be compressed or not is detected from
* the file name extension (e.g. ".gz").
* <br>
* If this parameter is set, the type system and index repository are no longer serialized into
* the same file as the test of the CAS. The {@link SerializedCasReader} can currently not
* read such files. Use this only if you really know what you are doing.
* <br>
* This parameter has no effect if formats S+ or 6+ are used as the type system information
* is embedded in each individual file. Otherwise, it is recommended that this parameter be
* set unless some other mechanism is used to initialize the CAS with the same type system and
* index repository during reading that was used during writing.
*/
public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation";
@ConfigurationParameter(name = PARAM_TYPE_SYSTEM_LOCATION, mandatory = false)
private String typeSystemLocation;
public static final String PARAM_FORMAT = "format";
@ConfigurationParameter(name = PARAM_FORMAT, mandatory = true, defaultValue = "COMPRESSED_FILTERED_TSI")
private String format;
/**
* The file extension. If this is set to {@link AUTO}, then the extension will be chosen based
* on the default extension specified by the UIMA {@link SerialFormat} class. However, this
* only works when using the new long format names (e.g. <code>COMPRESSED_FILTERED_TSI</code>).
* When using the old short names (e.g. <code>6</code>), the default extension <i>.bin</i> is
* used.
*/
public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION;
@ConfigurationParameter(name=PARAM_FILENAME_EXTENSION, mandatory=true, defaultValue=AUTO)
private String filenameExtension;
private boolean typeSystemWritten;
@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
{
super.initialize(aContext);
if (AUTO.equals(filenameExtension)) {
try {
filenameExtension = SerialFormat.valueOf(format).getDefaultFileExtension();
}
catch (IllegalArgumentException e) {
filenameExtension = ".bin";
}
}
}
@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
{
try (NamedOutputStream docOS = getOutputStream(aJCas, filenameExtension)) {
if ("S".equals(format) || SERIALIZED.toString().equals(format)) {
// Java-serialized CAS without type system
getLogger().debug("Writing CAS to [" + docOS + "]");
// CASSerializer serializer = new CASSerializer();
// serializer.addCAS(aJCas.getCasImpl());
// ObjectOutputStream objOS = new ObjectOutputStream(docOS);
// objOS.writeObject(serializer);
// objOS.flush();
CasIOUtils.save(aJCas.getCas(), docOS, SERIALIZED);
}
else if ("S+".equals(format) || SERIALIZED_TSI.toString().equals(format)) {
// Java-serialized CAS with type system
// ObjectOutputStream objOS = new ObjectOutputStream(docOS);
// CASCompleteSerializer serializer = serializeCASComplete(aJCas.getCasImpl());
// objOS.writeObject(serializer);
// objOS.flush();
CasIOUtils.save(aJCas.getCas(), docOS, SERIALIZED_TSI);
typeSystemWritten = true; // Embedded type system
}
else if ("0".equals(format) || BINARY.toString().equals(format)) {
// Java-serialized CAS without type system
// serializeCAS(aJCas.getCas(), docOS);
CasIOUtils.save(aJCas.getCas(), docOS, BINARY);
}
else if (BINARY.toString().equals(format)) {
// Java-serialized CAS without type system
CasIOUtils.save(aJCas.getCas(), docOS, SerialFormat.BINARY_TSI);
}
else if ("4".equals(format) || COMPRESSED.toString().equals(format)) {
// Binary compressed CAS without type system (form 4)
// serializeWithCompression(aJCas.getCas(), docOS);
CasIOUtils.save(aJCas.getCas(), docOS, COMPRESSED);
}
else if (COMPRESSED_TSI.toString().equals(format)) {
CasIOUtils.save(aJCas.getCas(), docOS, COMPRESSED_TSI);
}
else if (format.equals("6") || COMPRESSED_FILTERED.toString().equals(format)) {
CasIOUtils.save(aJCas.getCas(), docOS, COMPRESSED_FILTERED);
}
else if (COMPRESSED_FILTERED_TS.toString().equals(format)) {
CasIOUtils.save(aJCas.getCas(), docOS, COMPRESSED_FILTERED_TS);
}
else if (COMPRESSED_FILTERED_TSI.toString().equals(format)) {
CasIOUtils.save(aJCas.getCas(), docOS, COMPRESSED_FILTERED_TSI);
}
else if (format.equals("6+")) {
// LEGACY ... with embedded Java-serialized type system DKPro-Style
writeHeader(docOS);
writeTypeSystem(aJCas, docOS);
typeSystemWritten = true; // Embedded type system
serializeWithCompression(aJCas.getCas(), docOS, (TypeSystem) null);
}
else {
throw new IllegalArgumentException("Unknown format [" + format
+ "]. Must be S, S+, 0, 4, 6, or 6+");
}
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
// To support writing to ZIPs, the type system must be written separately from the CAS data
try {
if (typeSystemLocation != null && !typeSystemWritten) {
writeTypeSystem(aJCas);
typeSystemWritten = true;
}
}
catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
private void writeTypeSystem(JCas aJCas)
throws IOException
{
// If the type system location is an absolute file system location, write it there,
// otherwise use the default storage which places the file relative to the target location
if (!typeSystemLocation.startsWith(JAR_PREFIX) && new File(typeSystemLocation).isAbsolute()) {
try (OutputStream typeOS = CompressionUtils.getOutputStream(new File(typeSystemLocation))) {
getLogger().debug("Writing type system to [" + typeSystemLocation + "]");
writeTypeSystem(aJCas, typeOS);
}
}
else {
try (NamedOutputStream typeOS = getOutputStream(typeSystemLocation, "")) {
getLogger().debug("Writing type system to [" + typeOS + "]");
writeTypeSystem(aJCas, typeOS);
}
}
}
private void writeHeader(OutputStream aOS)
throws IOException
{
byte[] header = new byte[] { 'D', 'K', 'P', 'r', 'o', '1' };
DataOutputStream dataOS = new DataOutputStream(aOS);
dataOS.write(header);
dataOS.flush();
}
private void writeTypeSystem(JCas aJCas, OutputStream aOS)
throws IOException
{
ObjectOutputStream typeOS = new ObjectOutputStream(aOS);
CASMgrSerializer casMgrSerializer = serializeCASMgr(aJCas.getCasImpl());
typeOS.writeObject(casMgrSerializer);
typeOS.flush();
}
}