/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.bincas;
import static org.apache.commons.io.IOUtils.closeQuietly;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.SerialFormat;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.CASImpl;
import org.apache.uima.cas.impl.CASMgrSerializer;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.impl.TypeSystemImpl;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.FsIndexDescription;
import org.apache.uima.resource.metadata.TypePriorities;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.CasIOUtils;
import org.apache.uima.util.CasLoadMode;
import org.apache.uima.util.TypeSystemUtil;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.parameter.MimeTypes;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils;
/**
* UIMA Binary CAS formats reader.
*/
@MimeTypeCapability({ MimeTypes.APPLICATION_X_UIMA_BINARY })
public class BinaryCasReader
extends ResourceCollectionReaderBase
{
private static final byte[] DKPRO_HEADER = new byte[] { 'D', 'K', 'P', 'r', 'o', '1' };
/**
* The location from which to obtain the type system when the CAS is stored in form 0.
*/
public static final String PARAM_TYPE_SYSTEM_LOCATION = "typeSystemLocation";
@ConfigurationParameter(name=PARAM_TYPE_SYSTEM_LOCATION, mandatory=false)
private String typeSystemLocation;
/**
* Determines whether the type system from a currently read file should be merged
* with the current type system
*/
public static final String PARAM_MERGE_TYPE_SYSTEM = "mergeTypeSystem";
@ConfigurationParameter(name = PARAM_MERGE_TYPE_SYSTEM, mandatory = true, defaultValue = "false")
private boolean mergeTypeSystem;
private CASMgrSerializer casMgrSerializer;
private TypeSystemImpl typeSystem;
@Override
public void getNext(CAS aCAS)
throws IOException, CollectionException
{
Resource res = nextFile();
TypeSystemImpl xts = null;
byte[] header = new byte[DKPRO_HEADER.length];
if (this.mergeTypeSystem) {
// type system from input file
TypeSystemDescription tsd;
try (InputStream is = CompressionUtils.getInputStream(res.getLocation(),
res.getInputStream())) {
BufferedInputStream bis = new BufferedInputStream(is);
getLogger().debug("Reading CAS from [" + res.getLocation() + "]");
// Prepare for format detection
bis.mark(32);
DataInputStream dis = new DataInputStream(bis);
dis.read(header);
// If it is DKPro Core format, read the type system
if (Arrays.equals(header, DKPRO_HEADER)) {
xts = readDKProHeader(bis, header, xts);
} else {
// No embedded DKPro TS, reset
bis.reset();
// Try reading an externalized type system instead
if (typeSystemLocation != null) {
xts = readTypeSystem();
initCasFromEmbeddedTS(header, aCAS);
}
}
if (xts != null) {
// use external type system if specified
tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(xts);
} else {
// else load the CAS from the input file and use its type system
CasIOUtils.load(bis, null, aCAS, CasLoadMode.REINIT);
tsd = TypeSystemUtil.typeSystem2TypeSystemDescription(aCAS.getTypeSystem());
}
}
try {
// Merge the current type system with the one specified by the file being read
TypeSystemDescription mergedTypeSystem = CasCreationUtils.mergeTypeSystems(Arrays
.asList(TypeSystemUtil.typeSystem2TypeSystemDescription(typeSystem), tsd));
// Create a new CAS based on the merged type system
JCas mergedTypeSystemCas = CasCreationUtils.createCas(mergedTypeSystem,
(TypePriorities) null, (FsIndexDescription[]) null).getJCas();
// Create a holder for the CAS metadata
CASMgrSerializer casMgrSerializer = Serialization
.serializeCASMgr((mergedTypeSystemCas).getCasImpl());
// Reinitialize CAS with merged type system
((CASImpl) aCAS).setupCasFromCasMgrSerializer(casMgrSerializer);
} catch (CASException | ResourceInitializationException e) {
throw new CollectionException(e);
}
}
// Read file again, this time into a CAS which has been prepared with the merged TS
try (InputStream is = CompressionUtils.getInputStream(res.getLocation(),
res.getInputStream())) {
BufferedInputStream bis = new BufferedInputStream(is);
bis.mark(32);
DataInputStream dis = new DataInputStream(bis);
dis.read(header);
// If it is DKPro Core format, read the type system
if (Arrays.equals(header, DKPRO_HEADER)) {
xts = readDKProHeader(bis, header, xts);
} else {
// No embedded DKPro TS, reset
bis.reset();
// Try reading an externalized type system instead
if (typeSystemLocation != null) {
xts = readTypeSystem();
initCasFromEmbeddedTS(header, aCAS);
}
}
SerialFormat format;
if (xts != null) {
format = CasIOUtils.load(bis, aCAS, xts);
} else {
format = CasIOUtils.load(bis, aCAS);
}
getLogger().debug("Found format " + format);
} catch (IOException e) {
throw new CollectionException(e);
}
// Initialize the JCas sub-system which is the most often used API in DKPro Core components
try {
aCAS.getJCas();
}
catch (CASException e) {
throw new CollectionException(e);
}
}
// Check whether this is original UIMA CAS format or DKPro Core Legacy format
private TypeSystemImpl readDKProHeader(BufferedInputStream bis, byte[] header,
TypeSystemImpl ts) throws CollectionException {
getLogger().debug("Found DKPro-Core-style embedded type system");
ObjectInputStream ois;
try {
ois = new ObjectInputStream(bis);
CASMgrSerializer casMgr = (CASMgrSerializer) ois.readObject();
if (ts == null) {
ts = casMgr.getTypeSystem();
ts.commit();
}
} catch (IOException | ClassNotFoundException e) {
throw new CollectionException(e);
}
return ts;
}
@Override
public void typeSystemInit(TypeSystem aTypeSystem) throws ResourceInitializationException {
if (typeSystemLocation == null) {
typeSystem = (TypeSystemImpl) aTypeSystem;
}
}
/**
* It is possible that the type system overlaps with the scan pattern for files, e.g. because
* the type system ends in {@code .ser} and the resources also end in {@code .ser}. If this is
* the case, we filter the type system file from the resource files during scanning.
*/
@Override
protected Collection<Resource> scan(String aBase, Collection<String> aIncludes,
Collection<String> aExcludes)
throws IOException
{
Collection<Resource> resources = super.scan(aBase, aIncludes, aExcludes);
if (typeSystemLocation != null) {
org.springframework.core.io.Resource r = getTypeSystemResource();
resources.remove(new Resource(null, null, r.getURI(), null, null, r));
}
return resources;
}
protected org.springframework.core.io.Resource getTypeSystemResource() throws MalformedURLException
{
org.springframework.core.io.Resource r;
// Is absolute?
if (typeSystemLocation.indexOf(':') != -1 || typeSystemLocation.startsWith("/")
|| typeSystemLocation.startsWith(File.separator)) {
// If the type system location is absolute, resolve it absolute
r = getResolver().getResource(locationToUrl(typeSystemLocation));
}
else {
// If the type system is not absolute, resolve it relative to the base location
r = getResolver().getResource(getBase() + typeSystemLocation);
}
return r;
}
private TypeSystemImpl readTypeSystem() throws IOException {
if (typeSystemLocation == null) {
return null;
}
if (typeSystem == null) {
CASMgrSerializer casMgr = readCasManager();
typeSystem = casMgr.getTypeSystem();
typeSystem.commit();
}
return typeSystem;
}
private void initCasFromEmbeddedTS (byte[] header, CAS aCAS) throws IOException {
// If we encounter a Java-serialized file with an external
// TSI, then we reinitalize the CAS with the external TSI
// prior to loading the data
if (header[0] == (byte) 0xAC && header[1] == (byte) 0xED) {
CASMgrSerializer casMgr = readCasManager();
((CASImpl) aCAS).setupCasFromCasMgrSerializer(casMgr);
}
}
private CASMgrSerializer readCasManager() throws IOException
{
if (typeSystemLocation == null) {
return null;
}
// If we already read the type system, return it - do not read it again.
if (casMgrSerializer != null) {
return casMgrSerializer;
}
org.springframework.core.io.Resource r = getTypeSystemResource();
getLogger().debug("Reading type system from [" + r.getURI() + "]");
ObjectInputStream is = null;
try {
is = new ObjectInputStream(CompressionUtils.getInputStream(typeSystemLocation,
r.getInputStream()));
casMgrSerializer = (CASMgrSerializer) is.readObject();
}
catch (ClassNotFoundException e) {
throw new IOException(e);
}
finally {
closeQuietly(is);
}
return casMgrSerializer;
}
}