/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.bincas;
import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.initRandomCas;
import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.measureReadPerformance;
import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.measureWritePerformance;
import static de.tudarmstadt.ukp.dkpro.core.performance.PerformanceTestUtil.repeat;
import static org.apache.commons.io.FileUtils.readFileToString;
import static org.apache.uima.cas.impl.Serialization.deserializeCASComplete;
import static org.apache.uima.cas.impl.Serialization.serializeCASComplete;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
import static org.apache.uima.fit.factory.TypeSystemDescriptionFactory.createTypeSystemDescription;
import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.PrefixFileFilter;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.SerialFormat;
import org.apache.uima.cas.impl.CASCompleteSerializer;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.factory.AnalysisEngineFactory;
import org.apache.uima.fit.factory.CollectionReaderFactory;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Rule;
import org.junit.Test;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionMethod;
import de.tudarmstadt.ukp.dkpro.core.io.text.TextReader;
import de.tudarmstadt.ukp.dkpro.core.testing.DkproTestContext;
public class BinaryCasWriterReaderTest
{
private static final int NONE = 1;
private static final int METADATA = 2;
private static final int ALL = 3;
private File testFolder;
@Before
public void setup()
{
testFolder = testContext.getTestOutputFolder();
}
@Test
public void testSReinitialize()
throws Exception
{
write(testFolder.getPath(), SerialFormat.SERIALIZED.toString(), true);
read(testFolder.getPath(), NONE, true, false); // Type system is reinitialized from the persisted type system
read(testFolder.getPath(), NONE, true, true);
}
@Test
public void testSReinitializeInZIP()
throws Exception
{
write("jar:file:" + testFolder.getPath() + "/archive.zip", "S", true);
read("jar:file:" + testFolder.getPath() + "/archive.zip", NONE, true, false); // Type system is reinitialized from the persisted type system
read("jar:file:" + testFolder.getPath() + "/archive.zip", NONE, true, true);
}
@Test
public void testSPreinitialized()
throws Exception
{
write(testFolder.getPath(), "S", false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
@Test
public void testSplusReinitialize()
throws Exception
{
write(testFolder.getPath(), "S+", false);
read(testFolder.getPath(), NONE, false, false); // Type system is reinitialized from the persisted CAS
read(testFolder.getPath(), NONE, false, true);
}
@Test
public void test0Preinitialized()
throws Exception
{
write(testFolder.getPath(), SerialFormat.BINARY.toString(), false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
@Test
public void test4Preinitialized()
throws Exception
{
write(testFolder.getPath(), "4", false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
/**
* The type system in the CAS is different from the one in the file. To do lenient loading with
* format 6, we need to know the type system that was used to originally store the CAS.
*/
@Test
public void test6Lenient()
throws Exception
{
write(testFolder.getPath(), SerialFormat.COMPRESSED_FILTERED.toString(), true);
read(testFolder.getPath(), METADATA, true, false);
read(testFolder.getPath(), METADATA, true, true);
}
@Test
public void test6Preinitialized()
throws Exception
{
write(testFolder.getPath(), "6", false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
@Test
public void test_COMPRESSED_FILTERED_TSI_preinitialized()
throws Exception
{
write(testFolder.getPath(), SerialFormat.COMPRESSED_FILTERED_TSI.toString(), false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
@Test
public void test_COMPRESSED_FILTERED_TSI_lenient()
throws Exception
{
write(testFolder.getPath(), SerialFormat.COMPRESSED_FILTERED_TSI.toString(), false);
read(testFolder.getPath(), METADATA, false, false);
read(testFolder.getPath(), METADATA, false, true);
}
@Test
public void test6plusPreinitialized()
throws Exception
{
write(testFolder.getPath(), "6+", false);
read(testFolder.getPath(), ALL, false, false);
read(testFolder.getPath(), ALL, false, true);
}
@Test
public void test6plusLenient()
throws Exception
{
write(testFolder.getPath(), "6+", false);
read(testFolder.getPath(), METADATA, false, false);
read(testFolder.getPath(), METADATA, false, true);
}
@Test
public void testSerializedEmbeddedTypeSystem()
throws Exception
{
writeSerialized(testFolder.getPath(), false);
read(testFolder.getPath(), NONE, false, false); // Type system is reinitialized from the persisted CAS
read(testFolder.getPath(), NONE, false, true);
}
@Test
public void testSerializedSeparateTypeSystem()
throws Exception
{
writeSerialized(testFolder.getPath(), true);
read(testFolder.getPath(), NONE, true, false); // Type system is reinitialized from the persisted CAS
read(testFolder.getPath(), NONE, true, true);
}
@Test
public void readWriteZipMinimal()
throws Exception
{
String targetZip = "jar:file:target/archive.zip";
JCas out = JCasFactory.createJCas();
out.setDocumentLanguage("en");
out.setDocumentText("This is a test.");
DocumentMetaData meta = DocumentMetaData.create(out);
meta.setDocumentId("document");
AnalysisEngine writer = createEngine(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_FORMAT, "6",
BinaryCasWriter.PARAM_TARGET_LOCATION, targetZip,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_TYPE_SYSTEM_LOCATION, "typesystem.bin");
writer.process(out);
writer.collectionProcessComplete();
CollectionReader reader = CollectionReaderFactory.createReader(
BinaryCasReader.class,
BinaryCasReader.PARAM_SOURCE_LOCATION, targetZip,
BinaryCasReader.PARAM_PATTERNS, "*.bin",
BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION, "typesystem.bin");
JCas in = JCasFactory.createJCas();
reader.getNext(in.getCas());
assertEquals(out.getDocumentLanguage(), in.getDocumentLanguage());
assertEquals(out.getDocumentText(), in.getDocumentText());
assertEquals(DocumentMetaData.get(out).getDocumentId(), DocumentMetaData.get(in).getDocumentId());
}
public void write(String aLocation, String aFormat, boolean aWriteTypeSystem)
throws Exception
{
System.out.println("--- WRITING ---");
CollectionReader textReader = CollectionReaderFactory.createReader(TextReader.class,
ResourceCollectionReaderBase.PARAM_SOURCE_LOCATION, "src/test/resources/texts",
ResourceCollectionReaderBase.PARAM_PATTERNS, "*.txt",
ResourceCollectionReaderBase.PARAM_LANGUAGE, "latin");
AnalysisEngine writer;
if (false) {
writer = createEngine(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_FORMAT, aFormat,
BinaryCasWriter.PARAM_TARGET_LOCATION, aLocation,
BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin",
BinaryCasWriter.PARAM_TYPE_SYSTEM_LOCATION,
aWriteTypeSystem ? new File(aLocation, "typesystem.bin") : null);
}
else {
writer = createEngine(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_FORMAT, aFormat,
BinaryCasWriter.PARAM_TARGET_LOCATION, aLocation,
BinaryCasWriter.PARAM_FILENAME_EXTENSION, ".bin",
BinaryCasWriter.PARAM_TYPE_SYSTEM_LOCATION,
aWriteTypeSystem ? "typesystem.bin" : null);
}
// AnalysisEngine dumper = createEngine(CASDumpWriter.class);
runPipeline(textReader, /* dumper, */writer);
if (aLocation.startsWith("jar:")) {
assertTrue(new File(testFolder, "archive.zip").exists());
}
else {
assertTrue(new File(testFolder, "example1.txt.bin").exists());
assertTrue(new File(testFolder, "example2.txt.bin").exists());
}
}
public void writeSerialized(String aLocation, boolean aWriteTypeSystem)
throws Exception
{
System.out.println("--- WRITING ---");
CollectionReader reader = CollectionReaderFactory.createReader(
TextReader.class,
TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/texts",
TextReader.PARAM_PATTERNS, "*.txt",
TextReader.PARAM_LANGUAGE, "latin");
AnalysisEngine writer;
if (false) {
writer = AnalysisEngineFactory.createEngine(
SerializedCasWriter.class,
SerializedCasWriter.PARAM_TARGET_LOCATION, aLocation,
SerializedCasWriter.PARAM_FILENAME_EXTENSION, ".bin",
SerializedCasWriter.PARAM_TYPE_SYSTEM_LOCATION,
aWriteTypeSystem ? new File(aLocation, "typesystem.bin") : null);
}
else {
writer = AnalysisEngineFactory.createEngine(
SerializedCasWriter.class,
SerializedCasWriter.PARAM_TARGET_LOCATION, testFolder,
SerializedCasWriter.PARAM_FILENAME_EXTENSION, ".bin",
SerializedCasWriter.PARAM_TYPE_SYSTEM_LOCATION,
aWriteTypeSystem ? "typesystem.bin" : null);
}
runPipeline(reader, writer);
assertTrue(new File(testFolder, "example1.txt.bin").exists());
}
public void read(String aLocation, int aMode, boolean aLoadExternal, boolean aMergeTS)
throws Exception
{
TypeSystemDescription tsd;
switch (aMode) {
case NONE:
tsd = null;
break;
case METADATA:
tsd = createTypeSystemDescription("desc.type.metadata", "desc.type.metadata_customized");
break;
case ALL:
tsd = createTypeSystemDescription();
break;
default:
throw new IllegalArgumentException("Unknown mode");
}
System.out.println("--- READING ---");
CollectionReader reader;
if (false) {
reader = CollectionReaderFactory.createReader(
BinaryCasReader.class,
BinaryCasReader.PARAM_SOURCE_LOCATION, aLocation,
BinaryCasReader.PARAM_PATTERNS, "*.bin",
// Allow loading only if TSD is not specified
BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION,
aLoadExternal ? new File(aLocation, "typesystem.bin") : null);
}
else {
reader = CollectionReaderFactory.createReader(
BinaryCasReader.class,
BinaryCasReader.PARAM_SOURCE_LOCATION, aLocation,
BinaryCasReader.PARAM_PATTERNS, "*.bin",
BinaryCasReader.PARAM_MERGE_TYPE_SYSTEM, aMergeTS,
// Allow loading only if TSD is not specified
BinaryCasReader.PARAM_TYPE_SYSTEM_LOCATION,
aLoadExternal ? "typesystem.bin" : null);
}
// Test reading into CAS
CAS cas = CasCreationUtils.createCas(tsd, null, null);
reader.typeSystemInit(cas.getTypeSystem());
reader.getNext(cas);
String refText1 = readFileToString(new File("src/test/resources/texts/example1.txt"));
assertEquals(refText1, cas.getDocumentText());
assertEquals("latin", cas.getDocumentLanguage());
// Test reading into JCas
JCas jcas = JCasFactory.createJCas();
reader.getNext(jcas.getCas());
assertEquals("latin", DocumentMetaData.get(jcas).getLanguage());
String refText2 = readFileToString(new File("src/test/resources/texts/example2.txt"));
assertEquals(refText2, jcas.getDocumentText());
assertEquals("latin", jcas.getDocumentLanguage());
assertFalse(reader.hasNext());
}
@Test
public void measureSerializedCas()
throws UIMAException, IOException
{
File file = new File(testFolder, "dummy.bin");
Iterable<JCas> data = repeat(generateRandomCas(), 100);
System.out.printf("= write%n");
SummaryStatistics statsWrite = measureWriteSerializedCas(data, file);
System.out.printf("= read%n");
SummaryStatistics statsRead = measureReadSerializedCas(file, 100);
printStats(statsWrite, statsRead);
}
private static SummaryStatistics measureWriteSerializedCas(Iterable<JCas> aTestData, File aFile)
throws IOException
{
SummaryStatistics stats = new SummaryStatistics();
for (JCas jcas : aTestData) {
long begin = System.currentTimeMillis();
writeSerializedCas(jcas, aFile);
stats.addValue(System.currentTimeMillis() - begin);
}
return stats;
}
private static SummaryStatistics measureReadSerializedCas(File aFile, int aRepeat)
throws IOException, UIMAException
{
SummaryStatistics stats = new SummaryStatistics();
JCas jcas = JCasFactory.createJCas();
for (int n = 0; n < aRepeat; n++) {
long begin = System.currentTimeMillis();
readSerializedCas(jcas, aFile);
stats.addValue(System.currentTimeMillis() - begin);
}
return stats;
}
@Test
public void measureCasCreation()
throws UIMAException
{
SummaryStatistics statsRead = measureCasCreation(100);
printStats("CREATE", statsRead);
}
private static SummaryStatistics measureCasCreation(int aRepeat)
throws UIMAException
{
SummaryStatistics stats = new SummaryStatistics();
for (int n = 0; n < aRepeat; n++) {
long begin = System.currentTimeMillis();
// JCas jcas = JCasFactory.createJCas();
JCas jcas = CasCreationUtils.createCas((TypeSystemDescription) null, null, null).getJCas();
stats.addValue(System.currentTimeMillis() - begin);
}
return stats;
}
private static void writeSerializedCas(JCas aJCas, File aFile)
throws IOException
{
try (ObjectOutputStream os = new ObjectOutputStream(new FileOutputStream(aFile))) {
CASCompleteSerializer serializer = serializeCASComplete(aJCas.getCasImpl());
os.writeObject(serializer);
}
}
private static void readSerializedCas(JCas aJCas, File aFile)
throws IOException
{
try (ObjectInputStream is = new ObjectInputStream(new FileInputStream(aFile))) {
CASCompleteSerializer serializer = (CASCompleteSerializer) is.readObject();
deserializeCASComplete(serializer, aJCas.getCasImpl());
// // Initialize the JCas sub-system which is the most often used API in DKPro Core components
// try {
// aJCas.getCas().getJCas();
// }
// catch (CASException e) {
// throw new IOException(e);
// }
}
catch (ClassNotFoundException e) {
throw new IOException(e);
}
}
private JCas generateRandomCas()
throws UIMAException
{
// Generate test data
System.out.printf("Generating test data... ");
JCas jcas = JCasFactory.createJCas();
DocumentMetaData.create(jcas).setDocumentId("dummy");
initRandomCas(jcas, 10000, 30000, 0);
System.out.printf("done%n");
return jcas;
}
@Ignore("Run this only when you want to compare performance")
@Test
public void performanceTest()
throws Exception
{
int REPEATS = 100;
// Generate test data
Iterable<JCas> testdata = repeat(generateRandomCas(), REPEATS);
System.out.printf("Data serialized to %s %n", testFolder);
// Set up configurations
Map<String, AnalysisEngineDescription> configs = new LinkedHashMap<String, AnalysisEngineDescription>();
configs.put(
"Format S - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "S",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
configs.put(
"Format S+ - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "S+",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
configs.put(
"Format 0 - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "0",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
configs.put(
"Format 4 - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "4",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
configs.put(
"Format 6 - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "6",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
configs.put(
"Format 6+ - no compression",
createEngineDescription(
BinaryCasWriter.class,
BinaryCasWriter.PARAM_OVERWRITE, true,
BinaryCasWriter.PARAM_FORMAT, "6+",
BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.NONE,
BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
// configs.put(
// "Format 6+ - GZip compression",
// createEngineDescription(
// BinaryCasWriter.class,
// BinaryCasWriter.PARAM_FORMAT, "6+",
// BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.GZIP,
// BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
// configs.put(
// "Format 6+ - BZIP2 compression",
// createEngineDescription(
// BinaryCasWriter.class,
// BinaryCasWriter.PARAM_FORMAT, "6+",
// BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.BZIP2,
// BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
// configs.put(
// "Format 6+ - XZ compression",
// createEngineDescription(
// BinaryCasWriter.class,
// BinaryCasWriter.PARAM_FORMAT, "6+",
// BinaryCasWriter.PARAM_COMPRESSION, CompressionMethod.XZ,
// BinaryCasWriter.PARAM_TARGET_LOCATION, testFolder));
// Run tests
System.out.printf("--------------------------------------------%n");
for (Entry<String, AnalysisEngineDescription> cfg : configs.entrySet()) {
System.out.printf("%s%n", cfg.getKey());
System.out.printf(" Measuring WRITE%n");
for (File f : FileUtils.listFiles(testFolder, new PrefixFileFilter("dummy.bin"), null)) {
f.delete();
}
SummaryStatistics writeStats = measureWritePerformance(cfg.getValue(), testdata);
Collection<File> files = FileUtils.listFiles(testFolder, new PrefixFileFilter("dummy.bin"), null);
assertEquals(1, files.size());
File f = files.iterator().next();
// For some readers, we may need a CAS with is already initialized with the proper
// type system, so we create one here
JCas jcas = JCasFactory.createJCas();
System.out.printf(" Measuring READ%n");
CollectionReaderDescription reader = createReaderDescription(
BinaryCasReader.class,
BinaryCasReader.PARAM_SOURCE_LOCATION, f);
SummaryStatistics readStats = measureReadPerformance(reader, jcas, REPEATS);
printStats(writeStats, readStats);
System.out.printf(" Size %10d bytes%n", f.length());
System.out.printf("--------------------------------------------%n");
}
measureWriteSerializedCas(testdata, new File(testFolder, "dummy.bin"));
}
private static void printStats(String aTitle, SummaryStatistics aStats)
{
System.out.printf(" %10s%n", aTitle, "READ");
System.out.printf(" Repeat %10d times%n", aStats.getN());
System.out.printf(" Total %10.0f ms%n", aStats.getSum());
System.out.printf(" Mean %10.0f ms%n", aStats.getMean());
System.out.printf(" Min %10.0f ms%n", aStats.getMin());
System.out.printf(" Max %10.0f ms%n", aStats.getMax());
}
private static void printStats(SummaryStatistics aWrite, SummaryStatistics aRead)
{
System.out.printf(" %10s %10s%n", "WRITE", "READ");
System.out.printf(" Repeat %10d times %10d times%n", aWrite.getN(), aRead.getN());
System.out.printf(" Total %10.0f ms %10.0f ms%n", aWrite.getSum(), aRead.getSum());
System.out.printf(" Mean %10.0f ms %10.0f ms%n", aWrite.getMean(), aRead.getMean());
System.out.printf(" Min %10.0f ms %10.0f ms%n", aWrite.getMin(), aRead.getMin());
System.out.printf(" Max %10.0f ms %10.0f ms%n", aWrite.getMax(), aRead.getMax());
}
@Rule
public DkproTestContext testContext = new DkproTestContext();
}