package net.contrapunctus.rngzip.io;
import com.colloquial.arithcode.ArithCodeInputStream;
import com.colloquial.arithcode.ArithCodeOutputStream;
import com.colloquial.arithcode.PPMModel;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Constructor;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.contrapunctus.lzma.LzmaInputStream;
import net.contrapunctus.lzma.LzmaOutputStream;
import net.contrapunctus.rngzip.util.*;
import org.apache.commons.compress.bzip2.CBZip2InputStream;
import org.apache.commons.compress.bzip2.CBZip2OutputStream;
/**
* This class represents the various configurable settings for a
* compressed XML stream. To compress a stream, these settings are
* specified on the command line (or otherwise set by the client), but
* to decompress a stream they must be extracted from the stream
* itself.
*
* <p class='license'>This is free software; you may modify and/or
* redistribute it under the terms of the GNU General Public License,
* but it comes with <b>absolutely no warranty.</b>
*
* @author Christopher League
*/
public class RNGZSettings
{
/**
* This enumeration represents the different ways to encode choice
* points as bits.
*/
public enum BitCoding
{
/**
* Uses a fixed-length representation. That is, if a particular
* choice point has 6 possible choices, we will use 3 bits to
* represent them: 000 for choice 0, 100 for choice 4, 101 for
* choice 5, etc.
* @see SimpleChoiceFactory
*/
FIXED,
/**
* Uses an adaptive Huffman algorithm. More frequently traveled
* paths through each choice point will eventually use
* proportionally fewer bits.
* @see HuffmanChoiceFactory
*/
HUFFMAN,
/**
* Uses a full byte for each choice point. Requires that there
* are 256 or fewer choices.
*/
BYTE;
}
/**
* This enumeration represents the different ways to compress
* sub-streams within the compressed XML format.
*/
public enum DataCompression
{
/**
* Does not apply any compression to the stream.
*/
NONE,
/**
* Applies GZIP compression to the stream.
* @see GZIPOutputStream
*/
GZ,
/**
* Applies LZMA (7-Zip) compression to the stream.
* @see LzmaOutputStream
*/
LZMA,
/**
* Applies BZip2 compression to the stream.
* @see CBZip2OutputStream
*/
BZ2,
/**
* Applies Prediction by Partial Match (PPM) compression to the
* stream. The size of the context is currently 4. When this
* is applied to a data stream, it seeds the model with the
* parent tag as a context.
* @see ArithCodeOutputStream, PPMModel, PPMContextOutputStream
*/
PPM,
/**
* Same as PPM, except the context is extended to 5.
*/
PPMX
}
private static final int PPM_SMALL_LENGTH = 4;
private static final int PPM_LARGE_LENGTH = 5;
private static final BitCoding[] BitCoding_values = BitCoding.values();
private static final DataCompression[] DataCompression_values =
DataCompression.values();
public static final BitCoding DEFAULT_CODER = BitCoding.HUFFMAN;
public static final DataCompression DEFAULT_COMPRESSOR = DataCompression.GZ;
/**
* The strategy used to encode choice points as bits. The default
* is <code>HUFFMAN</code>.
*/
protected BitCoding coding = DEFAULT_CODER;
/**
* The type of compression applied to the bit stream representing
* the XML tree structure. The default is <code>GZ</code>.
*/
protected DataCompression treeCompr = DEFAULT_COMPRESSOR;
/**
* The type of compression applied to the character data from the
* XML document. The default is <code>GZ</code>.
*/
protected DataCompression dataCompr = DEFAULT_COMPRESSOR;
/**
* Default constructor, creates an object that represents
* (initially) all the default settings.
*/
public RNGZSettings() { }
/**
* This constructor takes parameters to specify the bit coding
* strategy and the types of compression used.
* @param bc the strategy used to encode choice points as bits.
* @param tc the type of compression applied to the bit stream
* representing the XML tree structure.
* @param dc the type of compression applied to the character data
* from the XML document.
*/
public RNGZSettings(BitCoding bc,
DataCompression tc,
DataCompression dc)
{
coding = bc;
treeCompr = tc;
dataCompr = dc;
}
/**
* Adjusts the strategy used to encode choice points as bits.
* @see #coding
*/
public void setBitCoder(BitCoding bc)
{
coding = bc;
}
/**
* Interpret the string parameter as a strategy used to encode
* choice points as bits.
* @param bc a case-insensitive representation of the {@link
* BitCoding} strategy, such as “fixed” or “Huffman”.
*/
public void setBitCoder(String bc)
{
setBitCoder(BitCoding.valueOf(bc.toUpperCase()));
}
/**
* Sets the type of compression used on the tree representation.
* @see #treeCompr
*/
public void setTreeCompressor(DataCompression tc)
{
treeCompr = tc;
}
/**
* Sets the type of compression used on the tree representation.
* @param tc a case-insensitive representation of the {@link
* DataCompression} type, such as “none” or “gz”.
*/
public void setTreeCompressor(String tc)
{
setTreeCompressor(DataCompression.valueOf(tc.toUpperCase()));
}
/**
* Sets the type of compression used on the character data.
* @see #dataCompr
*/
public void setDataCompressor(DataCompression dc)
{
dataCompr = dc;
}
/**
* Sets the type of compression used on the character data.
* @param dc a case-insensitive representation of the {@link
* DataCompression} type, such as “none” or “gz”.
*/
public void setDataCompressor(String dc)
{
setDataCompressor(DataCompression.valueOf(dc.toUpperCase()));
}
/**
* Provides a brief, human-readable representation of these
* settings.
*/
public String toString()
{
return coding + "-" + treeCompr + "-" + dataCompr;
}
/**
* Bytes 5–8 of the stream are given by this ‘magic’ number, in
* Java-standard big-endian byte order. They follow the four bytes
* of {@link MultiplexOutputStream#MAGIC}. These bytes represent
* the letters “rnZ” in ASCII (72 6E 5A in hexadecimal) followed by
* a one-byte version number, currently 01.
*/
public static final int MAGIC = 0x726E5A01;
/* ----------------------------------------------------------------
* COMPRESSOR INTERFACE
* ----------------------------------------------------------------
*/
private static Object externalInstance
(String nm, Class ty, Object arg)
throws IOException
{
try {
Class c = Class.forName(nm);
Constructor k = c.getConstructor(ty);
return k.newInstance(arg);
}
catch(Exception x) {
throw new IOException(x.getMessage());
}
}
/**
* Filter an output stream through a compressor, as specified by
* ‘cm’. That is, if ‘cm’ is <code>GZ</code>, this method will
* return <code>new GZIPOutputStream(out)</code>.
*/
public static OutputStream wrapOutput
(OutputStream out, DataCompression cm)
throws IOException
{
switch(cm) {
case NONE: break;
case GZ: out = new GZIPOutputStream(out); break;
case BZ2: out = new CBZip2OutputStream(out); break;
case LZMA: out = new LzmaOutputStream(out); break;
case PPM:
out = new ArithCodeOutputStream(out, new PPMModel(PPM_SMALL_LENGTH));
break;
case PPMX:
out = new ArithCodeOutputStream(out, new PPMModel(PPM_LARGE_LENGTH));
break;
// here's how it would work for external stuff:
//out = (OutputStream) externalInstance
// ("org.apache.commons.compress.bzip2.CBZip2OutputStream",
// OutputStream.class, out);
//break;
default: assert false;
}
return out;
}
/**
* Record a representation of these settings onto the designated
* stream. This representation is sufficient for reconstructing
* the settings upon decompressing.
*/
public void writeTo(MultiplexOutputStream mux, int stream)
throws IOException
{
/* For future compatibility, the config stream tells how many
other embedded streams will exist, then gives the compression
scheme for each one. For now, that is:
zz encoder id (one byte, 0=FIXED, 1=HUFFMAN, 2=BYTE)
02 number of streams
xx compression for bit stream (0=NONE, 1=GZ, 2=BZ2, 3=PPM, 4=LZMA)
yy compression for data stream (same)
*/
DataOutputStream out = mux.open
(stream, new OutputStreamFilter<DataOutputStream>() {
public DataOutputStream wrap(OutputStream out)
throws IOException {
return new DataOutputStream(out);
}
});
out.write(coding.ordinal());
out.write(2);
out.write(treeCompr.ordinal());
out.write(dataCompr.ordinal());
}
/**
* Construct a <code>BitOutputStream</code> on the multiplexed
* stream, using the tree compressor specified by these settings.
* @see #treeCompr
*/
protected BitOutputStream newBitOutput(MultiplexOutputStream mux,
int stream)
throws IOException
{
return mux.open
(stream, new OutputStreamFilter<BitOutputStream>() {
public BitOutputStream wrap (OutputStream out) throws IOException {
return new BitOutputStream(wrapOutput(out, treeCompr));
}
});
}
/**
* Construct a <code>DataOutputStream</code> which is compressed
* according to the settings.
* @see #dataCompr
*/
protected ContextualOutputStream newDataOutput
(MultiplexOutputStream mux, int stream)
throws IOException
{
return mux.open
(stream, new OutputStreamFilter<ContextualOutputStream>() {
public ContextualOutputStream wrap (OutputStream out)
throws IOException {
switch( dataCompr ) {
case PPM:
return new PPMContextOutputStream(out, PPM_SMALL_LENGTH);
case PPMX:
return new PPMContextOutputStream(out, PPM_LARGE_LENGTH);
default:
return new ContextFreeOutputStream(wrapOutput(out, dataCompr));
}
}});
}
/**
* Construct a <code>ChoiceCoder</code> according to these
* settings.
* @see #coding
*/
protected ChoiceCoder makeChoiceCoder(int limit, Object id)
{
if(limit < 1) {
throw new IllegalArgumentException("limit < 1");
}
else if(limit == 1) {
return TrivialChoiceCoder.instance;
}
else if(limit > 2 && coding == BitCoding.HUFFMAN) {
return new HuffmanChoiceCoder(limit, id);
}
else if(coding == BitCoding.BYTE) {
return new ByteChoiceCoder(limit, id);
}
else {
assert coding == BitCoding.FIXED || limit == 2;
return new SimpleChoiceCoder(limit, id);
}
}
/* ----------------------------------------------------------------
* DECOMPRESSOR INTERFACE
* ----------------------------------------------------------------
*/
/**
* Return the magic value. This is used to validate the input
* stream during decompression. It is a protected method rather
* than just a constant, so that if you subclass this class you can
* provide a different magic number.
* @see #MAGIC
*/
protected int magic()
{
return MAGIC;
}
/**
* Reconstitute the settings from a given stream.
*/
protected RNGZSettings fromStream(MultiplexInputStream mux,
int stream)
throws IOException
{
if(mux.magic() != magic()) {
throw new RNGZFormatException("bad magic");
}
DataInputStream config = new DataInputStream(mux.open(stream));
try {
coding = BitCoding_values[config.read()];
if(config.read() != 2) {
throw new RNGZFormatException("invalid config data");
}
treeCompr = DataCompression_values[config.read()];
dataCompr = DataCompression_values[config.read()];
}
catch(IndexOutOfBoundsException x) {
throw new RNGZFormatException("unknown coding");
}
return this;
}
/**
* Create a decompressing input stream, according to the value of
* ‘cm’.
*/
public static InputStream wrapInput
(InputStream in, DataCompression cm)
throws IOException
{
switch(cm) {
case NONE: break;
case GZ: in = new GZIPInputStream(in); break;
case BZ2: in = new CBZip2InputStream(in); break;
case LZMA: in = new LzmaInputStream(in); break;
case PPM:
in = new ArithCodeInputStream(in, new PPMModel(PPM_SMALL_LENGTH));
break;
case PPMX:
in = new ArithCodeInputStream(in, new PPMModel(PPM_LARGE_LENGTH));
break;
default: assert false;
}
return in;
}
/**
* Create a decompressing bit input stream, according to these
* settings.
* @see #treeCompr
*/
protected BitInputStream newBitInput(MultiplexInputStream mux,
int stream)
throws IOException
{
return new BitInputStream(wrapInput(mux.open(stream), treeCompr));
}
/**
* Create a decompressing data input stream, according to these
* settings.
* @see #dataCompr
*/
protected ContextualInputStream newDataInput
(MultiplexInputStream mux, int stream)
throws IOException
{
InputStream in = mux.open(stream);
switch( dataCompr ) {
case PPM:
return new PPMContextInputStream(in, PPM_SMALL_LENGTH);
case PPMX:
return new PPMContextInputStream(in, PPM_LARGE_LENGTH);
default:
return new ContextFreeInputStream(wrapInput(in, dataCompr));
}
}
}