package org.apache.hadoop.io.simpleseekableformat;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* SimpleSeekableFormat supports seek based on compressed byte offsets as well
* as uncompressed byte offsets.
*
* File Format Description:
*
* 0. Definition
* * Metadata block: a fixed size block of 1024 bytes storing metadata of the
* file.
* * Data block: a fixed size block of 1023K bytes storing actual data.
* * Data segment: a variable-length segment of bytes storing a logical
* data unit that needs to be processed together.
*
* 1. File Format Layout
* Each 1K bytes at the beginning of x MB is a metadata block.
* The rest of 1023K bytes are data blocks.
*
* 2. Metadata block (1024 bytes):
* Each metadata block looks like this:
* 32 bytes: "SSF_Magic_C17e5C697a00bB1A859aD\n"
* 4 bytes: version number, now is 1.
* 16 bytes: 8-byte of uncompressed data stream offset
* + 8-byte of compressed data stream offset
*
* 3. Data block (1023 * 1024 bytes):
* All data blocks should be concatenated to be a stream. The stream consists
* of consecutive data segments, back by back.
*
* 4. Data segment:
* Each data segment looks like this:
* 4 bytes: length (implies that a single data segment cannot be longer than
* 4GB). It does not include the length field itself, but includes
* all following fields like codec name and crc32 checksum.
* 2 bytes: byte length of compression codec class name. Or 0 for uncompressed.
* x-bytes: UTF-8 encoded compression codec class name.
* (Only when x = 0) 8 bytes: crc32 checksum of the data following.
* (length - 2 - x - (x==0?8:0) ) bytes: actual data
*
* This class encapsulates all underlying logics of the SeekableFileFormat.
*
* NOTE: Requirement on the CompressionCodec InputStream: available() should
* only return 0 when EOF. Otherwise SeekableFileInputStream.available() will
* break.
*/
public class SimpleSeekableFormat {
public static final String FILEFORMAT_SSF_CODEC_CONF = "fileformat.ssf.codec";
public static final String FILEFORMAT_SSF_MAX_UNCOMPRESSED_SEGMENT_LENGTH =
"fileformat.ssf.max.uncompressed.segment.length";
static final int METADATA_BLOCK_LENGTH = 1024;
static final int DATA_BLOCK_LENGTH = 1024 * 1024 - METADATA_BLOCK_LENGTH;
static final int VERSION = 1;
static final String MAGIC_HEADER = "SSF_Magic_C17e5C697a00bB1A859aD\n";
static final byte[] MAGIC_HEADER_BYTES;
static {
try {
MAGIC_HEADER_BYTES = MAGIC_HEADER.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
public static class OffsetPair {
private long uncompressedOffset;
private long compressedOffset;
public OffsetPair() {
this(0L, 0L);
}
public OffsetPair(long uncompressedOffset, long compressedOffset) {
this.uncompressedOffset = uncompressedOffset;
this.compressedOffset = compressedOffset;
}
public long getUncompressedOffset() {
return uncompressedOffset;
}
public void setUncompressedOffset(long uncompressedOffset) {
this.uncompressedOffset = uncompressedOffset;
}
public long getCompressedOffset() {
return compressedOffset;
}
public void setCompressedOffset(long compressedOffset) {
this.compressedOffset = compressedOffset;
}
};
public static class MetaData {
// key: uncompressedOffset
// value: DataStreamOffset
SortedMap<Long, Long> offsetPairs;
public void setOffsetPairs(final SortedMap<Long, Long> offsetPairs) {
this.offsetPairs = offsetPairs;
}
public SortedMap<Long, Long> getOffsetPairs() {
return offsetPairs;
}
}
/**
* MetaDataConsumer reads data from the stream and write to the MetaData class.
*/
public static class MetaDataConsumer implements InterleavedInputStream.MetaDataConsumer {
private MetaData metaData;
public MetaDataConsumer(MetaData metaData) {
this.metaData = metaData;
}
@Override
public void readMetaData(InputStream in, int metaDataBlockSize)
throws IOException {
// Read in the whole MetaDataBlock and store it in a DataInputStream.
byte[] metaDataBlock = new byte[metaDataBlockSize];
(new DataInputStream(in)).readFully(metaDataBlock);
DataInputStream din = new DataInputStream(new ByteArrayInputStream(metaDataBlock));
// verify magic header
byte[] magicHeaderBytes = new byte[MAGIC_HEADER_BYTES.length];
din.readFully(magicHeaderBytes);
if (!Arrays.equals(magicHeaderBytes, MAGIC_HEADER_BYTES)) {
throw new IOException("Wrong Magic Header Bytes");
}
// verify version
int version = din.readInt();
if (version > VERSION) {
throw new IOException("Unknown version " + version);
}
switch (version) {
case 1: {
// one pair of offsets
long uncompressedOffset = din.readLong();
long compressedOffset = din.readLong();
SortedMap<Long, Long> offsetPairs = new TreeMap<Long, Long>();
offsetPairs.put(uncompressedOffset, compressedOffset);
metaData.setOffsetPairs(offsetPairs);
// the rest is thrown away
}
}
}
}
/**
* This inner class provides the metadata block.
* Note that it accesses the lastOffsets field.
*/
public static class MetaDataProducer implements InterleavedOutputStream.MetaDataProducer {
private MetaData metaData;
public MetaDataProducer(MetaData metaData) {
this.metaData = metaData;
}
/**
* @param out The raw output stream.
*/
@Override
public void writeMetaData(DataOutputStream out, int metaDataBlockSize) throws IOException {
// Magic header and version
out.write(SimpleSeekableFormat.MAGIC_HEADER_BYTES);
out.writeInt(SimpleSeekableFormat.VERSION);
// Write out the offset pair
SortedMap<Long, Long> offsetPairs = metaData.getOffsetPairs();
assert(offsetPairs.size() == 1);
long uncompressedOffset = offsetPairs.firstKey();
long compressedOffset = offsetPairs.get(uncompressedOffset);
out.writeLong(uncompressedOffset);
out.writeLong(compressedOffset);
// Fill up the bytes
int left = metaDataBlockSize - SimpleSeekableFormat.MAGIC_HEADER_BYTES.length - 4 - 8 - 8;
while (left > 0) {
int toWrite = Math.min(left, NULLS.length);
out.write(NULLS, 0, toWrite);
left -= toWrite;
}
}
}
private static byte[] NULLS = new byte[1024];
static {
Arrays.fill(NULLS, (byte)0);
}
static class Buffer extends ByteArrayOutputStream {
public byte[] getData() { return buf; }
public int getLength() { return count; }
public void reset() { count = 0; }
}
}