package org.apache.hadoop.io.simpleseekableformat;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.CRC32;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.Compressor;
/**
* This class holds the data related to a single data segment.
*/
class DataSegmentWriter {
// empty string for no compression
private final String codecName;
private final byte[] codecNameUTF8;
// either uncompressedData or compressedData
private final SimpleSeekableFormat.Buffer storedData;
// CRC32 value
private final long crc32Value;
/**
* Create a new data segment from uncompressed data and a codec.
* This is called by the writer.
* @param compressor for reusing a compressor. It can be null.
*/
DataSegmentWriter(SimpleSeekableFormat.Buffer uncompressedData,
CompressionCodec codec,
Compressor compressor) throws IOException {
// Try compress
if (codec != null) {
SimpleSeekableFormat.Buffer compressedData = new SimpleSeekableFormat.Buffer();
OutputStream out;
if (compressor == null) {
compressor = codec.createCompressor();
} else {
compressor.reset();
}
out = codec.createOutputStream(compressedData, compressor);
out.write(uncompressedData.getData(), 0, uncompressedData.getLength());
out.close();
// Don't compress if the result is longer than uncompressed data.
if (compressedData.getLength() + codec.getClass().getName().length()
< uncompressedData.getLength() + 8) {
codecName = codec.getClass().getName();
storedData = compressedData;
} else {
codecName = "";
storedData = uncompressedData;
}
} else {
// no compression
codecName = "";
storedData = uncompressedData;
}
codecNameUTF8 = getCodecNameUTF8(codecName);
// Calculate CRC32 only when there are no compression.
if (codecName.length() == 0) {
CRC32 crc32 = new CRC32();
crc32.update(storedData.getData(), 0, storedData.getLength());
crc32Value = crc32.getValue();
} else {
crc32Value = 0;
}
}
/**
* Returns the size of this Data Segment after (potential) compression.
* This includes the length field.
*/
int size() {
return 4 /*length field*/
+ (codecNameUTF8.length == 0 ? 8 : 0) /*optional crc32*/
+ 2 /*utf8 length*/ + codecNameUTF8.length
+ storedData.getLength();
}
/**
* Write this data segment into an OutputStream.
*/
void writeTo(DataOutputStream out) throws IOException {
// We do the UTF8 conversion ourselves instead of relying on DataOutput
// to ensure we strictly follow UTF-8 standard, as well as better performance,
// and save the code to count the UTF-8 bytes (we need that to calculate
// the total length.
int length = size() - 4;
out.writeInt(length);
out.writeShort(codecNameUTF8.length);
out.write(codecNameUTF8);
if (codecNameUTF8.length == 0) {
out.writeLong(crc32Value);
}
out.write(storedData.getData(), 0, storedData.getLength());
}
/**
* Utility static fields.
*/
static final ConcurrentHashMap<String, byte[]> CODEC_NAME_CACHE = new ConcurrentHashMap<String, byte[]>();
/**
* Convert from String to UTF8 byte array.
*/
static byte[] getCodecNameUTF8(String compressionCodecName) {
byte[] codecNameBytes = CODEC_NAME_CACHE.get(compressionCodecName);
if (codecNameBytes == null) {
try {
codecNameBytes = compressionCodecName.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
CODEC_NAME_CACHE.put(compressionCodecName, codecNameBytes);
}
return codecNameBytes;
}
}