package org.apache.hadoop.io.simpleseekableformat; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.concurrent.ConcurrentHashMap; import java.util.zip.CRC32; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.util.ReflectionUtils; /** * This class holds the data related to a single data segment. */ class DataSegmentReader { static class EmptyDataSegmentException extends EOFException { } // uncompressed data stream private final InputStream uncompressedData; /** * May throw EOFException if InputStream does not have a * complete data segment. * * NOTE: This class holds reference to the Decompressor in * the decompressorCache, until the return value of * getInputStream() is closed. * * @param decompressorCache * @throws EmptyDataSegmentException if there is nothing to read. * @throws EOFException if the data segment is not complete. */ DataSegmentReader(DataInputStream in, Configuration conf, HashMap<Text, Decompressor> decompressorCache) throws EmptyDataSegmentException, EOFException, ClassNotFoundException, IOException { // Read from DataInputStream // 1. Read length int length = 0; try { length = in.readInt(); } catch (EOFException e) { throw new EmptyDataSegmentException(); } // 2. Read codec int codecNameUTF8Length = in.readShort(); byte[] codecNameUTF8 = new byte[codecNameUTF8Length]; in.readFully(codecNameUTF8); Text codecNameText = new Text(codecNameUTF8); // 3. read CRC32 (only present when uncompressed) boolean hasCrc32 = (codecNameUTF8Length == 0); long crc32Value = 0; if (hasCrc32) { crc32Value = in.readLong(); } // 4. read data byte[] storedData = new byte[length - (hasCrc32 ? 8 : 0)/*crc32*/ - 2/*codec length*/ - codecNameUTF8Length]; in.readFully(storedData); // Verify the checksum if (hasCrc32) { CRC32 crc32 = new CRC32(); crc32.update(storedData); if (crc32.getValue() != crc32Value) { throw new CorruptedDataException("Corrupted data segment with length " + length + " crc32 expected " + crc32Value + " but got " + crc32.getValue()); } } // Uncompress the data if needed if (codecNameUTF8Length == 0) { // no compression uncompressedData = new ByteArrayInputStream(storedData); } else { CompressionCodec codec = getCodecFromName(codecNameText, conf); Decompressor decompressor = null; if (decompressorCache != null) { // Create decompressor and add to cache if needed. decompressor = decompressorCache.get(codecNameText); if (decompressor == null) { decompressor = codec.createDecompressor(); } else { decompressor.reset(); } } if (decompressor == null) { uncompressedData = codec.createInputStream(new ByteArrayInputStream(storedData)); } else { uncompressedData = codec.createInputStream(new ByteArrayInputStream(storedData), decompressor); } } } InputStream getInputStream() { return uncompressedData; } /** * A simple cache to save the cost of reflection and object creation. */ private static ConcurrentHashMap<Text, CompressionCodec> CODEC_CACHE = new ConcurrentHashMap<Text, CompressionCodec>(); @SuppressWarnings("unchecked") private static CompressionCodec getCodecFromName(Text codecName, Configuration conf) throws ClassNotFoundException { CompressionCodec result = CODEC_CACHE.get(codecName); if (result == null) { Class<? extends CompressionCodec> codecClass = (Class<? extends CompressionCodec>)Class.forName(codecName.toString()); result = ReflectionUtils.newInstance(codecClass, conf); CODEC_CACHE.put(codecName, result); } return result; } }