package org.apache.hadoop.io.simpleseekableformat; import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.SortedMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPrematureEOFException; import org.apache.hadoop.io.compress.CompressionInputStream; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.simpleseekableformat.DataSegmentReader.EmptyDataSegmentException; /** * The reader for Seekable File Format. * * This class inherits CompressionInputStream because an instance of this will * be returned by SimpleSeekableFormatCodec. * * See {@link SimpleSeekableFormat} */ public class SimpleSeekableFormatInputStream extends CompressionInputStream { private final InterleavedInputStream interleavedIn; private final DataInputStream dataIn; private InputStream dataSegmentIn; // Stores the latest metaData block private final SimpleSeekableFormat.MetaData metaData; private final HashMap<Text, Decompressor> decompressorCache = new HashMap<Text, Decompressor>(); private final Configuration conf = new Configuration(); public SimpleSeekableFormatInputStream(InputStream in) { // we don't use the inherited field "in" at all: super(null); metaData = new SimpleSeekableFormat.MetaData(); interleavedIn = createInterleavedInputStream(in, SimpleSeekableFormat.METADATA_BLOCK_LENGTH, SimpleSeekableFormat.DATA_BLOCK_LENGTH, new SimpleSeekableFormat.MetaDataConsumer(metaData)); this.dataIn = new DataInputStream(interleavedIn); } /** * This factory method can be overwritten by subclass to provide different behavior. * It's only called in the constructor. */ protected InterleavedInputStream createInterleavedInputStream(InputStream in, int metaDataBlockLength, int dataBlockLength, SimpleSeekableFormat.MetaDataConsumer consumer) { return new InterleavedInputStream(in, metaDataBlockLength, dataBlockLength, consumer); } protected InterleavedInputStream getInterleavedIn() { return interleavedIn; } protected SimpleSeekableFormat.MetaData getMetaData() { return metaData; } @Override public int read() throws IOException { if (dataSegmentIn == null) { if (!moveToNextDataSegment()) { return -1; } } do { int result = dataSegmentIn.read(); if (result != -1) { return result; } if (!moveToNextDataSegment()) { return -1; } } while (true); } @Override public int read(byte[] b, int start, int length) throws IOException { if (dataSegmentIn == null) { if (!moveToNextDataSegment()) { return -1; } } do { int result = dataSegmentIn.read(b, start, length); if (result != -1) { return result; } if (!moveToNextDataSegment()) { return -1; } } while (true); } @Override public void close() throws IOException { clearDataSegment(); dataIn.close(); } /** * This function depends on that the underlying dataSegmentIn.available() only * returns 0 when EOF. Otherwise it will break because it jumps over the dataSegmentIn * that has available() == 0. */ @Override public int available() throws IOException { if (dataSegmentIn == null) { if (!moveToNextDataSegment()) { return 0; } } do { int result = dataSegmentIn.available(); if (result != 0) { return result; } if (!moveToNextDataSegment()) { return 0; } } while (true); } /** * Returns false if there are no more data segments. */ private boolean moveToNextDataSegment() throws IOException { try { clearDataSegment(); DataSegmentReader dataSegmentReader = new DataSegmentReader(dataIn, conf, decompressorCache); dataSegmentIn = dataSegmentReader.getInputStream(); } catch (EmptyDataSegmentException e){ // no data available return false; } catch (EOFException e) { // EOFException is thrown when the underlying data stream is truncated, e.g. truncated file. // This is considered as a normal case. throw new CodecPrematureEOFException("Truncated .SSF file detected."); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } return true; } /** * Called by subclass to clear out the current dataSegmentIn. */ protected void clearDataSegment() throws IOException { if (dataSegmentIn != null) { dataSegmentIn.close(); dataSegmentIn = null; } } @Override public void resetState() throws IOException { throw new RuntimeException("SeekableFileInputFormat does not support resetState()"); } /** * This function seeks forward using all "available" bytes. * It returns the offset after the seek. * * This function throws EOFException if there are no available complete metaDataBlock * or the metaDataBlock points to a position after the file end (e.g. truncated files). */ public long seekForward() throws IOException { // Try to read the last metadata block interleavedIn.skipToLastAvailableMetaDataBlock(); if (!interleavedIn.readMetaDataIfNeeded()) { throw new EOFException("Cannot get a complete metadata block"); } // Move the interleavedIn to the beginning of a dataSegment SortedMap<Long, Long> offsetPairs = metaData.getOffsetPairs(); // The last key in the offsetPair points to the farthest position that we can seek to. long uncompressedDataOffset = offsetPairs.lastKey(); long compressedDataOffset = offsetPairs.get(uncompressedDataOffset); long toSkip = compressedDataOffset - interleavedIn.getDataOffset(); if (toSkip < 0) { throw new CorruptedDataException("SSF format error: The last offset pair is before the current position in InterleaveStream!"); } try { interleavedIn.skipExactly(toSkip); } catch (EOFException e) { // Ignore this exception // This is the PTail use case. We don't care about this CodecPrematureEOFException } clearDataSegment(); return uncompressedDataOffset; } }