package FlexibleEncoding.Parquet; /* * adapted from Parquet* */ //import static parquet.Log.DEBUG; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; /** * Decodes values written in the grammar described in {@link RunLengthBitPackingHybridEncoder} * * @author Julien Le Dem */ public class RunLengthBitPackingHybridDecoder { private static final Log LOG = Log.getLog(RunLengthBitPackingHybridDecoder.class); private static enum MODE { RLE, PACKED } private final int bitWidth; private final BytePacker packer; private final ByteArrayInputStream in; private MODE mode; private int currentCount; private int currentValue; private int[] currentBuffer; public RunLengthBitPackingHybridDecoder(int bitWidth, ByteArrayInputStream in) { if (Log.DEBUG) LOG.debug("decoding bitWidth " + bitWidth); Preconditions.checkArgument(bitWidth >= 0 && bitWidth <= 32, "bitWidth must be >= 0 and <= 32"); this.bitWidth = bitWidth; this.packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); this.in = in; } public int readInt() throws IOException { if (currentCount == 0) { readNext(); } -- currentCount; int result; switch (mode) { case RLE: result = currentValue; break; case PACKED: result = currentBuffer[currentBuffer.length - 1 - currentCount]; break; default: throw new ParquetDecodingException("not a valid mode " + mode); } return result; } private void readNext() throws IOException { Preconditions.checkArgument(in.available() > 0, "Reading past RLE/BitPacking stream."); final int header = BytesUtils.readUnsignedVarInt(in); mode = (header & 1) == 0 ? MODE.RLE : MODE.PACKED; switch (mode) { case RLE: currentCount = header >>> 1; if (Log.DEBUG) LOG.debug("reading " + currentCount + " values RLE"); currentValue = BytesUtils.readIntLittleEndianPaddedOnBitWidth(in, bitWidth); break; case PACKED: int numGroups = header >>> 1; currentCount = numGroups * 8; if (Log.DEBUG) LOG.debug("reading " + currentCount + " values BIT PACKED"); currentBuffer = new int[currentCount]; // TODO: reuse a buffer byte[] bytes = new byte[numGroups * bitWidth]; // At the end of the file RLE data though, there might not be that many bytes left. int bytesToRead = (int)Math.ceil(currentCount * bitWidth / 8.0); bytesToRead = Math.min(bytesToRead, in.available()); new DataInputStream(in).readFully(bytes, 0, bytesToRead); for (int valueIndex = 0, byteIndex = 0; valueIndex < currentCount; valueIndex += 8, byteIndex += bitWidth) { packer.unpack8Values(bytes, byteIndex, currentBuffer, valueIndex); } break; default: throw new ParquetDecodingException("not a valid mode " + mode); } } }