/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package parquet.column.values.rle; import static org.junit.Assert.assertEquals; import java.io.ByteArrayInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.junit.Test; import parquet.bytes.BytesUtils; import parquet.column.values.bitpacking.BytePacker; import parquet.column.values.bitpacking.Packer; /** * @author Alex Levenson */ public class TestRunLengthBitPackingHybridEncoder { @Test public void testRLEOnly() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(3, 5); for (int i = 0; i < 100; i++) { encoder.writeInt(4); } for (int i = 0; i < 100; i++) { encoder.writeInt(5); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 4 assertEquals(4, BytesUtils.readIntLittleEndianOnOneByte(is)); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 5 assertEquals(5, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); } @Test public void testRepeatedZeros() throws Exception { // previousValue is initialized to 0 // make sure that repeated 0s at the beginning // of the stream don't trip up the repeat count RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(3, 5); for (int i = 0; i < 10; i++) { encoder.writeInt(0); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 10 << 1 = 20 assertEquals(20, BytesUtils.readUnsignedVarInt(is)); // payload = 4 assertEquals(0, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); } @Test public void testBitWidthZero() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(0, 5); for (int i = 0; i < 10; i++) { encoder.writeInt(0); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 10 << 1 = 20 assertEquals(20, BytesUtils.readUnsignedVarInt(is)); // end of stream assertEquals(-1, is.read()); } @Test public void testBitPackingOnly() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(3, 5); for (int i = 0; i < 100; i++) { encoder.writeInt(i % 3); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((104/8) << 1) | 1 = 27 assertEquals(27, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 104, is); for (int i = 0; i < 100; i++) { assertEquals(i % 3, (int) values.get(i)); } // end of stream assertEquals(-1, is.read()); } @Test public void testBitPackingOverflow() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(3, 5); for (int i = 0; i < 1000; i++) { encoder.writeInt(i % 3); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // 504 is the max number of values in a bit packed run // that still has a header of 1 byte // header = ((504/8) << 1) | 1 = 127 assertEquals(127, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 504, is); for (int i = 0; i < 504; i++) { assertEquals(i % 3, (int) values.get(i)); } // there should now be 496 values in another bit-packed run // header = ((496/8) << 1) | 1 = 125 assertEquals(125, BytesUtils.readUnsignedVarInt(is)); values = unpack(3, 496, is); for (int i = 0; i < 496; i++) { assertEquals((i + 504) % 3, (int) values.get(i)); } // end of stream assertEquals(-1, is.read()); } @Test public void testTransitionFromBitPackingToRle() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(3, 5); // 5 obviously bit-packed values encoder.writeInt(0); encoder.writeInt(1); encoder.writeInt(0); encoder.writeInt(1); encoder.writeInt(0); // three repeated values, that ought to be bit-packed as well encoder.writeInt(2); encoder.writeInt(2); encoder.writeInt(2); // lots more repeated values, that should be rle-encoded for (int i = 0; i < 100; i++) { encoder.writeInt(2); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((8/8) << 1) | 1 = 3 assertEquals(3, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(3, 8, is); assertEquals(Arrays.asList(0, 1, 0, 1, 0, 2, 2, 2), values); // header = 100 << 1 = 200 assertEquals(200, BytesUtils.readUnsignedVarInt(is)); // payload = 2 assertEquals(2, BytesUtils.readIntLittleEndianOnOneByte(is)); // end of stream assertEquals(-1, is.read()); } @Test public void testPaddingZerosOnUnfinishedBitPackedRuns() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(5, 5); for (int i = 0; i < 9; i++) { encoder.writeInt(i+1); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = ((16/8) << 1) | 1 = 5 assertEquals(5, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(5, 16, is); assertEquals(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0), values); assertEquals(-1, is.read()); } @Test public void testSwitchingModes() throws Exception { RunLengthBitPackingHybridEncoder encoder = new RunLengthBitPackingHybridEncoder(9, 100); // rle first for (int i = 0; i < 25; i++) { encoder.writeInt(17); } // bit-packing for (int i = 0; i < 7; i++) { encoder.writeInt(7); } encoder.writeInt(8); encoder.writeInt(9); encoder.writeInt(10); // bit-packing followed by rle for (int i = 0; i < 25; i++) { encoder.writeInt(6); } // followed by a different rle for (int i = 0; i < 8; i++) { encoder.writeInt(5); } ByteArrayInputStream is = new ByteArrayInputStream(encoder.toBytes().toByteArray()); // header = 25 << 1 = 50 assertEquals(50, BytesUtils.readUnsignedVarInt(is)); // payload = 17, stored in 2 bytes assertEquals(17, BytesUtils.readIntLittleEndianOnTwoBytes(is)); // header = ((16/8) << 1) | 1 = 5 assertEquals(5, BytesUtils.readUnsignedVarInt(is)); List<Integer> values = unpack(9, 16, is); int v = 0; for (int i = 0; i < 7; i++) { assertEquals(7, (int) values.get(v)); v++; } assertEquals(8, (int) values.get(v++)); assertEquals(9, (int) values.get(v++)); assertEquals(10, (int) values.get(v++)); for (int i = 0; i < 6; i++) { assertEquals(6, (int) values.get(v)); v++; } // header = 19 << 1 = 38 assertEquals(38, BytesUtils.readUnsignedVarInt(is)); // payload = 6, stored in 2 bytes assertEquals(6, BytesUtils.readIntLittleEndianOnTwoBytes(is)); // header = 8 << 1 = 16 assertEquals(16, BytesUtils.readUnsignedVarInt(is)); // payload = 5, stored in 2 bytes assertEquals(5, BytesUtils.readIntLittleEndianOnTwoBytes(is)); // end of stream assertEquals(-1, is.read()); } @Test public void testGroupBoundary() throws Exception { byte[] bytes = new byte[2]; // Create an RLE byte stream that has 3 values (1 literal group) with // bit width 2. bytes[0] = (1 << 1 )| 1; bytes[1] = (1 << 0) | (2 << 2) | (3 << 4); ByteArrayInputStream stream = new ByteArrayInputStream(bytes); RunLengthBitPackingHybridDecoder decoder = new RunLengthBitPackingHybridDecoder(2, stream); assertEquals(decoder.readInt(), 1); assertEquals(decoder.readInt(), 2); assertEquals(decoder.readInt(), 3); assertEquals(stream.available(), 0); } private static List<Integer> unpack(int bitWidth, int numValues, ByteArrayInputStream is) throws Exception { BytePacker packer = Packer.LITTLE_ENDIAN.newBytePacker(bitWidth); int[] unpacked = new int[8]; byte[] next8Values = new byte[bitWidth]; List<Integer> values = new ArrayList<Integer>(numValues); while(values.size() < numValues) { for (int i = 0; i < bitWidth; i++) { next8Values[i] = (byte) is.read(); } packer.unpack8Values(next8Values, 0, unpacked, 0); for (int v = 0; v < 8; v++) { values.add(unpacked[v]); } } return values; } }