/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package parquet.column.values.dictionary;
import static org.junit.Assert.assertEquals;
import static parquet.column.Encoding.PLAIN;
import static parquet.column.Encoding.PLAIN_DICTIONARY;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.DOUBLE;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT;
import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
import java.io.IOException;
import org.junit.Assert;
import org.junit.Test;
import parquet.bytes.BytesInput;
import parquet.column.ColumnDescriptor;
import parquet.column.Dictionary;
import parquet.column.Encoding;
import parquet.column.page.DictionaryPage;
import parquet.column.values.ValuesReader;
import parquet.column.values.ValuesWriter;
import parquet.column.values.dictionary.DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter;
import parquet.column.values.dictionary.DictionaryValuesWriter.PlainDoubleDictionaryValuesWriter;
import parquet.column.values.dictionary.DictionaryValuesWriter.PlainFloatDictionaryValuesWriter;
import parquet.column.values.dictionary.DictionaryValuesWriter.PlainIntegerDictionaryValuesWriter;
import parquet.column.values.dictionary.DictionaryValuesWriter.PlainLongDictionaryValuesWriter;
import parquet.column.values.fallback.FallbackValuesWriter;
import parquet.column.values.plain.BinaryPlainValuesReader;
import parquet.column.values.plain.PlainValuesReader;
import parquet.column.values.plain.PlainValuesWriter;
import parquet.io.api.Binary;
import parquet.schema.PrimitiveType.PrimitiveTypeName;
public class TestDictionary {
private <I extends DictionaryValuesWriter> FallbackValuesWriter<I, PlainValuesWriter> plainFallBack(I dvw, int initialSize) {
return FallbackValuesWriter.of(dvw, new PlainValuesWriter(initialSize));
}
private FallbackValuesWriter<PlainBinaryDictionaryValuesWriter, PlainValuesWriter> newPlainBinaryDictionaryValuesWriter(int maxDictionaryByteSize, int initialSize) {
return plainFallBack(new PlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY), initialSize);
}
private FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> newPlainLongDictionaryValuesWriter(int maxDictionaryByteSize, int initialSize) {
return plainFallBack(new PlainLongDictionaryValuesWriter(maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY), initialSize);
}
private FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> newPlainIntegerDictionaryValuesWriter(int maxDictionaryByteSize, int initialSize) {
return plainFallBack(new PlainIntegerDictionaryValuesWriter(maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY), initialSize);
}
private FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> newPlainDoubleDictionaryValuesWriter(int maxDictionaryByteSize, int initialSize) {
return plainFallBack(new PlainDoubleDictionaryValuesWriter(maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY), initialSize);
}
private FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> newPlainFloatDictionaryValuesWriter(int maxDictionaryByteSize, int initialSize) {
return plainFallBack(new PlainFloatDictionaryValuesWriter(maxDictionaryByteSize, PLAIN_DICTIONARY, PLAIN_DICTIONARY), initialSize);
}
@Test
public void testBinaryDictionary() throws IOException {
int COUNT = 100;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
writeRepeated(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeRepeated(COUNT, cw, "b");
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
// now we will fall back
writeDistinct(COUNT, cw, "c");
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
DictionaryValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes3, cr2, "c");
}
@Test
public void testBinaryDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
int fallBackThreshold = maxDictionaryByteSize;
int dataSize=0;
for (long i = 0; i < 100; i++) {
Binary binary = Binary.fromString("str" + i);
cw.writeBytes(binary);
dataSize += (binary.length() + 4);
if (dataSize < fallBackThreshold) {
assertEquals(PLAIN_DICTIONARY, cw.getEncoding());
} else {
assertEquals(PLAIN, cw.getEncoding());
}
}
//Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new BinaryPlainValuesReader();
reader.initFromPage(100, cw.getBytes().toByteArray(), 0);
for (long i = 0; i < 100; i++) {
assertEquals(Binary.fromString("str" + i), reader.readBytes());
}
//simulate cutting the page
cw.reset();
assertEquals(0, cw.getBufferedSize());
}
@Test
public void testBinaryDictionaryChangedValues() throws IOException {
int COUNT = 100;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(200, 10000);
writeRepeatedWithReuse(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeRepeatedWithReuse(COUNT, cw, "b");
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
// now we will fall back
writeDistinct(COUNT, cw, "c");
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
DictionaryValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
BinaryPlainValuesReader cr2 = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes3, cr2, "c");
}
@Test
public void testFirstPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
writeDistinct(COUNT, cw, "a");
// not efficient so falls back
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "b");
// still plain because we fell back on first page
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes1, cr, "a");
checkRepeated(COUNT, bytes2, cr, "b");
}
@Test
public void testSecondPageFallBack() throws IOException {
int COUNT = 1000;
ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
writeRepeated(COUNT, cw, "a");
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
writeDistinct(COUNT, cw, "b");
// not efficient so falls back
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
writeRepeated(COUNT, cw, "a");
// still plain because we fell back on previous page
BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);
ValuesReader cr = initDicReader(cw, BINARY);
checkRepeated(COUNT, bytes1, cr, "a");
cr = new BinaryPlainValuesReader();
checkDistinct(COUNT, bytes2, cr, "b");
checkRepeated(COUNT, bytes3, cr, "a");
}
@Test
public void testLongDictionary() throws IOException {
int COUNT = 1000;
int COUNT2 = 2000;
final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(10000, 10000);
for (long i = 0; i < COUNT; i++) {
cw.writeLong(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (long i = COUNT2; i > 0; i--) {
cw.writeLong(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, PrimitiveTypeName.INT64);
cr.initFromPage(COUNT, bytes1.toByteArray(), 0);
for (long i = 0; i < COUNT; i++) {
long back = cr.readLong();
assertEquals(i % 50, back);
}
cr.initFromPage(COUNT2, bytes2.toByteArray(), 0);
for (long i = COUNT2; i > 0; i--) {
long back = cr.readLong();
assertEquals(i % 50, back);
}
}
private void roundTripLong(FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException {
int fallBackThreshold = maxDictionaryByteSize / 8;
for (long i = 0; i < 100; i++) {
cw.writeLong(i);
if (i < fallBackThreshold) {
assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
} else {
assertEquals(cw.getEncoding(), PLAIN);
}
}
reader.initFromPage(100, cw.getBytes().toByteArray(), 0);
for (long i = 0; i < 100; i++) {
assertEquals(i, reader.readLong());
}
}
@Test
public void testLongDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
// Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new PlainValuesReader.LongPlainValuesReader();
roundTripLong(cw, reader, maxDictionaryByteSize);
//simulate cutting the page
cw.reset();
assertEquals(0,cw.getBufferedSize());
cw.resetDictionary();
roundTripLong(cw, reader, maxDictionaryByteSize);
}
@Test
public void testDoubleDictionary() throws IOException {
int COUNT = 1000;
int COUNT2 = 2000;
final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(10000, 10000);
for (double i = 0; i < COUNT; i++) {
cw.writeDouble(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (double i = COUNT2; i > 0; i--) {
cw.writeDouble(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
final DictionaryValuesReader cr = initDicReader(cw, DOUBLE);
cr.initFromPage(COUNT, bytes1.toByteArray(), 0);
for (double i = 0; i < COUNT; i++) {
double back = cr.readDouble();
assertEquals(i % 50, back, 0.0);
}
cr.initFromPage(COUNT2, bytes2.toByteArray(), 0);
for (double i = COUNT2; i > 0; i--) {
double back = cr.readDouble();
assertEquals(i % 50, back, 0.0);
}
}
private void roundTripDouble(FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException {
int fallBackThreshold = maxDictionaryByteSize / 8;
for (double i = 0; i < 100; i++) {
cw.writeDouble(i);
if (i < fallBackThreshold) {
assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
} else {
assertEquals(cw.getEncoding(), PLAIN);
}
}
reader.initFromPage(100, cw.getBytes().toByteArray(), 0);
for (double i = 0; i < 100; i++) {
assertEquals(i, reader.readDouble(), 0.00001);
}
}
@Test
public void testDoubleDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
// Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();
roundTripDouble(cw, reader, maxDictionaryByteSize);
//simulate cutting the page
cw.reset();
assertEquals(0,cw.getBufferedSize());
cw.resetDictionary();
roundTripDouble(cw, reader, maxDictionaryByteSize);
}
@Test
public void testIntDictionary() throws IOException {
int COUNT = 2000;
int COUNT2 = 4000;
final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(10000, 10000);
for (int i = 0; i < COUNT; i++) {
cw.writeInteger(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (int i = COUNT2; i > 0; i--) {
cw.writeInteger(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, INT32);
cr.initFromPage(COUNT, bytes1.toByteArray(), 0);
for (int i = 0; i < COUNT; i++) {
int back = cr.readInteger();
assertEquals(i % 50, back);
}
cr.initFromPage(COUNT2, bytes2.toByteArray(), 0);
for (int i = COUNT2; i > 0; i--) {
int back = cr.readInteger();
assertEquals(i % 50, back);
}
}
private void roundTripInt(FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException {
int fallBackThreshold = maxDictionaryByteSize / 4;
for (int i = 0; i < 100; i++) {
cw.writeInteger(i);
if (i < fallBackThreshold) {
assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
} else {
assertEquals(cw.getEncoding(), PLAIN);
}
}
reader.initFromPage(100, cw.getBytes().toByteArray(), 0);
for (int i = 0; i < 100; i++) {
assertEquals(i, reader.readInteger());
}
}
@Test
public void testIntDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
// Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();
roundTripInt(cw, reader, maxDictionaryByteSize);
//simulate cutting the page
cw.reset();
assertEquals(0,cw.getBufferedSize());
cw.resetDictionary();
roundTripInt(cw, reader, maxDictionaryByteSize);
}
@Test
public void testFloatDictionary() throws IOException {
int COUNT = 2000;
int COUNT2 = 4000;
final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(10000, 10000);
for (float i = 0; i < COUNT; i++) {
cw.writeFloat(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (float i = COUNT2; i > 0; i--) {
cw.writeFloat(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
DictionaryValuesReader cr = initDicReader(cw, FLOAT);
cr.initFromPage(COUNT, bytes1.toByteArray(), 0);
for (float i = 0; i < COUNT; i++) {
float back = cr.readFloat();
assertEquals(i % 50, back, 0.0f);
}
cr.initFromPage(COUNT2, bytes2.toByteArray(), 0);
for (float i = COUNT2; i > 0; i--) {
float back = cr.readFloat();
assertEquals(i % 50, back, 0.0f);
}
}
private void roundTripFloat(FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException {
int fallBackThreshold = maxDictionaryByteSize / 4;
for (float i = 0; i < 100; i++) {
cw.writeFloat(i);
if (i < fallBackThreshold) {
assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
} else {
assertEquals(cw.getEncoding(), PLAIN);
}
}
reader.initFromPage(100, cw.getBytes().toByteArray(), 0);
for (float i = 0; i < 100; i++) {
assertEquals(i, reader.readFloat(), 0.00001);
}
}
@Test
public void testFloatDictionaryFallBack() throws IOException {
int slabSize = 100;
int maxDictionaryByteSize = 50;
final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
// Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
ValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();
roundTripFloat(cw, reader, maxDictionaryByteSize);
//simulate cutting the page
cw.reset();
assertEquals(0,cw.getBufferedSize());
cw.resetDictionary();
roundTripFloat(cw, reader, maxDictionaryByteSize);
}
@Test
public void testZeroValues() throws IOException {
FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(100, 100);
cw.writeInteger(34);
cw.writeInteger(34);
getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
DictionaryValuesReader reader = initDicReader(cw, INT32);
// pretend there are 100 nulls. what matters is offset = bytes.length.
byte[] bytes = {0x00, 0x01, 0x02, 0x03}; // data doesn't matter
int offset = bytes.length;
reader.initFromPage(100, bytes, offset);
}
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type)
throws IOException {
final DictionaryPage dictionaryPage = cw.createDictionaryPage().copy();
final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0);
final Dictionary dictionary = PLAIN.initDictionary(descriptor, dictionaryPage);
final DictionaryValuesReader cr = new DictionaryValuesReader(dictionary);
return cr;
}
private void checkDistinct(int COUNT, BytesInput bytes, ValuesReader cr, String prefix) throws IOException {
cr.initFromPage(COUNT, bytes.toByteArray(), 0);
for (int i = 0; i < COUNT; i++) {
Assert.assertEquals(prefix + i, cr.readBytes().toStringUsingUTF8());
}
}
private void checkRepeated(int COUNT, BytesInput bytes, ValuesReader cr, String prefix) throws IOException {
cr.initFromPage(COUNT, bytes.toByteArray(), 0);
for (int i = 0; i < COUNT; i++) {
Assert.assertEquals(prefix + i % 10, cr.readBytes().toStringUsingUTF8());
}
}
private void writeDistinct(int COUNT, ValuesWriter cw, String prefix) {
for (int i = 0; i < COUNT; i++) {
cw.writeBytes(Binary.fromString(prefix + i));
}
}
private void writeRepeated(int COUNT, ValuesWriter cw, String prefix) {
for (int i = 0; i < COUNT; i++) {
cw.writeBytes(Binary.fromString(prefix + i % 10));
}
}
private void writeRepeatedWithReuse(int COUNT, ValuesWriter cw, String prefix) {
Binary reused = Binary.fromString(prefix + "0");
for (int i = 0; i < COUNT; i++) {
Binary content = Binary.fromString(prefix + i % 10);
System.arraycopy(content.getBytes(), 0, reused.getBytes(), 0, reused.length());
cw.writeBytes(reused);
}
}
private BytesInput getBytesAndCheckEncoding(ValuesWriter cw, Encoding encoding)
throws IOException {
BytesInput bytes = BytesInput.copy(cw.getBytes());
assertEquals(encoding, cw.getEncoding());
cw.reset();
return bytes;
}
}