package FlexibleEncoding.Parquet; /* * adapted from Parquet* */ //import static parquet.column.values.bitpacking.Packer.BIG_ENDIAN; //import static parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; //import static parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; //import static parquet.schema.PrimitiveType.PrimitiveTypeName.BOOLEAN; import java.io.IOException; //import parquet.io.ParquetDecodingException; /** * encoding of the data * * @author Julien Le Dem * */ public enum Encoding { PLAIN { @Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { switch (descriptor.getType()) { case BOOLEAN: return new BooleanPlainValuesReader(); case BINARY: return new BinaryPlainValuesReader(); case FLOAT: return new PlainValuesReader.FloatPlainValuesReader(); case DOUBLE: return new PlainValuesReader.DoublePlainValuesReader(); case INT32: return new PlainValuesReader.IntegerPlainValuesReader(); case INT64: return new PlainValuesReader.LongPlainValuesReader(); case FIXED_LEN_BYTE_ARRAY: return new FixedLenByteArrayPlainValuesReader(descriptor.getTypeLength()); default: throw new ParquetDecodingException("no plain reader for type " + descriptor.getType()); } } }, /** * Actually a combination of bit packing and run length encoding. * TODO: Should we rename this to be more clear? */ RLE { @Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { int bitWidth = BytesUtils.getWidthFromMaxInt(getMaxLevel(descriptor, valuesType)); if(bitWidth == 0) { return new ZeroIntegerValuesReader(); } return new RunLengthBitPackingHybridValuesReader(bitWidth); } }, /** * This is no longer used, and has been replaced by {@link #RLE} * which is combination of bit packing and rle */ @Deprecated BIT_PACKED { @Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { return new ByteBitPackingValuesReader(getMaxLevel(descriptor, valuesType), Packer.BIG_ENDIAN); } }, PLAIN_DICTIONARY { @Override public ValuesReader getDictionaryBasedValuesReader(ColumnDescriptor descriptor, ValuesType valuesType, Dictionary dictionary) { switch (descriptor.getType()) { case BINARY: case INT64: case DOUBLE: case INT32: case FLOAT: return new DictionaryValuesReader(dictionary); default: throw new ParquetDecodingException("Dictionary encoding not supported for type: " + descriptor.getType()); } } @Override public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) throws IOException { switch (descriptor.getType()) { case BINARY: return new PlainValuesDictionary.PlainBinaryDictionary(dictionaryPage); case INT64: return new PlainValuesDictionary.PlainLongDictionary(dictionaryPage); case DOUBLE: return new PlainValuesDictionary.PlainDoubleDictionary(dictionaryPage); case INT32: return new PlainValuesDictionary.PlainIntegerDictionary(dictionaryPage); case FLOAT: return new PlainValuesDictionary.PlainFloatDictionary(dictionaryPage); default: throw new ParquetDecodingException("Dictionary encoding not supported for type: " + descriptor.getType()); } } @Override public boolean usesDictionary() { return true; } }, /** * Delta encoding for integers. This can be used for int columns and works best * on sorted data */ DELTA_BINARY_PACKED { @Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { if(descriptor.getType() != PrimitiveType.PrimitiveTypeName.INT32) { throw new ParquetDecodingException("Encoding DELTA_BINARY_PACKED is only supported for type INT32"); } return new DeltaBinaryPackingValuesReader(); } }, /** * Encoding for byte arrays to separate the length values and the data. The lengths * are encoded using DELTA_BINARY_PACKED */ DELTA_LENGTH_BYTE_ARRAY { @Override public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { if (descriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) { throw new ParquetDecodingException("Encoding DELTA_LENGTH_BYTE_ARRAY is only supported for type BINARY"); } return new DeltaLengthByteArrayValuesReader(); } }, /** * Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. * Suffixes are stored as delta length byte arrays. */ DELTA_BYTE_ARRAY { public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { if (descriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) { throw new ParquetDecodingException("Encoding DELTA_BYTE_ARRAY is only supported for type BINARY"); } return new DeltaByteArrayReader(); } }, /** * Dictionary encoding: the ids are encoded using the RLE encoding */ RLE_DICTIONARY; int getMaxLevel(ColumnDescriptor descriptor, ValuesType valuesType) { int maxLevel; switch (valuesType) { case REPETITION_LEVEL: maxLevel = descriptor.getMaxRepetitionLevel(); break; case DEFINITION_LEVEL: maxLevel = descriptor.getMaxDefinitionLevel(); break; case VALUES: if(descriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) { maxLevel = 1; break; } default: throw new ParquetDecodingException("Unsupported encoding for values: " + this); } return maxLevel; } /** * @return whether this encoding requires a dictionary */ public boolean usesDictionary() { return false; } /** * initializes a dictionary from a page * @param dictionaryPage * @return the corresponding dictionary */ public Dictionary initDictionary(ColumnDescriptor descriptor, DictionaryPage dictionaryPage) throws IOException { throw new UnsupportedOperationException(this.name() + " does not support dictionary"); } /** * To read decoded values that don't require a dictionary * * @param descriptor the column to read * @param valuesType the type of values * @return the proper values reader for the given column * @throw {@link UnsupportedOperationException} if the encoding is dictionary based */ public ValuesReader getValuesReader(ColumnDescriptor descriptor, ValuesType valuesType) { throw new UnsupportedOperationException("Error decoding " + descriptor + ". " + this.name() + " is dictionary based"); } /** * To read decoded values that require a dictionary * * @param descriptor the column to read * @param valuesType the type of values * @param dictionary the dictionary * @return the proper values reader for the given column * @throw {@link UnsupportedOperationException} if the encoding is not dictionary based */ public ValuesReader getDictionaryBasedValuesReader(ColumnDescriptor descriptor, ValuesType valuesType, Dictionary dictionary) { throw new UnsupportedOperationException(this.name() + " is not dictionary based"); } }