/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.hive.parquet.reader; import com.facebook.presto.hive.parquet.ParquetDataPage; import com.facebook.presto.hive.parquet.ParquetDataPageV1; import com.facebook.presto.hive.parquet.ParquetDataPageV2; import com.facebook.presto.hive.parquet.ParquetDictionaryPage; import com.facebook.presto.hive.parquet.ParquetEncoding; import com.facebook.presto.hive.parquet.RichColumnDescriptor; import com.facebook.presto.hive.parquet.dictionary.ParquetDictionary; import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.block.BlockBuilder; import com.facebook.presto.spi.block.BlockBuilderStatus; import com.facebook.presto.spi.type.DecimalType; import com.facebook.presto.spi.type.Type; import io.airlift.slice.Slice; import it.unimi.dsi.fastutil.ints.IntList; import parquet.bytes.BytesUtils; import parquet.column.ColumnDescriptor; import parquet.column.values.ValuesReader; import parquet.column.values.rle.RunLengthBitPackingHybridDecoder; import parquet.io.ParquetDecodingException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.Optional; import static com.facebook.presto.hive.parquet.ParquetTypeUtils.createDecimalType; import static com.facebook.presto.hive.parquet.ParquetValidationUtils.validateParquet; import static com.facebook.presto.hive.parquet.ParquetValuesType.DEFINITION_LEVEL; import static com.facebook.presto.hive.parquet.ParquetValuesType.REPETITION_LEVEL; import static com.facebook.presto.hive.parquet.ParquetValuesType.VALUES; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.google.common.base.Preconditions.checkArgument; import static java.util.Objects.requireNonNull; public abstract class ParquetColumnReader { protected final ColumnDescriptor columnDescriptor; protected int definitionLevel; protected ValuesReader valuesReader; protected int nextBatchSize; private ParquetLevelReader repetitionReader; private ParquetLevelReader definitionReader; private int repetitionLevel; private long totalValueCount; private ParquetPageReader pageReader; private ParquetDictionary dictionary; private int currentValueCount; private ParquetDataPage page; private int remainingValueCountInPage; private int readOffset; protected abstract void readValue(BlockBuilder blockBuilder, Type type); protected abstract void skipValue(); public static ParquetColumnReader createReader(RichColumnDescriptor descriptor) { switch (descriptor.getType()) { case BOOLEAN: return new ParquetBooleanColumnReader(descriptor); case INT32: return createDecimalColumnReader(descriptor).orElse(new ParquetIntColumnReader(descriptor)); case INT64: return createDecimalColumnReader(descriptor).orElse(new ParquetLongColumnReader(descriptor)); case INT96: return new ParquetTimestampColumnReader(descriptor); case FLOAT: return new ParquetFloatColumnReader(descriptor); case DOUBLE: return new ParquetDoubleColumnReader(descriptor); case BINARY: return createDecimalColumnReader(descriptor).orElse(new ParquetBinaryColumnReader(descriptor)); case FIXED_LEN_BYTE_ARRAY: return createDecimalColumnReader(descriptor) .orElseThrow(() -> new PrestoException(NOT_SUPPORTED, "Parquet type FIXED_LEN_BYTE_ARRAY supported as DECIMAL; got " + descriptor.getPrimitiveType().getOriginalType())); default: throw new PrestoException(NOT_SUPPORTED, "Unsupported parquet type: " + descriptor.getType()); } } private static Optional<ParquetColumnReader> createDecimalColumnReader(RichColumnDescriptor descriptor) { Optional<Type> type = createDecimalType(descriptor); if (type.isPresent()) { DecimalType decimalType = (DecimalType) type.get(); return Optional.of(ParquetDecimalColumnReaderFactory.createReader(descriptor, decimalType.getPrecision(), decimalType.getScale())); } return Optional.empty(); } public ParquetColumnReader(ColumnDescriptor columnDescriptor) { this.columnDescriptor = requireNonNull(columnDescriptor, "columnDescriptor"); pageReader = null; } public ParquetPageReader getPageReader() { return pageReader; } public void setPageReader(ParquetPageReader pageReader) { this.pageReader = requireNonNull(pageReader, "pageReader"); ParquetDictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage); } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e); } } else { dictionary = null; } checkArgument(pageReader.getTotalValueCount() > 0, "page is empty"); totalValueCount = pageReader.getTotalValueCount(); } public void prepareNextRead(int batchSize) { readOffset = readOffset + nextBatchSize; nextBatchSize = batchSize; } public ColumnDescriptor getDescriptor() { return columnDescriptor; } public Block readPrimitive(Type type, IntList positions) throws IOException { seek(); BlockBuilder blockBuilder = type.createBlockBuilder(new BlockBuilderStatus(), nextBatchSize); int valueCount = 0; while (valueCount < nextBatchSize) { if (page == null) { readNextPage(); } int numValues = Math.min(remainingValueCountInPage, nextBatchSize - valueCount); readValues(blockBuilder, numValues, type, positions); valueCount += numValues; updatePosition(numValues); } checkArgument(valueCount == nextBatchSize, "valueCount " + valueCount + " not equals to batchSize " + nextBatchSize); readOffset = 0; nextBatchSize = 0; return blockBuilder.build(); } private void readValues(BlockBuilder blockBuilder, int numValues, Type type, IntList positions) { definitionLevel = definitionReader.readLevel(); repetitionLevel = repetitionReader.readLevel(); int valueCount = 0; for (int i = 0; i < numValues; i++) { do { readValue(blockBuilder, type); try { valueCount++; repetitionLevel = repetitionReader.readLevel(); if (repetitionLevel == 0) { positions.add(valueCount); valueCount = 0; if (i == numValues - 1) { return; } } definitionLevel = definitionReader.readLevel(); } catch (IllegalArgumentException expected) { // Reading past repetition stream, RunLengthBitPackingHybridDecoder throws IllegalArgumentException positions.add(valueCount); return; } } while (repetitionLevel != 0); } } private void skipValues(int offset) { definitionLevel = definitionReader.readLevel(); repetitionLevel = repetitionReader.readLevel(); for (int i = 0; i < offset; i++) { do { skipValue(); try { repetitionLevel = repetitionReader.readLevel(); if (i == offset - 1 && repetitionLevel == 0) { return; } definitionLevel = definitionReader.readLevel(); } catch (IllegalArgumentException expected) { // Reading past repetition stream, RunLengthBitPackingHybridDecoder throws IllegalArgumentException return; } } while (repetitionLevel != 0); } } private void seek() throws IOException { checkArgument(currentValueCount <= totalValueCount, "Already read all values in column chunk"); if (readOffset == 0) { return; } int valuePosition = 0; while (valuePosition < readOffset) { if (page == null) { readNextPage(); } int offset = Math.min(remainingValueCountInPage, readOffset - valuePosition); skipValues(offset); valuePosition = valuePosition + offset; updatePosition(offset); } checkArgument(valuePosition == readOffset, "valuePosition " + valuePosition + " must be equal to readOffset " + readOffset); } private void readNextPage() throws IOException { page = pageReader.readPage(); validateParquet(page != null, "Not enough values to read in column chunk"); remainingValueCountInPage = page.getValueCount(); if (page instanceof ParquetDataPageV1) { valuesReader = readPageV1((ParquetDataPageV1) page); } else { valuesReader = readPageV2((ParquetDataPageV2) page); } } private void updatePosition(int numValues) { if (numValues == remainingValueCountInPage) { page = null; valuesReader = null; } remainingValueCountInPage = remainingValueCountInPage - numValues; currentValueCount += numValues; } private ValuesReader readPageV1(ParquetDataPageV1 page) { ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL); repetitionReader = new ParquetLevelValuesReader(rlReader); definitionReader = new ParquetLevelValuesReader(dlReader); try { byte[] bytes = page.getSlice().getBytes(); rlReader.initFromPage(page.getValueCount(), bytes, 0); int offset = rlReader.getNextOffset(); dlReader.initFromPage(page.getValueCount(), bytes, offset); offset = dlReader.getNextOffset(); return initDataReader(page.getValueEncoding(), bytes, offset, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e); } } private ValuesReader readPageV2(ParquetDataPageV2 page) { repetitionReader = buildLevelRLEReader(columnDescriptor.getMaxRepetitionLevel(), page.getRepetitionLevels()); definitionReader = buildLevelRLEReader(columnDescriptor.getMaxDefinitionLevel(), page.getDefinitionLevels()); return initDataReader(page.getDataEncoding(), page.getSlice().getBytes(), 0, page.getValueCount()); } private ParquetLevelReader buildLevelRLEReader(int maxLevel, Slice slice) { if (maxLevel == 0) { return new ParquetLevelNullReader(); } return new ParquetLevelRLEReader(new RunLengthBitPackingHybridDecoder(BytesUtils.getWidthFromMaxInt(maxLevel), new ByteArrayInputStream(slice.getBytes()))); } private ValuesReader initDataReader(ParquetEncoding dataEncoding, byte[] bytes, int offset, int valueCount) { ValuesReader valuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException("Dictionary is missing for Page"); } valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary); } else { valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES); } try { valuesReader.initFromPage(valueCount, bytes, offset); return valuesReader; } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e); } } }