/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.hive.parquet.reader;
import com.facebook.presto.hive.parquet.ParquetCorruptionException;
import com.facebook.presto.hive.parquet.ParquetDataPage;
import com.facebook.presto.hive.parquet.ParquetDataPageV1;
import com.facebook.presto.hive.parquet.ParquetDataPageV2;
import com.facebook.presto.hive.parquet.ParquetDictionaryPage;
import io.airlift.slice.Slice;
import parquet.column.Encoding;
import parquet.format.DataPageHeader;
import parquet.format.DataPageHeaderV2;
import parquet.format.DictionaryPageHeader;
import parquet.format.PageHeader;
import parquet.format.Util;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import static com.facebook.presto.hive.parquet.ParquetTypeUtils.getParquetEncoding;
import static io.airlift.slice.Slices.wrappedBuffer;
public class ParquetColumnChunk
extends ByteArrayInputStream
{
private final ParquetColumnChunkDescriptor descriptor;
public ParquetColumnChunk(
ParquetColumnChunkDescriptor descriptor,
byte[] data,
int offset)
{
super(data);
this.descriptor = descriptor;
this.pos = offset;
}
public ParquetColumnChunkDescriptor getDescriptor()
{
return descriptor;
}
protected PageHeader readPageHeader()
throws IOException
{
return Util.readPageHeader(this);
}
public ParquetPageReader readAllPages()
throws IOException
{
List<ParquetDataPage> pages = new ArrayList<>();
ParquetDictionaryPage dictionaryPage = null;
long valueCount = 0;
while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) {
PageHeader pageHeader = readPageHeader();
int uncompressedPageSize = pageHeader.getUncompressed_page_size();
int compressedPageSize = pageHeader.getCompressed_page_size();
switch (pageHeader.type) {
case DICTIONARY_PAGE:
if (dictionaryPage != null) {
throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
}
dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
break;
case DATA_PAGE:
valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages);
break;
case DATA_PAGE_V2:
valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages);
break;
default:
skip(compressedPageSize);
break;
}
}
return new ParquetPageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage);
}
public int getPosition()
{
return pos;
}
private Slice getSlice(int size)
{
Slice slice = wrappedBuffer(buf, pos, size);
pos += size;
return slice;
}
private ParquetDictionaryPage readDictionaryPage(PageHeader pageHeader, int uncompressedPageSize, int compressedPageSize)
throws IOException
{
DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
return new ParquetDictionaryPage(
getSlice(compressedPageSize),
uncompressedPageSize,
dicHeader.getNum_values(),
getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name())));
}
private long readDataPageV1(PageHeader pageHeader,
int uncompressedPageSize,
int compressedPageSize,
List<ParquetDataPage> pages)
throws IOException
{
DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
pages.add(new ParquetDataPageV1(
getSlice(compressedPageSize),
dataHeaderV1.getNum_values(),
uncompressedPageSize,
ParquetMetadataReader.readStats(
dataHeaderV1.getStatistics(),
descriptor.getColumnDescriptor().getType()),
getParquetEncoding(Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name())),
getParquetEncoding(Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name())),
getParquetEncoding(Encoding.valueOf(dataHeaderV1.getEncoding().name()))));
return dataHeaderV1.getNum_values();
}
private long readDataPageV2(PageHeader pageHeader,
int uncompressedPageSize,
int compressedPageSize,
List<ParquetDataPage> pages)
throws IOException
{
DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
pages.add(new ParquetDataPageV2(
dataHeaderV2.getNum_rows(),
dataHeaderV2.getNum_nulls(),
dataHeaderV2.getNum_values(),
getSlice(dataHeaderV2.getRepetition_levels_byte_length()),
getSlice(dataHeaderV2.getDefinition_levels_byte_length()),
getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())),
getSlice(dataSize),
uncompressedPageSize,
ParquetMetadataReader.readStats(
dataHeaderV2.getStatistics(),
descriptor.getColumnDescriptor().getType()),
dataHeaderV2.isIs_compressed()));
return dataHeaderV2.getNum_values();
}
}