/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.parquet.columnreaders;
import com.google.common.base.Stopwatch;
import io.netty.buffer.ByteBufUtil;
import org.apache.drill.exec.util.filereader.BufferedDirectBufInputStream;
import io.netty.buffer.ByteBuf;
import io.netty.buffer.DrillBuf;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.exec.memory.BufferAllocator;
import org.apache.drill.exec.store.parquet.ParquetFormatPlugin;
import org.apache.drill.exec.store.parquet.ParquetReaderStats;
import org.apache.drill.exec.util.filereader.DirectBufInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.ValuesType;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.dictionary.DictionaryValuesReader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.PageType;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.CodecFactory;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.PrimitiveType;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import static org.apache.parquet.column.Encoding.valueOf;
// class to keep track of the read position of variable length columns
class PageReader {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(
org.apache.drill.exec.store.parquet.columnreaders.PageReader.class);
public static final ParquetMetadataConverter METADATA_CONVERTER = ParquetFormatPlugin.parquetMetadataConverter;
protected final org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentColumnReader;
protected final DirectBufInputStream dataReader;
//buffer to store bytes of current page
protected DrillBuf pageData;
// for variable length data we need to keep track of our current position in the page data
// as the values and lengths are intermixed, making random access to the length data impossible
long readyToReadPosInBytes;
// read position in the current page, stored in the ByteBuf in ParquetRecordReader called bufferWithAllData
long readPosInBytes;
// storage space for extra bits at the end of a page if they did not line up with a byte boundary
// prevents the need to keep the entire last page, as these pageDataByteArray need to be added to the next batch
//byte extraBits;
// used for columns where the number of values that will fit in a vector is unknown
// currently used for variable length
// TODO - reuse this when compressed vectors are added, where fixed length values will take up a
// variable amount of space
// For example: if nulls are stored without extra space left in the data vector
// (this is currently simplifying random access to the data during processing, but increases the size of the vectors)
int valuesReadyToRead;
// the number of values read out of the last page
int valuesRead;
int byteLength;
//int rowGroupIndex;
ValuesReader definitionLevels;
ValuesReader repetitionLevels;
ValuesReader valueReader;
ValuesReader dictionaryLengthDeterminingReader;
ValuesReader dictionaryValueReader;
Dictionary dictionary;
PageHeader pageHeader = null;
int currentPageCount = -1;
protected FSDataInputStream inputStream;
// These need to be held throughout reading of the entire column chunk
List<ByteBuf> allocatedDictionaryBuffers;
protected final CodecFactory codecFactory;
protected final String fileName;
protected final ParquetReaderStats stats;
private final boolean useBufferedReader;
private final int scanBufferSize;
private final boolean useFadvise;
private final boolean enforceTotalSize;
protected final String debugName;
PageReader(org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData)
throws ExecutionSetupException {
this.parentColumnReader = parentStatus;
allocatedDictionaryBuffers = new ArrayList<ByteBuf>();
codecFactory = parentColumnReader.parentReader.getCodecFactory();
this.stats = parentColumnReader.parentReader.parquetReaderStats;
this.fileName = path.toString();
debugName = new StringBuilder()
.append(this.parentColumnReader.parentReader.getFragmentContext().getFragIdString())
.append(":")
.append(this.parentColumnReader.parentReader.getOperatorContext().getStats().getId() )
.append(this.parentColumnReader.columnChunkMetaData.toString() )
.toString();
try {
inputStream = fs.open(path);
BufferAllocator allocator = parentColumnReader.parentReader.getOperatorContext().getAllocator();
columnChunkMetaData.getTotalUncompressedSize();
useBufferedReader = parentColumnReader.parentReader.useBufferedReader;
scanBufferSize = parentColumnReader.parentReader.bufferedReadSize;
useFadvise = parentColumnReader.parentReader.useFadvise;
enforceTotalSize = parentColumnReader.parentReader.enforceTotalSize;
if (useBufferedReader) {
this.dataReader = new BufferedDirectBufInputStream(inputStream, allocator, path.getName(),
columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), scanBufferSize,
enforceTotalSize, useFadvise);
} else {
this.dataReader = new DirectBufInputStream(inputStream, allocator, path.getName(),
columnChunkMetaData.getStartingPos(), columnChunkMetaData.getTotalSize(), enforceTotalSize,
useFadvise);
}
dataReader.init();
loadDictionaryIfExists(parentStatus, columnChunkMetaData, dataReader);
} catch (IOException e) {
throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
+ path.getName(), e);
}
}
protected void loadDictionaryIfExists(final org.apache.drill.exec.store.parquet.columnreaders.ColumnReader<?> parentStatus,
final ColumnChunkMetaData columnChunkMetaData, final DirectBufInputStream f) throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
if (columnChunkMetaData.getDictionaryPageOffset() > 0) {
dataReader.skip(columnChunkMetaData.getDictionaryPageOffset() - dataReader.getPos());
long start=dataReader.getPos();
timer.start();
final PageHeader pageHeader = Util.readPageHeader(f);
long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
long pageHeaderBytes=dataReader.getPos()-start;
this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
assert pageHeader.type == PageType.DICTIONARY_PAGE;
readDictionaryPage(pageHeader, parentStatus);
}
}
private void readDictionaryPage(final PageHeader pageHeader,
final ColumnReader<?> parentStatus) throws IOException {
int compressedSize = pageHeader.getCompressed_page_size();
int uncompressedSize = pageHeader.getUncompressed_page_size();
final DrillBuf dictionaryData = readPage(pageHeader, compressedSize, uncompressedSize);
allocatedDictionaryBuffers.add(dictionaryData);
DictionaryPage page = new DictionaryPage(
asBytesInput(dictionaryData, 0, uncompressedSize),
pageHeader.uncompressed_page_size,
pageHeader.dictionary_page_header.num_values,
valueOf(pageHeader.dictionary_page_header.encoding.name()));
this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page);
}
private DrillBuf readPage(PageHeader pageHeader, int compressedSize, int uncompressedSize) throws IOException {
DrillBuf pageDataBuf = null;
Stopwatch timer = Stopwatch.createUnstarted();
long timeToRead;
long start=dataReader.getPos();
if (parentColumnReader.columnChunkMetaData.getCodec() == CompressionCodecName.UNCOMPRESSED) {
timer.start();
pageDataBuf = dataReader.getNext(compressedSize);
if (logger.isTraceEnabled()) {
logger.trace("PageReaderTask==> Col: {} readPos: {} Uncompressed_size: {} pageData: {}",
parentColumnReader.columnChunkMetaData.toString(), dataReader.getPos(),
pageHeader.getUncompressed_page_size(), ByteBufUtil.hexDump(pageData));
}
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, uncompressedSize);
} else {
DrillBuf compressedData = null;
pageDataBuf=allocateTemporaryBuffer(uncompressedSize);
try {
timer.start();
compressedData = dataReader.getNext(compressedSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
timer.reset();
this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, compressedSize);
start = dataReader.getPos();
timer.start();
codecFactory.getDecompressor(parentColumnReader.columnChunkMetaData.getCodec())
.decompress(compressedData.nioBuffer(0, compressedSize), compressedSize,
pageDataBuf.nioBuffer(0, uncompressedSize), uncompressedSize);
pageDataBuf.writerIndex(uncompressedSize);
timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
this.updateStats(pageHeader, "Decompress", start, timeToRead, compressedSize, uncompressedSize);
} finally {
if (compressedData != null) {
compressedData.release();
}
}
}
return pageDataBuf;
}
public static BytesInput asBytesInput(DrillBuf buf, int offset, int length) throws IOException {
return BytesInput.from(buf.nioBuffer(offset, length), 0, length);
}
/**
* Get the page header and the pageData (uncompressed) for the next page
*/
protected void nextInternal() throws IOException{
Stopwatch timer = Stopwatch.createUnstarted();
// next, we need to decompress the bytes
// TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one
// I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary
do {
long start=dataReader.getPos();
timer.start();
pageHeader = Util.readPageHeader(dataReader);
long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
long pageHeaderBytes=dataReader.getPos()-start;
this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","",
this.parentColumnReader.parentReader.hadoopPath,
this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead);
timer.reset();
if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {
readDictionaryPage(pageHeader, parentColumnReader);
}
} while (pageHeader.getType() == PageType.DICTIONARY_PAGE);
int compressedSize = pageHeader.getCompressed_page_size();
int uncompressedSize = pageHeader.getUncompressed_page_size();
pageData = readPage(pageHeader, compressedSize, uncompressedSize);
}
/**
* Grab the next page.
*
* @return - if another page was present
* @throws IOException
*/
public boolean next() throws IOException {
Stopwatch timer = Stopwatch.createUnstarted();
currentPageCount = -1;
valuesRead = 0;
valuesReadyToRead = 0;
// TODO - the metatdata for total size appears to be incorrect for impala generated files, need to find cause
// and submit a bug report
long totalValueCount = parentColumnReader.columnChunkMetaData.getValueCount();
if(parentColumnReader.totalValuesRead >= totalValueCount) {
return false;
}
clearBuffers();
nextInternal();
if(pageData == null || pageHeader == null){
//TODO: Is this an error condition or a normal condition??
return false;
}
timer.start();
currentPageCount = pageHeader.data_page_header.num_values;
final Encoding rlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.repetition_level_encoding);
final Encoding dlEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.definition_level_encoding);
final Encoding valueEncoding = METADATA_CONVERTER.getEncoding(pageHeader.data_page_header.encoding);
byteLength = pageHeader.uncompressed_page_size;
final ByteBuffer pageDataBuffer = pageData.nioBuffer(0, pageData.capacity());
readPosInBytes = 0;
if (parentColumnReader.getColumnDescriptor().getMaxRepetitionLevel() > 0) {
repetitionLevels = rlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.REPETITION_LEVEL);
repetitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
// we know that the first value will be a 0, at the end of each list of repeated values we will hit another 0 indicating
// a new record, although we don't know the length until we hit it (and this is a one way stream of integers) so we
// read the first zero here to simplify the reading processes, and start reading the first value the same as all
// of the rest. Effectively we are 'reading' the non-existent value in front of the first allowing direct access to
// the first list of repetition levels
readPosInBytes = repetitionLevels.getNextOffset();
repetitionLevels.readInteger();
}
if (parentColumnReader.columnDescriptor.getMaxDefinitionLevel() != 0){
parentColumnReader.currDefLevel = -1;
definitionLevels = dlEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.DEFINITION_LEVEL);
definitionLevels.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
readPosInBytes = definitionLevels.getNextOffset();
if (!valueEncoding.usesDictionary()) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
}
if (parentColumnReader.columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.BOOLEAN) {
valueReader = valueEncoding.getValuesReader(parentColumnReader.columnDescriptor, ValuesType.VALUES);
valueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
}
if (valueEncoding.usesDictionary()) {
// initialize two of the dictionary readers, one is for determining the lengths of each value, the second is for
// actually copying the values out into the vectors
dictionaryLengthDeterminingReader = new DictionaryValuesReader(dictionary);
dictionaryLengthDeterminingReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
dictionaryValueReader = new DictionaryValuesReader(dictionary);
dictionaryValueReader.initFromPage(currentPageCount, pageDataBuffer, (int) readPosInBytes);
parentColumnReader.usingDictionary = true;
} else {
parentColumnReader.usingDictionary = false;
}
// readPosInBytes is used for actually reading the values after we determine how many will fit in the vector
// readyToReadPosInBytes serves a similar purpose for the vector types where we must count up the values that will
// fit one record at a time, such as for variable length data. Both operations must start in the same location after the
// definition and repetition level data which is stored alongside the page data itself
readyToReadPosInBytes = readPosInBytes;
long timeDecode = timer.elapsed(TimeUnit.NANOSECONDS);
stats.numDataPagesDecoded.incrementAndGet();
stats.timeDataPageDecode.addAndGet(timeDecode);
return true;
}
/**
* Allocate a buffer which the user should release immediately. The reader does not manage release of these buffers.
*/
protected DrillBuf allocateTemporaryBuffer(int size) {
return parentColumnReader.parentReader.getOperatorContext().getAllocator().buffer(size);
}
protected boolean hasPage() {
return currentPageCount != -1;
}
protected void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) {
String pageType = "Data Page";
if (pageHeader.type == PageType.DICTIONARY_PAGE) {
pageType = "Dictionary Page";
}
logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType.toString(),
this.parentColumnReader.parentReader.hadoopPath,
this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time);
if (pageHeader.type != PageType.DICTIONARY_PAGE) {
if (bytesin == bytesout) {
this.stats.timeDataPageLoads.addAndGet(time);
this.stats.numDataPageLoads.incrementAndGet();
this.stats.totalDataPageReadBytes.addAndGet(bytesin);
} else {
this.stats.timeDataPagesDecompressed.addAndGet(time);
this.stats.numDataPagesDecompressed.incrementAndGet();
this.stats.totalDataDecompressedBytes.addAndGet(bytesin);
}
} else {
if (bytesin == bytesout) {
this.stats.timeDictPageLoads.addAndGet(time);
this.stats.numDictPageLoads.incrementAndGet();
this.stats.totalDictPageReadBytes.addAndGet(bytesin);
} else {
this.stats.timeDictPagesDecompressed.addAndGet(time);
this.stats.numDictPagesDecompressed.incrementAndGet();
this.stats.totalDictDecompressedBytes.addAndGet(bytesin);
}
}
}
protected void clearBuffers() {
if (pageData != null) {
pageData.release();
pageData = null;
}
}
protected void clearDictionaryBuffers() {
for (ByteBuf b : allocatedDictionaryBuffers) {
b.release();
}
allocatedDictionaryBuffers.clear();
}
public void clear(){
try {
// data reader also owns the input stream and will close it.
this.dataReader.close();
} catch (IOException e) {
//Swallow the exception which is OK for input streams
}
// Free all memory, including fixed length types. (Data is being copied for all types not just var length types)
clearBuffers();
clearDictionaryBuffers();
}
}