/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.store.parquet.columnreaders; import com.google.common.base.Stopwatch; import io.netty.buffer.DrillBuf; import org.apache.drill.common.exceptions.DrillRuntimeException; import org.apache.drill.common.exceptions.ExecutionSetupException; import org.apache.drill.common.exceptions.UserException; import org.apache.drill.exec.ExecConstants; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.DirectDecompressor; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.parquet.hadoop.CodecFactory; import org.apache.parquet.hadoop.codec.SnappyCodec; import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData; import org.apache.drill.exec.util.filereader.DirectBufInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.column.page.DictionaryPage; import org.apache.parquet.format.PageHeader; import org.apache.parquet.format.PageType; import org.apache.parquet.format.Util; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.xerial.snappy.Snappy; import java.io.IOException; import java.nio.ByteBuffer; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import static org.apache.parquet.column.Encoding.valueOf; /** * The AyncPageReader reads one page of data at a time asynchronously from the provided InputStream. The * first request to the page reader creates a Future Task (AsyncPageReaderTask) and submits it to the * scan thread pool. The result of the Future task (a page) is put into a (blocking) queue and the scan * thread starts processing the data as soon as the Future task is complete. * This is a simple producer-consumer queue, the AsyncPageReaderTask is the producer and the ParquetScan is * the consumer. * The AsyncPageReaderTask submits another Future task for reading the next page as soon as it is done, * while the results queue is not full. Until the queue is full, therefore, the scan thread pool keeps the * disk as busy as possible. * In case the disk is slower than the processing, the queue is never filled up after the processing of the * pages begins. In this case, the next disk read begins immediately after the previous read is completed * and the disk is never idle. The query in this case is effectively bounded by the disk. * If, however, the processing is slower than the disk (can happen with SSDs, data being cached by the * FileSystem, or if the processing requires complex processing that is necessarily slow) the queue fills * up. Once the queue is full, the AsyncPageReaderTask does not submit any new Future tasks. The next Future * task is submitted by the *processing* thread as soon as it pulls a page out of the queue. (Note that the * invariant here is that there is space for at least one more page in the queue before the Future read task * is submitted to the pool). This sequence is important. Not doing so can lead to deadlocks - producer * threads may block on putting data into the queue which is full while the consumer threads might be * blocked trying to read from a queue that has no data. * The first request to the page reader can be either to load a dictionary page or a data page; this leads * to the rather odd looking code in the constructor since the parent PageReader calls * loadDictionaryIfExists in the constructor. * The Future tasks created are kept in a non blocking queue and the Future object is checked for any * exceptions that might have occurred during the execution. The queue of Futures is also used to cancel * any pending Futures at close (this may happen as a result of a cancel). * */ class AsyncPageReader extends PageReader { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AsyncPageReader.class); private ExecutorService threadPool; private long queueSize; private LinkedBlockingQueue<ReadStatus> pageQueue; private ConcurrentLinkedQueue<Future<Void>> asyncPageRead; private long totalPageValuesRead = 0; AsyncPageReader(ColumnReader<?> parentStatus, FileSystem fs, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException { super(parentStatus, fs, path, columnChunkMetaData); if (threadPool == null && asyncPageRead == null) { threadPool = parentColumnReader.parentReader.getOperatorContext().getScanExecutor(); queueSize = parentColumnReader.parentReader.readQueueSize; pageQueue = new LinkedBlockingQueue<>((int)queueSize); asyncPageRead = new ConcurrentLinkedQueue<>(); asyncPageRead.offer(threadPool.submit(new AsyncPageReaderTask(debugName, pageQueue))); } } @Override protected void loadDictionaryIfExists(final ColumnReader<?> parentStatus, final ColumnChunkMetaData columnChunkMetaData, final DirectBufInputStream f) throws UserException { if (columnChunkMetaData.getDictionaryPageOffset() > 0) { try { assert(columnChunkMetaData.getDictionaryPageOffset() >= dataReader.getPos() ); dataReader.skip(columnChunkMetaData.getDictionaryPageOffset() - dataReader.getPos()); } catch (IOException e) { handleAndThrowException(e, "Error Reading dictionary page."); } // parent constructor may call this method before the thread pool is set. if (threadPool == null && asyncPageRead == null) { threadPool = parentColumnReader.parentReader.getOperatorContext().getScanExecutor(); queueSize = parentColumnReader.parentReader.getFragmentContext().getOptions() .getOption(ExecConstants.PARQUET_PAGEREADER_QUEUE_SIZE).num_val; pageQueue = new LinkedBlockingQueue<ReadStatus>((int)queueSize); asyncPageRead = new ConcurrentLinkedQueue<>(); asyncPageRead.offer(threadPool.submit(new AsyncPageReaderTask(debugName, pageQueue))); } } } private DrillBuf getDecompressedPageData(ReadStatus readStatus) { DrillBuf data; boolean isDictionary = false; synchronized (this) { data = readStatus.getPageData(); readStatus.setPageData(null); isDictionary = readStatus.isDictionaryPage; } if (parentColumnReader.columnChunkMetaData.getCodec() != CompressionCodecName.UNCOMPRESSED) { DrillBuf compressedData = data; data = decompress(readStatus.getPageHeader(), compressedData); synchronized (this) { readStatus.setPageData(null); } compressedData.release(); } else { if (isDictionary) { stats.totalDictPageReadBytes.addAndGet(readStatus.bytesRead); } else { stats.totalDataPageReadBytes.addAndGet(readStatus.bytesRead); } } return data; } // Read and decode the dictionary and the header private void readDictionaryPage( final ColumnReader<?> parentStatus) throws UserException { try { Stopwatch timer = Stopwatch.createStarted(); ReadStatus readStatus = null; synchronized(pageQueue) { boolean pageQueueFull = pageQueue.remainingCapacity() == 0; asyncPageRead.poll().get(); // get the result of execution readStatus = pageQueue.take(); // get the data if no exception has been thrown assert (readStatus.pageData != null); //if the queue was full before we took a page out, then there would // have been no new read tasks scheduled. In that case, schedule a new read. if (pageQueueFull) { asyncPageRead.offer(threadPool.submit(new AsyncPageReaderTask(debugName, pageQueue))); } } long timeBlocked = timer.elapsed(TimeUnit.NANOSECONDS); stats.timeDiskScanWait.addAndGet(timeBlocked); stats.timeDiskScan.addAndGet(readStatus.getDiskScanTime()); stats.numDictPageLoads.incrementAndGet(); stats.timeDictPageLoads.addAndGet(timeBlocked + readStatus.getDiskScanTime()); readDictionaryPageData(readStatus, parentStatus); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (Exception e) { handleAndThrowException(e, "Error reading dictionary page."); } } // Read and decode the dictionary data private void readDictionaryPageData(final ReadStatus readStatus, final ColumnReader<?> parentStatus) throws UserException { try { pageHeader = readStatus.getPageHeader(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final DrillBuf dictionaryData = getDecompressedPageData(readStatus); Stopwatch timer = Stopwatch.createStarted(); allocatedDictionaryBuffers.add(dictionaryData); DictionaryPage page = new DictionaryPage(asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); long timeToDecode = timer.elapsed(TimeUnit.NANOSECONDS); stats.timeDictPageDecode.addAndGet(timeToDecode); } catch (Exception e) { handleAndThrowException(e, "Error decoding dictionary page."); } } private void handleAndThrowException(Exception e, String msg) throws UserException { UserException ex = UserException.dataReadError(e).message(msg) .pushContext("Row Group Start: ", this.parentColumnReader.columnChunkMetaData.getStartingPos()) .pushContext("Column: ", this.parentColumnReader.schemaElement.getName()) .pushContext("File: ", this.fileName).build(logger); throw ex; } private DrillBuf decompress(PageHeader pageHeader, DrillBuf compressedData) { DrillBuf pageDataBuf = null; Stopwatch timer = Stopwatch.createUnstarted(); long timeToRead; int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); pageDataBuf = allocateTemporaryBuffer(uncompressedSize); try { timer.start(); CompressionCodecName codecName = parentColumnReader.columnChunkMetaData.getCodec(); ByteBuffer input = compressedData.nioBuffer(0, compressedSize); ByteBuffer output = pageDataBuf.nioBuffer(0, uncompressedSize); DecompressionHelper decompressionHelper = new DecompressionHelper(codecName); decompressionHelper.decompress(input, compressedSize, output, uncompressedSize); pageDataBuf.writerIndex(uncompressedSize); timeToRead = timer.elapsed(TimeUnit.NANOSECONDS); this.updateStats(pageHeader, "Decompress", 0, timeToRead, compressedSize, uncompressedSize); } catch (IOException e) { handleAndThrowException(e, "Error decompressing data."); } return pageDataBuf; } @Override protected void nextInternal() throws IOException { ReadStatus readStatus = null; String name = parentColumnReader.columnChunkMetaData.toString(); try { Stopwatch timer = Stopwatch.createStarted(); parentColumnReader.parentReader.getOperatorContext().getStats().startWait(); asyncPageRead.poll().get(); // get the result of execution synchronized(pageQueue) { boolean pageQueueFull = pageQueue.remainingCapacity() == 0; readStatus = pageQueue.take(); // get the data if no exception has been thrown if (readStatus.pageData == null || readStatus == ReadStatus.EMPTY) { throw new DrillRuntimeException("Unexpected end of data"); } //if the queue was full before we took a page out, then there would // have been no new read tasks scheduled. In that case, schedule a new read. if (pageQueueFull) { asyncPageRead.offer(threadPool.submit(new AsyncPageReaderTask(debugName, pageQueue))); } } long timeBlocked = timer.elapsed(TimeUnit.NANOSECONDS); parentColumnReader.parentReader.getOperatorContext().getStats().stopWait(); stats.timeDiskScanWait.addAndGet(timeBlocked); stats.timeDiskScan.addAndGet(readStatus.getDiskScanTime()); if (readStatus.isDictionaryPage) { stats.numDictPageLoads.incrementAndGet(); stats.timeDictPageLoads.addAndGet(timeBlocked + readStatus.getDiskScanTime()); } else { stats.numDataPageLoads.incrementAndGet(); stats.timeDataPageLoads.addAndGet(timeBlocked + readStatus.getDiskScanTime()); } pageHeader = readStatus.getPageHeader(); // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary do { if (pageHeader.getType() == PageType.DICTIONARY_PAGE) { readDictionaryPageData(readStatus, parentColumnReader); asyncPageRead.poll().get(); // get the result of execution synchronized (pageQueue) { boolean pageQueueFull = pageQueue.remainingCapacity() == 0; readStatus = pageQueue.take(); // get the data if no exception has been thrown if (readStatus.pageData == null || readStatus == ReadStatus.EMPTY) { break; } //if the queue was full before we took a page out, then there would // have been no new read tasks scheduled. In that case, schedule a new read. if (pageQueueFull) { asyncPageRead.offer(threadPool.submit(new AsyncPageReaderTask(debugName, pageQueue))); } } assert (readStatus.pageData != null); pageHeader = readStatus.getPageHeader(); } } while (pageHeader.getType() == PageType.DICTIONARY_PAGE); pageHeader = readStatus.getPageHeader(); pageData = getDecompressedPageData(readStatus); assert(pageData != null); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } catch (Exception e){ handleAndThrowException(e, "Error reading page data"); } } @Override public void clear() { while (asyncPageRead != null && !asyncPageRead.isEmpty()) { try { Future<Void> f = asyncPageRead.poll(); if(!f.isDone() && !f.isCancelled()){ f.cancel(true); } else { f.get(1, TimeUnit.MILLISECONDS); } } catch (Exception e) { // Do nothing. } } //Empty the page queue String name = parentColumnReader.columnChunkMetaData.toString(); ReadStatus r; while (!pageQueue.isEmpty()) { r = null; try { r = pageQueue.take(); if (r == ReadStatus.EMPTY) { break; } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } finally { if (r != null && r.pageData != null) { r.pageData.release(); } } } super.clear(); } public static class ReadStatus { private PageHeader pageHeader; private DrillBuf pageData; private boolean isDictionaryPage = false; private long bytesRead = 0; private long valuesRead = 0; private long diskScanTime = 0; public static final ReadStatus EMPTY = new ReadStatus(); public synchronized PageHeader getPageHeader() { return pageHeader; } public synchronized void setPageHeader(PageHeader pageHeader) { this.pageHeader = pageHeader; } public synchronized DrillBuf getPageData() { return pageData; } public synchronized void setPageData(DrillBuf pageData) { this.pageData = pageData; } public synchronized boolean isDictionaryPage() { return isDictionaryPage; } public synchronized void setIsDictionaryPage(boolean isDictionaryPage) { this.isDictionaryPage = isDictionaryPage; } public synchronized long getBytesRead() { return bytesRead; } public synchronized void setBytesRead(long bytesRead) { this.bytesRead = bytesRead; } public synchronized long getValuesRead() { return valuesRead; } public synchronized void setValuesRead(long valuesRead) { this.valuesRead = valuesRead; } public synchronized long getDiskScanTime() { return diskScanTime; } public synchronized void setDiskScanTime(long diskScanTime) { this.diskScanTime = diskScanTime; } } private class AsyncPageReaderTask implements Callable<Void> { private final AsyncPageReader parent = AsyncPageReader.this; private final LinkedBlockingQueue<ReadStatus> queue; private final String name; public AsyncPageReaderTask(String name, LinkedBlockingQueue<ReadStatus> queue) { this.name = name; this.queue = queue; } @Override public Void call() throws IOException { ReadStatus readStatus = new ReadStatus(); long bytesRead = 0; long valuesRead = 0; final long totalValuesRead = parent.totalPageValuesRead; Stopwatch timer = Stopwatch.createStarted(); final long totalValuesCount = parent.parentColumnReader.columnChunkMetaData.getValueCount(); // if we are done, just put a marker object in the queue and we are done. logger.trace("[{}]: Total Values COUNT {} Total Values READ {} ", name, totalValuesCount, totalValuesRead); if (totalValuesRead >= totalValuesCount) { try { queue.put(ReadStatus.EMPTY); } catch (InterruptedException e) { Thread.currentThread().interrupt(); // Do nothing. } return null; } DrillBuf pageData = null; timer.reset(); try { long s = parent.dataReader.getPos(); PageHeader pageHeader = Util.readPageHeader(parent.dataReader); //long e = parent.dataReader.getPos(); //if (logger.isTraceEnabled()) { // logger.trace("[{}]: Read Page Header : ReadPos = {} : Bytes Read = {} ", name, s, e - s); //} int compressedSize = pageHeader.getCompressed_page_size(); s = parent.dataReader.getPos(); pageData = parent.dataReader.getNext(compressedSize); bytesRead = compressedSize; //e = parent.dataReader.getPos(); //if (logger.isTraceEnabled()) { // DrillBuf bufStart = pageData.slice(0, compressedSize>100?100:compressedSize); // int endOffset = compressedSize>100?compressedSize-100:0; // DrillBuf bufEnd = pageData.slice(endOffset, compressedSize-endOffset); // logger // .trace("[{}]: Read Page Data : ReadPos = {} : Bytes Read = {} : Buf Start = {} : Buf End = {} ", // name, s, e - s, ByteBufUtil.hexDump(bufStart), ByteBufUtil.hexDump(bufEnd)); //} synchronized (parent) { if (pageHeader.getType() == PageType.DICTIONARY_PAGE) { readStatus.setIsDictionaryPage(true); valuesRead += pageHeader.getDictionary_page_header().getNum_values(); } else { valuesRead += pageHeader.getData_page_header().getNum_values(); parent.totalPageValuesRead += valuesRead; } long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS); readStatus.setPageHeader(pageHeader); readStatus.setPageData(pageData); readStatus.setBytesRead(bytesRead); readStatus.setValuesRead(valuesRead); readStatus.setDiskScanTime(timeToRead); assert (totalValuesRead <= totalValuesCount); } synchronized (queue) { queue.put(readStatus); // if the queue is not full, schedule another read task immediately. If it is then the consumer // will schedule a new read task as soon as it removes a page from the queue. if (queue.remainingCapacity() > 0) { asyncPageRead.offer(parent.threadPool.submit(new AsyncPageReaderTask(debugName, queue))); } } // Do nothing. } catch (InterruptedException e) { if (pageData != null) { pageData.release(); } Thread.currentThread().interrupt(); } catch (Exception e) { if (pageData != null) { pageData.release(); } parent.handleAndThrowException(e, "Exception occurred while reading from disk."); } finally { } return null; } } private class DecompressionHelper { final CompressionCodecName codecName; public DecompressionHelper(CompressionCodecName codecName){ this.codecName = codecName; } public void decompress (ByteBuffer input, int compressedSize, ByteBuffer output, int uncompressedSize) throws IOException { // GZip != thread_safe, so we go off and do our own thing. // The hadoop interface does not support ByteBuffer so we incur some // expensive copying. if (codecName == CompressionCodecName.GZIP) { GzipCodec codec = new GzipCodec(); // DirectDecompressor: @see https://hadoop.apache.org/docs/r2.7.2/api/org/apache/hadoop/io/compress/DirectDecompressor.html DirectDecompressor directDecompressor = codec.createDirectDecompressor(); if (directDecompressor != null) { logger.debug("Using GZIP direct decompressor."); directDecompressor.decompress(input, output); } else { logger.debug("Using GZIP (in)direct decompressor."); Decompressor decompressor = codec.createDecompressor(); decompressor.reset(); byte[] inputBytes = new byte[compressedSize]; input.position(0); input.get(inputBytes); decompressor.setInput(inputBytes, 0, inputBytes.length); byte[] outputBytes = new byte[uncompressedSize]; decompressor.decompress(outputBytes, 0, uncompressedSize); output.clear(); output.put(outputBytes); } } else if (codecName == CompressionCodecName.SNAPPY) { // For Snappy, just call the Snappy decompressor directly instead // of going thru the DirectDecompressor class. // The Snappy codec is itself thread safe, while going thru the DirectDecompressor path // seems to have concurrency issues. output.clear(); int size = Snappy.uncompress(input, output); output.limit(size); } else { CodecFactory.BytesDecompressor decompressor = codecFactory.getDecompressor(parentColumnReader.columnChunkMetaData.getCodec()); decompressor.decompress(input, compressedSize, output, uncompressedSize); } } } }