/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.orc.encoded; import java.io.IOException; import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.IdentityHashMap; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.common.Pool; import org.apache.hadoop.hive.common.Pool.PoolObjectHelper; import org.apache.hadoop.hive.common.io.DataCache; import org.apache.hadoop.hive.common.io.DiskRange; import org.apache.hadoop.hive.common.io.DiskRangeList; import org.apache.hadoop.hive.common.io.DataCache.BooleanRef; import org.apache.hadoop.hive.common.io.DataCache.DiskRangeListFactory; import org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper; import org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData; import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer; import org.apache.orc.CompressionCodec; import org.apache.orc.DataReader; import org.apache.orc.OrcConf; import org.apache.orc.impl.OutStream; import org.apache.orc.impl.RecordReaderUtils; import org.apache.orc.impl.StreamName; import org.apache.orc.StripeInformation; import org.apache.orc.impl.BufferChunk; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch; import org.apache.hadoop.hive.ql.io.orc.encoded.Reader.PoolFactory; import org.apache.orc.OrcProto; import com.google.common.annotations.VisibleForTesting; import sun.misc.Cleaner; /** * Encoded reader implementation. * * Note about refcounts on cache blocks. * When we get or put blocks into cache, they are "locked" (refcount++), so they cannot be evicted. * We send the MemoryBuffer-s to caller as part of RG data; one MemoryBuffer can be used for many * RGs (e.g. a dictionary, or multiple RGs per block). Also, we want to "unlock" MemoryBuffer-s in * cache as soon as possible. This is how we deal with this: * * For dictionary case: * 1) There's a separate refcount on the ColumnStreamData object we send to the caller. In the * dictionary case, it's increased per RG, and callers don't release MBs if the containing * ColumnStreamData is not ready to be released. This is done because dictionary can have many * buffers; decrefing all of them for all RGs is more expensive; plus, decrefing in cache * may be more expensive due to cache policy/etc. * * For non-dictionary case: * 1) All the ColumnStreamData-s for normal data always have refcount 1; we return them once. * 2) At all times, every MB in such cases has +1 refcount for each time we return it as part of CSD. * 3) When caller is done, it therefore decrefs SB to 0, and decrefs all the MBs involved. * 4) Additionally, we keep an extra +1 refcount "for the fetching thread". That way, if we return * the MB to caller, and he decrefs it, the MB can't be evicted and will be there if we want to * reuse it for some other RG. * 5) As we read (we always read RGs in order and forward in each stream; we assume they are stored * physically in order in the file; AND that CBs are not shared between streams), we note which * MBs cannot possibly be reused anymore (next RG starts in the next CB). We decref the refcount * from (4) in such case. * 6) Given that RG end boundaries in ORC are estimates, we can request data from cache and then * not use it; thus, at the end we go thru all the MBs, and release those not released by (5). */ class EncodedReaderImpl implements EncodedReader { public static final Logger LOG = LoggerFactory.getLogger(EncodedReaderImpl.class); private static Field cleanerField; static { try { // TODO: To make it work for JDK9 use CleanerUtil from https://issues.apache.org/jira/browse/HADOOP-12760 final Class<?> dbClazz = Class.forName("java.nio.DirectByteBuffer"); cleanerField = dbClazz.getDeclaredField("cleaner"); cleanerField.setAccessible(true); } catch (Throwable t) { cleanerField = null; } } private static final Object POOLS_CREATION_LOCK = new Object(); private static Pools POOLS; private static class Pools { Pool<CacheChunk> tccPool; Pool<ProcCacheChunk> pccPool; Pool<OrcEncodedColumnBatch> ecbPool; Pool<ColumnStreamData> csdPool; } private final static DiskRangeListFactory CC_FACTORY = new DiskRangeListFactory() { @Override public DiskRangeList createCacheChunk(MemoryBuffer buffer, long offset, long end) { CacheChunk tcc = POOLS.tccPool.take(); tcc.init(buffer, offset, end); return tcc; } }; private final Object fileKey; private final DataReader dataReader; private boolean isDataReaderOpen = false; private final CompressionCodec codec; private final int bufferSize; private final List<OrcProto.Type> types; private final long rowIndexStride; private final DataCache cacheWrapper; private boolean isTracingEnabled; public EncodedReaderImpl(Object fileKey, List<OrcProto.Type> types, CompressionCodec codec, int bufferSize, long strideRate, DataCache cacheWrapper, DataReader dataReader, PoolFactory pf) throws IOException { this.fileKey = fileKey; this.codec = codec; this.types = types; this.bufferSize = bufferSize; this.rowIndexStride = strideRate; this.cacheWrapper = cacheWrapper; this.dataReader = dataReader; if (POOLS != null) return; if (pf == null) { pf = new NoopPoolFactory(); } Pools pools = createPools(pf); synchronized (POOLS_CREATION_LOCK) { if (POOLS != null) return; POOLS = pools; } } /** Helper context for each column being read */ private static final class ColumnReadContext { public ColumnReadContext(int colIx, OrcProto.ColumnEncoding encoding, OrcProto.RowIndex rowIndex, int colRgIx) { this.encoding = encoding; this.rowIndex = rowIndex; this.colIx = colIx; this.includedIx = colRgIx; streamCount = 0; } public static final int MAX_STREAMS = OrcProto.Stream.Kind.ROW_INDEX_VALUE; /** The number of streams that are part of this column. */ int streamCount = 0; final StreamContext[] streams = new StreamContext[MAX_STREAMS]; /** Column encoding. */ OrcProto.ColumnEncoding encoding; /** Column rowindex. */ OrcProto.RowIndex rowIndex; /** Column index in the file. */ int colIx; /** Column index in the included columns only (for RG masks). */ int includedIx; public void addStream(long offset, OrcProto.Stream stream, int indexIx) { streams[streamCount++] = new StreamContext(stream, offset, indexIx); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(" column_index: ").append(colIx); sb.append(" included_index: ").append(includedIx); sb.append(" encoding: ").append(encoding); sb.append(" stream_count: ").append(streamCount); int i = 0; for (StreamContext sc : streams) { if (sc != null) { sb.append(" stream_").append(i).append(":").append(sc.toString()); } i++; } return sb.toString(); } } private static final class StreamContext { public StreamContext(OrcProto.Stream stream, long streamOffset, int streamIndexOffset) { this.kind = stream.getKind(); this.length = stream.getLength(); this.offset = streamOffset; this.streamIndexOffset = streamIndexOffset; } /** Offsets of each stream in the column. */ public long offset, length; public int streamIndexOffset; public OrcProto.Stream.Kind kind; /** Iterators for the buffers; used to maintain position in per-rg reading. */ DiskRangeList bufferIter; /** Saved stripe-level stream, to reuse for each RG (e.g. dictionaries). */ ColumnStreamData stripeLevelStream; @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(" kind: ").append(kind); sb.append(" offset: ").append(offset); sb.append(" length: ").append(length); sb.append(" index_offset: ").append(streamIndexOffset); return sb.toString(); } } @Override public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] included, boolean[][] colRgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException { // Note: for now we don't have to setError here, caller will setError if we throw. // We are also not supposed to call setDone, since we are only part of the operation. long stripeOffset = stripe.getOffset(); // 1. Figure out what we have to read. long offset = 0; // Stream offset in relation to the stripe. // 1.1. Figure out which columns have a present stream boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types); if (isTracingEnabled) { LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull)); } // We assume stream list is sorted by column and that non-data // streams do not interleave data streams for the same column. // 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream). ColumnReadContext[] colCtxs = new ColumnReadContext[included.length]; int colRgIx = -1; // Don't create context for the 0-s column. for (int i = 1; i < included.length; ++i) { if (!included[i]) continue; colCtxs[i] = new ColumnReadContext(i, encodings.get(i), indexes[i], ++colRgIx); if (isTracingEnabled) { LOG.trace("Creating context: " + colCtxs[i].toString()); } } boolean isCompressed = (codec != null); CreateHelper listToRead = new CreateHelper(); boolean hasIndexOnlyCols = false; boolean[] includedRgs = null; // Will always be the same for all cols at the moment. for (OrcProto.Stream stream : streamList) { long length = stream.getLength(); int colIx = stream.getColumn(); OrcProto.Stream.Kind streamKind = stream.getKind(); if (!included[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) { // We have a stream for included column, but in future it might have no data streams. // It's more like "has at least one column included that has an index stream". hasIndexOnlyCols = hasIndexOnlyCols || included[colIx]; if (isTracingEnabled) { LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length); } offset += length; continue; } ColumnReadContext ctx = colCtxs[colIx]; assert ctx != null; includedRgs = colRgs[ctx.includedIx]; int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), types.get(colIx).getKind(), streamKind, isCompressed, hasNull[colIx]); ctx.addStream(offset, stream, indexIx); if (isTracingEnabled) { LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx); } if (includedRgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) { RecordReaderUtils.addEntireStreamToRanges(offset, length, listToRead, true); if (isTracingEnabled) { LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail()); } } else { RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRgs, codec != null, indexes[colIx], encodings.get(colIx), types.get(colIx), bufferSize, hasNull[colIx], offset, length, listToRead, true); } offset += length; } boolean hasFileId = this.fileKey != null; if (listToRead.get() == null) { // No data to read for this stripe. Check if we have some included index-only columns. // TODO: there may be a bug here. Could there be partial RG filtering on index-only column? if (hasIndexOnlyCols && (includedRgs == null)) { OrcEncodedColumnBatch ecb = POOLS.ecbPool.take(); ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, included.length); consumer.consumeData(ecb); } else { LOG.warn("Nothing to read for stripe [" + stripe + "]"); } return; } // 2. Now, read all of the ranges from cache or disk. DiskRangeList.MutateHelper toRead = new DiskRangeList.MutateHelper(listToRead.get()); if (/*isTracingEnabled && */LOG.isInfoEnabled()) { LOG.info("Resulting disk ranges to read (file " + fileKey + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next)); } BooleanRef isAllInCache = new BooleanRef(); if (hasFileId) { cacheWrapper.getFileData(fileKey, toRead.next, stripeOffset, CC_FACTORY, isAllInCache); if (/*isTracingEnabled && */LOG.isInfoEnabled()) { LOG.info("Disk ranges after cache (found everything " + isAllInCache.value + "; file " + fileKey + ", base offset " + stripeOffset + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next)); } } // TODO: the memory release could be optimized - we could release original buffers after we // are fully done with each original buffer from disk. For now release all at the end; // it doesn't increase the total amount of memory we hold, just the duration a bit. // This is much simpler - we can just remember original ranges after reading them, and // release them at the end. In a few cases where it's easy to determine that a buffer // can be freed in advance, we remove it from the map. IdentityHashMap<ByteBuffer, Boolean> toRelease = null; if (!isAllInCache.value) { if (!isDataReaderOpen) { this.dataReader.open(); isDataReaderOpen = true; } dataReader.readFileData(toRead.next, stripeOffset, cacheWrapper.getAllocator().isDirectAlloc()); toRelease = new IdentityHashMap<>(); DiskRangeList drl = toRead.next; while (drl != null) { if (drl instanceof BufferChunk) { toRelease.put(drl.getData(), true); } drl = drl.next; } } // 3. For uncompressed case, we need some special processing before read. // Basically, we are trying to create artificial, consistent ranges to cache, as there are // no CBs in an uncompressed file. At the end of this processing, the list would contain // either cache buffers, or buffers allocated by us and not cached (if we are only reading // parts of the data for some ranges and don't want to cache it). Both are represented by // CacheChunks, so the list is just CacheChunk-s from that point on. DiskRangeList iter = toRead.next; // Keep "toRead" list for future use, don't extract(). if (codec == null) { for (int colIx = 0; colIx < colCtxs.length; ++colIx) { ColumnReadContext ctx = colCtxs[colIx]; if (ctx == null) continue; // This column is not included. for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) { StreamContext sctx = ctx.streams[streamIx]; DiskRangeList newIter = preReadUncompressedStream( stripeOffset, iter, sctx.offset, sctx.offset + sctx.length); if (newIter != null) { iter = newIter; } } } // Release buffers as we are done with all the streams... also see toRelease comment.\ // With uncompressed streams, we know we are done earlier. if (toRelease != null) { releaseBuffers(toRelease.keySet(), true); toRelease = null; } if (isTracingEnabled) { LOG.trace("Disk ranges after pre-read (file " + fileKey + ", base offset " + stripeOffset + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next)); } iter = toRead.next; // Reset the iter to start. } // 4. Finally, decompress data, map per RG, and return to caller. // We go by RG and not by column because that is how data is processed. int rgCount = (int)Math.ceil((double)stripe.getNumberOfRows() / rowIndexStride); for (int rgIx = 0; rgIx < rgCount; ++rgIx) { boolean isLastRg = rgIx == rgCount - 1; // Create the batch we will use to return data for this RG. OrcEncodedColumnBatch ecb = POOLS.ecbPool.take(); ecb.init(fileKey, stripeIx, rgIx, included.length); boolean isRGSelected = true; for (int colIx = 0; colIx < colCtxs.length; ++colIx) { ColumnReadContext ctx = colCtxs[colIx]; if (ctx == null) continue; // This column is not included. if (isTracingEnabled) { LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount); } // TODO: simplify this now that high-level cache has been removed. Same RGs for all cols. if (colRgs[ctx.includedIx] != null && !colRgs[ctx.includedIx][rgIx]) { // RG x col filtered. isRGSelected = false; if (isTracingEnabled) { LOG.trace("colIxMod: {} rgIx: {} colRgs[{}]: {} colRgs[{}][{}]: {}", ctx.includedIx, rgIx, ctx.includedIx, Arrays.toString(colRgs[ctx.includedIx]), ctx.includedIx, rgIx, colRgs[ctx.includedIx][rgIx]); } continue; } OrcProto.RowIndexEntry index = ctx.rowIndex.getEntry(rgIx), nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1); ecb.initOrcColumn(ctx.colIx); for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) { StreamContext sctx = ctx.streams[streamIx]; ColumnStreamData cb = null; try { if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding)) { // This stream is for entire stripe and needed for every RG; uncompress once and reuse. if (isTracingEnabled) { LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length); } if (sctx.stripeLevelStream == null) { sctx.stripeLevelStream = POOLS.csdPool.take(); // We will be using this for each RG while also sending RGs to processing. // To avoid buffers being unlocked, run refcount one ahead; so each RG // processing will decref once, and the // last one will unlock the buffers. sctx.stripeLevelStream.incRef(); // For stripe-level streams we don't need the extra refcount on the block. // See class comment about refcounts. long unlockUntilCOffset = sctx.offset + sctx.length; DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset, toRelease); if (lastCached != null) { iter = lastCached; } } sctx.stripeLevelStream.incRef(); cb = sctx.stripeLevelStream; } else { // This stream can be separated by RG using index. Let's do that. // Offset to where this RG begins. long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset); // Offset relative to the beginning of the stream of where this RG ends. long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset); // Offset before which this RG is guaranteed to end. Can only be estimated. // We estimate the same way for compressed and uncompressed for now. long endCOffset = sctx.offset + RecordReaderUtils.estimateRgEndOffset( isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize); // As we read, we can unlock initial refcounts for the buffers that end before // the data that we need for this RG. long unlockUntilCOffset = sctx.offset + nextCOffsetRel; cb = createRgColumnStreamData( rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed); boolean isStartOfStream = sctx.bufferIter == null; DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset, toRelease); if (lastCached != null) { sctx.bufferIter = iter = lastCached; } } ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb); } catch (Exception ex) { DiskRangeList drl = toRead == null ? null : toRead.next; LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex); throw (ex instanceof IOException) ? (IOException)ex : new IOException(ex); } } } if (isRGSelected) { consumer.consumeData(ecb); } } if (isTracingEnabled) { LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next)); } // Release the unreleased buffers. See class comment about refcounts. for (int colIx = 0; colIx < colCtxs.length; ++colIx) { ColumnReadContext ctx = colCtxs[colIx]; if (ctx == null) continue; // This column is not included. for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) { StreamContext sctx = ctx.streams[streamIx]; if (sctx == null || sctx.stripeLevelStream == null) continue; if (0 != sctx.stripeLevelStream.decRef()) continue; for (MemoryBuffer buf : sctx.stripeLevelStream.getCacheBuffers()) { if (LOG.isTraceEnabled()) { LOG.trace("Unlocking {} at the end of processing", buf); } cacheWrapper.releaseBuffer(buf); } } } releaseInitialRefcounts(toRead.next); // Release buffers as we are done with all the streams... also see toRelease comment. if (toRelease != null) { releaseBuffers(toRelease.keySet(), true); } releaseCacheChunksIntoObjectPool(toRead.next); } private static String arrayToString(boolean[] a) { StringBuilder b = new StringBuilder(); b.append('['); for (int i = 0; i < a.length; ++i) { b.append(a[i] ? "1" : "0"); } b.append(']'); return b.toString(); } private ColumnStreamData createRgColumnStreamData(int rgIx, boolean isLastRg, int colIx, StreamContext sctx, long cOffset, long endCOffset, boolean isCompressed) { ColumnStreamData cb = POOLS.csdPool.take(); cb.incRef(); if (isTracingEnabled) { LOG.trace("Getting data for column "+ colIx + " " + (isLastRg ? "last " : "") + "RG " + rgIx + " stream " + sctx.kind + " at " + sctx.offset + ", " + sctx.length + " index position " + sctx.streamIndexOffset + ": " + (isCompressed ? "" : "un") + "compressed [" + cOffset + ", " + endCOffset + ")"); } return cb; } private void releaseInitialRefcounts(DiskRangeList current) { while (current != null) { DiskRangeList toFree = current; current = current.next; if (!(toFree instanceof CacheChunk)) continue; CacheChunk cc = (CacheChunk)toFree; if (cc.getBuffer() == null) continue; MemoryBuffer buffer = cc.getBuffer(); cacheWrapper.releaseBuffer(buffer); cc.setBuffer(null); } } @Override public void setTracing(boolean isEnabled) { this.isTracingEnabled = isEnabled; } @Override public void close() throws IOException { dataReader.close(); } /** * Fake cache chunk used for uncompressed data. Used in preRead for uncompressed files. * Makes assumptions about preRead code; for example, we add chunks here when they are * already in the linked list, without unlinking. So, we record the start position in the * original list, and then, when someone adds the next element, we merely increase the number * of elements one has to traverse from that position to get the whole list. */ private static class UncompressedCacheChunk extends CacheChunk { private BufferChunk chunk; private int count; public UncompressedCacheChunk(BufferChunk bc) { super(); init(null, bc.getOffset(), bc.getEnd()); chunk = bc; count = 1; } public void addChunk(BufferChunk bc) { assert bc.getOffset() == this.getEnd(); this.end = bc.getEnd(); ++count; } public BufferChunk getChunk() { return chunk; } public int getCount() { return count; } @Override public void handleCacheCollision(DataCache cacheWrapper, MemoryBuffer replacementBuffer, List<MemoryBuffer> cacheBuffers) { assert cacheBuffers == null; // This is done at pre-read stage where there's nothing special w/refcounts. Just release. cacheWrapper.getAllocator().deallocate(getBuffer()); // Replace the buffer in our big range list, as well as in current results. this.setBuffer(replacementBuffer); } public void clear() { this.chunk = null; this.count = -1; } } /** * CacheChunk that is pre-created for new cache data; initially, it contains an original disk * buffer and an unallocated MemoryBuffer object. Before we expose it, the MB is allocated, * the data is decompressed, and original compressed data is discarded. The chunk lives on in * the DiskRange list created for the request, and everyone treats it like regular CacheChunk. */ private static class ProcCacheChunk extends CacheChunk { public void init(long cbStartOffset, long cbEndOffset, boolean isCompressed, ByteBuffer originalData, MemoryBuffer targetBuffer, int originalCbIndex) { super.init(targetBuffer, cbStartOffset, cbEndOffset); this.isOriginalDataCompressed = isCompressed; this.originalData = originalData; this.originalCbIndex = originalCbIndex; } @Override public void reset() { super.reset(); this.originalData = null; } @Override public String toString() { return super.toString() + ", original is set " + (this.originalData != null) + ", buffer was replaced " + (originalCbIndex == -1); } @Override public void handleCacheCollision(DataCache cacheWrapper, MemoryBuffer replacementBuffer, List<MemoryBuffer> cacheBuffers) { assert originalCbIndex >= 0; // Had the put succeeded for our new buffer, it would have refcount of 2 - 1 from put, // and 1 from notifyReused call above. "Old" buffer now has the 1 from put; new buffer // is not in cache. cacheWrapper.getAllocator().deallocate(getBuffer()); cacheWrapper.reuseBuffer(replacementBuffer); // Replace the buffer in our big range list, as well as in current results. this.buffer = replacementBuffer; cacheBuffers.set(originalCbIndex, replacementBuffer); originalCbIndex = -1; // This can only happen once at decompress time. } /** Original data that will be turned into encoded cache data in this.buffer and reset. */ private ByteBuffer originalData = null; /** Whether originalData is compressed. */ private boolean isOriginalDataCompressed; /** Index of the MemoryBuffer corresponding to this object inside the result list. If we * hit a cache collision, we will replace this memory buffer with the one from cache at * this index, without having to look for it. */ private int originalCbIndex; } /** * Uncompresses part of the stream. RGs can overlap, so we cannot just go and decompress * and remove what we have returned. We will keep iterator as a "hint" point. * @param baseOffset Absolute offset of boundaries and ranges relative to file, for cache keys. * @param start Ordered ranges containing file data. Helpful if they point close to cOffset. * @param cOffset Start offset to decompress. * @param endCOffset End offset to decompress; estimate, partial CBs will be ignored. * @param csd Stream data, to add the results. * @param unlockUntilCOffset The offset until which the buffers can be unlocked in cache, as * they will not be used in future calls (see the class comment in * EncodedReaderImpl about refcounts). * @return Last buffer cached during decompression. Cache buffers are never removed from * the master list, so they are safe to keep as iterators for various streams. */ public DiskRangeList readEncodedStream(long baseOffset, DiskRangeList start, long cOffset, long endCOffset, ColumnStreamData csd, long unlockUntilCOffset, long streamOffset, IdentityHashMap<ByteBuffer, Boolean> toRelease) throws IOException { if (csd.getCacheBuffers() == null) { csd.setCacheBuffers(new ArrayList<MemoryBuffer>()); } else { csd.getCacheBuffers().clear(); } if (cOffset == endCOffset) return null; boolean isCompressed = codec != null; List<ProcCacheChunk> toDecompress = null; List<IncompleteCb> badEstimates = null; List<ByteBuffer> toReleaseCopies = null; if (isCompressed) { toReleaseCopies = new ArrayList<>(); toDecompress = new ArrayList<>(); badEstimates = new ArrayList<>(); } // 1. Find our bearings in the stream. Normally, iter will already point either to where we // want to be, or just before. However, RGs can overlap due to encoding, so we may have // to return to a previous block. DiskRangeList current = findExactPosition(start, cOffset); if (isTracingEnabled) { LOG.trace("Starting read for [" + cOffset + "," + endCOffset + ") at " + current); } CacheChunk lastUncompressed = null; // 2. Go thru the blocks; add stuff to results and prepare the decompression work (see below). try { lastUncompressed = isCompressed ? prepareRangesForCompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd, toRelease, toReleaseCopies, toDecompress, badEstimates) : prepareRangesForUncompressedRead( cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd); } catch (Exception ex) { LOG.error("Failed " + (isCompressed ? "" : "un") + "compressed read; cOffset " + cOffset + ", endCOffset " + endCOffset + ", streamOffset " + streamOffset + ", unlockUntilCOffset " + unlockUntilCOffset + "; ranges passed in " + RecordReaderUtils.stringifyDiskRanges(start) + "; ranges passed to prepare " + RecordReaderUtils.stringifyDiskRanges(current)); // Don't log exception here. throw (ex instanceof IOException) ? (IOException)ex : new IOException(ex); } // 2.5. Remember the bad estimates for future reference. if (badEstimates != null && !badEstimates.isEmpty()) { // Relies on the fact that cache does not actually store these. DiskRange[] cacheKeys = badEstimates.toArray(new DiskRange[badEstimates.size()]); long[] result = cacheWrapper.putFileData(fileKey, cacheKeys, null, baseOffset); assert result == null; // We don't expect conflicts from bad estimates. } if (toDecompress == null || toDecompress.isEmpty()) { releaseBuffers(toReleaseCopies, false); return lastUncompressed; // Nothing to do. } // 3. Allocate the buffers, prepare cache keys. // At this point, we have read all the CBs we need to read. cacheBuffers contains some cache // data and some unallocated membufs for decompression. toDecompress contains all the work we // need to do, and each item points to one of the membufs in cacheBuffers as target. The iter // has also been adjusted to point to these buffers instead of compressed data for the ranges. MemoryBuffer[] targetBuffers = new MemoryBuffer[toDecompress.size()]; DiskRange[] cacheKeys = new DiskRange[toDecompress.size()]; int ix = 0; for (ProcCacheChunk chunk : toDecompress) { cacheKeys[ix] = chunk; // Relies on the fact that cache does not actually store these. targetBuffers[ix] = chunk.getBuffer(); ++ix; } cacheWrapper.getAllocator().allocateMultiple(targetBuffers, bufferSize); // 4. Now decompress (or copy) the data into cache buffers. for (ProcCacheChunk chunk : toDecompress) { ByteBuffer dest = chunk.getBuffer().getByteBufferRaw(); if (chunk.isOriginalDataCompressed) { decompressChunk(chunk.originalData, codec, dest); } else { copyUncompressedChunk(chunk.originalData, dest); } chunk.originalData = null; if (isTracingEnabled) { LOG.trace("Locking " + chunk.getBuffer() + " due to reuse (after decompression)"); } cacheWrapper.reuseBuffer(chunk.getBuffer()); } // 5. Release the copies we made directly to the cleaner. releaseBuffers(toReleaseCopies, false); // 6. Finally, put uncompressed data to cache. if (fileKey != null) { long[] collisionMask = cacheWrapper.putFileData( fileKey, cacheKeys, targetBuffers, baseOffset); processCacheCollisions(collisionMask, toDecompress, targetBuffers, csd.getCacheBuffers()); } // 7. It may happen that we know we won't use some cache buffers anymore (the alternative // is that we will use the same buffers for other streams in separate calls). // Release initial refcounts. for (ProcCacheChunk chunk : toDecompress) { ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, chunk); } return lastUncompressed; } /** Subset of readEncodedStream specific to compressed streams, separate to avoid long methods. */ private CacheChunk prepareRangesForCompressedRead(long cOffset, long endCOffset, long streamOffset, long unlockUntilCOffset, DiskRangeList current, ColumnStreamData columnStreamData, IdentityHashMap<ByteBuffer, Boolean> toRelease, List<ByteBuffer> toReleaseCopies, List<ProcCacheChunk> toDecompress, List<IncompleteCb> badEstimates) throws IOException { if (cOffset > current.getOffset()) { // Target compression block is in the middle of the range; slice the range in two. current = current.split(cOffset).next; } long currentOffset = cOffset; CacheChunk lastUncompressed = null; while (true) { DiskRangeList next = null; if (current instanceof CacheChunk) { // 2a. This is a decoded compression buffer, add as is. CacheChunk cc = (CacheChunk)current; if (isTracingEnabled) { LOG.trace("Locking " + cc.getBuffer() + " due to reuse"); } cacheWrapper.reuseBuffer(cc.getBuffer()); columnStreamData.getCacheBuffers().add(cc.getBuffer()); currentOffset = cc.getEnd(); if (isTracingEnabled) { LOG.trace("Adding an already-uncompressed buffer " + cc.getBuffer()); } ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, cc); lastUncompressed = cc; next = current.next; if (next != null && (endCOffset >= 0 && currentOffset < endCOffset) && next.getOffset() >= endCOffset) { throw new IOException("Expected data at " + currentOffset + " (reading until " + endCOffset + "), but the next buffer starts at " + next.getOffset()); } } else if (current instanceof IncompleteCb) { // 2b. This is a known incomplete CB caused by ORC CB end boundaries being estimates. if (isTracingEnabled) { LOG.trace("Cannot read " + current); } next = null; currentOffset = -1; } else { // 2c. This is a compressed buffer. We need to uncompress it; the buffer can comprise // several disk ranges, so we might need to combine them. if (!(current instanceof BufferChunk)) { String msg = "Found an unexpected " + current.getClass().getSimpleName() + ": " + current + " while looking at " + currentOffset; LOG.error(msg); throw new RuntimeException(msg); } BufferChunk bc = (BufferChunk)current; ProcCacheChunk newCached = addOneCompressionBuffer(bc, columnStreamData.getCacheBuffers(), toDecompress, toRelease, toReleaseCopies, badEstimates); lastUncompressed = (newCached == null) ? lastUncompressed : newCached; next = (newCached != null) ? newCached.next : null; currentOffset = (next != null) ? next.getOffset() : -1; } if (next == null || (endCOffset >= 0 && currentOffset >= endCOffset)) { break; } current = next; } return lastUncompressed; } /** Subset of readEncodedStream specific to uncompressed streams, separate to avoid long methods. */ private CacheChunk prepareRangesForUncompressedRead(long cOffset, long endCOffset, long streamOffset, long unlockUntilCOffset, DiskRangeList current, ColumnStreamData columnStreamData) throws IOException { // Note: we are called after preReadUncompressedStream, so it doesn't have to do nearly as much // as prepareRangesForCompressedRead does; e.g. every buffer is already a CacheChunk. long currentOffset = cOffset; CacheChunk lastUncompressed = null; boolean isFirst = true; while (true) { DiskRangeList next = null; assert current instanceof CacheChunk; lastUncompressed = (CacheChunk)current; if (isTracingEnabled) { LOG.trace("Locking " + lastUncompressed.getBuffer() + " due to reuse"); } cacheWrapper.reuseBuffer(lastUncompressed.getBuffer()); if (isFirst) { columnStreamData.setIndexBaseOffset((int)(lastUncompressed.getOffset() - streamOffset)); isFirst = false; } columnStreamData.getCacheBuffers().add(lastUncompressed.getBuffer()); currentOffset = lastUncompressed.getEnd(); if (isTracingEnabled) { LOG.trace("Adding an uncompressed buffer " + lastUncompressed.getBuffer()); } ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, lastUncompressed); next = current.next; if (next == null || (endCOffset >= 0 && currentOffset >= endCOffset)) { break; } current = next; } return lastUncompressed; } /** * To achieve some sort of consistent cache boundaries, we will cache streams deterministically; * in segments starting w/stream start, and going for either stream size or some fixed size. * If we are not reading the entire segment's worth of data, then we will not cache the partial * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA * to handle just for this case. * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our * allocator. Uncompressed case is not mainline though so let's not complicate it. */ private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException { if (streamOffset == streamEnd) return null; List<UncompressedCacheChunk> toCache = null; // 1. Find our bearings in the stream. DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd); if (isTracingEnabled) { LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current); } if (streamOffset > current.getOffset()) { // Target compression block is in the middle of the range; slice the range in two. current = current.split(streamOffset).next; } // Account for maximum cache buffer size. long streamLen = streamEnd - streamOffset; int partSize = determineUncompressedPartSize(), partCount = (int)(streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0); CacheChunk lastUncompressed = null; MemoryBuffer[] singleAlloc = new MemoryBuffer[1]; for (int i = 0; i < partCount; ++i) { long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd); long hasEntirePartTo = partOffset; // We have 0 bytes of data for this part, for now. if (current == null) { break; // We have no data from this point on (could be unneeded), skip. } assert partOffset <= current.getOffset(); if (partOffset == current.getOffset() && current instanceof CacheChunk) { // We assume cache chunks would always match the way we read, so check and skip it. assert current.getOffset() == partOffset && current.getEnd() == partEnd; lastUncompressed = (CacheChunk)current; current = current.next; continue; } if (current.getOffset() >= partEnd) { continue; // We have no data at all for this part of the stream (could be unneeded), skip. } // We have some disk buffers... see if we have entire part, etc. UncompressedCacheChunk candidateCached = null; // We will cache if we have the entire part. DiskRangeList next = current; while (true) { boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd); if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) { // We are missing a section at the end of the part... copy the start to non-cached. lastUncompressed = copyAndReplaceCandidateToNonCached( candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc); candidateCached = null; } current = next; if (noMoreDataForPart) break; // Done with this part. if (current.getEnd() > partEnd) { // If the current buffer contains multiple parts, split it. current = current.split(partEnd); } if (isTracingEnabled) { LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")"); } BufferChunk curBc = (BufferChunk)current; // Track if we still have the entire part. long hadEntirePartTo = hasEntirePartTo; // We have data until the end of current block if we had it until the beginning. hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1; if (hasEntirePartTo == -1) { // We don't have the entire part; copy both whatever we intended to cache, and the rest, // to an allocated buffer. We could try to optimize a bit if we have contiguous buffers // with gaps, but it's probably not needed. if (candidateCached != null) { assert hadEntirePartTo != -1; copyAndReplaceCandidateToNonCached( candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc); candidateCached = null; } lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc); next = lastUncompressed.next; // There may be more data after the gap. } else { // So far we have all the data from the beginning of the part. if (candidateCached == null) { candidateCached = new UncompressedCacheChunk(curBc); } else { candidateCached.addChunk(curBc); } next = current.next; } } if (candidateCached != null) { if (toCache == null) { toCache = new ArrayList<>(partCount - i); } toCache.add(candidateCached); } } // 3. Allocate the buffers, prepare cache keys. if (toCache == null) return lastUncompressed; // Nothing to copy and cache. MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()]; targetBuffers[0] = null; DiskRange[] cacheKeys = new DiskRange[toCache.size()]; int ix = 0; for (UncompressedCacheChunk chunk : toCache) { cacheKeys[ix] = chunk; // Relies on the fact that cache does not actually store these. ++ix; } cacheWrapper.getAllocator().allocateMultiple( targetBuffers, (int)(partCount == 1 ? streamLen : partSize)); // 4. Now copy the data into cache buffers. ix = 0; for (UncompressedCacheChunk candidateCached : toCache) { candidateCached.setBuffer(targetBuffers[ix]); ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw(); copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached); candidateCached.clear(); lastUncompressed = candidateCached; ++ix; } // 5. Put uncompressed data to cache. if (fileKey != null) { long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset); processCacheCollisions(collisionMask, toCache, targetBuffers, null); } return lastUncompressed; } private int determineUncompressedPartSize() { // We will break the uncompressed data in the cache in the chunks that are the size // of the prevalent ORC compression buffer (the default), or maximum allocation (since we // cannot allocate bigger chunks), whichever is less. long orcCbSizeDefault = ((Number)OrcConf.BUFFER_SIZE.getDefaultValue()).longValue(); int maxAllocSize = cacheWrapper.getAllocator().getMaxAllocation(); return (int)Math.min(maxAllocSize, orcCbSizeDefault); } private static void copyUncompressedChunk(ByteBuffer src, ByteBuffer dest) { int startPos = dest.position(), startLim = dest.limit(); dest.put(src); // Copy uncompressed data to cache. // Put moves position forward by the size of the data. int newPos = dest.position(); if (newPos > startLim) { throw new AssertionError("After copying, buffer [" + startPos + ", " + startLim + ") became [" + newPos + ", " + dest.limit() + ")"); } dest.position(startPos); dest.limit(newPos); } private static CacheChunk copyAndReplaceCandidateToNonCached( UncompressedCacheChunk candidateCached, long partOffset, long candidateEnd, DataCache cacheWrapper, MemoryBuffer[] singleAlloc) { // We thought we had the entire part to cache, but we don't; convert start to // non-cached. Since we are at the first gap, the previous stuff must be contiguous. singleAlloc[0] = null; cacheWrapper.getAllocator().allocateMultiple(singleAlloc, (int)(candidateEnd - partOffset)); MemoryBuffer buffer = singleAlloc[0]; cacheWrapper.reuseBuffer(buffer); ByteBuffer dest = buffer.getByteBufferRaw(); CacheChunk tcc = POOLS.tccPool.take(); tcc.init(buffer, partOffset, candidateEnd); copyAndReplaceUncompressedChunks(candidateCached, dest, tcc); return tcc; } private static CacheChunk copyAndReplaceUncompressedToNonCached( BufferChunk bc, DataCache cacheWrapper, MemoryBuffer[] singleAlloc) { singleAlloc[0] = null; cacheWrapper.getAllocator().allocateMultiple(singleAlloc, bc.getLength()); MemoryBuffer buffer = singleAlloc[0]; cacheWrapper.reuseBuffer(buffer); ByteBuffer dest = buffer.getByteBufferRaw(); CacheChunk tcc = POOLS.tccPool.take(); tcc.init(buffer, bc.getOffset(), bc.getEnd()); copyUncompressedChunk(bc.getChunk(), dest); bc.replaceSelfWith(tcc); return tcc; } private static void copyAndReplaceUncompressedChunks( UncompressedCacheChunk candidateCached, ByteBuffer dest, CacheChunk tcc) { int startPos = dest.position(), startLim = dest.limit(); DiskRangeList next = null; for (int i = 0; i < candidateCached.getCount(); ++i) { BufferChunk chunk = (i == 0) ? candidateCached.getChunk() : (BufferChunk)next; dest.put(chunk.getData()); next = chunk.next; if (i == 0) { chunk.replaceSelfWith(tcc); } else { chunk.removeSelf(); } } int newPos = dest.position(); if (newPos > startLim) { throw new AssertionError("After copying, buffer [" + startPos + ", " + startLim + ") became [" + newPos + ", " + dest.limit() + ")"); } dest.position(startPos); dest.limit(newPos); } private static void decompressChunk( ByteBuffer src, CompressionCodec codec, ByteBuffer dest) throws IOException { int startPos = dest.position(), startLim = dest.limit(); codec.decompress(src, dest); // Codec resets the position to 0 and limit to correct limit. dest.position(startPos); int newLim = dest.limit(); if (newLim > startLim) { throw new AssertionError("After codec, buffer [" + startPos + ", " + startLim + ") became [" + dest.position() + ", " + newLim + ")"); } } public static void releaseCacheChunksIntoObjectPool(DiskRangeList current) { while (current != null) { if (current instanceof ProcCacheChunk) { POOLS.pccPool.offer((ProcCacheChunk)current); } else if (current instanceof CacheChunk) { POOLS.tccPool.offer((CacheChunk)current); } current = current.next; } } private void ponderReleaseInitialRefcount( long unlockUntilCOffset, long streamStartOffset, CacheChunk cc) { // Don't release if the buffer contains any data beyond the acceptable boundary. if (cc.getEnd() > unlockUntilCOffset) return; assert cc.getBuffer() != null; try { releaseInitialRefcount(cc, false); } catch (AssertionError e) { LOG.error("BUG: releasing initial refcount; stream start " + streamStartOffset + ", " + "unlocking until " + unlockUntilCOffset + " from [" + cc + "]: " + e.getMessage()); throw e; } // Release all the previous buffers that we may not have been able to release due to reuse, // as long as they are still in the same stream and are not already released. DiskRangeList prev = cc.prev; while (true) { if ((prev == null) || (prev.getEnd() <= streamStartOffset) || !(prev instanceof CacheChunk)) break; CacheChunk prevCc = (CacheChunk)prev; if (prevCc.buffer == null) break; try { releaseInitialRefcount(prevCc, true); } catch (AssertionError e) { LOG.error("BUG: releasing initial refcount; stream start " + streamStartOffset + ", " + "unlocking until " + unlockUntilCOffset + " from [" + cc + "] and backtracked to [" + prevCc + "]: " + e.getMessage()); throw e; } prev = prev.prev; } } private void releaseInitialRefcount(CacheChunk cc, boolean isBacktracking) { // This is the last RG for which this buffer will be used. Remove the initial refcount if (isTracingEnabled) { LOG.trace("Unlocking " + cc.getBuffer() + " for the fetching thread" + (isBacktracking ? "; backtracking" : "")); } cacheWrapper.releaseBuffer(cc.getBuffer()); cc.setBuffer(null); } private void processCacheCollisions(long[] collisionMask, List<? extends CacheChunk> toDecompress, MemoryBuffer[] targetBuffers, List<MemoryBuffer> cacheBuffers) { if (collisionMask == null) return; assert collisionMask.length >= (toDecompress.size() >>> 6); // There are some elements that were cached in parallel, take care of them. long maskVal = -1; for (int i = 0; i < toDecompress.size(); ++i) { if ((i & 63) == 0) { maskVal = collisionMask[i >>> 6]; } if ((maskVal & 1) == 1) { // Cache has found an old buffer for the key and put it into array instead of our new one. CacheChunk replacedChunk = toDecompress.get(i); MemoryBuffer replacementBuffer = targetBuffers[i]; if (isTracingEnabled) { LOG.trace("Discarding data due to cache collision: " + replacedChunk.getBuffer() + " replaced with " + replacementBuffer); } assert replacedChunk.getBuffer() != replacementBuffer : i + " was not replaced in the results " + "even though mask is [" + Long.toBinaryString(maskVal) + "]"; replacedChunk.handleCacheCollision(cacheWrapper, replacementBuffer, cacheBuffers); } maskVal >>= 1; } } /** Finds compressed offset in a stream and makes sure iter points to its position. This may be necessary for obscure combinations of compression and encoding boundaries. */ private static DiskRangeList findExactPosition(DiskRangeList ranges, long offset) { if (offset < 0) return ranges; return findIntersectingPosition(ranges, offset, offset); } private static DiskRangeList findIntersectingPosition(DiskRangeList ranges, long offset, long end) { if (offset < 0) return ranges; // We expect the offset to be valid TODO: rather, validate while (ranges.getEnd() <= offset) { ranges = ranges.next; } while (ranges.getOffset() > end) { ranges = ranges.prev; } // We are now on some intersecting buffer, find the first intersecting buffer. while (ranges.prev != null && ranges.prev.getEnd() > offset) { ranges = ranges.prev; } return ranges; } /** * Reads one compression block from the source; handles compression blocks read from * multiple ranges (usually, that would only happen with zcr). * Adds stuff to cachedBuffers, toDecompress and toRelease (see below what each does). * @param current BufferChunk where compression block starts. * @param cacheBuffers The result buffer array to add pre-allocated target cache buffer. * @param toDecompress The list of work to decompress - pairs of compressed buffers and the * target buffers (same as the ones added to cacheBuffers). * @param toRelease The list of buffers to release to zcr because they are no longer in use. * @param badEstimates The list of bad estimates that cannot be decompressed. * @return The resulting cache chunk. */ private ProcCacheChunk addOneCompressionBuffer(BufferChunk current, List<MemoryBuffer> cacheBuffers, List<ProcCacheChunk> toDecompress, IdentityHashMap<ByteBuffer, Boolean> toRelease, List<ByteBuffer> toReleaseCopies, List<IncompleteCb> badEstimates) throws IOException { ByteBuffer slice = null; ByteBuffer compressed = current.getChunk(); long cbStartOffset = current.getOffset(); int b0 = -1, b1 = -1, b2 = -1; // First, read the CB header. Due to ORC estimates, ZCR, etc. this can be complex. if (compressed.remaining() >= 3) { // The overwhelming majority of cases will go here. Read 3 bytes. Tada! b0 = compressed.get() & 0xff; b1 = compressed.get() & 0xff; b2 = compressed.get() & 0xff; } else { // Bad luck! Handle the corner cases where 3 bytes are in multiple blocks. int[] bytes = new int[3]; current = readLengthBytesFromSmallBuffers( current, cbStartOffset, bytes, badEstimates, isTracingEnabled); if (current == null) return null; compressed = current.getChunk(); b0 = bytes[0]; b1 = bytes[1]; b2 = bytes[2]; } int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1); if (chunkLength > bufferSize) { throw new IllegalArgumentException("Buffer size too small. size = " + bufferSize + " needed = " + chunkLength); } int consumedLength = chunkLength + OutStream.HEADER_SIZE; long cbEndOffset = cbStartOffset + consumedLength; boolean isUncompressed = ((b0 & 0x01) == 1); if (isTracingEnabled) { LOG.trace("Found CB at " + cbStartOffset + ", chunk length " + chunkLength + ", total " + consumedLength + ", " + (isUncompressed ? "not " : "") + "compressed"); } if (compressed.remaining() >= chunkLength) { // Simple case - CB fits entirely in the disk range. slice = compressed.slice(); slice.limit(chunkLength); return addOneCompressionBlockByteBuffer(slice, isUncompressed, cbStartOffset, cbEndOffset, chunkLength, current, toDecompress, cacheBuffers); } if (current.getEnd() < cbEndOffset && !current.hasContiguousNext()) { badEstimates.add(addIncompleteCompressionBuffer( cbStartOffset, current, 0, isTracingEnabled)); return null; // This is impossible to read from this chunk. } // TODO: we could remove extra copy for isUncompressed case by copying directly to cache. // We need to consolidate 2 or more buffers into one to decompress. ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect()); toReleaseCopies.add(copy); // We will always release copies at the end. int remaining = chunkLength - compressed.remaining(); int originalPos = compressed.position(); copy.put(compressed); if (isTracingEnabled) { LOG.trace("Removing partial CB " + current + " from ranges after copying its contents"); } DiskRangeList next = current.next; current.removeSelf(); if (originalPos == 0 && toRelease.remove(compressed)) { releaseBuffer(compressed, true); } int extraChunkCount = 0; while (true) { if (!(next instanceof BufferChunk)) { throw new IOException("Trying to extend compressed block into uncompressed block " + next); } compressed = next.getData(); ++extraChunkCount; if (compressed.remaining() >= remaining) { // This is the last range for this compression block. Yay! slice = compressed.slice(); slice.limit(remaining); copy.put(slice); ProcCacheChunk cc = addOneCompressionBlockByteBuffer(copy, isUncompressed, cbStartOffset, cbEndOffset, remaining, (BufferChunk)next, toDecompress, cacheBuffers); if (compressed.remaining() <= 0 && toRelease.remove(compressed)) { releaseBuffer(compressed, true); // We copied the entire buffer. } // else there's more data to process; will be handled in next call. return cc; } remaining -= compressed.remaining(); copy.put(compressed); // TODO: move into the if below; account for release call if (toRelease.remove(compressed)) { releaseBuffer(compressed, true); // We copied the entire buffer. } DiskRangeList tmp = next; next = next.hasContiguousNext() ? next.next : null; if (next != null) { if (isTracingEnabled) { LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents"); } tmp.removeSelf(); } else { badEstimates.add(addIncompleteCompressionBuffer( cbStartOffset, tmp, extraChunkCount, isTracingEnabled)); return null; // This is impossible to read from this chunk. } } } @VisibleForTesting static BufferChunk readLengthBytesFromSmallBuffers(BufferChunk first, long cbStartOffset, int[] result, List<IncompleteCb> badEstimates, boolean isTracingEnabled) throws IOException { if (!first.hasContiguousNext()) { badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, first, 0, isTracingEnabled)); return null; // This is impossible to read from this chunk. } int ix = readLengthBytes(first.getChunk(), result, 0); assert ix < 3; // Otherwise we wouldn't be here. DiskRangeList current = first.next; first.removeSelf(); while (true) { if (!(current instanceof BufferChunk)) { throw new IOException( "Trying to extend compressed block into uncompressed block " + current); } BufferChunk currentBc = (BufferChunk) current; ix = readLengthBytes(currentBc.getChunk(), result, ix); if (ix == 3) return currentBc; // Done, we have 3 bytes. Continue reading this buffer. DiskRangeList tmp = current; current = current.hasContiguousNext() ? current.next : null; if (current != null) { if (isTracingEnabled) { LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents"); } tmp.removeSelf(); } else { badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, tmp, -1, isTracingEnabled)); return null; // This is impossible to read from this chunk. } } } private static int readLengthBytes(ByteBuffer compressed, int[] bytes, int ix) { int byteCount = compressed.remaining(); while (byteCount > 0 && ix < 3) { bytes[ix++] = compressed.get() & 0xff; --byteCount; } return ix; } private void releaseBuffers(Collection<ByteBuffer> toRelease, boolean isFromDataReader) { if (toRelease == null) return; for (ByteBuffer buf : toRelease) { releaseBuffer(buf, isFromDataReader); } } private void releaseBuffer(ByteBuffer bb, boolean isFromDataReader) { if (isTracingEnabled) { LOG.trace("Releasing the buffer " + System.identityHashCode(bb)); } if (isFromDataReader && dataReader.isTrackingDiskRanges()) { dataReader.releaseBuffer(bb); return; } Field localCf = cleanerField; if (!bb.isDirect() || localCf == null) return; try { Cleaner cleaner = (Cleaner) localCf.get(bb); if (cleaner != null) { cleaner.clean(); } else { LOG.debug("Unable to clean a buffer using cleaner - no cleaner"); } } catch (Exception e) { // leave it for GC to clean up LOG.warn("Unable to clean direct buffers using Cleaner."); cleanerField = null; } } private static IncompleteCb addIncompleteCompressionBuffer(long cbStartOffset, DiskRangeList target, int extraChunkCountToLog, boolean isTracingEnabled) { IncompleteCb icb = new IncompleteCb(cbStartOffset, target.getEnd()); if (isTracingEnabled) { LOG.trace("Replacing " + target + " (and " + extraChunkCountToLog + " previous chunks) with " + icb + " in the buffers"); } target.replaceSelfWith(icb); return icb; } /** * Add one buffer with compressed data the results for addOneCompressionBuffer (see javadoc). * @param fullCompressionBlock (fCB) Entire compression block, sliced or copied from disk data. * @param isUncompressed Whether the data in the block is uncompressed. * @param cbStartOffset Compressed start offset of the fCB. * @param cbEndOffset Compressed end offset of the fCB. * @param lastChunkLength The number of compressed bytes consumed from last *chunk* into fullCompressionBlock. * @param lastChunk * @param toDecompress See addOneCompressionBuffer. * @param cacheBuffers See addOneCompressionBuffer. * @return New cache buffer. */ private ProcCacheChunk addOneCompressionBlockByteBuffer(ByteBuffer fullCompressionBlock, boolean isUncompressed, long cbStartOffset, long cbEndOffset, int lastChunkLength, BufferChunk lastChunk, List<ProcCacheChunk> toDecompress, List<MemoryBuffer> cacheBuffers) { // Prepare future cache buffer. MemoryBuffer futureAlloc = cacheWrapper.getAllocator().createUnallocated(); // Add it to result in order we are processing. cacheBuffers.add(futureAlloc); // Add it to the list of work to decompress. ProcCacheChunk cc = POOLS.pccPool.take(); cc.init(cbStartOffset, cbEndOffset, !isUncompressed, fullCompressionBlock, futureAlloc, cacheBuffers.size() - 1); toDecompress.add(cc); // Adjust the compression block position. if (isTracingEnabled) { LOG.trace("Adjusting " + lastChunk + " to consume " + lastChunkLength + " compressed bytes"); } lastChunk.getChunk().position(lastChunk.getChunk().position() + lastChunkLength); // Finally, put it in the ranges list for future use (if shared between RGs). // Before anyone else accesses it, it would have been allocated and decompressed locally. if (lastChunk.getChunk().remaining() <= 0) { if (isTracingEnabled) { LOG.trace("Replacing " + lastChunk + " with " + cc + " in the buffers"); } lastChunk.replaceSelfWith(cc); } else { if (isTracingEnabled) { LOG.trace("Adding " + cc + " before " + lastChunk + " in the buffers"); } lastChunk.insertPartBefore(cc); } return cc; } private static ByteBuffer allocateBuffer(int size, boolean isDirect) { return isDirect ? ByteBuffer.allocateDirect(size) : ByteBuffer.allocate(size); } private static Pools createPools(PoolFactory pf) { Pools pools = new Pools(); pools.pccPool = pf.createPool(1024, new PoolObjectHelper<ProcCacheChunk>() { @Override public ProcCacheChunk create() { return new ProcCacheChunk(); } @Override public void resetBeforeOffer(ProcCacheChunk t) { t.reset(); } }); pools.tccPool = pf.createPool(1024, new PoolObjectHelper<CacheChunk>() { @Override public CacheChunk create() { return new CacheChunk(); } @Override public void resetBeforeOffer(CacheChunk t) { t.reset(); } }); pools.ecbPool = pf.createEncodedColumnBatchPool(); pools.csdPool = pf.createColumnStreamDataPool(); return pools; } /** Pool factory that is used if another one isn't specified - just creates the objects. */ private static class NoopPoolFactory implements PoolFactory { @Override public <T> Pool<T> createPool(final int size, final PoolObjectHelper<T> helper) { return new Pool<T>() { public void offer(T t) { } @Override public int size() { return size; } public T take() { return helper.create(); } }; } @Override public Pool<OrcEncodedColumnBatch> createEncodedColumnBatchPool() { return createPool(0, new PoolObjectHelper<OrcEncodedColumnBatch>() { @Override public OrcEncodedColumnBatch create() { return new OrcEncodedColumnBatch(); } @Override public void resetBeforeOffer(OrcEncodedColumnBatch t) { } }); } @Override public Pool<ColumnStreamData> createColumnStreamDataPool() { return createPool(0, new PoolObjectHelper<ColumnStreamData>() { @Override public ColumnStreamData create() { return new ColumnStreamData(); } @Override public void resetBeforeOffer(ColumnStreamData t) { } }); } } }