/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.llap.cache; import java.util.concurrent.atomic.AtomicLong; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel.MapMode; import java.nio.file.FileSystems; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.FileAttribute; import java.nio.file.attribute.PosixFilePermission; import java.nio.file.attribute.PosixFilePermissions; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.io.encoded.MemoryBuffer; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.llap.io.api.impl.LlapIoImpl; import org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics; public final class BuddyAllocator implements EvictionAwareAllocator, BuddyAllocatorMXBean, LlapOomDebugDump { private final Arena[] arenas; private final AtomicInteger allocatedArenas = new AtomicInteger(0); private final MemoryManager memoryManager; private static final long MAX_DUMP_INTERVAL_NS = 300 * 1000000000L; // 5 minutes. private final AtomicLong lastLog = new AtomicLong(-1); // Config settings private final int minAllocLog2, maxAllocLog2, arenaSizeLog2, maxArenas; private final int minAllocation, maxAllocation, arenaSize; private final long maxSize; private final boolean isDirect; private final boolean isMapped; private final Path cacheDir; private final LlapDaemonCacheMetrics metrics; // We don't know the acceptable size for Java array, so we'll use 1Gb boundary. // That is guaranteed to fit any maximum allocation. private static final int MAX_ARENA_SIZE = 1024*1024*1024; // Don't try to operate with less than MIN_SIZE allocator space, it will just give you grief. private static final int MIN_TOTAL_MEMORY_SIZE = 64*1024*1024; private static final FileAttribute<Set<PosixFilePermission>> RWX = PosixFilePermissions .asFileAttribute(PosixFilePermissions.fromString("rwx------")); private static final FileAttribute<Set<PosixFilePermission>> RW_ = PosixFilePermissions .asFileAttribute(PosixFilePermissions.fromString("rw-------")); public BuddyAllocator(Configuration conf, MemoryManager mm, LlapDaemonCacheMetrics metrics) { this(HiveConf.getBoolVar(conf, ConfVars.LLAP_ALLOCATOR_DIRECT), HiveConf.getBoolVar(conf, ConfVars.LLAP_ALLOCATOR_MAPPED), (int)HiveConf.getSizeVar(conf, ConfVars.LLAP_ALLOCATOR_MIN_ALLOC), (int)HiveConf.getSizeVar(conf, ConfVars.LLAP_ALLOCATOR_MAX_ALLOC), HiveConf.getIntVar(conf, ConfVars.LLAP_ALLOCATOR_ARENA_COUNT), getMaxTotalMemorySize(conf), HiveConf.getVar(conf, ConfVars.LLAP_ALLOCATOR_MAPPED_PATH), mm, metrics); } private static long getMaxTotalMemorySize(Configuration conf) { long maxSize = HiveConf.getSizeVar(conf, ConfVars.LLAP_IO_MEMORY_MAX_SIZE); if (maxSize > MIN_TOTAL_MEMORY_SIZE || HiveConf.getBoolVar(conf, ConfVars.HIVE_IN_TEST)) { return maxSize; } throw new RuntimeException("Allocator space is too small for reasonable operation; " + ConfVars.LLAP_IO_MEMORY_MAX_SIZE.varname + "=" + maxSize + ", but at least " + MIN_TOTAL_MEMORY_SIZE + " is required. If you cannot spare any memory, you can " + "disable LLAP IO entirely via " + ConfVars.LLAP_IO_ENABLED.varname + "; or set " + ConfVars.LLAP_IO_MEMORY_MODE.varname + " to 'none'"); } @VisibleForTesting public BuddyAllocator(boolean isDirectVal, int minAllocVal, int maxAllocVal, int arenaCount, long maxSizeVal, MemoryManager memoryManager, LlapDaemonCacheMetrics metrics) { this(isDirectVal, false /*isMapped*/, minAllocVal, maxAllocVal, arenaCount, maxSizeVal, null /* mapping path */, memoryManager, metrics); } @VisibleForTesting public BuddyAllocator(boolean isDirectVal, boolean isMappedVal, int minAllocVal, int maxAllocVal, int arenaCount, long maxSizeVal, String mapPath, MemoryManager memoryManager, LlapDaemonCacheMetrics metrics) { isDirect = isDirectVal; isMapped = isMappedVal; minAllocation = minAllocVal; maxAllocation = maxAllocVal; if (isMapped) { try { cacheDir = Files.createTempDirectory(FileSystems.getDefault().getPath(mapPath), "llap-", RWX); } catch (IOException ioe) { // conf validator already checks this, so it will never trigger usually throw new AssertionError("Configured mmap directory should be writable", ioe); } } else { cacheDir = null; } long arenaSizeVal = (arenaCount == 0) ? MAX_ARENA_SIZE : maxSizeVal / arenaCount; // The math.min, and the fact that maxAllocation is an int, ensures we don't overflow. arenaSizeVal = Math.max(maxAllocation, Math.min(arenaSizeVal, MAX_ARENA_SIZE)); if (LlapIoImpl.LOG.isInfoEnabled()) { LlapIoImpl.LOG.info("Buddy allocator with " + (isDirect ? "direct" : "byte") + " buffers; " + (isMapped ? ("memory mapped off " + cacheDir.toString() + "; ") : "") + "allocation sizes " + minAllocation + " - " + maxAllocation + ", arena size " + arenaSizeVal + ", total size " + maxSizeVal); } String minName = ConfVars.LLAP_ALLOCATOR_MIN_ALLOC.varname, maxName = ConfVars.LLAP_ALLOCATOR_MAX_ALLOC.varname; if (minAllocation < 8) { throw new RuntimeException(minName + " must be at least 8 bytes: " + minAllocation); } if (maxSizeVal < maxAllocation || maxAllocation < minAllocation) { throw new RuntimeException("Inconsistent sizes; expecting " + minName + " <= " + maxName + " <= " + ConfVars.LLAP_IO_MEMORY_MAX_SIZE.varname + "; configured with min=" + minAllocation + ", max=" + maxAllocation + " and total=" + maxSizeVal); } if ((Integer.bitCount(minAllocation) != 1) || (Integer.bitCount(maxAllocation) != 1)) { throw new RuntimeException("Allocation sizes must be powers of two; configured with " + minName + "=" + minAllocation + ", " + maxName + "=" + maxAllocation); } if ((arenaSizeVal % maxAllocation) > 0) { long oldArenaSize = arenaSizeVal; arenaSizeVal = (arenaSizeVal / maxAllocation) * maxAllocation; LlapIoImpl.LOG.warn("Rounding arena size to " + arenaSizeVal + " from " + oldArenaSize + " to be divisible by allocation size " + maxAllocation); } arenaSize = (int)arenaSizeVal; if ((maxSizeVal % arenaSize) > 0) { long oldMaxSize = maxSizeVal; maxSizeVal = (maxSizeVal / arenaSize) * arenaSize; LlapIoImpl.LOG.warn("Rounding cache size to " + maxSizeVal + " from " + oldMaxSize + " to be divisible by arena size " + arenaSize); } if ((maxSizeVal / arenaSize) > Integer.MAX_VALUE) { throw new RuntimeException( "Too many arenas needed to allocate the cache: " + arenaSize + ", " + maxSizeVal); } maxSize = maxSizeVal; memoryManager.updateMaxSize(maxSize); minAllocLog2 = 31 - Integer.numberOfLeadingZeros(minAllocation); maxAllocLog2 = 31 - Integer.numberOfLeadingZeros(maxAllocation); arenaSizeLog2 = 63 - Long.numberOfLeadingZeros(arenaSize); maxArenas = (int)(maxSize / arenaSize); arenas = new Arena[maxArenas]; for (int i = 0; i < maxArenas; ++i) { arenas[i] = new Arena(); } arenas[0].init(); allocatedArenas.set(1); this.memoryManager = memoryManager; this.metrics = metrics; metrics.incrAllocatedArena(); } // TODO: would it make sense to return buffers asynchronously? @Override public void allocateMultiple(MemoryBuffer[] dest, int size) throws AllocatorOutOfMemoryException { assert size > 0 : "size is " + size; if (size > maxAllocation) { throw new RuntimeException("Trying to allocate " + size + "; max is " + maxAllocation); } int freeListIx = 31 - Integer.numberOfLeadingZeros(size); if (size != (1 << freeListIx)) ++freeListIx; // not a power of two, add one more freeListIx = Math.max(freeListIx - minAllocLog2, 0); int allocLog2 = freeListIx + minAllocLog2; int allocationSize = 1 << allocLog2; // TODO: reserving the entire thing is not ideal before we alloc anything. Interleave? memoryManager.reserveMemory(dest.length << allocLog2); int destAllocIx = 0; for (int i = 0; i < dest.length; ++i) { if (dest[i] != null) continue; dest[i] = createUnallocated(); // TODO: pool of objects? } // First try to quickly lock some of the correct-sized free lists and allocate from them. int arenaCount = allocatedArenas.get(); if (arenaCount < 0) { arenaCount = -arenaCount - 1; // Next arena is being allocated. } long threadId = arenaCount > 1 ? Thread.currentThread().getId() : 0; { int startArenaIx = (int)(threadId % arenaCount), index = startArenaIx; do { int newDestIx = arenas[index].allocateFast( index, freeListIx, dest, destAllocIx, allocationSize); if (newDestIx == dest.length) return; assert newDestIx != -1; destAllocIx = newDestIx; if ((++index) == arenaCount) { index = 0; } } while (index != startArenaIx); } // 1) We can get fragmented on large blocks of uncompressed data. The memory might be // in there, but it might be in separate small blocks. This is a complicated problem, and // several solutions (in order of decreasing ugliness and increasing complexity) are: just // ask to evict the exact-sized block (there may be no such block), evict from a particular // arena (policy would know allocator internals somewhat), store buffer mapping and ask to // evict from specific choice of blocks next to each other or next to already-evicted block, // and finally do a compaction (requires a block mapping and complex sync). For now we'd just // force-evict some memory and avoid both complexity and ugliness, since large blocks are rare. // 2) Fragmentation aside (TODO: and this is a very hacky solution for that), // we called reserveMemory so we know that there's memory waiting for us somewhere. // However, we have a class of rare race conditions related to the order of locking/checking of // different allocation areas. Simple case - say we have 2 arenas, 256Kb available in arena 2. // We look at arena 1; someone deallocs 256Kb from arena 1 and allocs the same from arena 2; // we look at arena 2 and find no memory. Or, for single arena, 2 threads reserve 256k each, // and a single 1Mb block is available. When the 1st thread locks the 1Mb freelist, the 2nd one // might have already examined the 256k and 512k lists, finding nothing. Blocks placed by (1) // into smaller lists after its split is done will not be found by (2); given that freelist // locks don't overlap, (2) may even run completely between the time (1) takes out the 1Mb // block and the time it returns the remaining 768Kb. // Two solutions to this are some form of cross-thread helping (threads putting "demand" // into some sort of queues that deallocate and split will examine), or having and "actor" // allocator thread (or threads per arena). // The 2nd one is probably much simpler and will allow us to get rid of a lot of sync code. // But for now we will just retry. We will evict more each time. long forceReserved = 0; int attempt = 0; try { while (true) { // Try to split bigger blocks. TODO: again, ideally we would tryLock at least once { int startArenaIx = (int)((threadId + attempt) % arenaCount), arenaIx = startArenaIx; do { int newDestIx = arenas[arenaIx].allocateWithSplit( arenaIx, freeListIx, dest, destAllocIx, allocationSize); if (newDestIx == dest.length) return; assert newDestIx != -1; destAllocIx = newDestIx; if ((++arenaIx) == arenaCount) { arenaIx = 0; } } while (arenaIx != startArenaIx); } if (attempt == 0) { // Try to allocate memory if we haven't allocated all the way to maxSize yet; very rare. for (int arenaIx = arenaCount; arenaIx < arenas.length; ++arenaIx) { destAllocIx = arenas[arenaIx].allocateWithExpand( arenaIx, freeListIx, dest, destAllocIx, allocationSize); if (destAllocIx == dest.length) return; } } int numberToForce = (dest.length - destAllocIx) * (attempt + 1); long newReserved = memoryManager.forceReservedMemory(allocationSize, numberToForce); forceReserved += newReserved; if (newReserved == 0) { // Cannot force-evict anything, give up. String msg = "Failed to allocate " + size + "; at " + destAllocIx + " out of " + dest.length + " (entire cache is fragmented and locked, or an internal issue)"; logOomErrorMessage(msg); throw new AllocatorOutOfMemoryException(msg); } if (attempt == 0) { LlapIoImpl.LOG.warn("Failed to allocate despite reserved memory; will retry"); } ++attempt; } } finally { if (attempt > 4) { LlapIoImpl.LOG.warn("Allocation of " + dest.length + " buffers of size " + size + " took " + attempt + " attempts to evict enough memory"); } // After we succeed (or fail), release the force-evicted memory to memory manager. We have // previously reserved enough to allocate all we need, so we don't take our allocation out // of this - as per the comment above, we basically just wasted a bunch of cache (and CPU). if (forceReserved > 0) { memoryManager.releaseMemory(forceReserved); } } } private void logOomErrorMessage(String msg) { while (true) { long time = System.nanoTime(); long lastTime = lastLog.get(); // Magic value usage is invalid with nanoTime, so once in a 1000 years we may log extra. boolean shouldLog = (lastTime == -1 || (time - lastTime) > MAX_DUMP_INTERVAL_NS); if (shouldLog && !lastLog.compareAndSet(lastTime, time)) { continue; } if (shouldLog) { LlapIoImpl.LOG.error(msg + debugDumpForOom()); } else { LlapIoImpl.LOG.error(msg); } return; } } /** * Arbitrarily, we start getting the state from Allocator. Allocator calls MM which calls * the policies that call the eviction dispatcher that calls the caches. See init - these all * are connected in a cycle, so we need to make sure the who-calls-whom order is definite. */ @Override public void debugDumpShort(StringBuilder sb) { memoryManager.debugDumpShort(sb); sb.append("\nAllocator state:"); int unallocCount = 0, fullCount = 0; long totalFree = 0; for (Arena arena : arenas) { Integer result = arena.debugDumpShort(sb); if (result == null) { ++unallocCount; } else if (result == 0) { ++fullCount; } else { totalFree += result; } } sb.append("\nTotal available and allocated: ").append(totalFree).append( "; unallocated arenas: ").append(unallocCount).append( "; full arenas ").append(fullCount); sb.append("\n"); } @Override public void deallocate(MemoryBuffer buffer) { deallocateInternal(buffer, true); } @Override public void deallocateEvicted(MemoryBuffer buffer) { deallocateInternal(buffer, false); } private void deallocateInternal(MemoryBuffer buffer, boolean doReleaseMemory) { LlapDataBuffer buf = (LlapDataBuffer)buffer; long memUsage = buf.getMemoryUsage(); arenas[buf.arenaIndex].deallocate(buf); if (doReleaseMemory) { memoryManager.releaseMemory(memUsage); } } @Override public boolean isDirectAlloc() { return isDirect; } public String debugDumpForOomInternal() { StringBuilder result = new StringBuilder( "NOTE: with multiple threads the dump is not guaranteed to be consistent"); for (Arena arena : arenas) { arena.debugDump(result); } return result.toString(); } // BuddyAllocatorMXBean @Override public boolean getIsDirect() { return isDirect; } @Override public int getMinAllocation() { return minAllocation; } @Override public int getMaxAllocation() { return maxAllocation; } @Override public int getArenaSize() { return arenaSize; } @Override public long getMaxCacheSize() { return maxSize; } private ByteBuffer preallocate(int arenaSize) { if (isMapped) { RandomAccessFile rwf = null; File rf = null; Preconditions.checkArgument(isDirect, "All memory mapped allocations have to be direct buffers"); try { rf = File.createTempFile("arena-", ".cache", cacheDir.toFile()); rwf = new RandomAccessFile(rf, "rw"); rwf.setLength(arenaSize); // truncate (TODO: posix_fallocate?) // Use RW, not PRIVATE because the copy-on-write is irrelevant for a deleted file // see discussion in YARN-5551 for the memory accounting discussion ByteBuffer rwbuf = rwf.getChannel().map(MapMode.READ_WRITE, 0, arenaSize); return rwbuf; } catch (IOException ioe) { LlapIoImpl.LOG.warn("Failed trying to allocate memory mapped arena", ioe); // fail similarly when memory allocations fail throw new OutOfMemoryError("Failed trying to allocate memory mapped arena: " + ioe.getMessage()); } finally { // A mapping, once established, is not dependent upon the file channel that was used to // create it. delete file and hold onto the map IOUtils.closeQuietly(rwf); if (rf != null) { rf.delete(); } } } return isDirect ? ByteBuffer.allocateDirect(arenaSize) : ByteBuffer.allocate(arenaSize); } private class Arena { private ByteBuffer data; // Avoid storing headers with data since we expect binary size allocations. // Each headers[i] is a "virtual" byte at i * minAllocation. private byte[] headers; private FreeList[] freeLists; void init() { try { data = preallocate(arenaSize); } catch (OutOfMemoryError oom) { throw new OutOfMemoryError("Cannot allocate " + arenaSize + " bytes: " + oom.getMessage() + "; make sure your xmx and process size are set correctly."); } int maxMinAllocs = 1 << (arenaSizeLog2 - minAllocLog2); headers = new byte[maxMinAllocs]; int allocLog2Diff = maxAllocLog2 - minAllocLog2, freeListCount = allocLog2Diff + 1; freeLists = new FreeList[freeListCount]; for (int i = 0; i < freeListCount; ++i) { freeLists[i] = new FreeList(); } int maxMaxAllocs = 1 << (arenaSizeLog2 - maxAllocLog2), headerIndex = 0, headerStep = 1 << allocLog2Diff; freeLists[allocLog2Diff].listHead = 0; for (int i = 0, offset = 0; i < maxMaxAllocs; ++i, offset += maxAllocation) { // TODO: will this cause bugs on large numbers due to some Java sign bit stupidity? headers[headerIndex] = makeHeader(allocLog2Diff, false); data.putInt(offset, (i == 0) ? -1 : (headerIndex - headerStep)); data.putInt(offset + 4, (i == maxMaxAllocs - 1) ? -1 : (headerIndex + headerStep)); headerIndex += headerStep; } } public Integer debugDumpShort(StringBuilder result) { if (data == null) { return null; } int allocSize = minAllocation; int total = 0; for (int i = 0; i < freeLists.length; ++i, allocSize <<= 1) { FreeList freeList = freeLists[i]; freeList.lock.lock(); try { int nextHeaderIx = freeList.listHead; int count = 0; while (nextHeaderIx >= 0) { ++count; nextHeaderIx = getNextFreeListItem(offsetFromHeaderIndex(nextHeaderIx)); } if (count > 0) { if (total == 0) { result.append("\nArena with free list lengths by size: "); } total += (allocSize * count); result.append(allocSize).append(" => ").append(count).append(", "); } } finally { freeList.lock.unlock(); } } return total; } public void debugDump(StringBuilder result) { result.append("\nArena: "); if (data == null) { result.append(" not allocated"); return; } // Try to get as consistent view as we can; make copy of the headers. byte[] headers = new byte[this.headers.length]; System.arraycopy(this.headers, 0, headers, 0, headers.length); int allocSize = minAllocation; for (int i = 0; i < freeLists.length; ++i, allocSize <<= 1) { result.append("\n free list for size " + allocSize + ": "); FreeList freeList = freeLists[i]; freeList.lock.lock(); try { int nextHeaderIx = freeList.listHead; while (nextHeaderIx >= 0) { result.append(nextHeaderIx + ", "); nextHeaderIx = getNextFreeListItem(offsetFromHeaderIndex(nextHeaderIx)); } } finally { freeList.lock.unlock(); } } for (int i = 0; i < headers.length; ++i) { byte header = headers[i]; if (header == 0) continue; int freeListIx = freeListFromHeader(header), offset = offsetFromHeaderIndex(i); boolean isFree = (header & 1) == 0; result.append("\n block " + i + " at " + offset + ": size " + (1 << (freeListIx + minAllocLog2)) + ", " + (isFree ? "free" : "allocated")); } } private int freeListFromHeader(byte header) { return (header >> 1) - 1; } private int allocateFast( int arenaIx, int freeListIx, MemoryBuffer[] dest, int ix, int size) { if (data == null) return -1; // not allocated yet FreeList freeList = freeLists[freeListIx]; if (!freeList.lock.tryLock()) return ix; try { return allocateFromFreeListUnderLock(arenaIx, freeList, freeListIx, dest, ix, size); } finally { freeList.lock.unlock(); } } private int allocateWithSplit(int arenaIx, int freeListIx, MemoryBuffer[] dest, int ix, int allocationSize) { if (data == null) return -1; // not allocated yet FreeList freeList = freeLists[freeListIx]; int remaining = -1; freeList.lock.lock(); try { // Try to allocate from target-sized free list, maybe we'll get lucky. ix = allocateFromFreeListUnderLock( arenaIx, freeList, freeListIx, dest, ix, allocationSize); remaining = dest.length - ix; if (remaining == 0) return ix; } finally { freeList.lock.unlock(); } byte headerData = makeHeader(freeListIx, true); // Header for newly allocated used blocks. int headerStep = 1 << freeListIx; // Number of headers (smallest blocks) per target block. int splitListIx = freeListIx + 1; // Next free list from which we will be splitting. // Each iteration of this loop tries to split blocks from one level of the free list into // target size blocks; if we cannot satisfy the allocation from the free list containing the // blocks of a particular size, we'll try to split yet larger blocks, until we run out. while (remaining > 0 && splitListIx < freeLists.length) { int splitWaysLog2 = (splitListIx - freeListIx); assert splitWaysLog2 > 0; int splitWays = 1 << splitWaysLog2; // How many ways each block splits into target size. int lastSplitBlocksRemaining = -1; // How many target-sized blocks remain from last split. int lastSplitNextHeader = -1; // The header index for the beginning of the remainder. FreeList splitList = freeLists[splitListIx]; splitList.lock.lock(); try { int headerIx = splitList.listHead; // Index of the next free block to split. while (headerIx >= 0 && remaining > 0) { int origOffset = offsetFromHeaderIndex(headerIx), offset = origOffset; // We will split the block at headerIx [splitWays] ways, and take [toTake] blocks, // which will leave [lastSplitBlocksRemaining] free blocks of target size. int toTake = Math.min(splitWays, remaining); remaining -= toTake; lastSplitBlocksRemaining = splitWays - toTake; // Whatever remains. // Take toTake blocks by splitting the block at offset. for (; toTake > 0; ++ix, --toTake, headerIx += headerStep, offset += allocationSize) { headers[headerIx] = headerData; // TODO: this could be done out of the lock, we only need to take the blocks out. ((LlapDataBuffer)dest[ix]).initialize(arenaIx, data, offset, allocationSize); } lastSplitNextHeader = headerIx; // If anything remains, this is where it starts. headerIx = getNextFreeListItem(origOffset); } replaceListHeadUnderLock(splitList, headerIx); // In the end, update free list head. } finally { splitList.lock.unlock(); } if (remaining == 0) { // We have just obtained all we needed by splitting some block; now we need // to put the space remaining from that block into lower free lists. // We'll put at most one block into each list, since 2 blocks can always be combined // to make a larger-level block. Each bit in the remaining target-sized blocks count // is one block in a list offset from target-sized list by bit index. int newListIndex = freeListIx; while (lastSplitBlocksRemaining > 0) { if ((lastSplitBlocksRemaining & 1) == 1) { FreeList newFreeList = freeLists[newListIndex]; newFreeList.lock.lock(); headers[lastSplitNextHeader] = makeHeader(newListIndex, false); try { addBlockToFreeListUnderLock(newFreeList, lastSplitNextHeader); } finally { newFreeList.lock.unlock(); } lastSplitNextHeader += (1 << newListIndex); } lastSplitBlocksRemaining >>>= 1; ++newListIndex; continue; } } ++splitListIx; } return ix; } private void replaceListHeadUnderLock(FreeList freeList, int headerIx) { if (headerIx == freeList.listHead) return; if (headerIx >= 0) { int newHeadOffset = offsetFromHeaderIndex(headerIx); data.putInt(newHeadOffset, -1); // Remove backlink. } freeList.listHead = headerIx; } private int allocateWithExpand( int arenaIx, int freeListIx, MemoryBuffer[] dest, int ix, int size) { while (true) { int arenaCount = allocatedArenas.get(), allocArenaCount = arenaCount; if (arenaCount < 0) { allocArenaCount = -arenaCount - 1; // Someone is allocating an arena. } if (allocArenaCount > arenaIx) { // Someone already allocated this arena; just do the usual thing. return allocateWithSplit(arenaIx, freeListIx, dest, ix, size); } if ((arenaIx + 1) == -arenaCount) { // Someone is allocating this arena. Wait a bit and recheck. try { synchronized (this) { this.wait(100); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); // Restore interrupt, won't handle here. } continue; } // Either this arena is being allocated, or it is already allocated, or it is next. The // caller should not try to allocate another arena before waiting for the previous one. assert arenaCount == arenaIx : "Arena count " + arenaCount + " but " + arenaIx + " is not being allocated"; if (!allocatedArenas.compareAndSet(arenaCount, -arenaCount - 1)) { continue; // CAS race, look again. } assert data == null; init(); boolean isCommited = allocatedArenas.compareAndSet(-arenaCount - 1, arenaCount + 1); assert isCommited; synchronized (this) { this.notifyAll(); } metrics.incrAllocatedArena(); return allocateWithSplit(arenaIx, freeListIx, dest, ix, size); } } public int offsetFromHeaderIndex(int lastSplitNextHeader) { return lastSplitNextHeader << minAllocLog2; } public int allocateFromFreeListUnderLock(int arenaIx, FreeList freeList, int freeListIx, MemoryBuffer[] dest, int ix, int size) { int current = freeList.listHead; while (current >= 0 && ix < dest.length) { int offset = offsetFromHeaderIndex(current); // Noone else has this either allocated or in a different free list; no sync needed. headers[current] = makeHeader(freeListIx, true); current = getNextFreeListItem(offset); ((LlapDataBuffer)dest[ix]).initialize(arenaIx, data, offset, size); ++ix; } replaceListHeadUnderLock(freeList, current); return ix; } private int getPrevFreeListItem(int offset) { return data.getInt(offset); } private int getNextFreeListItem(int offset) { return data.getInt(offset + 4); } private byte makeHeader(int freeListIx, boolean isInUse) { return (byte)(((freeListIx + 1) << 1) | (isInUse ? 1 : 0)); } public void deallocate(LlapDataBuffer buffer) { assert data != null; int headerIx = buffer.byteBuffer.position() >>> minAllocLog2, freeListIx = freeListFromHeader(headers[headerIx]); assert freeListIx == (31 - Integer.numberOfLeadingZeros(buffer.allocSize) - minAllocLog2) : buffer.allocSize + " " + freeListIx; while (true) { FreeList freeList = freeLists[freeListIx]; int bHeaderIx = headerIx ^ (1 << freeListIx); freeList.lock.lock(); try { if ((freeListIx == freeLists.length - 1) || headers[bHeaderIx] != makeHeader(freeListIx, false)) { // Buddy block is allocated, or it is on higher level of allocation than we are, or we // have reached the top level. Add whatever we have got to the current free list. addBlockToFreeListUnderLock(freeList, headerIx); headers[headerIx] = makeHeader(freeListIx, false); break; } // Buddy block is free and in the same free list we have locked. Take it out for merge. removeBlockFromFreeList(freeList, bHeaderIx); headers[bHeaderIx] = headers[headerIx] = 0; // Erase both headers of the blocks to merge. } finally { freeList.lock.unlock(); } ++freeListIx; headerIx = Math.min(headerIx, bHeaderIx); } } private void addBlockToFreeListUnderLock(FreeList freeList, int headerIx) { if (freeList.listHead >= 0) { int oldHeadOffset = offsetFromHeaderIndex(freeList.listHead); assert getPrevFreeListItem(oldHeadOffset) == -1; data.putInt(oldHeadOffset, headerIx); } int offset = offsetFromHeaderIndex(headerIx); data.putInt(offset, -1); data.putInt(offset + 4, freeList.listHead); freeList.listHead = headerIx; } private void removeBlockFromFreeList(FreeList freeList, int headerIx) { int bOffset = offsetFromHeaderIndex(headerIx), bpHeaderIx = getPrevFreeListItem(bOffset), bnHeaderIx = getNextFreeListItem(bOffset); if (freeList.listHead == headerIx) { assert bpHeaderIx == -1; freeList.listHead = bnHeaderIx; } if (bpHeaderIx != -1) { data.putInt(offsetFromHeaderIndex(bpHeaderIx) + 4, bnHeaderIx); } if (bnHeaderIx != -1) { data.putInt(offsetFromHeaderIndex(bnHeaderIx), bpHeaderIx); } } } private static class FreeList { ReentrantLock lock = new ReentrantLock(false); int listHead = -1; // Index of where the buffer is; in minAllocation units // TODO: One possible improvement - store blocks arriving left over from splits, and // blocks requested, to be able to wait for pending splits and reduce fragmentation. // However, we are trying to increase fragmentation now, since we cater to single-size. } @Override public MemoryBuffer createUnallocated() { return new LlapDataBuffer(); } @Override public String debugDumpForOom() { return "\nALLOCATOR STATE:\n" + debugDumpForOomInternal() + "\nPARENT STATE:\n" + memoryManager.debugDumpForOom(); } }