/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.db;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import com.google.common.base.Preconditions;
import org.apache.cassandra.utils.ByteBufferUtil;
/**
* A memstore-local allocation buffer.
* <p/>
* The MemtableAllocator is a bump-the-pointer allocator that allocates
* large (2MB) regions and then doles it out to threads that request
* slices into the array.
* <p/>
* The purpose of this class is to combat heap fragmentation in the
* memtable: by ensuring that all name and value column data in a given memtable refer
* only to large regions of contiguous memory, we ensure that large blocks
* get freed up when the memtable is flushed.
* <p/>
* Otherwise, the byte array allocated during insertion end up
* interleaved throughout the heap, and the old generation gets progressively
* more fragmented until a stop-the-world compacting collection occurs.
* <p/>
* TODO: we should probably benchmark whether word-aligning the allocations
* would provide a performance improvement - probably would speed up the
* Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached
* anyway
*/
public class MemtableAllocator
{
private final static int REGION_SIZE = 2 * 1024 * 1024;
private final static int MAX_CLONED_SIZE = 256 * 1024; // bigger than this don't go in the region
private final AtomicReference<Region> currentRegion = new AtomicReference<Region>();
private final Collection<Region> filledRegions = new LinkedBlockingQueue<Region>();
/**
* Allocate a slice of the given length.
* <p/>
* If the size is larger than the maximum size specified for this
* allocator, returns null.
*/
public ByteBuffer clone(ByteBuffer buffer)
{
assert buffer != null;
// satisfy large allocations directly from JVM since they don't cause fragmentation
// as badly, and fill up our regions quickly
if (buffer.remaining() > MAX_CLONED_SIZE)
return ByteBufferUtil.clone(buffer);
while (true)
{
Region region = getRegion();
// Try to allocate from this region
ByteBuffer cloned = region.allocate(buffer.remaining());
if (cloned != null)
{
cloned.mark();
cloned.put(buffer.duplicate());
cloned.reset();
return cloned;
}
// not enough space!
tryRetireRegion(region);
}
}
/**
* Try to retire the current region if it is still <code>region</code>.
* Postcondition is that curRegion.get() != region
*/
private void tryRetireRegion(Region region)
{
if (currentRegion.compareAndSet(region, null))
{
filledRegions.add(region);
}
}
/**
* Get the current region, or, if there is no current region, allocate a new one
*/
private Region getRegion()
{
while (true)
{
// Try to get the region
Region region = currentRegion.get();
if (region != null)
return region;
// No current region, so we want to allocate one. We race
// against other allocators to CAS in an uninitialized region
// (which is cheap to allocate)
region = new Region(REGION_SIZE);
if (currentRegion.compareAndSet(null, region))
{
// we won race - now we need to actually do the expensive allocation step
region.init();
return region;
}
// someone else won race - that's fine, we'll try to grab theirs
// in the next iteration of the loop.
}
}
/**
* A region of memory out of which allocations are sliced.
*
* This serves two purposes:
* - to provide a step between initialization and allocation, so that racing to CAS a
* new region in is harmless
* - encapsulates the allocation offset
*/
private static class Region
{
/**
* Actual underlying data
*/
private ByteBuffer data;
private static final int UNINITIALIZED = -1;
/**
* Offset for the next allocation, or the sentinel value -1
* which implies that the region is still uninitialized.
*/
private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED);
/**
* Total number of allocations satisfied from this buffer
*/
private AtomicInteger allocCount = new AtomicInteger();
/**
* Size of region in bytes
*/
private final int size;
/**
* Create an uninitialized region. Note that memory is not allocated yet, so
* this is cheap.
*
* @param size in bytes
*/
private Region(int size)
{
this.size = size;
}
/**
* Actually claim the memory for this region. This should only be called from
* the thread that constructed the region. It is thread-safe against other
* threads calling alloc(), who will block until the allocation is complete.
*/
public void init()
{
assert nextFreeOffset.get() == UNINITIALIZED;
data = ByteBuffer.allocate(size);
assert data.remaining() == data.capacity();
// Mark that it's ready for use
boolean initted = nextFreeOffset.compareAndSet(UNINITIALIZED, 0);
// We should always succeed the above CAS since only one thread calls init()!
Preconditions.checkState(initted, "Multiple threads tried to init same region");
}
/**
* Try to allocate <code>size</code> bytes from the region.
*
* @return the successful allocation, or null to indicate not-enough-space
*/
public ByteBuffer allocate(int size)
{
while (true)
{
int oldOffset = nextFreeOffset.get();
if (oldOffset == UNINITIALIZED)
{
// The region doesn't have its data allocated yet.
// Since we found this in currentRegion, we know that whoever
// CAS-ed it there is allocating it right now. So spin-loop
// shouldn't spin long!
Thread.yield();
continue;
}
if (oldOffset + size > data.capacity()) // capacity == remaining
return null;
// Try to atomically claim this region
if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size))
{
// we got the alloc
allocCount.incrementAndGet();
return (ByteBuffer) data.duplicate().position(oldOffset);
}
// we raced and lost alloc, try again
}
}
@Override
public String toString()
{
return "Region@" + System.identityHashCode(this) +
" allocs=" + allocCount.get() + "waste=" +
(data.capacity() - nextFreeOffset.get());
}
}
}