/*
* JBoss, Home of Professional Open Source.
* See the COPYRIGHT.txt file distributed with this work for information
* regarding copyright ownership. Some portions may be licensed
* to Red Hat, Inc. under one or more contributor license agreements.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301 USA.
*/
package org.teiid.common.buffer.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInput;
import java.io.ObjectInputStream;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.lang.ref.WeakReference;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.teiid.common.buffer.AutoCleanupUtil;
import org.teiid.common.buffer.Cache;
import org.teiid.common.buffer.CacheEntry;
import org.teiid.common.buffer.CacheKey;
import org.teiid.common.buffer.ExtensibleBufferedInputStream;
import org.teiid.common.buffer.FileStore;
import org.teiid.common.buffer.Serializer;
import org.teiid.common.buffer.StorageManager;
import org.teiid.core.TeiidComponentException;
import org.teiid.core.TeiidRuntimeException;
import org.teiid.core.util.ExecutorUtils;
import org.teiid.core.util.PropertiesUtils;
import org.teiid.logging.LogConstants;
import org.teiid.logging.LogManager;
import org.teiid.logging.MessageLevel;
import org.teiid.query.QueryPlugin;
/**
* Implements storage against a {@link FileStore} abstraction using a fronting
* memory buffer with a filesystem paradigm. All objects must go through the
* memory (typically off-heap) buffer so that they can be put into their appropriately
* sized storage bucket.
*
* The memory uses a 31bit address space on top of 2^13 byte blocks.
*
* Therefore there is 2^31*2^13 = 2^44 or 16 terabytes max of addressable space.
* This is well beyond any current needs.
*
* The 64 byte inode format is:
* 14 32 bit direct block pointers
* 1 32 bit block indirect pointer
* 1 32 bit block doubly indirect pointer (should be rarely used)
*
* This means that the maximum number of blocks available to an object is
* 14 + (2^13)/4 + ((2^13)/4)^2 ~= 2^22
*
* Thus the max serialized object size is: 2^22*(2^13) ~= 32GB.
*
* Typically the max object size will be much smaller, such as 8MB.
*
* Inodes are held separately from the data/index blocks, and introduce an overhead
* that is ~ 1/128th the size of memory buffer.
*
* The filesystem stores are broken up into block specific sizes starting with 8KB.
*
* The root directory "physicalMapping" is held in memory for performance. It will grow in
* proportion to the number of tables/tuplebuffers in use.
*
* The locking is as fine grained as possible to prevent contention. See {@link PhysicalInfo} for
* flags that are used when it is used as a lock. It is important to not access the
* group maps when a {@link PhysicalInfo} lock is held.
*/
public class BufferFrontedFileStoreCache implements Cache<PhysicalInfo> {
private static final int FULL_DEFRAG_TRUNCATE_TIMEOUT = 10000;
private static final long TIMEOUT_NANOS = TimeUnit.SECONDS.toNanos(120);
private static final int DEFAULT_MIN_DEFRAG = 1 << 26;
private static final int HEADER_BYTES = 16;
private static final int EVICTION_SCANS = 2;
public static final int DEFAuLT_MAX_OBJECT_SIZE = 1 << 23;
static final int ADDRESS_BITS = 31;
static final int SYSTEM_MASK = 1<<ADDRESS_BITS;
static final int BYTES_PER_BLOCK_ADDRESS = 4;
static final int INODE_BYTES = 16*BYTES_PER_BLOCK_ADDRESS;
static final int LOG_INODE_SIZE = 6;
static final int DIRECT_POINTERS = 14;
static final int EMPTY_ADDRESS = -1;
static final int FREED = -2;
//TODO allow the block size to be configurable. 8k is a reasonable default up to a gig, but we could be more efficient with larger blocks from there.
//the rationale for a smaller block size is to reduce internal fragmentation, which is critical when maintaining a relatively small buffer < 256MB
static final int LOG_BLOCK_SIZE = 13;
public static final long MAX_ADDRESSABLE_MEMORY = 1l<<(ADDRESS_BITS+LOG_BLOCK_SIZE);
static final int BLOCK_SIZE = 1 << LOG_BLOCK_SIZE;
static final int BLOCK_MASK = BLOCK_SIZE - 1;
static final int ADDRESSES_PER_BLOCK = BLOCK_SIZE/BYTES_PER_BLOCK_ADDRESS;
static final int MAX_INDIRECT = DIRECT_POINTERS + ADDRESSES_PER_BLOCK;
static final int MAX_DOUBLE_INDIRECT = MAX_INDIRECT + ADDRESSES_PER_BLOCK * ADDRESSES_PER_BLOCK;
private enum Mode {
GET,
UPDATE,
ALLOCATE
}
private final class InodeBlockManager implements BlockManager {
private int inode;
private ByteBuffer inodeBuffer;
private final long gid;
private final long oid;
private int blockSegment;
private BlockByteBuffer blockByteBufferCopy = BufferFrontedFileStoreCache.this.blockByteBuffer.duplicate();
private BlockByteBuffer inodeByteBufferCopy = BufferFrontedFileStoreCache.this.inodeByteBuffer.duplicate();
InodeBlockManager(long gid, long oid, int inode) {
this.inode = inode;
this.gid = gid;
this.oid = oid;
this.blockSegment = blocksInuse.getNextSegment();
}
@Override
public int getInode() {
return inode;
}
@Override
public ByteBuffer getBlock(int index) {
int dataBlock = getOrUpdateDataBlockIndex(index, EMPTY_ADDRESS, Mode.GET);
return blockByteBufferCopy.getByteBuffer(dataBlock);
}
private int getOrUpdateDataBlockIndex(int index, int value, Mode mode) {
if (index >= MAX_DOUBLE_INDIRECT) {
throw new TeiidRuntimeException(QueryPlugin.Event.TEIID30045, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30045));
}
int dataBlock = 0;
int position = 0;
ByteBuffer info = getInodeBlock();
if (index >= MAX_INDIRECT) {
position = BYTES_PER_BLOCK_ADDRESS*(DIRECT_POINTERS+1);
ByteBuffer next = updateIndirectBlockInfo(info, index, position, MAX_INDIRECT, value, mode);
if (next != null) {
info = next;
//should have traversed to the secondary
int indirectAddressBlock = (index - MAX_INDIRECT) / ADDRESSES_PER_BLOCK;
position = info.position() + indirectAddressBlock * BYTES_PER_BLOCK_ADDRESS;
if (mode == Mode.ALLOCATE && position + BYTES_PER_BLOCK_ADDRESS < info.limit()) {
info.putInt(position + BYTES_PER_BLOCK_ADDRESS, EMPTY_ADDRESS);
}
next = updateIndirectBlockInfo(info, index, position, MAX_INDIRECT + indirectAddressBlock * ADDRESSES_PER_BLOCK, value, mode);
if (next != null) {
info = next;
position = info.position() + ((index - MAX_INDIRECT)%ADDRESSES_PER_BLOCK) * BYTES_PER_BLOCK_ADDRESS;
}
}
} else if (index >= DIRECT_POINTERS) {
//indirect
position = BYTES_PER_BLOCK_ADDRESS*DIRECT_POINTERS;
ByteBuffer next = updateIndirectBlockInfo(info, index, position, DIRECT_POINTERS, value, mode);
if (next != null) {
info = next;
position = next.position() + (index - DIRECT_POINTERS) * BYTES_PER_BLOCK_ADDRESS;
}
} else {
position = BYTES_PER_BLOCK_ADDRESS*index;
}
if (mode == Mode.ALLOCATE) {
dataBlock = nextBlock(info, true);
info.putInt(position, dataBlock);
if (mode == Mode.ALLOCATE && position + BYTES_PER_BLOCK_ADDRESS < info.limit()) {
//maintain the invariant that the next pointer is empty
info.putInt(position + BYTES_PER_BLOCK_ADDRESS, EMPTY_ADDRESS);
}
} else {
dataBlock = info.getInt(position);
if (mode == Mode.UPDATE) {
info.putInt(position, value);
}
}
return dataBlock;
}
private ByteBuffer updateIndirectBlockInfo(ByteBuffer buf, int index, int position, int cutOff, int value, Mode mode) {
int sib_index = buf.getInt(position);
if (index == cutOff) {
if (mode == Mode.ALLOCATE) {
sib_index = nextBlock(buf, false);
buf.putInt(position, sib_index);
} else if (mode == Mode.UPDATE && value == EMPTY_ADDRESS) {
freeDataBlock(sib_index);
return null;
}
}
return blockByteBufferCopy.getByteBuffer(sib_index);
}
/**
* Get the next dataBlock. When the memory buffer is full we have some
* book keeping to do.
* @param reading
* @return
*/
private int nextBlock(ByteBuffer reading, boolean data) {
int limit = reading.limit();
int position = reading.position();
int next = EMPTY_ADDRESS;
memoryEvictionLock.readLock().lock();
boolean readLocked = true;
try {
if ((next = blocksInuse.getAndSetNextClearBit(blockSegment)) == EMPTY_ADDRESS) {
memoryEvictionLock.readLock().unlock();
readLocked = false;
next = evictFromMemoryBuffer(true);
}
} finally {
if (readLocked) {
memoryEvictionLock.readLock().unlock();
}
}
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.TRACE)) {
LogManager.logTrace(LogConstants.CTX_BUFFER_MGR, "Allocating", data?"data":"index", "block", next, "to", gid, oid); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$
}
//restore the reading buffer
if (reading.limit() != limit) {
reading.rewind();
reading.limit(limit);
reading.position(position);
}
return next;
}
@Override
public void freeBlock(int index) {
int dataBlock = getOrUpdateDataBlockIndex(index, EMPTY_ADDRESS, Mode.UPDATE);
freeDataBlock(dataBlock);
}
private void freeDataBlock(int dataBlock) {
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.TRACE)) {
LogManager.logTrace(LogConstants.CTX_BUFFER_MGR, "freeing data block", dataBlock, "for", gid, oid); //$NON-NLS-1$ //$NON-NLS-2$
}
blocksInuse.clear(dataBlock);
}
private ByteBuffer getInodeBlock() {
if (inodeBuffer == null) {
if (inode == EMPTY_ADDRESS) {
this.inode = inodesInuse.getAndSetNextClearBit();
if (this.inode == -1) {
throw new AssertionError("Out of inodes"); //$NON-NLS-1$
}
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Allocating inode", this.inode, "to", gid, oid); //$NON-NLS-1$ //$NON-NLS-2$
}
ByteBuffer bb = getInodeBlock();
bb.putInt(EMPTY_ADDRESS);
}
inodeBuffer = inodeByteBufferCopy.getByteBuffer(inode).slice();
}
return inodeBuffer;
}
@Override
public int free(boolean acquire) {
if (this.inode == EMPTY_ADDRESS) {
return EMPTY_ADDRESS;
}
ByteBuffer bb = getInodeBlock();
int dataBlockToAcquire = bb.getInt(0);
int indirectIndexBlock = bb.getInt(BYTES_PER_BLOCK_ADDRESS*DIRECT_POINTERS);
int doublyIndirectIndexBlock = bb.getInt(BYTES_PER_BLOCK_ADDRESS*(DIRECT_POINTERS+1));
boolean freedAll = freeBlock(acquire?BYTES_PER_BLOCK_ADDRESS:0, bb, DIRECT_POINTERS-(acquire?1:0), true);
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "freeing inode", inode, "for", gid, oid); //$NON-NLS-1$ //$NON-NLS-2$
}
inodesInuse.clear(inode);
if (!freedAll || indirectIndexBlock == EMPTY_ADDRESS) {
return acquire?dataBlockToAcquire:FREED;
}
freedAll = freeIndirectBlock(indirectIndexBlock);
if (!freedAll || doublyIndirectIndexBlock == EMPTY_ADDRESS) {
return acquire?dataBlockToAcquire:FREED;
}
bb = blockByteBufferCopy.getByteBuffer(doublyIndirectIndexBlock).slice();
freeBlock(0, bb, ADDRESSES_PER_BLOCK, false);
freeDataBlock(doublyIndirectIndexBlock);
return acquire?dataBlockToAcquire:FREED;
}
private boolean freeIndirectBlock(int indirectIndexBlock) {
ByteBuffer bb = blockByteBufferCopy.getByteBuffer(indirectIndexBlock);
boolean freedAll = freeBlock(bb.position(), bb, ADDRESSES_PER_BLOCK, true);
freeDataBlock(indirectIndexBlock);
return freedAll;
}
private boolean freeBlock(int startPosition, ByteBuffer ib, int numPointers, boolean primary) {
ib.position(startPosition);
for (int i = 0; i < numPointers; i++) {
int dataBlock = ib.getInt();
if (dataBlock == EMPTY_ADDRESS) {
return false;
}
if (primary) {
freeDataBlock(dataBlock);
} else {
freeIndirectBlock(dataBlock);
}
}
return true;
}
@Override
public ByteBuffer allocateBlock(int blockNum) {
int dataBlock = getOrUpdateDataBlockIndex(blockNum, EMPTY_ADDRESS, Mode.ALLOCATE);
return blockByteBufferCopy.getByteBuffer(dataBlock);
}
}
private StorageManager storageManager;
private int maxStorageObjectSize = DEFAuLT_MAX_OBJECT_SIZE;
private long memoryBufferSpace = 1 << 26; //64MB
private boolean direct;
private int maxMemoryBlocks;
private AtomicLong readAttempts = new AtomicLong();
LrfuEvictionQueue<PhysicalInfo> memoryBufferEntries = new LrfuEvictionQueue<PhysicalInfo>(readAttempts);
private Semaphore memoryWritePermits; //prevents deadlock waiting for free blocks
private ReentrantReadWriteLock memoryEvictionLock = new ReentrantReadWriteLock(true);
private ReentrantLock freedLock = new ReentrantLock();
private Condition blocksFreed = freedLock.newCondition();
private int blocks;
private ConcurrentBitSet blocksInuse;
private BlockByteBuffer blockByteBuffer;
private ConcurrentBitSet inodesInuse;
private BlockByteBuffer inodeByteBuffer;
//root directory
private ConcurrentHashMap<Long, Map<Long, PhysicalInfo>> physicalMapping = new ConcurrentHashMap<Long, Map<Long, PhysicalInfo>>(16, .75f, BufferManagerImpl.CONCURRENCY_LEVEL);
private BlockStore[] sizeBasedStores;
private ExecutorService asynchPool = ExecutorUtils.newFixedThreadPool(2, "FileStore Worker"); //$NON-NLS-1$
private AtomicBoolean defragRunning = new AtomicBoolean();
private AtomicInteger freedCounter = new AtomicInteger();
private boolean compactBufferFiles = PropertiesUtils.getBooleanProperty(System.getProperties(), "org.teiid.compactBufferFiles", false); //$NON-NLS-1$
private int truncateInterval = 4;
//defrag to release freespace held by storage files
final class DefragTask implements Runnable {
private AtomicInteger runs = new AtomicInteger();
@Override
public void run() {
int count = runs.incrementAndGet();
try {
defrag(false);
if ((count%truncateInterval)==0) {
truncate(false);
}
} catch (Throwable t) {
LogManager.logWarning(LogConstants.CTX_BUFFER_MGR, t, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30022));
} finally {
defragRunning.set(false);
}
}
private long truncate(boolean anySpace) {
anySpace |= compactBufferFiles;
long freed = 0;
for (int i = 0; i < sizeBasedStores.length; i++) {
BlockStore blockStore = sizeBasedStores[i];
for (int segment = 0; segment < blockStore.stores.length; segment++) {
freed += truncate(blockStore, segment, anySpace);
}
}
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Finished truncate reclaimed", freed); //$NON-NLS-1$
}
return freed;
}
private void defrag(boolean all) {
all |= compactBufferFiles;
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Running defrag"); //$NON-NLS-1$
}
for (int i = 0; i < sizeBasedStores.length; i++) {
BlockStore blockStore = sizeBasedStores[i];
for (int segment = 0; segment < blockStore.stores.length; segment++) {
if (!shouldDefrag(blockStore, segment, all)) {
continue;
}
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Defraging store", i, "segment", segment, "length", blockStore.stores[segment].getLength()); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
}
try {
for (int retries = 0; retries < 10; retries++) {
int relativeBlockToMove = blockStore.blocksInUse.compactHighestBitSet(segment);
if (!shouldDefrag(blockStore, segment, all) || relativeBlockToMove == -1) {
break;
}
//move the block if possible
InputStream is = blockStore.stores[segment].createInputStream(relativeBlockToMove * blockStore.blockSize, blockStore.blockSize);
Long gid = null;
Long oid = null;
try {
gid = readLong(is);
oid = readLong(is);
} catch (IOException e) {
continue; //can happen the bit was set and no data exists
}
is.reset(); //move back to the beginning
Map<Long, PhysicalInfo> map = physicalMapping.get(gid);
if (map == null) {
continue;
}
PhysicalInfo info = map.get(oid);
if (info == null) {
continue;
}
int bitIndex = relativeBlockToMove + (segment * blockStore.blocksInUse.getBitsPerSegment());
synchronized (info) {
info.await(true, false);
if (info.block == EMPTY_ADDRESS) {
continue;
}
if (info.block != bitIndex) {
//we've marked a bit in use, but haven't yet written new data
continue;
}
}
int newBlock = blockStore.writeToStorageBlock(info, is);
synchronized (info) {
info.await(true, true);
if (info.block == EMPTY_ADDRESS) {
//already removed;
if (newBlock != EMPTY_ADDRESS) {
blockStore.blocksInUse.clear(newBlock);
}
continue;
}
info.block = newBlock;
blockStore.blocksInUse.clear(bitIndex);
}
}
} catch (IOException e) {
LogManager.logWarning(LogConstants.CTX_BUFFER_MGR, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30022));
}
}
}
}
private long readLong(InputStream is) throws IOException {
long val = 0;
for (int k = 0; k < 8; k++) {
val += ((is.read() & 255) << (56-k*8));
}
return val;
}
private long truncate(BlockStore blockStore, int segment, boolean anySpace) {
//truncate the file
blockStore.locks[segment].writeLock().lock();
try {
int endBlock = blockStore.blocksInUse.compactHighestBitSet(segment);
long newLength = (endBlock + 1) * blockStore.blockSize;
long oldLength = blockStore.stores[segment].getLength();
if (anySpace) {
if (newLength < oldLength) {
blockStore.stores[segment].setLength(newLength);
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Truncating segment", segment, "to", newLength); //$NON-NLS-1$ //$NON-NLS-2$
}
return oldLength - newLength;
}
} else {
long desiredLength = ((oldLength/blockStore.blockSize)/2)*blockStore.blockSize;
if (newLength < oldLength && newLength <= desiredLength && oldLength - desiredLength >= 2*minDefrag) {
blockStore.stores[segment].setLength(desiredLength);
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Truncating segment", segment, "to", desiredLength); //$NON-NLS-1$ //$NON-NLS-2$
}
}
return oldLength - desiredLength;
}
} catch (IOException e) {
LogManager.logWarning(LogConstants.CTX_BUFFER_MGR, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30023));
} finally {
blockStore.locks[segment].writeLock().unlock();
}
return 0;
}
};
final DefragTask defragTask = new DefragTask();
private long lastFullRun;
AtomicBoolean cleanerRunning = new AtomicBoolean();
private final Runnable cleaningTask = new Runnable() {
@Override
public void run() {
try {
while (lowBlocks(false)) {
if (evictFromMemoryBuffer(false) == EMPTY_ADDRESS) {
break;
}
}
} finally {
cleanerRunning.set(false);
}
}
};
private int cleaningThreshold;
private int criticalCleaningThreshold;
private AtomicLong storageWrites = new AtomicLong();
private AtomicLong storageReads = new AtomicLong();
private long minDefrag = DEFAULT_MIN_DEFRAG;
private BufferManagerImpl bufferManager;
@Override
public void initialize() throws TeiidComponentException {
initialize(true);
}
void initialize(boolean allocateMemory) throws TeiidComponentException {
storageManager.initialize();
memoryBufferSpace = Math.max(memoryBufferSpace, maxStorageObjectSize);
blocks = (int) Math.min(Integer.MAX_VALUE, (memoryBufferSpace>>LOG_BLOCK_SIZE)*ADDRESSES_PER_BLOCK/(ADDRESSES_PER_BLOCK+1));
inodesInuse = new ConcurrentBitSet(blocks+1, BufferManagerImpl.CONCURRENCY_LEVEL);
blocksInuse = new ConcurrentBitSet(blocks, BufferManagerImpl.CONCURRENCY_LEVEL);
this.blockByteBuffer = new BlockByteBuffer(30, blocks, LOG_BLOCK_SIZE, direct);
//ensure that we'll run out of blocks first
this.inodeByteBuffer = new BlockByteBuffer(30, blocks+1, LOG_INODE_SIZE, direct);
memoryWritePermits = new Semaphore(blocks);
maxMemoryBlocks = Math.min(MAX_DOUBLE_INDIRECT, blocks);
maxMemoryBlocks = Math.min(maxMemoryBlocks, (maxStorageObjectSize>>LOG_BLOCK_SIZE) + ((maxStorageObjectSize&BufferFrontedFileStoreCache.BLOCK_MASK)>0?1:0));
//try to maintain enough freespace so that writers don't block in cleaning
cleaningThreshold = Math.min(maxMemoryBlocks<<4, blocks>>1);
criticalCleaningThreshold = Math.min(maxMemoryBlocks<<2, blocks>>2);
//account for index pointer block overhead
if (maxMemoryBlocks > DIRECT_POINTERS) {
maxMemoryBlocks--;
}
if (maxMemoryBlocks > MAX_INDIRECT) {
int indirect = maxMemoryBlocks-MAX_INDIRECT;
maxMemoryBlocks -= (indirect/ADDRESSES_PER_BLOCK + (indirect%ADDRESSES_PER_BLOCK>0?1:0) + 1);
}
List<BlockStore> stores = new ArrayList<BlockStore>();
long size = BLOCK_SIZE;
int files = 32; //this allows us to have 64 terabytes of smaller block sizes
do {
stores.add(new BlockStore(this.storageManager, (int)size, 30, files));
size <<=1;
if (files > 1) {
files >>= 1;
}
} while ((size>>1) < maxStorageObjectSize);
this.sizeBasedStores = stores.toArray(new BlockStore[stores.size()]);
this.truncateInterval = compactBufferFiles?1:8;
}
boolean lowBlocks(boolean critical) {
int bitsSet = blocksInuse.getBitsSet();
return bitsSet > 0 && (blocks - bitsSet < (critical?criticalCleaningThreshold:cleaningThreshold)) && memoryBufferEntries.firstEntry(false) != null;
}
InodeBlockManager getBlockManager(long gid, long oid, int inode) {
return new InodeBlockManager(gid, oid, inode);
}
@SuppressWarnings("unchecked")
@Override
public boolean add(CacheEntry entry, Serializer s) {
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "adding object", s.getId(), entry.getId()); //$NON-NLS-1$
}
boolean newEntry = false;
InodeBlockManager blockManager = null;
boolean hasPermit = false;
PhysicalInfo info = null;
boolean success = false;
int memoryBlocks = this.maxMemoryBlocks;
try {
Map<Long, PhysicalInfo> map = physicalMapping.get(s.getId());
if (map == null) {
return true; //already removed
}
info = map.get(entry.getId());
if (info == null) {
synchronized (map) {
info = map.get(entry.getId());
if (info == null) {
newEntry = true;
if (!map.containsKey(entry.getId())) {
return true; //already removed
}
info = new PhysicalInfo(s.getId(), entry.getId(), EMPTY_ADDRESS, readAttempts.get(), entry.getSizeEstimate());
info.adding = true;
map.put(entry.getId(), info);
}
}
}
if (!newEntry) {
synchronized (info) {
if (info.adding) {
return false; //someone else is responsible for adding this cache entry
}
if (info.evicting || info.inode != EMPTY_ADDRESS
|| !shouldPlaceInMemoryBuffer(0, info)) {
return true; //safe to remove from tier 1
}
info.adding = true;
//second chance re-add to the cache, we assume that serialization would be faster than a disk read
memoryBlocks = info.memoryBlockCount;
}
}
checkForLowMemory();
memoryWritePermits.acquire(memoryBlocks);
hasPermit = true;
blockManager = getBlockManager(s.getId(), entry.getId(), EMPTY_ADDRESS);
BlockOutputStream bos = new BlockOutputStream(blockManager, memoryBlocks);
bos.writeLong(s.getId());
bos.writeLong(entry.getId());
ObjectOutput dos = new ObjectOutputStream(bos);
s.serialize(entry.getObject(), dos);
dos.close();
//synchronized to ensure proper cleanup from a concurrent removal
synchronized (map) {
if (physicalMapping.containsKey(s.getId()) && map.containsKey(entry.getId())) {
synchronized (info) {
//set the size first, since it may raise an exceptional condition
info.setSize(bos.getBytesWritten());
info.inode = blockManager.getInode();
memoryBufferEntries.add(info);
}
success = true;
}
}
} catch (Throwable e) {
if (e == PhysicalInfo.sizeChanged) {
//entries are mutable after adding, the original should be removed shortly so just ignore
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Object ", entry.getId(), " changed size since first persistence, keeping the original."); //$NON-NLS-1$ //$NON-NLS-2$
} else if (e == BlockOutputStream.exceededMax){
final long[] size = new long[1];
try {
ObjectOutput dos = new ObjectOutputStream(new OutputStream() {
@Override
public void write(int b) throws IOException {
size[0]++;
}
});
s.serialize(entry.getObject(), dos);
} catch (IOException e1) {
}
LogManager.logError(LogConstants.CTX_BUFFER_MGR,
QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30001,s.getId(), entry.getId(), entry.getSizeEstimate(), size[0], s.describe(entry.getObject())));
} else {
LogManager.logError(LogConstants.CTX_BUFFER_MGR, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30002,s.getId(), entry.getId()));
}
} finally {
if (hasPermit) {
memoryWritePermits.release(memoryBlocks);
}
if (info != null) {
synchronized (info) {
info.adding = false;
if (!success && blockManager != null) {
//invalidate for safety
info.inode = EMPTY_ADDRESS;
}
}
}
if (!success && blockManager != null) {
blockManager.free(false);
}
}
return true;
}
private void checkForLowMemory() {
//proactively create freespace
if (!cleanerRunning.get() && lowBlocks(false) && cleanerRunning.compareAndSet(false, true)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Starting memory buffer cleaner"); //$NON-NLS-1$
asynchPool.execute(cleaningTask);
}
if (lowBlocks(true)) {
//do a non-blocking removal before we're forced to block
evictFromMemoryBuffer(false);
}
}
@Override
public PhysicalInfo lockForLoad(Long oid, Serializer<?> serializer) {
Map<Long, PhysicalInfo> map = physicalMapping.get(serializer.getId());
if (map == null) {
return null;
}
PhysicalInfo info = map.get(oid);
if (info == null) {
return null;
}
info.lockForLoad();
return info;
}
@Override
public void unlockForLoad(PhysicalInfo info) {
if (info == null) {
return;
}
info.unlockForLoad();
}
@Override
public CacheEntry get(PhysicalInfo info, Long oid,
WeakReference<? extends Serializer<?>> ref)
throws TeiidComponentException {
if (info == null) {
return null;
}
Serializer<?> serializer = ref.get();
if (serializer == null) {
return null;
}
readAttempts.incrementAndGet();
InputStream is = null;
Lock lock = null;
ExtensibleBufferedInputStream eis = null;
int memoryBlocks = 0;
try {
synchronized (info) {
assert !info.pinned && info.loading; //load should be locked
info.await(true, false); //not necessary, but should make things safer
if (info.inode != EMPTY_ADDRESS) {
info.pinned = true;
memoryBufferEntries.touch(info);
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Getting object at inode", info.inode, serializer.getId(), oid); //$NON-NLS-1$
}
BlockManager manager = getBlockManager(serializer.getId(), oid, info.inode);
is = new BlockInputStream(manager, info.memoryBlockCount);
} else if (info.block != EMPTY_ADDRESS) {
info.pinned = true;
memoryBufferEntries.recordAccess(info);
storageReads.incrementAndGet();
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Getting object at block", info.block, info.sizeIndex, serializer.getId(), oid); //$NON-NLS-1$
}
BlockStore blockStore = sizeBasedStores[info.sizeIndex];
int segment = info.block/blockStore.blocksInUse.getBitsPerSegment();
FileStore fs = blockStore.stores[segment];
long blockOffset = (info.block%blockStore.blocksInUse.getBitsPerSegment())*blockStore.blockSize;
eis = fs.createInputStream(blockOffset, info.memoryBlockCount<<LOG_BLOCK_SIZE);
lock = blockStore.locks[segment].writeLock();
memoryBlocks = info.memoryBlockCount;
} else {
return null;
}
}
if (lock != null) {
is = readIntoMemory(info, eis, lock, memoryBlocks);
}
for (int i = 0; i < HEADER_BYTES; i++) {
is.read();
}
ObjectInput dis = new ObjectInputStream(is);
CacheEntry ce = new CacheEntry(new CacheKey(oid, 1, 1), info.sizeEstimate, serializer.deserialize(dis), ref, true);
return ce;
} catch(IOException e) {
throw new TeiidComponentException(QueryPlugin.Event.TEIID30048, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30048, info.gid, oid));
} catch (ClassNotFoundException e) {
throw new TeiidComponentException(QueryPlugin.Event.TEIID30048, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30048, info.gid, oid));
} catch (InterruptedException e) {
throw new TeiidRuntimeException(QueryPlugin.Event.TEIID30049, e);
} finally {
synchronized (info) {
info.pinned = false;
info.notifyAll();
}
}
}
/**
* Transfer into memory to release memory/file locks
*/
private InputStream readIntoMemory(PhysicalInfo info, ExtensibleBufferedInputStream is,
Lock fileLock, int memoryBlocks) throws InterruptedException,
IOException {
checkForLowMemory();
this.memoryWritePermits.acquire(memoryBlocks);
BlockManager manager = null;
boolean success = false;
boolean locked = false;
try {
manager = getBlockManager(info.gid, info.getId(), EMPTY_ADDRESS);
//preallocate the memory area, to ensure we won't exhaust memory while holding
//the file lock
for (int i = 0; i < memoryBlocks; i++) {
manager.allocateBlock(i);
}
fileLock.lock();
locked = true;
ExtensibleBufferedOutputStream os = new BlockOutputStream(manager, -1);
//TODO: there is still an extra buffer being created here, we could FileChannels to do better
ByteBuffer bb = null;
while ((bb = is.getBuffer()) != null) {
byte[] array = bb.array();
os.write(array, bb.position() + bb.arrayOffset(), bb.remaining());
bb.position(bb.position()+bb.remaining());
}
fileLock.unlock();
os.close();
locked = false;
synchronized (info) {
info.inode = manager.getInode();
memoryBufferEntries.add(info);
is = new BlockInputStream(manager, info.memoryBlockCount);
}
success = true;
} finally {
try {
if (locked) {
fileLock.unlock();
}
if (!success && manager != null) {
manager.free(false);
}
} finally {
this.memoryWritePermits.release(memoryBlocks);
}
}
return is;
}
/**
* Determine if an object should be in the memory buffer.
* Adds are indicated by a current time of 0.
* @param currentTime
* @param info
* @return
*/
private boolean shouldPlaceInMemoryBuffer(long currentTime, PhysicalInfo info) {
PhysicalInfo lowest = memoryBufferEntries.firstEntry(false);
CacheKey key = info.getKey();
return (blocksInuse.getTotalBits() - blocksInuse.getBitsSet()) > (cleaningThreshold + info.memoryBlockCount)
|| (lowest != null && lowest.block != EMPTY_ADDRESS
&& lowest.getKey().getOrderingValue() < (currentTime>0?memoryBufferEntries.computeNextOrderingValue(currentTime, key.getLastAccess(), key.getOrderingValue()):key.getOrderingValue()));
}
@Override
public FileStore createFileStore(String name) {
return storageManager.createFileStore(name);
}
public void setDirect(boolean direct) {
this.direct = direct;
}
@Override
public boolean addToCacheGroup(Long gid, Long oid) {
Map<Long, PhysicalInfo> map = physicalMapping.get(gid);
if (map == null) {
return false;
}
map.put(oid, null);
return true;
}
@Override
public void createCacheGroup(Long gid) {
physicalMapping.put(gid, Collections.synchronizedMap(new HashMap<Long, PhysicalInfo>()));
}
@Override
public Integer remove(Long gid, Long id) {
Map<Long, PhysicalInfo> map = physicalMapping.get(gid);
if (map == null) {
return null;
}
PhysicalInfo info = null;
Integer result = null;
synchronized (map) {
info = map.remove(id);
if (info != null) {
result = info.sizeEstimate;
}
}
if (info != null) {
free(info, false, false);
}
return result;
}
@Override
public Collection<Long> removeCacheGroup(Long gid) {
Map<Long, PhysicalInfo> map = physicalMapping.remove(gid);
if (map == null) {
return Collections.emptySet();
}
synchronized (map) {
for (Map.Entry<Long, PhysicalInfo> entry : map.entrySet()) {
free(entry.getValue(), false, false);
}
return map.keySet();
}
}
/**
* Multi-purpose method to free memory. Modes are:
* demote && !acquireDataBlock -> push out of memory buffer onto disk
* demote && acquireDataBlock -> push out of memory and reuse a datablock
* !demote -> full removal from memory and disk
*/
int free(PhysicalInfo info, boolean demote, boolean acquireDataBlock) {
if (info == null) {
return EMPTY_ADDRESS;
}
Long oid = info.getId();
int result = FREED;
BlockManager bm = null;
int block = EMPTY_ADDRESS;
int memoryBlockCount;
int sizeIndex;
synchronized (info) {
//if we're a demotion then the free flag was already checked and set
if (!demote) {
//let any pending finish - it would be nice if we could pre-empt
//since we can save some work, but this should be rare enough
//to just block
info.await(true, true);
info.evicting = true;
} else {
assert info.evicting;
}
block = info.block;
memoryBlockCount = info.memoryBlockCount;
sizeIndex = info.sizeIndex;
if (info.inode != EMPTY_ADDRESS) {
bm = getBlockManager(info.gid, oid, info.inode);
} else if (demote) {
return EMPTY_ADDRESS;
}
//release the lock to perform the transfer
//for straight removals this is a little wasteful
}
try {
if (demote && block == EMPTY_ADDRESS) {
storageWrites.getAndIncrement();
BlockInputStream is = new BlockInputStream(bm, memoryBlockCount);
BlockStore blockStore = sizeBasedStores[sizeIndex];
for (int i = 0; i < 3; i++) {
try {
block = blockStore.writeToStorageBlock(info, is);
break;
} catch (OutOfDiskException e) {
switch (i) {
case 0:
//the first attempt is to trim the existing files
defragTask.truncate(true);
break;
case 1:
synchronized (this) {
if (System.currentTimeMillis() - lastFullRun > FULL_DEFRAG_TRUNCATE_TIMEOUT) {
defragTask.defrag(true);
defragTask.truncate(true);
lastFullRun = System.currentTimeMillis();
}
}
break;
case 2:
//give up, there isn't enough memory available
throw e;
}
}
}
}
} catch (IOException e) {
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logError(LogConstants.CTX_BUFFER_MGR, e, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30016, oid, info.gid));
} else {
LogManager.logError(LogConstants.CTX_BUFFER_MGR, QueryPlugin.Util.gs(QueryPlugin.Event.TEIID30016, oid, info.gid) + " " + e.getMessage()); //$NON-NLS-1$
}
} finally {
//ensure post conditions
synchronized (info) {
//it is possible for a read to happen while evicting.
//that's ok, we'll just wait for it to finish
assert info.evicting;
info.await(true, false);
info.evicting = false;
info.notifyAll();
assert bm == null || info.inode != EMPTY_ADDRESS;
if (info.inode != EMPTY_ADDRESS) {
info.inode = EMPTY_ADDRESS;
memoryBufferEntries.remove(info);
}
if (block != EMPTY_ADDRESS) {
if (demote) {
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Assigning storage data block", block, "of size", sizeBasedStores[info.sizeIndex].blockSize); //$NON-NLS-1$ //$NON-NLS-2$
}
info.block = block;
} else {
BlockStore blockStore = sizeBasedStores[info.sizeIndex];
blockStore.blocksInUse.clear(info.block);
if (LogManager.isMessageToBeRecorded(LogConstants.CTX_BUFFER_MGR, MessageLevel.DETAIL)) {
LogManager.logDetail(LogConstants.CTX_BUFFER_MGR, "Freed storage data block", info.block, "of size", blockStore.blockSize); //$NON-NLS-1$ //$NON-NLS-2$
}
if (!defragRunning.get()
&& (freedCounter.getAndIncrement()&0x3fff)==0x3fff //should be several hundred megs of turn over
&& defragRunning.compareAndSet(false, true)) {
this.asynchPool.execute(defragTask);
}
info.block = EMPTY_ADDRESS;
}
}
if (bm != null) {
result = bm.free(acquireDataBlock);
freedLock.lock();
try {
blocksFreed.signalAll();
} finally {
freedLock.unlock();
}
}
if (block == EMPTY_ADDRESS && demote && this.bufferManager != null) {
//failed to demote
this.bufferManager.invalidCacheGroup(info.gid);
}
}
}
return result;
}
boolean shouldDefrag(BlockStore blockStore, int segment, boolean all) {
int highestBitSet = blockStore.blocksInUse.getHighestBitSet(segment);
int bitsSet = blockStore.blocksInUse.getBitsSet(segment);
highestBitSet = Math.max(bitsSet, Math.max(0, highestBitSet));
if (highestBitSet == 0) {
return false;
}
int freeBlocks = highestBitSet-bitsSet;
return freeBlocks > (highestBitSet>>(all?3:1)) && freeBlocks*blockStore.blockSize > minDefrag;
}
/**
* Eviction routine. When space is exhausted data blocks are acquired from
* memory entries.
* @param acquire
* @return
*/
int evictFromMemoryBuffer(boolean acquire) {
boolean writeLocked = false;
int next = EMPTY_ADDRESS;
try {
for (int i = 0; i < EVICTION_SCANS && next == EMPTY_ADDRESS; i++) {
//doing a cleanup may trigger the purging of resources
AutoCleanupUtil.doCleanup(true);
//scan the eviction queue looking for a victim
Iterator<PhysicalInfo> iter = memoryBufferEntries.getEvictionQueue().iterator();
while (((!acquire && lowBlocks(false)) || (acquire && (next = blocksInuse.getAndSetNextClearBit()) == EMPTY_ADDRESS)) && iter.hasNext()) {
PhysicalInfo info = iter.next();
synchronized (info) {
if (info.inode == EMPTY_ADDRESS) {
continue;
}
if (info.pinned || info.evicting) {
if (!acquire || i != EVICTION_SCANS - 1) {
continue;
}
if (acquire && !writeLocked) {
//stop the world - prevent any other thread from taking a free block
//until this one is satisfied
memoryEvictionLock.writeLock().lock();
writeLocked = true;
}
//wait for the read/eviction to be over
info.await(true, true);
if (info.inode == EMPTY_ADDRESS) {
continue;
}
}
//mark as evicting early so that other evictFromMemoryCalls don't select this same entry
info.evicting = true;
}
next = free(info, true, acquire);
break;
}
}
if (acquire && next == EMPTY_ADDRESS) {
if (!writeLocked) {
memoryEvictionLock.writeLock().lock();
writeLocked = true;
}
freedLock.lock();
try {
long waitTime = TIMEOUT_NANOS;
while (true) {
next = blocksInuse.getAndSetNextClearBit();
if (next != EMPTY_ADDRESS) {
return next;
}
waitTime = blocksFreed.awaitNanos(waitTime);
if (waitTime <= 0) {
break;
}
}
} finally {
freedLock.unlock();
}
next = blocksInuse.getAndSetNextClearBit();
if (next == EMPTY_ADDRESS) {
throw new AssertionError("Could not free space for pending write"); //$NON-NLS-1$
}
}
} catch (InterruptedException e) {
throw new TeiidRuntimeException(QueryPlugin.Event.TEIID30050, e);
} finally {
if (writeLocked) {
memoryEvictionLock.writeLock().unlock();
}
}
return next;
}
public void setStorageManager(StorageManager storageManager) {
this.storageManager = storageManager;
}
public StorageManager getStorageManager() {
return storageManager;
}
public void setMemoryBufferSpace(long maxBufferSpace) {
this.memoryBufferSpace = Math.min(maxBufferSpace, MAX_ADDRESSABLE_MEMORY);
}
public int getInodesInUse() {
return this.inodesInuse.getBitsSet();
}
public int getDataBlocksInUse() {
return this.blocksInuse.getBitsSet();
}
public void setMaxStorageObjectSize(int maxStorageBlockSize) {
if (maxStorageBlockSize > (1 << 30)) {
throw new TeiidRuntimeException("max storage block size cannot exceed 1 GB"); //$NON-NLS-1$
}
this.maxStorageObjectSize = maxStorageBlockSize;
}
public long getStorageReads() {
return storageReads.get();
}
public long getStorageWrites() {
return storageWrites.get();
}
public long getMemoryBufferSpace() {
return memoryBufferSpace;
}
public void setMinDefrag(long minDefrag) {
this.minDefrag = minDefrag;
}
public int getMaxMemoryBlocks() {
return maxMemoryBlocks;
}
public long getMemoryInUseBytes() {
return this.blocksInuse.getBitsSet() * BLOCK_SIZE + this.inodesInuse.getBitsSet() * (1 << LOG_INODE_SIZE);
}
public void setBufferManager(BufferManagerImpl bufferManager) {
this.bufferManager = bufferManager;
}
public void setTruncateInterval(int truncateInterval) {
this.truncateInterval = truncateInterval;
}
public long getDiskUsage() {
long result = 0;
for (int i = 0; i < sizeBasedStores.length; i++) {
BlockStore blockStore = sizeBasedStores[i];
for (int segment = 0; segment < blockStore.stores.length; segment++) {
result += blockStore.stores[segment].getLength();
}
}
return result;
}
@Override
public void shutdown() {
this.asynchPool.shutdownNow();
}
public void setCompactBufferFiles(boolean compactBufferFiles) {
this.compactBufferFiles = compactBufferFiles;
}
@Override
public long getMaxStorageSpace() {
return this.storageManager.getMaxStorageSpace();
}
}