/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.store.skiplist;
import com.addthis.basis.util.Parameter;
import com.addthis.codec.codables.BytesCodable;
import com.addthis.hydra.store.common.AbstractPage;
import com.addthis.hydra.store.common.AbstractPageCache;
import com.addthis.hydra.store.common.ExternalMode;
import com.addthis.hydra.store.common.Page;
import com.addthis.hydra.store.common.PageFactory;
import com.addthis.hydra.store.db.CloseOperation;
import com.addthis.hydra.store.kv.ByteStore;
import com.addthis.hydra.store.kv.KeyCoder;
import com.addthis.hydra.store.util.MetricsUtil;
import com.addthis.hydra.store.util.NamedThreadFactory;
import com.google.common.annotations.VisibleForTesting;
import io.netty.buffer.ByteBufOutputStream;
import io.netty.buffer.PooledByteBufAllocator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/**
* ::TWO INVARIANTS TO AVOID DEADLOCK AND MAINTAIN CONSISTENCY::
* <p/>
* Invariant #1:
* When locking two pages always lock the lower page before locking the higher page.
* <p/>
* Invariant #2:
* To read a consistent snapshot of a page in the external storage you must
* be holding a lock on the lower page in memory.
* <p/>
* The left sentinel page is the lowest page in storage. It is constructed with
* a special first key with value negative infinity. No key may be smaller than
* negative infinity. The left sentinel page may be neither <i>purged</i>
* nor <i>deleted</i> (see below).
* <p/>
* A page is <i>evicted</i> when the contents of the page are transferred from
* the JVM heap into the external storage. When a page is evicted a page stub
* remains in memory that contains the minimal information needed to restore the
* page into memory.
* <p/>
* A page is <i>purged</i> when a page stub is deleted from memory. The most
* recent copy of this page still resides in the external storage. The left
* sentinel page may not be purged.
* <p/>
* A page is <i>deleted</i> when it is removed from both memory and the external storage.
* Only pages with 0 keys may be deleted. The left sentinel page may not be deleted.
*
* @param <K>
* @param <V>
*/
public class SkipListCache<K, V extends BytesCodable> extends AbstractPageCache<K, V> {
private static final Logger log = LoggerFactory.getLogger(SkipListCache.class);
private static final int defaultEvictionThreads = Parameter.intValue("cache.threadcount.eviction", 1);
/**
* Used as an absolute delta from maxPages when using that upper bound.
* Otherwise it's treated as a percentage of maxTotalMemory.
*/
private static final int shouldEvictDelta = Parameter.intValue("eps.cache.evict.delta", 20);
final BlockingQueue<Page<K, V>> evictionQueue;
final ConcurrentSkipListSet<K> purgeSet;
final AtomicInteger purgeSetSize = new AtomicInteger(0);
/**
* Used to schedule synchronous page eviction in the
* {@link #put(Object, BytesCodable)} and {@link #remove(Object)}
* methods when the background eviction threads are behind schedule.
*/
private final LinkedBlockingQueue<BackgroundEvictionTask> evictionTaskQueue;
private final ScheduledExecutorService evictionThreadPool, purgeThreadPool;
private static final int evictionThreadSleepMillis = 10;
private static final int threadPoolWaitShutdownSeconds = 10;
/**
* The Builder pattern allows many different variations of a class to
* be instantiated without the pitfalls of complex constructors. See
* ''Effective Java, Second Edition.'' Item 2 - "Consider a builder when
* faced with many constructor parameters."
*/
public static class Builder<K, V extends BytesCodable> {
// Required parameters
protected final int maxPageSize;
protected final ByteStore externalStore;
protected final KeyCoder<K, V> keyCoder;
// Optional parameters - initialized to default values;
protected int numEvictionThreads = defaultEvictionThreads;
protected int maxPages = defaultMaxPages;
@SuppressWarnings("unchecked")
protected PageFactory<K, V> pageFactory = ConcurrentPage.ConcurrentPageFactory.singleton;
public Builder(KeyCoder<K, V> keyCoder, ByteStore store, int maxPageSize) {
this.externalStore = store;
this.maxPageSize = maxPageSize;
this.keyCoder = keyCoder;
}
@SuppressWarnings("unused")
public Builder<K, V> numEvictionThreads(int val) {
numEvictionThreads = val;
return this;
}
@SuppressWarnings("unused")
public Builder<K, V> maxPages(int val) {
maxPages = val;
return this;
}
@SuppressWarnings("unused")
public Builder<K, V> pageFactory(PageFactory<K, V> factory) {
pageFactory = factory;
return this;
}
public SkipListCache<K, V> build() {
return new SkipListCache<>(keyCoder, externalStore, maxPageSize,
maxPages, numEvictionThreads, pageFactory);
}
}
public SkipListCache(KeyCoder<K, V> keyCoder, ByteStore externalStore, int maxPageSize,
int maxPages, int numEvictionThreads, PageFactory<K, V> pageFactory) {
super(keyCoder, externalStore, pageFactory, maxPageSize, maxPages, true);
this.evictionTaskQueue = new LinkedBlockingQueue<>();
this.purgeSet = new ConcurrentSkipListSet<>();
this.evictionQueue = new LinkedBlockingQueue<>();
evictionThreadPool = Executors.newScheduledThreadPool(numEvictionThreads,
new NamedThreadFactory(scope + "-eviction-", true));
purgeThreadPool = Executors.newScheduledThreadPool(numEvictionThreads,
new NamedThreadFactory(scope + "-purge-", true));
for (int i = 0; i < numEvictionThreads; i++) {
purgeThreadPool.scheduleAtFixedRate(new BackgroundPurgeTask(),
i,
evictionThreadSleepMillis,
TimeUnit.MILLISECONDS);
evictionThreadPool.scheduleAtFixedRate(new BackgroundEvictionTask(0),
i,
evictionThreadSleepMillis,
TimeUnit.MILLISECONDS);
}
log.info("[init] ro=" + isReadOnly() + " maxPageSize=" + maxPageSize +
" maxPages=" + maxPages + " gztype=" + AbstractPage.gztype + " gzlevel=" +
AbstractPage.gzlevel + " gzbuf=" + AbstractPage.gzbuf + " mem[page=" + mem_page + " type=SkipListCache]");
}
class BackgroundPurgeTask implements Runnable {
Iterator<K> targetKeys;
BackgroundPurgeTask() {
targetKeys = purgeSet.iterator();
}
@Override
public void run() {
try {
backgroundPurge();
} catch (Exception ex) {
logException("Uncaught exception in skiplist concurrent cache purge thread", ex);
}
}
private void backgroundPurge() {
while (!shutdownEvictionThreads.get() && shouldPurgePage() && doPurgePage()) ;
}
/**
* Return true if-and-only if no further processing is necessary.
*/
private EvictionStatus removePageFromCache(K targetKey) {
assert (!targetKey.equals(negInf));
Page<K, V> prevPage = null, currentPage = null;
try {
// We must acquire the locks on the pages from lowest to highest.
// This is inefficient but it avoids deadlock.
Map.Entry<K, Page<K, V>> prevEntry, currentEntry;
prevEntry = getCache().lowerEntry(targetKey);
prevPage = prevEntry.getValue();
if (!prevPage.writeTryLock()) {
prevPage = null;
return EvictionStatus.TRYLOCK_FAIL;
}
if (prevPage.inTransientState()) {
return EvictionStatus.TRANSIENT_PAGE;
}
currentEntry = getCache().higherEntry(prevEntry.getKey());
if (currentEntry != null) {
currentPage = currentEntry.getValue();
if (!currentPage.writeTryLock()) {
currentPage = null;
return EvictionStatus.TRYLOCK_FAIL;
}
int compareKeys = compareKeys(targetKey, currentPage.getFirstKey());
if (compareKeys < 0) {
return EvictionStatus.NO_STATUS;
} else if (compareKeys == 0 && currentPage.keys() == null &&
currentPage.getState() == ExternalMode.DISK_MEMORY_IDENTICAL) {
currentPage.setState(ExternalMode.MEMORY_EVICTED);
getCache().remove(targetKey);
cacheSize.getAndDecrement();
return EvictionStatus.SUCCESS;
}
}
return EvictionStatus.EVICTED_PAGE;
} finally {
writeUnlockAndNull(currentPage);
writeUnlockAndNull(prevPage);
}
}
/**
* Returns <code>true</code> is a page is purged and
* false otherwise.
*/
private boolean doPurgePage() {
if (targetKeys == null) {
targetKeys = purgeSet.iterator();
}
while (targetKeys.hasNext()) {
K minKey = targetKeys.next();
EvictionStatus status = removePageFromCache(minKey);
if (status.removePurgeSet()) {
if (purgeSet.remove(minKey)) {
purgeSetSize.getAndDecrement();
return true;
}
}
}
targetKeys = null;
return false;
}
}
public boolean shouldPurgePage() {
return purgeSetSize.get() > getNumPagesInMemory();
}
@Override
protected V doPut(K key, V value) {
V prev;
/**
* If the background eviction threads are behind schedule,
* then synchronously perform a page eviction. The
* {@link #getEvictionTask()} and {@link #putEvictionTask(BackgroundEvictionTask)}
* method are for re-using BackgroundEvictionTask object.
*/
if (mustEvictPage()) {
BackgroundEvictionTask task = getEvictionTask();
task.run();
putEvictionTask(task);
}
Page<K, V> page = locatePage(key, LockMode.WRITEMODE);
try {
prev = putIntoPage(page, key, value);
int prevMem = page.getMemoryEstimate();
page.updateMemoryEstimate();
updateMemoryEstimate(page.getMemoryEstimate() - prevMem);
if (page.splitCondition()) {
splitPage(page);
} else if (page.getState() == ExternalMode.DISK_MEMORY_IDENTICAL) {
page.setState(ExternalMode.DISK_MEMORY_DIRTY);
}
} finally {
writeUnlockAndNull(page);
}
return prev;
}
@Override
protected void doRemove(K start, K end) {
while (true) {
if (mustEvictPage()) {
BackgroundEvictionTask task = getEvictionTask();
task.run();
putEvictionTask(task);
}
Page<K, V> page = locatePage(start, LockMode.WRITEMODE);
try {
int startOffset = binarySearch(page.keys(), start, comparator);
int endOffset = binarySearch(page.keys(), end, comparator);
int pageSize = page.size();
if (startOffset < 0) {
startOffset = ~startOffset;
}
if (endOffset < 0) {
endOffset = ~endOffset;
}
if (startOffset < endOffset) {
int memEstimate = page.getMemoryEstimate();
int length = (endOffset - startOffset);
for (int i = 0; i < length; i++) {
page.keys().remove(startOffset);
page.values().remove(startOffset);
page.rawValues().remove(startOffset);
}
page.setSize(page.size() - length);
if (page.getState() == ExternalMode.DISK_MEMORY_IDENTICAL) {
page.setState(ExternalMode.DISK_MEMORY_DIRTY);
}
page.updateMemoryEstimate();
updateMemoryEstimate(page.getMemoryEstimate() - memEstimate);
}
if (page.size() == 0 && !page.getFirstKey().equals(negInf)) {
K targetKey = page.getFirstKey();
page = writeUnlockAndNull(page);
deletePage(targetKey);
continue;
} else if (endOffset == pageSize) {
byte[] higherKeyEncoded = externalStore.higherKey(keyCoder.keyEncode(page.getFirstKey()));
if (higherKeyEncoded != null) {
start = keyCoder.keyDecode(higherKeyEncoded);
continue;
}
}
} finally {
writeUnlockAndNull(page);
}
break;
}
}
@Override
protected V doRemove(K key) {
if (mustEvictPage()) {
BackgroundEvictionTask task = getEvictionTask();
task.run();
putEvictionTask(task);
}
Page<K, V> page = locatePage(key, LockMode.WRITEMODE);
try {
if (page.size() == 0) {
if (!page.getFirstKey().equals(negInf)) {
K targetKey = page.getFirstKey();
page = writeUnlockAndNull(page);
deletePage(targetKey);
}
return null;
}
int offset = binarySearch(page.keys(), key, comparator);
// An existing (key, value) pair is found.
if (offset >= 0) {
int memEstimate = page.getMemoryEstimate();
page.fetchValue(offset);
page.keys().remove(offset);
page.rawValues().remove(offset);
V prev = page.values().remove(offset);
page.setSize(page.size() - 1);
if (page.getState() == ExternalMode.DISK_MEMORY_IDENTICAL) {
page.setState(ExternalMode.DISK_MEMORY_DIRTY);
}
page.updateMemoryEstimate();
updateMemoryEstimate(page.getMemoryEstimate() - memEstimate);
if (page.size() == 0 && !page.getFirstKey().equals(negInf)) {
K targetKey = page.getFirstKey();
page = writeUnlockAndNull(page);
deletePage(targetKey);
}
return prev;
} else {
return null;
}
} finally {
writeUnlockAndNull(page);
}
}
@Override
public K getLastKey() {
// Fast path: the last page in cache happens to be the last page on disk.
K fastPath = lastKeyFastPath();
if (fastPath != null) return fastPath;
K currentKey;
byte[] currentKeyEncoded;
Page<K, V> currentPage = null, prevPage = null;
// Slow path: we load each page from disk searching for the first key.
// This is slower than getFirstKey() due to our locking convention.
try {
// Load the high page into memory
while (true) {
currentKeyEncoded = externalStore.lastKey();
currentKey = keyCoder.keyDecode(currentKeyEncoded);
currentPage = loadPage(currentKey, null);
if (!currentPage.inTransientState() && currentPage.getNextFirstKey() == null) {
break;
}
}
// Find that last key!
while (true) {
K prevKey, verifyKey;
byte[] prevKeyEncoded, verifyKeyEncoded;
assert (!currentPage.inTransientState());
if (currentPage.keys() == null) {
pullPageFromDisk(currentPage, LockMode.WRITEMODE);
}
if (currentPage.size() > 0) {
return currentPage.keys().get(currentPage.size() - 1);
}
// This loop is needed to detect concurrent page split operations.
do {
prevPage = writeUnlockAndNull(prevPage);
prevKeyEncoded = externalStore.lowerKey(currentKeyEncoded);
if (prevKeyEncoded == null) {
return null;
}
prevKey = keyCoder.keyDecode(prevKeyEncoded);
currentPage = writeUnlockAndNull(currentPage);
prevPage = loadPage(prevKey, null);
verifyKeyEncoded = externalStore.higherKey(prevKeyEncoded);
if (verifyKeyEncoded == null) {
assert (prevPage.getNextFirstKey() == null);
break;
}
verifyKey = keyCoder.keyDecode(verifyKeyEncoded);
}
while (!currentKey.equals(verifyKey));
currentPage = prevPage;
currentKey = prevKey;
currentKeyEncoded = prevKeyEncoded;
prevPage = null;
}
} finally {
writeUnlockAndNull(prevPage);
writeUnlockAndNull(currentPage);
}
}
/**
* Close without scheduling any unfinished background tasks.
* The background eviction thread(s) are shut down regardless of
* whether the skiplist exceeds its heap capacity.
*/
@Override
public void close() {
doClose(false, false, CloseOperation.NONE);
}
/**
* Close the cache.
*
* @param cleanLog if true then wait for the BerkeleyDB clean thread to finish.
* @param operation optionally test or repair the berkeleyDB.
* @return status code. A status code of 0 indicates success.
*/
@Override
public int close(boolean cleanLog, CloseOperation operation) {
return doClose(cleanLog, false, operation);
}
/**
* Wait for all background tasks to complete.
* Wait for the background eviction threads to complete
* purging all necessary pages. This method is intended
* for JUnit testing. If it is being used in other instances,
* then perhaps a new method should be introduced instead.
*/
@VisibleForTesting
void waitForShutdown() {
doClose(false, true, CloseOperation.NONE);
}
private int doClose(boolean cleanLog, boolean wait, CloseOperation operation) {
int status = 0;
if (!shutdownGuard.getAndSet(true)) {
if (wait) {
waitForPageEviction();
}
shutdownEvictionThreads.set(true);
waitForEvictionThreads();
pushAllPagesToDisk();
if (operation != null && operation.testIntegrity()) {
int failedPages = testIntegrity(operation.repairIntegrity());
status = (failedPages > 0) ? 1 : 0;
}
closeExternalStore(cleanLog);
assert(status == 0);
log.info("pages: encoded=" + numPagesEncoded.get() +
" decoded=" + numPagesDecoded.get() +
" split=" + numPagesSplit.get());
if (trackEncodingByteUsage) {
log.info(MetricsUtil.histogramToString("encodeFirstKeySize", metrics.encodeFirstKeySize));
log.info(MetricsUtil.histogramToString("encodeNextFirstKeySize", metrics.encodeNextFirstKeySize));
log.info(MetricsUtil.histogramToString("encodeKeySize", metrics.encodeKeySize));
log.info(MetricsUtil.histogramToString("encodeValueSize", metrics.encodeValueSize));
log.info(MetricsUtil.histogramToString("encodePageSize (final)",
metrics.encodePageSize));
log.info(MetricsUtil.histogramToString("numberKeysPerPage",
metrics.numberKeysPerPage));
}
}
return status;
}
/**
* Retrieve a BackgroundEvictionTask object from
* the {@link #evictionTaskQueue} or create a new instance when
* the queue is empty.
*/
private BackgroundEvictionTask getEvictionTask() {
BackgroundEvictionTask task = evictionTaskQueue.poll();
if (task == null) {
return new BackgroundEvictionTask(fixedNumberEvictions);
} else {
return task;
}
}
/**
* Place a BackgroundEvictionTask object onto
* the shared queue so that other threads may
* re-use this object.
*/
private void putEvictionTask(BackgroundEvictionTask task) {
evictionTaskQueue.add(task);
}
private void waitForEvictionThreads() {
purgeThreadPool.shutdown();
evictionThreadPool.shutdown();
try {
purgeThreadPool.awaitTermination(threadPoolWaitShutdownSeconds, TimeUnit.SECONDS);
evictionThreadPool.awaitTermination(threadPoolWaitShutdownSeconds, TimeUnit.SECONDS);
} catch (InterruptedException ignored) {
}
}
public void backgroundEviction() {
ByteBufOutputStream byteStream = new ByteBufOutputStream(PooledByteBufAllocator.DEFAULT.buffer());
try {
while (shutdownEvictionThreads.get() && shouldEvictPage() && doEvictPage(byteStream)) ;
} finally {
byteStream.buffer().release();
}
}
public class BackgroundEvictionTask implements Runnable {
private final int id;
private final int maxEvictions;
public BackgroundEvictionTask(int evictions) {
id = evictionId.getAndIncrement();
maxEvictions = evictions;
}
@Override
public void run() {
try {
if (maxEvictions <= 0) {
backgroundEviction();
} else {
fixedNumberEviction(maxEvictions);
}
} catch (Exception ex) {
logException("Uncaught exception in eviction task", ex);
}
}
}
@Override
protected void addToPurgeSet(Page<K, V> page) {
if (!page.getFirstKey().equals(negInf)) {
if (purgeSet.add(page.getFirstKey())) {
purgeSetSize.getAndIncrement();
}
}
}
}