/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.store.kv;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ExecutionException;
import java.util.zip.GZIPInputStream;
import com.addthis.basis.util.ClosableIterator;
import com.addthis.basis.util.Parameter;
import com.addthis.codec.codables.BytesCodable;
import com.addthis.hydra.store.db.IReadWeighable;
import com.addthis.hydra.store.db.ReadDBKeyCoder;
import com.addthis.hydra.store.kv.metrics.ExternalPagedStoreMetrics;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.cache.Weigher;
import com.jcraft.jzlib.InflaterInputStream;
import com.ning.compress.lzf.LZFInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xerial.snappy.SnappyInputStream;
/**
* read only caching page store intended to play nice with the query system
* <p/>
* interacts with an external k-(page of v) store which presumably keeps pages (grouped lists)
* of the type v given to us
* <p/>
* performs three functions:
* exposes methods to get individual vs (eg tree nodes) from the backing store
* provides iterators to allow for efficient traversal of the store by individual k/v
* caches pages to make both kinds of access more efficient for repeated reads
* <p/>
* TODO:
* toString
* <p/>
* support reading of either pages or efficient 'page size 1' (individal nodes) from the backing store
* to support mixing the two if desired (eg a tree branch optimized for random access)
*
* @param <K> - key for individual values, also used for pages
* @param <V> - type of object stored (the backing store will have type Page<V>)
*/
public class ReadExternalPagedStore<K extends Comparable<K>, V extends IReadWeighable & BytesCodable> {
private static final boolean collectMetricsParameter = Parameter.boolValue("eps.debug.collect", false);
private static final Logger log = LoggerFactory.getLogger(ReadExternalPagedStore.class);
private final boolean checkKeyRange = Parameter.boolValue("eps.keys.debug", false);
private final ExternalPagedStoreMetrics metrics;
private final boolean collectMetrics;
protected static final int TYPE_BIT_OFFSET = 5;
/**
* guava loading cache for storing pages. Get method takes the exact page key, so finding the
* page key must be done first.
*/
private final LoadingCache<K, TreePage> loadingPageCache;
//backing byte store
private final ByteStore pages;
final KeyCoder<K, V> keyCoder;
public ReadExternalPagedStore(KeyCoder<K, V> keyCoder, final ByteStore pages,
int maxSize, int maxWeight) {
this(keyCoder, pages, maxSize, maxWeight, false);
}
public ReadExternalPagedStore(final KeyCoder<K, V> keyCoder, final ByteStore pages,
int maxSize, int maxWeight, boolean collect) {
this.keyCoder = keyCoder;
this.pages = pages;
log.info("[init] maxSize=" + maxSize + " maxWeight=" + maxWeight);
collectMetrics = collectMetricsParameter || collect;
metrics = collectMetrics ? new ExternalPagedStoreMetrics() : null;
// Prefer evicting on weight instead of page count
if (maxWeight != 0) {
loadingPageCache = CacheBuilder.newBuilder()
.weigher(new Weigher<K, TreePage>() {
@Override
public int weigh(K key, TreePage value) {
return value.originalByteSize;
}
})
.maximumWeight(maxWeight)
.build(
new CacheLoader<K, TreePage>() {
public TreePage load(K key) throws Exception {
byte[] page = pages.get(keyCoder.keyEncode(key));
if (page != null) {
return pageDecode(page);
} else {
throw new ExecutionException("Source did not have page",
new NullPointerException());
}
}
});
} else {
loadingPageCache = CacheBuilder.newBuilder()
.maximumSize(maxSize)
.build(
new CacheLoader<K, TreePage>() {
public TreePage load(K key) throws Exception {
byte[] page = pages.get(keyCoder.keyEncode(key));
if (page != null) {
return pageDecode(page);
} else {
throw new ExecutionException("Source did not have page",
new NullPointerException());
}
}
});
}
}
public ReadDBKeyCoder<V> getKeyCoder() {
return (ReadDBKeyCoder) keyCoder;
}
public K getFirstKey() {
return keyCoder.keyDecode(pages.firstKey());
}
public byte[] getPageKeyForKey(K key) {
byte[] byteKey = keyCoder.keyEncode(key);
return pages.floorKey(byteKey);
}
public KeyValuePage<K, V> getOrLoadPageForKey(K key) {
K pageKey = keyCoder.keyDecode(getPageKeyForKey(key));
if (pageKey != null) {
try {
return loadingPageCache.get(pageKey);
} catch (ExecutionException e) {
}
}
return null;
}
/**
* TODO: Might as well store TreePage keys as undecoded bytes if we only use this method?
*/
public V getValue(K key) {
KeyValuePage<K, V> page = getOrLoadPageForKey(key);
V value = page.getValue(key);
if (collectMetrics) {
metrics.updateGetValue(value);
}
return value;
}
public void close() {
pages.close();
}
//decode pages. Called on the bytes returned by store.get()
private TreePage pageDecode(byte[] page) {
try {
InputStream in = new ByteArrayInputStream(page);
int flags = in.read() & 0xff;
int gztype = flags & 0x0f;
int pageType = flags >>> TYPE_BIT_OFFSET;
switch (gztype) {
case 1:
in = new InflaterInputStream(in);
break;
case 2:
in = new GZIPInputStream(in);
break;
case 3:
in = new LZFInputStream(in);
break;
case 4:
in = new SnappyInputStream(in);
break;
}
PageEncodeType pageEncodeType;
DataInputStream dis = null;
switch (pageType) {
case 0:
pageEncodeType = PageEncodeType.LEGACY;
break;
case 1:
pageEncodeType = PageEncodeType.SPARSE;
dis = new DataInputStream(in);
break;
case 2:
pageEncodeType = PageEncodeType.LONGIDS;
dis = new DataInputStream(in);
break;
default:
throw new IllegalStateException("unknown page type " + pageType);
}
TreePage decode;
int entries = pageEncodeType.readInt(in, dis);
if (collectMetrics) {
metrics.updatePageSize(entries);
}
byte[] firstKeyBytes = pageEncodeType.readBytes(in, dis);
K firstKey = keyCoder.keyDecode(firstKeyBytes);
byte[] nextFirstKeyBytes = pageEncodeType.nextFirstKey(in, dis);
K nextFirstKey = keyCoder.keyDecode(nextFirstKeyBytes);
decode = new TreePage(firstKey).setNextFirstKey(nextFirstKey);
decode.originalByteSize = 4 + firstKeyBytes.length;
if (nextFirstKeyBytes != null) {
decode.originalByteSize += nextFirstKeyBytes.length;
}
for (int i = 0; i < entries; i++) {
byte[] kb = pageEncodeType.readBytes(in, dis);
decode.originalByteSize += kb.length;
byte[] vb = pageEncodeType.readBytes(in, dis);
decode.originalByteSize += vb.length;
K key = keyCoder.keyDecode(kb, firstKey, pageEncodeType);
decode.map.put(key, new PageValue(vb, pageEncodeType));
}
//ignoring memory data
in.close();
log.debug("decoded {}", decode);
return decode;
} catch (RuntimeException ex) {
throw ex;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
/**
* wrapper around an individual (non-paged) value V that allows for selective
* decoding of tree nodes from a page. Pages start off with a bunch of these.
* <p/>
* Does some crazy concurrency things. They might not be a good idea? Kind of cool though.
*/
private final class PageValue {
private V value;
private byte[] raw;
private volatile V realValue;
private PageEncodeType encodeType;
PageValue(byte[] raw, PageEncodeType encodeType) {
this.raw = raw;
this.encodeType = encodeType;
}
@Override
public String toString() {
return "PV:" + (value != null ? value : raw != null ? "{raw:" + raw.length + "}" : "null");
}
public V value() {
if (value == null) {
byte[] r = raw;
if (realValue != null) {
value = realValue;
} else if (r != null) {
realValue = keyCoder.valueDecode(r, encodeType);
value = realValue;
raw = null;
}
}
return value;
}
}
/**
* Implementation of a page. Is constructed by pageDecode. See var comments
*/
private final class TreePage implements KeyValuePage<K, V>, Comparator<K> {
//An ordered mapping of K (individual/non-paged keys) to PageValues (decode-deferring wrappers for V)
private final TreeMap<K, PageValue> map;
//The first key in the map. also used as the key for this page in backing store
private final K firstKey;
//The first of the next page (apparently). should also be the key for the next page in the backing store (?)
//If this is always correct -> we can use our own key iterator without relying on one from backing store
private K nextFirstKey;
private int originalByteSize;
TreePage(K firstKey) {
this.firstKey = firstKey;
this.map = new TreeMap<>(this);
}
TreePage setNextFirstKey(K nextFirstKey) {
this.nextFirstKey = nextFirstKey;
return this;
}
@Override
public String toString() {
return "tp[" + map.size() + "," + firstKey + "," + nextFirstKey + "]";
}
void checkKey(K key) {
if (!checkKeyRange) {
return;
}
if (key.compareTo(firstKey) < 0 || (nextFirstKey != null && key.compareTo(nextFirstKey) >= 0)) {
throw new RuntimeException("getPut out of range " + key + " compared to " + firstKey + " - " + nextFirstKey);
}
}
@Override
public boolean containsKey(K key) {
return map.containsKey(key);
}
@Override
public K getFirstKey() {
return firstKey;
}
@Override
public K getLastKey() {
if (map.size() == 0) {
return null;
} else {
return map.lastKey();
}
}
@Override
public V getValue(K key) {
checkKey(key);
PageValue pv = map.get(key);
if (pv != null) {
return pv.value();
} else {
return null;
}
}
@Override
public V getPutValue(K key, V val) {
throw new UnsupportedOperationException();
}
@Override
public V getRemoveValue(K key) {
throw new UnsupportedOperationException();
}
@Override
public void putValue(K key, V val) {
throw new UnsupportedOperationException();
}
@Override
public void removeValue(K key) {
throw new UnsupportedOperationException();
}
@Override
public void removeValues(K start, K end) {
throw new UnsupportedOperationException();
}
@Override
public Iterator<Map.Entry<K, V>> range(K start) {
SortedMap<K, PageValue> tailMap = start != null ? map.tailMap(start) : map;
if (log.isDebugEnabled()) {
log.debug("range start=" + start + " tailMap=" + tailMap + " map=" + map);
}
return new TreePageIterator(tailMap);
}
@Override
public K getNextFirstKey() {
return nextFirstKey;
}
@Override
public int compareKeys(K k1, K k2) {
return ReadExternalPagedStore.this.compareKeys(k1, k2);
}
@Override
public int compare(K o1, K o2) {
return compareKeys(o1, o2);
}
}
/**
* allows iterators to preserve deferred decoding
* <p/>
* iterates over K,V pairs (non-page Ks to non-paged Vs)
* <p/>
* used by TreePage to provide an iterator
* <p/>
* TODO delete class
*/
private final class TreePageIterator implements Iterator<Map.Entry<K, V>> {
private final Iterator<Map.Entry<K, PageValue>> iter;
private final class TreePageIterEntry implements Map.Entry<K, V> {
private final Map.Entry<K, PageValue> entry;
TreePageIterEntry(Map.Entry<K, PageValue> entry) {
this.entry = entry;
}
@Override
public K getKey() {
return entry.getKey();
}
@Override
public V getValue() {
return entry.getValue().value();
}
@Override
public V setValue(V value) {
throw new UnsupportedOperationException();
}
}
//Is handed a portion of a TreePage's TreeMap (as a sorted map named tailmap).
public TreePageIterator(SortedMap<K, PageValue> tailMap) {
iter = tailMap.entrySet().iterator();
}
@Override
public String toString() {
return "TPI:" + iter;
}
@Override
public boolean hasNext() {
return iter.hasNext();
}
@Override
public Map.Entry<K, V> next() {
return new TreePageIterEntry(iter.next());
}
@Override
public void remove() {
iter.remove();
}
}
/**
* since we have the nextFirstKey pointer in the TreePage class, we use our own iterator instead of one
* from the backing page store.
* <p/>
* this would be less efficient if there is a significant cost to using store.get(key) over the pages
* returned by the store's iterator. I do not believe that to be the case.
*/
public Iterator<KeyValuePage<K, V>> getPageIterator(final K start) {
return new PageIterator(start);
}
/**
* iterates over K-(pages of V) entry objects.
* essentially this iterates over k-page pairs.
* <p/>
* Handles decoding and interacting with the page cache.
* <p/>
* TODO: optionally(?) prebuffer the next page or delegate that to a sub-iterator
* TODO: keep pointer to page and next key instead of two pages
*/
private final class PageIterator implements Iterator<KeyValuePage<K, V>> {
private KeyValuePage<K, V> nextPage;
private KeyValuePage<K, V> page;
public PageIterator(K start) {
if (start == null) {
start = getFirstKey();
}
nextPage = getOrLoadPageForKey(start);
}
@Override
public String toString() {
return "PI:" + page;
}
@Override
public boolean hasNext() {
fillNext();
return (nextPage != null);
}
private void fillNext() {
if (nextPage == null && page != null) {
K nextPageKey = page.getNextFirstKey();
if (nextPageKey != null) {
nextPage = getOrLoadPageForKey(nextPageKey);
}
}
}
@Override
public KeyValuePage<K, V> next() {
if (hasNext()) {
page = nextPage;
nextPage = null;
return page;
} else {
throw new NoSuchElementException();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
public Iterator<Map.Entry<K, V>> range(K start) {
//create PageIterator (to get a stream of pages), then create a bounded iterator
return new BoundedIterator(getPageIterator(start), start);
}
/**
* legacy comment: wrapper for page iterator
* <p/>
* yet another iterator wrapper. This one iterates over k-v pairs (not k-page pairs).
* in this sense, it is similar to TreePageIterator (not to be confused with PageIterator)
* <p/>
* By bounded, it means bounded on the LEFT side only! It will iterate from START to the end
* of the database; going by k-v pairs.
* <p/>
* TODO ----
* Probably should try to buffer next page in many cases.
*/
private class BoundedIterator implements ClosableIterator<Map.Entry<K, V>> {
private K firstKey;
//backing PageIterator (provides pages)
private Iterator<KeyValuePage<K, V>> pageIterator;
//backing ValueIterator (iterates over a page)
private Iterator<Map.Entry<K, V>> valueIterator;
//TreePage
private KeyValuePage<K, V> nextPage;
//K-V pairs
private Map.Entry<K, V> lastEntry;
private Map.Entry<K, V> nextEntry;
BoundedIterator(Iterator<KeyValuePage<K, V>> iterator, K firstKey) {
this.pageIterator = iterator;
this.firstKey = firstKey;
}
@Override
public String toString() {
return "BI:" + firstKey + "," + pageIterator + "," + valueIterator + "," + nextPage + "," + lastEntry + "," + nextEntry;
}
private void fillNext() {
/* first make sure we have a viable page */
while (valueIterator == null && pageIterator != null && pageIterator.hasNext()) {
nextPage = pageIterator.next();
valueIterator = nextPage.range(firstKey);
if (!valueIterator.hasNext()) {
valueIterator = null;
}
}
/* make sure we have a viable page iterator */
if (nextEntry == null && valueIterator != null && valueIterator.hasNext()) {
nextEntry = valueIterator.next();
if (!valueIterator.hasNext()) {
valueIterator = null;
nextPage = null;
}
}
}
@Override
public void close() {
}
@Override
public boolean hasNext() {
fillNext();
return nextEntry != null;
}
@Override
public Map.Entry<K, V> next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
lastEntry = nextEntry;
nextEntry = null;
return lastEntry;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
public int compareKeys(K k1, K k2) {
return k1.compareTo(k2);
}
public ExternalPagedStoreMetrics getMetrics() {
return metrics;
}
public void testIntegrity() {
int counter = 0;
int failedPages = 0;
try {
byte[] encodedKey = pages.firstKey();
K key = keyCoder.keyDecode(encodedKey);
do {
KeyValuePage<K, V> newPage = loadingPageCache.get(key);
byte[] encodedNextKey = pages.higherKey(encodedKey);
if (encodedNextKey != null) {
K nextKey = keyCoder.keyDecode(encodedNextKey);
K nextFirstKey = newPage.getNextFirstKey();
K firstKey = newPage.getFirstKey();
K lastKey = newPage.getLastKey();
if (nextFirstKey == null) {
failedPages++;
log.warn("On page " + counter + " the firstKey is " +
firstKey +
" the nextFirstKey is null" +
" and the next page is associated with key " + nextKey);
assert(false);
} else if (!nextFirstKey.equals(nextKey)) {
failedPages++;
int compareTo = compareKeys(nextFirstKey, nextKey);
char direction = compareTo > 0 ? '>' : '<';
log.warn("On page " + counter + " the firstKey is " +
firstKey +
" the nextFirstKey is " + nextFirstKey +
" which is " + direction + " the next page is associated with key " + nextKey);
assert(false);
} else if (lastKey != null && compareKeys(lastKey,nextKey) >= 0) {
failedPages++;
log.warn("On page " + counter + " the firstKey is " +
firstKey + " the largest key is " + lastKey +
" the next key is " + nextKey +
" which is less than or equal to the largest key.");
assert(false);
}
key = nextKey;
}
encodedKey = encodedNextKey;
counter++;
if (counter % 10000 == 0) {
log.info("Scanned " + counter + " pages. Detected " + failedPages + " failed pages.");
}
} while (encodedKey != null);
} catch (ExecutionException ex) {
log.error(ex.toString());
}
log.info("Scan complete. Scanned " + counter + " pages. Detected " + failedPages + " failed pages.");
}
}