package com.jivesoftware.os.amza.service.storage.delta;
import com.google.common.collect.Iterators;
import com.google.common.collect.Maps;
import com.google.common.primitives.Longs;
import com.jivesoftware.os.amza.api.IoStats;
import com.jivesoftware.os.amza.api.partition.PartitionProperties;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.scan.RowStream;
import com.jivesoftware.os.amza.api.stream.FpKeyValueStream;
import com.jivesoftware.os.amza.api.stream.KeyValuePointerStream;
import com.jivesoftware.os.amza.api.stream.KeyValues;
import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys;
import com.jivesoftware.os.amza.api.stream.WALKeyPointerStream;
import com.jivesoftware.os.amza.api.wal.KeyUtil;
import com.jivesoftware.os.amza.api.wal.WALIndex;
import com.jivesoftware.os.amza.api.wal.WALKey;
import com.jivesoftware.os.amza.api.wal.WALPointer;
import com.jivesoftware.os.amza.api.wal.WALPrefix;
import com.jivesoftware.os.amza.service.storage.PartitionIndex;
import com.jivesoftware.os.amza.service.storage.PartitionStore;
import com.jivesoftware.os.amza.service.take.HighwaterStorage;
import com.jivesoftware.os.jive.utils.collections.bah.ConcurrentBAHash;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang.mutable.MutableBoolean;
/**
* @author jonathan.colt
*/
class PartitionDelta {
public static final MetricLogger LOG = MetricLoggerFactory.getLogger();
private final VersionedPartitionName versionedPartitionName;
private final DeltaWAL deltaWAL;
private final int maxValueSizeInIndex;
private final AtomicReference<PartitionDelta> mergingDelta;
private final ConcurrentBAHash<WALPointer> pointerIndex = new ConcurrentBAHash<>(3, true, 4);
private final ConcurrentSkipListMap<byte[], WALPointer> orderedIndex = new ConcurrentSkipListMap<>(KeyUtil::compare);
private final Map<WALPrefix, AppendOnlyConcurrentArrayList> prefixTxFpIndex = Maps.newConcurrentMap();
private final AppendOnlyConcurrentArrayList txIdWAL = new AppendOnlyConcurrentArrayList(11); //TODO expose to config
private final AtomicLong updatesSinceLastHighwaterFlush = new AtomicLong();
PartitionDelta(VersionedPartitionName versionedPartitionName,
DeltaWAL deltaWAL,
int maxValueSizeInIndex,
PartitionDelta merging) {
this.versionedPartitionName = versionedPartitionName;
this.deltaWAL = deltaWAL;
this.maxValueSizeInIndex = maxValueSizeInIndex;
this.mergingDelta = new AtomicReference<>(merging);
}
void acquire() {
deltaWAL.acquire();
}
void release() {
deltaWAL.release();
}
private long getDeltaWALId() {
return deltaWAL.getId();
}
private long getPrevDeltaWALId() {
return deltaWAL.getPrevId();
}
boolean isMerging() {
return mergingDelta.get() != null;
}
boolean needsToMerge() {
return !txIdWAL.isEmpty();
}
public long size() {
return pointerIndex.size();
}
/*public long mergedSize() {
PartitionDelta merge = mergingDelta.get();
return pointerIndex.size() + (merge != null ? merge.size() : 0);
}*/
private PartitionDelta acquireMerging() {
synchronized (mergingDelta) {
PartitionDelta partitionDelta = mergingDelta.get();
if (partitionDelta != null) {
partitionDelta.acquire();
}
return partitionDelta;
}
}
private void releaseMerging(PartitionDelta partitionDelta) {
partitionDelta.release();
}
private boolean streamRawValues(IoStats ioStats, byte[] prefix, UnprefixedWALKeys keys, FpKeyValueStream fpKeyValueStream) throws Exception {
return deltaWAL.hydrate(ioStats, fpStream -> {
PartitionDelta mergingPartitionDelta = acquireMerging();
if (mergingPartitionDelta != null) {
try {
return mergingPartitionDelta.streamRawValues(ioStats,
prefix,
mergingKeyStream -> keys.consume((key) -> {
WALPointer got = pointerIndex.get(WALKey.compose(prefix, key));
if (got == null) {
return mergingKeyStream.stream(key);
} else if (got.getHasValue()) {
return fpKeyValueStream.stream(got.getFp(),
null,
prefix,
key,
got.getValue(),
got.getTimestampId(),
got.getTombstoned(),
got.getVersion());
} else {
return fpStream.stream(got.getFp());
}
}),
fpKeyValueStream);
} finally {
releaseMerging(mergingPartitionDelta);
}
} else {
return keys.consume((key) -> {
WALPointer got = pointerIndex.get(WALKey.compose(prefix, key));
if (got == null) {
return fpKeyValueStream.stream(-1, null, prefix, key, null, -1, false, -1);
} else if (got.getHasValue()) {
return fpKeyValueStream.stream(got.getFp(),
null,
prefix,
key,
got.getValue(),
got.getTimestampId(),
got.getTombstoned(),
got.getVersion());
} else {
return fpStream.stream(got.getFp());
}
});
}
}, fpKeyValueStream);
}
boolean get(IoStats ioStats, byte[] prefix, UnprefixedWALKeys keys, FpKeyValueStream fpKeyValueStream) throws Exception {
return streamRawValues(ioStats, prefix, keys, fpKeyValueStream);
}
WALPointer getPointer(byte[] prefix, byte[] key) throws Exception {
WALPointer got = pointerIndex.get(WALKey.compose(prefix, key));
if (got != null) {
return got;
}
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
return partitionDelta.getPointer(prefix, key);
} finally {
releaseMerging(partitionDelta);
}
}
return null;
}
boolean getPointers(KeyValues keyValues, KeyValuePointerStream stream) throws Exception {
return keyValues.consume((prefix, key, value, valueTimestamp, valueTombstone, valueVersion) -> {
WALPointer pointer = getPointer(prefix, key);
if (pointer != null) {
return stream.stream(prefix, key, value, valueTimestamp, valueTombstone, valueVersion,
pointer.getTimestampId(), pointer.getTombstoned(), pointer.getVersion(), pointer.getFp(), pointer.getHasValue(), pointer.getValue());
} else {
return stream.stream(prefix, key, value, valueTimestamp, valueTombstone, valueVersion, -1, false, -1, -1, false, null);
}
});
}
boolean containsKeys(byte[] prefix, UnprefixedWALKeys keys, KeyTombstoneExistsStream stream) throws Exception {
return keys.consume((key) -> {
WALPointer got = getPointer(prefix, key);
long timestamp = (got == null) ? -1 : got.getTimestampId();
boolean tombstoned = got != null && got.getTombstoned();
long version = (got == null) ? -1 : got.getVersion();
return stream.stream(prefix, key, timestamp, tombstoned, version, got != null);
});
}
interface KeyTombstoneExistsStream {
boolean stream(byte[] prefix, byte[] key, long timestamp, boolean tombstoned, long version, boolean exists) throws Exception;
}
void put(long fp,
byte[] prefix,
byte[] key,
byte[] value,
long valueTimestamp,
boolean valueTombstone,
long valueVersion) throws InterruptedException {
WALPointer pointer;
int valueLength = (value == null) ? 0 : value.length;
if (maxValueSizeInIndex >= 0 && maxValueSizeInIndex >= valueLength) {
pointer = new WALPointer(fp, valueTimestamp, valueTombstone, valueVersion, true, value);
} else {
pointer = new WALPointer(fp, valueTimestamp, valueTombstone, valueVersion, false, null);
}
byte[] walKey = WALKey.compose(prefix, key);
pointerIndex.put(walKey, pointer);
orderedIndex.put(walKey, pointer);
}
private final AtomicBoolean firstAndOnlyOnce = new AtomicBoolean(true);
boolean shouldWriteHighwater() {
long got = updatesSinceLastHighwaterFlush.get();
if (got > 1000) { // TODO expose to partition config
updatesSinceLastHighwaterFlush.set(0);
return true;
} else {
return firstAndOnlyOnce.compareAndSet(true, false);
}
}
boolean keys(WALKeyPointerStream keyPointerStream) throws Exception {
return WALKey.decompose(
txFpRawKeyValueEntryStream -> {
for (Map.Entry<byte[], WALPointer> entry : orderedIndex.entrySet()) {
WALPointer pointer = entry.getValue();
if (!txFpRawKeyValueEntryStream.stream(-1,
pointer.getFp(),
null,
entry.getKey(),
pointer.getHasValue(),
pointer.getValue(),
pointer.getTimestampId(),
pointer.getTombstoned(),
pointer.getVersion(),
null)) {
return false;
}
}
return true;
},
(txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, entry)
-> keyPointerStream.stream(prefix, key, valueTimestamp, valueTombstoned, valueVersion, fp, hasValue, value));
}
DeltaPeekableElmoIterator rangeScanIterator(byte[] fromPrefix, byte[] fromKey, byte[] toPrefix, byte[] toKey, boolean hydrateValues) {
byte[] from = fromKey != null ? WALKey.compose(fromPrefix, fromKey) : null;
byte[] to = toKey != null ? WALKey.compose(toPrefix, toKey) : null;
Iterator<Map.Entry<byte[], WALPointer>> iterator = subMap(orderedIndex, from, to).entrySet().iterator();
Iterator<Map.Entry<byte[], WALPointer>> mergingIterator = Iterators.emptyIterator();
DeltaWAL mergingDeltaWAL = null;
PartitionDelta mergingPartitionDelta = acquireMerging();
if (mergingPartitionDelta != null) {
mergingIterator = subMap(mergingPartitionDelta.orderedIndex, from, to).entrySet().iterator();
mergingDeltaWAL = mergingPartitionDelta.deltaWAL;
}
return new DeltaPeekableElmoIterator(iterator, mergingIterator, deltaWAL, mergingDeltaWAL, hydrateValues);
}
private static ConcurrentNavigableMap<byte[], WALPointer> subMap(ConcurrentSkipListMap<byte[], WALPointer> index, byte[] from, byte[] to) {
if (from != null && to != null) {
if (KeyUtil.compare(from, to) <= 0) {
return index.subMap(from, to);
} else {
return index.subMap(from, to).descendingMap();
}
} else if (from != null) {
return index.tailMap(from, true);
} else if (to != null) {
return index.headMap(to, false);
} else {
return index;
}
}
DeltaPeekableElmoIterator rowScanIterator(boolean hydrateValues) {
Iterator<Map.Entry<byte[], WALPointer>> iterator = orderedIndex.entrySet().iterator();
Iterator<Map.Entry<byte[], WALPointer>> mergingIterator = Iterators.emptyIterator();
DeltaWAL mergingDeltaWAL = null;
PartitionDelta mergingPartitionDelta = acquireMerging();
if (mergingPartitionDelta != null) {
mergingIterator = mergingPartitionDelta.orderedIndex.entrySet().iterator();
mergingDeltaWAL = mergingPartitionDelta.deltaWAL;
}
return new DeltaPeekableElmoIterator(iterator, mergingIterator, deltaWAL, mergingDeltaWAL, hydrateValues);
}
long highestTxId() {
if (txIdWAL.isEmpty()) {
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
return partitionDelta.highestTxId();
} finally {
releaseMerging(partitionDelta);
}
} else {
return -1;
}
}
return txIdWAL.last().txId;
}
/*long highestTxId(byte[] prefix) {
AppendOnlyConcurrentArrayList prefixTxFps = prefixTxFpIndex.get(new WALPrefix(prefix));
if (prefixTxFps == null || prefixTxFps.isEmpty()) {
PartitionDelta partitionDelta = merging.get();
return (partitionDelta != null) ? partitionDelta.highestTxId(prefix) : -1;
}
return prefixTxFps.last().txId;
}*/
long lowestTxId() {
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
long lowestTxId = partitionDelta.lowestTxId();
if (lowestTxId >= 0) {
return lowestTxId;
}
} finally {
releaseMerging(partitionDelta);
}
}
if (txIdWAL.isEmpty()) {
return -1;
}
return txIdWAL.first().txId;
}
long lowestTxId(byte[] prefix) {
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
long lowestTxId = partitionDelta.lowestTxId(prefix);
if (lowestTxId >= 0) {
return lowestTxId;
}
} finally {
releaseMerging(partitionDelta);
}
}
AppendOnlyConcurrentArrayList prefixTxFps = prefixTxFpIndex.get(new WALPrefix(prefix));
if (prefixTxFps == null || prefixTxFps.isEmpty()) {
return -1;
}
return prefixTxFps.first().txId;
}
void onLoadAppendTxFp(byte[] prefix, long rowTxId, long rowFP) {
if (txIdWAL.isEmpty() || txIdWAL.last().txId != rowTxId) {
txIdWAL.add(new TxFps(prefix, rowTxId, new long[] { rowFP }));
} else {
txIdWAL.onLoadAddFpToTail(rowFP);
}
if (prefix != null) {
AppendOnlyConcurrentArrayList prefixTxFps = prefixTxFpIndex.computeIfAbsent(new WALPrefix(prefix),
walPrefix -> new AppendOnlyConcurrentArrayList(8));
if (prefixTxFps.isEmpty() || prefixTxFps.last().txId != rowTxId) {
prefixTxFps.add(new TxFps(prefix, rowTxId, new long[] { rowFP }));
} else {
prefixTxFps.onLoadAddFpToTail(rowFP);
}
}
}
void appendTxFps(byte[] prefix, long rowTxId, long[] rowFPs) {
TxFps txFps = new TxFps(prefix, rowTxId, rowFPs);
if (prefix != null) {
AppendOnlyConcurrentArrayList prefixTxFps = prefixTxFpIndex.computeIfAbsent(new WALPrefix(prefix),
walPrefix -> new AppendOnlyConcurrentArrayList(8));
prefixTxFps.add(txFps);
}
txIdWAL.add(txFps);
updatesSinceLastHighwaterFlush.addAndGet(rowFPs.length);
}
public boolean takeRowsFromTransactionId(IoStats ioStats, long transactionId, RowStream rowStream) throws Exception {
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
if (!partitionDelta.takeRowsFromTransactionId(ioStats, transactionId, rowStream)) {
return false;
}
} finally {
releaseMerging(partitionDelta);
}
}
if (txIdWAL.isEmpty() || txIdWAL.last().txId < transactionId) {
return true;
}
return deltaWAL.takeRows(ioStats, txFpsStream -> txIdWAL.streamFromTxId(transactionId, false, txFpsStream), rowStream);
}
public boolean takeRowsFromTransactionId(IoStats ioStats, byte[] prefix, long transactionId, RowStream rowStream) throws Exception {
PartitionDelta partitionDelta = acquireMerging();
if (partitionDelta != null) {
try {
if (!partitionDelta.takeRowsFromTransactionId(ioStats, prefix, transactionId, rowStream)) {
return false;
}
} finally {
releaseMerging(partitionDelta);
}
}
AppendOnlyConcurrentArrayList prefixTxFps = prefixTxFpIndex.get(new WALPrefix(prefix));
if (prefixTxFps == null || prefixTxFps.isEmpty() || prefixTxFps.last().txId < transactionId) {
return true;
}
return deltaWAL.takeRows(ioStats, txFpsStream -> prefixTxFps.streamFromTxId(transactionId, false, txFpsStream), rowStream);
}
public static class MergeResult {
public final PartitionStore partitionStore;
public final VersionedPartitionName versionedPartitionName;
public final WALIndex walIndex;
public final long count;
public final long lastTxId;
public MergeResult(PartitionStore partitionStore,
VersionedPartitionName versionedPartitionName,
WALIndex walIndex,
long count,
long lastTxId) {
this.partitionStore = partitionStore;
this.versionedPartitionName = versionedPartitionName;
this.walIndex = walIndex;
this.count = count;
this.lastTxId = lastTxId;
}
}
MergeResult merge(IoStats ioStats,
HighwaterStorage highwaterStorage,
PartitionIndex partitionIndex,
PartitionProperties properties,
int stripe,
boolean validate) throws Exception {
long merged = 0;
long lastTxId = 0;
WALIndex walIndex = null;
PartitionStore partitionStore = null;
PartitionDelta merge = acquireMerging();
if (merge != null) {
try {
if (!merge.txIdWAL.isEmpty()) {
merged = merge.size();
lastTxId = merge.highestTxId();
if (validate) {
partitionStore = partitionIndex.getAndValidate("merge",
merge.getDeltaWALId(),
merge.getPrevDeltaWALId(),
merge.versionedPartitionName,
properties,
stripe);
} else {
partitionStore = partitionIndex.get("merge", merge.versionedPartitionName, properties, stripe);
}
long highestTxId = partitionStore.highestTxId();
LOG.info("Merging ({}) deltas for partition: {} from tx: {}", merge.pointerIndex.size(), merge.versionedPartitionName, highestTxId);
LOG.debug("Merging keys: {}", merge.orderedIndex.keySet());
MutableBoolean eos = new MutableBoolean(false);
PartitionStore mergeToStore = partitionStore;
merge.txIdWAL.streamFromTxId(highestTxId, true, txFps -> {
long txId = txFps.txId;
mergeToStore.merge(false,
properties,
txId,
txFps.prefix,
(highwaters, scan) -> WALKey.decompose(
txFpRawKeyValueStream -> merge.deltaWAL.hydrateKeyValueHighwaters(ioStats,
fpStream -> {
for (long fp : txFps.fps) {
if (!fpStream.stream(fp)) {
return false;
}
}
return true;
},
(fp, rowType, prefix, key, value, valueTimestamp, valueTombstone, valueVersion, highwater) -> {
// prefix is the partitionName and is discarded
WALPointer pointer = merge.orderedIndex.get(key);
if (pointer == null) {
throw new RuntimeException("Delta WAL missing"
+ " prefix: " + Arrays.toString(prefix)
+ " key: " + Arrays.toString(key)
+ " for: " + versionedPartitionName);
}
if (pointer.getFp() == fp) {
if (!txFpRawKeyValueStream.stream(txId, fp, rowType, key, true, value, valueTimestamp, valueTombstone, valueVersion,
null)) {
return false;
}
if (highwater != null) {
highwaters.highwater(highwater);
}
}
return true;
}),
(_txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, row) -> {
if (!scan.row(txId, key, value, valueTimestamp, valueTombstoned, valueVersion)) {
eos.setValue(true);
return false;
}
return true;
}));
return !eos.booleanValue();
});
partitionStore.getWalStorage().endOfMergeMarker(ioStats, merge.getDeltaWALId(), lastTxId);
walIndex = partitionStore.getWalStorage().commitIndex(true, lastTxId);
highwaterStorage.setLocal(merge.versionedPartitionName, lastTxId);
LOG.info("Merged deltas for {}", merge.versionedPartitionName);
}
} finally {
releaseMerging(merge);
}
}
synchronized (mergingDelta) {
mergingDelta.set(null);
}
return new MergeResult(partitionStore, versionedPartitionName, walIndex, merged, lastTxId);
}
private static final Comparator<TxFps> txFpsComparator = (o1, o2) -> Longs.compare(o1.txId, o2.txId);
private static class AppendOnlyConcurrentArrayList {
private volatile TxFps[] array;
private volatile int length;
public AppendOnlyConcurrentArrayList(int initialCapacity) {
this.array = new TxFps[Math.max(initialCapacity, 1)];
}
public void onLoadAddFpToTail(long fp) {
long[] existing = array[length - 1].fps;
long[] extended = new long[existing.length + 1];
System.arraycopy(existing, 0, extended, 0, existing.length);
extended[existing.length] = fp;
array[length - 1].fps = extended;
}
public void add(TxFps txFps) {
synchronized (this) {
if (length > 0 && txFps.txId <= array[length - 1].txId) {
throw new IllegalStateException("Appending txIds out of order: " + txFps.txId + " <= " + array[length - 1].txId);
}
if (length == array.length) {
TxFps[] doubled = new TxFps[array.length * 2];
System.arraycopy(array, 0, doubled, 0, array.length);
array = doubled;
}
array[length] = txFps;
length++;
}
}
public boolean streamFromTxId(long txId, boolean inclusive, TxFpsStream txFpsStream) throws Exception {
TxFps[] array;
int length;
synchronized (this) {
array = this.array;
length = this.length;
}
int index = Arrays.binarySearch(array, 0, length, new TxFps(null, txId, null), txFpsComparator);
if (index >= 0 && !inclusive) {
index++;
} else if (index < 0) {
index = -(index + 1);
}
while (true) {
for (int i = index; i < length; i++) {
if (!txFpsStream.stream(array[i])) {
return false;
}
}
int latestLength;
synchronized (this) {
latestLength = this.length;
array = this.array;
}
if (latestLength != length) {
index = length;
length = latestLength;
} else {
break;
}
}
return true;
}
public boolean isEmpty() {
synchronized (this) {
return length == 0;
}
}
public TxFps first() {
return array[0];
}
public TxFps last() {
TxFps[] array;
int length;
synchronized (this) {
array = this.array;
length = this.length;
}
return array[length - 1];
}
}
}