package com.jivesoftware.os.amza.lab.pointers;
import com.jivesoftware.os.amza.api.CompareTimestampVersions;
import com.jivesoftware.os.amza.api.filer.UIO;
import com.jivesoftware.os.amza.api.partition.VersionedPartitionName;
import com.jivesoftware.os.amza.api.scan.CompactionWALIndex;
import com.jivesoftware.os.amza.api.stream.KeyContainedStream;
import com.jivesoftware.os.amza.api.stream.KeyValuePointerStream;
import com.jivesoftware.os.amza.api.stream.KeyValues;
import com.jivesoftware.os.amza.api.stream.MergeTxKeyPointerStream;
import com.jivesoftware.os.amza.api.stream.TxFpStream;
import com.jivesoftware.os.amza.api.stream.TxKeyPointers;
import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys;
import com.jivesoftware.os.amza.api.stream.WALKeyPointerStream;
import com.jivesoftware.os.amza.api.stream.WALKeyPointers;
import com.jivesoftware.os.amza.api.stream.WALMergeKeyPointerStream;
import com.jivesoftware.os.amza.api.wal.WALIndex;
import com.jivesoftware.os.amza.api.wal.WALKey;
import com.jivesoftware.os.amza.lab.pointers.LABPointerIndexWALIndexName.Type;
import com.jivesoftware.os.lab.LABEnvironment;
import com.jivesoftware.os.lab.api.MemoryRawEntryFormat;
import com.jivesoftware.os.lab.api.NoOpFormatTransformerProvider;
import com.jivesoftware.os.lab.api.ValueIndex;
import com.jivesoftware.os.lab.api.ValueIndexConfig;
import com.jivesoftware.os.lab.api.rawhide.LABRawhide;
import com.jivesoftware.os.lab.guts.IndexUtil;
import com.jivesoftware.os.lab.guts.LABHashIndexType;
import com.jivesoftware.os.lab.io.BolBuffer;
import com.jivesoftware.os.mlogger.core.MetricLogger;
import com.jivesoftware.os.mlogger.core.MetricLoggerFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.Semaphore;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
/**
* @author jonathan.colt
*/
public class LABPointerIndexWALIndex implements WALIndex {
private static final MetricLogger LOG = MetricLoggerFactory.getLogger();
private static final int numPermits = 1024;
private final String providerName;
private final int maxValueSizeInIndex;
private final VersionedPartitionName versionedPartitionName;
private final LABPointerIndexWALIndexName name;
private final LABPointerIndexConfig config;
private final LABEnvironment[] environments;
private volatile int currentStripe;
private ValueIndex<byte[]> primaryDb;
private ValueIndex<byte[]> prefixDb;
private final AtomicBoolean closed = new AtomicBoolean(false);
private final Semaphore lock = new Semaphore(numPermits, true);
private final AtomicLong count = new AtomicLong(-1);
private final AtomicInteger commits = new AtomicInteger(0);
private final AtomicReference<WALIndex> compactingTo = new AtomicReference<>();
public LABPointerIndexWALIndex(String providerName,
int maxValueSizeInIndex,
VersionedPartitionName versionedPartitionName,
LABEnvironment[] environments,
int currentStripe,
LABPointerIndexWALIndexName name,
LABPointerIndexConfig config) throws Exception {
this.providerName = providerName;
this.maxValueSizeInIndex = maxValueSizeInIndex;
this.versionedPartitionName = versionedPartitionName;
this.name = name;
this.config = config;
this.environments = environments;
this.currentStripe = currentStripe;
}
private void init() throws Exception {
if (primaryDb != null) {
return;
}
synchronized (closed) {
if (primaryDb != null || closed.get()) {
return;
}
primaryDb = environments[currentStripe].open(new ValueIndexConfig(name.getPrimaryName(),
config.getEntriesBetweenLeaps(),
config.getMaxHeapPressureInBytes(),
config.getSplitWhenKeysTotalExceedsNBytes(),
config.getSplitWhenValuesTotalExceedsNBytes(),
config.getSplitWhenValuesAndKeysTotalExceedsNBytes(),
NoOpFormatTransformerProvider.NAME,
LABRawhide.NAME,
MemoryRawEntryFormat.NAME,
-1,
LABHashIndexType.valueOf(config.getHashIndexType()),
config.getHashIndexLoadFactor(),
config.getHashIndexEnabled()));
prefixDb = environments[currentStripe].open(new ValueIndexConfig(name.getPrefixName(),
config.getEntriesBetweenLeaps(),
config.getMaxHeapPressureInBytes(),
config.getSplitWhenKeysTotalExceedsNBytes(),
config.getSplitWhenValuesTotalExceedsNBytes(),
config.getSplitWhenValuesAndKeysTotalExceedsNBytes(),
NoOpFormatTransformerProvider.NAME,
LABRawhide.NAME,
MemoryRawEntryFormat.NAME,
-1,
LABHashIndexType.valueOf(config.getHashIndexType()),
config.getHashIndexLoadFactor(),
config.getHashIndexEnabled()));
}
}
@Override
public int getStripe() {
return currentStripe;
}
@Override
public String getProviderName() {
return providerName;
}
public VersionedPartitionName getVersionedPartitionName() {
return versionedPartitionName;
}
@Override
public void delete() throws Exception {
close();
lock.acquire(numPermits);
try {
synchronized (compactingTo) {
WALIndex wali = compactingTo.get();
if (wali != null) {
wali.close();
}
for (Type type : Type.values()) {
removeDatabase(currentStripe, type);
}
}
} finally {
lock.release(numPermits);
}
}
@Override
public boolean merge(TxKeyPointers pointers, MergeTxKeyPointerStream stream) throws Exception {
init();
try {
lock.acquire();
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
}
try {
byte[] mode = new byte[1];
byte[] txFpBytes = new byte[16];
BolBuffer entryBuffer = new BolBuffer();
BolBuffer keyBuffer = new BolBuffer();
return pointers.consume((txId, prefix, key, value, timestamp, tombstoned, version, fp) -> {
byte[] pk = WALKey.compose(prefix, key);
return primaryDb.get(
(stream1) -> stream1.key(0, pk, 0, pk.length),
(index1, key1, timestamp1, tombstoned1, version1, payload) -> {
if (payload != null) {
int c = CompareTimestampVersions.compare(timestamp1, version1, timestamp, version);
mode[0] = (c < 0) ? WALMergeKeyPointerStream.clobbered : WALMergeKeyPointerStream.ignored;
} else {
mode[0] = WALMergeKeyPointerStream.added;
}
if (mode[0] != WALMergeKeyPointerStream.ignored) {
byte[] mergePayload = toPayload(fp, value);
primaryDb.append((pointerStream) -> {
return pointerStream.stream(-1, pk, timestamp, tombstoned, version, mergePayload);
}, true, entryBuffer, keyBuffer);
if (prefix != null) {
UIO.longBytes(txId, txFpBytes, 0);
UIO.longBytes(fp, txFpBytes, 8);
byte[] prefixTxFp = WALKey.compose(prefix, txFpBytes);
prefixDb.append((pointerStream) -> {
return pointerStream.stream(-1, prefixTxFp, timestamp, tombstoned, version, mergePayload);
}, true, entryBuffer, keyBuffer);
}
}
if (stream != null) {
return stream.stream(mode[0], txId, prefix, key, timestamp, tombstoned, version, fp);
} else {
return true;
}
},
true);
});
} finally {
lock.release();
}
}
private static byte PAYLOAD_NULL = -1;
private static byte PAYLOAD_NONNULL = -2;
private byte[] toPayload(long fp, byte[] value) {
if (fp < 0) {
throw new IllegalArgumentException("Negative fp " + fp);
}
int valueLength = (value == null) ? 0 : value.length;
if (maxValueSizeInIndex >= 0 && maxValueSizeInIndex >= valueLength) {
// leverage the fact that fp cannot be negative by using a negative leading byte
byte[] payload = new byte[1 + (value == null ? 0 : value.length)];
payload[0] = (value == null) ? PAYLOAD_NULL : PAYLOAD_NONNULL;
if (value != null && value.length > 0) {
System.arraycopy(value, 0, payload, 1, value.length);
}
return payload;
} else {
return UIO.longBytes(fp);
}
}
private boolean fromPayload(long txId,
long fp,
byte[] payload,
TxFpStream txFpStream,
boolean hydrateValues) throws Exception {
if (payload != null && payload[0] < 0) {
if (payload[0] == PAYLOAD_NULL) {
return txFpStream.stream(txId, fp, true, null);
} else if (payload[0] == PAYLOAD_NONNULL) {
byte[] value = new byte[payload.length - 1];
System.arraycopy(payload, 1, value, 0, value.length);
return txFpStream.stream(txId, fp, true, value);
}
}
//TODO split hydrateValues=false into its own method
return txFpStream.stream(txId, fp, !hydrateValues, null);
}
private boolean fromPayload(byte[] prefix,
byte[] key,
long timestamp,
boolean tombstoned,
long version,
byte[] payload,
WALKeyPointerStream stream,
boolean hydrateValues) throws Exception {
if (payload != null) {
if (payload[0] == PAYLOAD_NULL) {
return stream.stream(prefix, key, timestamp, tombstoned, version, -1, true, null);
} else if (payload[0] == PAYLOAD_NONNULL) {
byte[] value = new byte[payload.length - 1];
System.arraycopy(payload, 1, value, 0, value.length);
return stream.stream(prefix, key, timestamp, tombstoned, version, -1, true, value);
} else {
long fp = UIO.bytesLong(payload);
return stream.stream(prefix, key, timestamp, tombstoned, version, fp, false, null);
}
}
//TODO split hydrateValues=false into its own method
return stream.stream(prefix, key, timestamp, tombstoned, version, -1, !hydrateValues, null);
}
private boolean fromPayload(byte[] prefix,
byte[] key,
byte[] value,
long valueTimestamp,
boolean valueTombstoned,
long valueVersion,
long timestamp,
boolean tombstoned,
long version,
byte[] payload,
KeyValuePointerStream stream,
boolean hydrateValues) throws Exception {
if (payload != null) {
if (payload[0] == PAYLOAD_NULL) {
return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, timestamp, tombstoned, version, -1, true, null);
} else if (payload[0] == PAYLOAD_NONNULL) {
byte[] pointerValue = new byte[payload.length - 1];
System.arraycopy(payload, 1, pointerValue, 0, pointerValue.length);
return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, timestamp, tombstoned, version, -1, true, pointerValue);
} else {
long fp = UIO.bytesLong(payload);
return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, timestamp, tombstoned, version, fp, false, null);
}
}
//TODO split hydrateValues=false into its own method
return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, timestamp, tombstoned, version, -1, !hydrateValues, null);
}
@Override
public boolean takePrefixUpdatesSince(byte[] prefix, long sinceTransactionId, TxFpStream txFpStream) throws Exception {
init();
lock.acquire();
try {
byte[] fromFpPk = WALKey.compose(prefix, new byte[0]);
byte[] toFpPk = WALKey.prefixUpperExclusive(fromFpPk);
BolBuffer bbToFpPk = new BolBuffer(toFpPk);
return prefixDb.rangeScan(fromFpPk, toFpPk, (index, rawKey, timestamp, tombstoned, version, payload) -> {
if (IndexUtil.compare(rawKey, bbToFpPk) >= 0) {
return false;
}
BolBuffer key = new BolBuffer(WALKey.rawKeyKey(rawKey.copy()));
long takeTxId = key.getLong(0);
long takeFp = key.getLong(8);
return fromPayload(takeTxId, takeFp, payload == null ? null : payload.copy(), txFpStream, true);
}, true);
} finally {
lock.release();
}
}
@Override
public boolean getPointer(byte[] prefix, byte[] key, WALKeyPointerStream stream) throws Exception {
init();
try {
lock.acquire();
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
}
try {
return getPointerInternal(prefix, key, stream);
} finally {
lock.release();
}
}
private boolean getPointerInternal(byte[] prefix, byte[] key, WALKeyPointerStream stream) throws Exception {
byte[] pk = WALKey.compose(prefix, key);
return primaryDb.get((keyStream) -> keyStream.key(0, pk, 0, pk.length),
(index, rawKey, timestamp, tombstoned, version, payload) -> {
return fromPayload(prefix, key, timestamp, tombstoned, version, payload == null ? null : payload.copy(), stream, true);
},
true);
}
@Override
public boolean getPointers(byte[] prefix, UnprefixedWALKeys keys, WALKeyPointerStream stream) throws Exception {
init();
lock.acquire();
try {
return keys.consume((key) -> {
byte[] pk = WALKey.compose(prefix, key);
return primaryDb.get((keyStream) -> keyStream.key(0, pk, 0, pk.length),
(index, rawKey, timestamp, tombstoned, version, payload) -> {
return fromPayload(prefix, key, timestamp, tombstoned, version, payload == null ? null : payload.copy(), stream, true);
},
true);
});
} finally {
lock.release();
}
}
@Override
public boolean getPointers(KeyValues keyValues, KeyValuePointerStream stream) throws Exception {
init();
lock.acquire();
try {
return keyValues.consume((prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> {
byte[] pk = WALKey.compose(prefix, key);
return primaryDb.get((keyStream) -> keyStream.key(0, pk, 0, pk.length),
(index, rawKey, timestamp, tombstoned, version, payload) -> {
return fromPayload(prefix,
key,
value,
valueTimestamp,
valueTombstoned,
valueVersion,
timestamp,
tombstoned,
version,
payload == null ? null : payload.copy(),
stream,
true);
},
true);
});
} finally {
lock.release();
}
}
@Override
public boolean containsKeys(byte[] prefix, UnprefixedWALKeys keys, KeyContainedStream stream) throws Exception {
init();
lock.acquire();
try {
return keys.consume((key) -> getPointerInternal(prefix, key,
(_prefix, _key, timestamp, tombstoned, version, fp, hasValue, value) -> {
boolean contained = (fp != -1 || hasValue) && !tombstoned;
stream.stream(prefix, key, contained, timestamp, version);
return true;
}));
} finally {
lock.release();
}
}
@Override
public boolean exists() throws Exception {
lock.acquire();
try {
return environments[currentStripe].exists(name.getPrimaryName());
} finally {
lock.release();
}
}
@Override
public long deltaCount(WALKeyPointers keyPointers) throws Exception {
init();
lock.acquire();
try {
long[] delta = new long[1];
boolean completed = keyPointers.consume(
(prefix, key, requestTimestamp, requestTombstoned, requestVersion, requestFp, requestIndexValue, requestValue)
-> getPointerInternal(prefix, key, (_prefix, _key, indexTimestamp, indexTombstoned, indexVersion, indexFp, indexHasValue, indexValue) -> {
// indexFp, indexHasValue, backingHasValue, indexTombstoned, requestTombstoned, delta
// -1 false false false false 1
// -1 true true false false 0
// -1 false false false true 0
// -1 true true false true -1
// 1 false true false false 0
// 1 false true false true -1
// 1 false true true false 1
// 1 false true true true 0
boolean backingHasValue = (indexFp != -1 || indexHasValue);
if (!requestTombstoned && (!backingHasValue && !indexTombstoned || backingHasValue && indexTombstoned)) {
delta[0]++;
} else if (backingHasValue && !indexTombstoned && requestTombstoned) {
delta[0]--;
}
return true;
}));
if (!completed) {
return -1;
}
return delta[0];
} finally {
lock.release();
}
}
@Override
public void commit(boolean fsync) throws Exception {
init();
lock.acquire();
try {
// TODO is this the right thing to do?
if (primaryDb != null) {
primaryDb.commit(fsync, true);
}
if (prefixDb != null) {
prefixDb.commit(fsync, true);
}
synchronized (commits) {
count.set(-1);
commits.incrementAndGet();
}
} finally {
lock.release();
}
}
@Override
public void close() throws Exception {
lock.acquire(numPermits);
try {
synchronized (closed) {
if (primaryDb != null) {
primaryDb.close(true, true);
primaryDb = null;
}
if (prefixDb != null) {
prefixDb.close(true, true);
prefixDb = null;
}
closed.set(true);
}
} finally {
lock.release(numPermits);
}
}
@Override
public boolean rowScan(final WALKeyPointerStream stream, boolean hydrateValues) throws Exception {
init();
lock.acquire();
try {
return primaryDb.rowScan(
(index, rawKey, timestamp, tombstoned, version, payload) -> {
byte[] rawKeyBytes = rawKey.copy();
return fromPayload(WALKey.rawKeyPrefix(rawKeyBytes),
WALKey.rawKeyKey(rawKeyBytes),
timestamp,
tombstoned,
version,
payload == null ? null : payload.copy(),
stream,
hydrateValues);
},
hydrateValues);
} finally {
lock.release();
}
}
@Override
public boolean rangeScan(byte[] fromPrefix,
byte[] fromKey,
byte[] toPrefix,
byte[] toKey,
WALKeyPointerStream stream,
boolean hydrateValues) throws Exception {
init();
lock.acquire();
try {
byte[] fromPk = fromKey != null ? WALKey.compose(fromPrefix, fromKey) : null;
byte[] toPk = toKey != null ? WALKey.compose(toPrefix, toKey) : null;
return primaryDb.rangeScan(fromPk,
toPk,
(index, rawKey, timestamp, tombstoned, version, payload) -> {
byte[] rawKeyBytes = rawKey.copy();
return fromPayload(WALKey.rawKeyPrefix(rawKeyBytes),
WALKey.rawKeyKey(rawKeyBytes),
timestamp,
tombstoned,
version,
payload == null ? null : payload.copy(),
stream,
hydrateValues);
},
hydrateValues);
} finally {
lock.release();
}
}
@Override
public CompactionWALIndex startCompaction(boolean hasActive, int compactionStripe) throws Exception {
init();
synchronized (compactingTo) {
WALIndex got = compactingTo.get();
if (got != null) {
throw new IllegalStateException("Tried to compact while another compaction is already underway: " + name);
}
if (primaryDb == null || prefixDb == null) {
throw new IllegalStateException("Tried to compact a index that has been expunged: " + name);
}
removeDatabase(compactionStripe, Type.compacting);
removeDatabase(compactionStripe, Type.compacted);
removeDatabase(currentStripe, Type.backup);
final LABPointerIndexWALIndex compactingWALIndex = new LABPointerIndexWALIndex(providerName,
maxValueSizeInIndex,
versionedPartitionName,
environments,
compactionStripe,
name.typeName(Type.compacting),
config);
compactingTo.set(compactingWALIndex);
return new CompactionWALIndex() {
@Override
public boolean merge(TxKeyPointers pointers) throws Exception {
return compactingWALIndex.merge(pointers, null);
}
@Override
public void commit(boolean fsync, Callable<Void> commit) throws Exception {
lock.acquire(numPermits);
try {
compactingWALIndex.commit(fsync);
compactingWALIndex.close();
if (!compactingTo.compareAndSet(compactingWALIndex, null)) {
throw new IllegalStateException("Tried to commit a stale compaction index");
}
if (primaryDb == null || prefixDb == null) {
LOG.warn("Was not commited because index has been closed.");
} else {
LOG.debug("Committing before swap: {}", name.getPrimaryName());
boolean compactedNonEmpty = rename(compactionStripe, Type.compacting, Type.compacted, false);
synchronized (closed) {
primaryDb.close(true, true);
primaryDb = null;
prefixDb.close(true, true);
prefixDb = null;
if (hasActive) {
rename(currentStripe, Type.active, Type.backup, compactedNonEmpty);
} else {
removeDatabase(currentStripe, Type.active);
}
if (commit != null) {
commit.call();
}
if (compactedNonEmpty) {
rename(compactionStripe, Type.compacted, Type.active, true);
}
removeDatabase(currentStripe, Type.backup);
primaryDb = environments[compactionStripe].open(new ValueIndexConfig(name.getPrimaryName(),
config.getEntriesBetweenLeaps(),
config.getMaxHeapPressureInBytes(),
config.getSplitWhenKeysTotalExceedsNBytes(),
config.getSplitWhenValuesTotalExceedsNBytes(),
config.getSplitWhenValuesAndKeysTotalExceedsNBytes(),
NoOpFormatTransformerProvider.NAME,
LABRawhide.NAME,
MemoryRawEntryFormat.NAME,
-1,
LABHashIndexType.valueOf(config.getHashIndexType()),
config.getHashIndexLoadFactor(),
config.getHashIndexEnabled()));
prefixDb = environments[compactionStripe].open(new ValueIndexConfig(name.getPrefixName(),
config.getEntriesBetweenLeaps(),
config.getMaxHeapPressureInBytes(),
config.getSplitWhenKeysTotalExceedsNBytes(),
config.getSplitWhenValuesTotalExceedsNBytes(),
config.getSplitWhenValuesAndKeysTotalExceedsNBytes(),
NoOpFormatTransformerProvider.NAME,
LABRawhide.NAME,
MemoryRawEntryFormat.NAME,
-1,
LABHashIndexType.valueOf(config.getHashIndexType()),
config.getHashIndexLoadFactor(),
config.getHashIndexEnabled()));
}
currentStripe = compactionStripe;
LOG.debug("Committing after swap: {}", name.getPrimaryName());
}
} finally {
lock.release(numPermits);
}
}
@Override
public void abort() throws Exception {
compactingWALIndex.close();
if (compactingTo.compareAndSet(compactingWALIndex, null)) {
removeDatabase(compactionStripe, Type.compacting);
}
}
};
}
}
private boolean rename(int stripe, Type fromType, Type toType, boolean required) throws Exception {
boolean primaryRenamed = environments[stripe].rename(name.typeName(fromType).getPrimaryName(), name.typeName(toType).getPrimaryName(), true);
boolean prefixRenamed = environments[stripe].rename(name.typeName(fromType).getPrefixName(), name.typeName(toType).getPrefixName(), true);
if (!primaryRenamed && (required || prefixRenamed)) {
throw new IOException("Failed to rename"
+ " from:" + name.typeName(fromType).getPrimaryName()
+ " to:" + name.typeName(toType).getPrimaryName()
+ " required:" + required
+ " prefix:" + prefixRenamed);
}
return primaryRenamed;
}
private void removeDatabase(int stripe, Type type) throws Exception {
environments[stripe].remove(name.typeName(type).getPrimaryName(), true);
environments[stripe].remove(name.typeName(type).getPrefixName(), true);
}
public void flush(boolean fsync) throws Exception {
lock.acquire();
try {
if (primaryDb != null) {
primaryDb.commit(fsync, true);
}
if (prefixDb != null) {
prefixDb.commit(fsync, true);
}
} finally {
lock.release();
}
}
@Override
public void updatedProperties(Map<String, String> properties) {
}
@Override
public String toString() {
return "LABPointerIndexWALIndex{" + "name=" + name
+ ", environments=" + Arrays.toString(environments)
+ ", primaryDb=" + primaryDb
+ ", prefixDb=" + prefixDb
+ ", lock=" + lock
+ ", count=" + count
+ ", commits=" + commits
+ ", compactingTo=" + compactingTo
+ '}';
}
}