package com.jivesoftware.os.amza.berkeleydb; import com.jivesoftware.os.amza.api.CompareTimestampVersions; import com.jivesoftware.os.amza.api.filer.UIO; import com.jivesoftware.os.amza.api.partition.VersionedPartitionName; import com.jivesoftware.os.amza.api.scan.CompactionWALIndex; import com.jivesoftware.os.amza.api.stream.KeyContainedStream; import com.jivesoftware.os.amza.api.stream.KeyValuePointerStream; import com.jivesoftware.os.amza.api.stream.KeyValues; import com.jivesoftware.os.amza.api.stream.MergeTxKeyPointerStream; import com.jivesoftware.os.amza.api.stream.TxFpStream; import com.jivesoftware.os.amza.api.stream.TxKeyPointers; import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys; import com.jivesoftware.os.amza.api.stream.WALKeyPointerStream; import com.jivesoftware.os.amza.api.stream.WALKeyPointers; import com.jivesoftware.os.amza.api.stream.WALMergeKeyPointerStream; import com.jivesoftware.os.amza.api.wal.KeyUtil; import com.jivesoftware.os.amza.api.wal.WALIndex; import com.jivesoftware.os.amza.api.wal.WALKey; import com.jivesoftware.os.amza.berkeleydb.BerkeleyDBWALIndexName.Type; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import com.sleepycat.je.Cursor; import com.sleepycat.je.Database; import com.sleepycat.je.DatabaseConfig; import com.sleepycat.je.DatabaseEntry; import com.sleepycat.je.DatabaseNotFoundException; import com.sleepycat.je.DiskOrderedCursor; import com.sleepycat.je.DiskOrderedCursorConfig; import com.sleepycat.je.Environment; import com.sleepycat.je.LockMode; import com.sleepycat.je.OperationStatus; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; /** * @author jonathan.colt */ public class BerkeleyDBWALIndex implements WALIndex { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private static final int numPermits = 1024; private final String providerName; private final VersionedPartitionName versionedPartitionName; private final Environment[] environments; private final BerkeleyDBWALIndexName name; private volatile int currentStripe; private final DatabaseConfig primaryDbConfig; private final DatabaseConfig prefixDbConfig; private Database primaryDb; private Database prefixDb; private final Semaphore lock = new Semaphore(numPermits, true); private final AtomicLong count = new AtomicLong(-1); private final AtomicInteger commits = new AtomicInteger(0); private final AtomicReference<WALIndex> compactingTo = new AtomicReference<>(); public BerkeleyDBWALIndex(String providerName, VersionedPartitionName versionedPartitionName, Environment[] environments, BerkeleyDBWALIndexName name, int currentStripe) throws Exception { this.providerName = providerName; this.versionedPartitionName = versionedPartitionName; this.environments = environments; this.name = name; this.currentStripe = currentStripe; // Open the database, creating one if it does not exist this.primaryDbConfig = new DatabaseConfig() .setAllowCreate(true) .setBtreeComparator(KeyUtil.lexicographicalComparator()) .setOverrideBtreeComparator(true); this.primaryDb = environments[currentStripe].openDatabase(null, name.getPrimaryName(), primaryDbConfig); // Open the database, creating one if it does not exist this.prefixDbConfig = new DatabaseConfig() .setAllowCreate(true) .setBtreeComparator(KeyUtil.lexicographicalComparator()) .setOverrideBtreeComparator(true); this.prefixDb = environments[currentStripe].openDatabase(null, name.getPrefixName(), prefixDbConfig); } private void walPointerToEntry(long fp, long timestamp, boolean tombstoned, long version, DatabaseEntry dbValue) { byte[] valueBytes = UIO.longBytes(fp); byte[] entryBytes = new byte[valueBytes.length + 8 + 1 + 8]; System.arraycopy(valueBytes, 0, entryBytes, 0, valueBytes.length); UIO.longBytes(timestamp, entryBytes, valueBytes.length); entryBytes[valueBytes.length + 8] = tombstoned ? (byte) 1 : (byte) 0; UIO.longBytes(version, entryBytes, valueBytes.length + 8 + 1); dbValue.setData(entryBytes); } private boolean entryToWALPointer(byte[] prefix, byte[] key, byte[] entryBytes, WALKeyPointerStream stream, boolean hydrateValues) throws Exception { byte[] valueBytes = new byte[entryBytes.length - 8 - 1 - 8]; System.arraycopy(entryBytes, 0, valueBytes, 0, valueBytes.length); long timestamp = UIO.bytesLong(entryBytes, valueBytes.length); boolean tombstoned = (entryBytes[valueBytes.length + 8] == (byte) 1); long version = UIO.bytesLong(entryBytes, valueBytes.length + 8 + 1); return stream.stream(prefix, key, timestamp, tombstoned, version, UIO.bytesLong(valueBytes), !hydrateValues, null); } private boolean entryToWALPointer(byte[] prefix, byte[] key, byte[] value, long valueTimestamp, boolean valueTombstoned, long valueVersion, byte[] entryBytes, KeyValuePointerStream stream) throws Exception { byte[] valueBytes = new byte[entryBytes.length - 8 - 1 - 8]; System.arraycopy(entryBytes, 0, valueBytes, 0, valueBytes.length); long timestamp = UIO.bytesLong(entryBytes, valueBytes.length); boolean tombstoned = (entryBytes[valueBytes.length + 8] == (byte) 1); long version = UIO.bytesLong(entryBytes, valueBytes.length + 8 + 1); return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, timestamp, tombstoned, version, UIO.bytesLong(valueBytes), false, null); } private long entryToTimestamp(byte[] entryBytes) throws Exception { int valueLength = entryBytes.length - 8 - 1 - 8; return UIO.bytesLong(entryBytes, valueLength); } private long entryToVersion(byte[] entryBytes) throws Exception { int valueLength = entryBytes.length - 8 - 1 - 8; return UIO.bytesLong(entryBytes, valueLength + 1 + 8); } @Override public String getProviderName() { return providerName; } public VersionedPartitionName getVersionedPartitionName() { return versionedPartitionName; } @Override public void delete() throws Exception { close(); lock.acquire(numPermits); try { synchronized (compactingTo) { WALIndex wali = compactingTo.get(); if (wali != null) { wali.close(); } for (Type type : Type.values()) { removeDatabase(currentStripe, type); } } } finally { lock.release(numPermits); } } @Override public boolean merge(TxKeyPointers pointers, MergeTxKeyPointerStream stream) throws Exception { try { lock.acquire(); } catch (InterruptedException ie) { throw new RuntimeException(ie); } try { DatabaseEntry dbKey = new DatabaseEntry(); DatabaseEntry dbValue = new DatabaseEntry(); byte[] txFpBytes = new byte[16]; byte[] emptyValue = new byte[0]; return pointers.consume((txId, prefix, key, value, timestamp, tombstoned, version, fp) -> { byte[] pk = WALKey.compose(prefix, key); dbKey.setData(pk); OperationStatus status = primaryDb.get(null, dbKey, dbValue, LockMode.READ_UNCOMMITTED); byte mode; if (status == OperationStatus.SUCCESS) { int c = CompareTimestampVersions.compare(entryToTimestamp(dbValue.getData()), entryToVersion(dbValue.getData()), timestamp, version); mode = (c < 0) ? WALMergeKeyPointerStream.clobbered : WALMergeKeyPointerStream.ignored; } else { mode = WALMergeKeyPointerStream.added; } if (mode != WALMergeKeyPointerStream.ignored) { walPointerToEntry(fp, timestamp, tombstoned, version, dbValue); primaryDb.put(null, dbKey, dbValue); if (prefix != null) { UIO.longBytes(txId, txFpBytes, 0); UIO.longBytes(fp, txFpBytes, 8); byte[] prefixTxFp = WALKey.compose(prefix, txFpBytes); dbKey.setData(prefixTxFp); dbValue.setData(emptyValue); prefixDb.put(null, dbKey, dbValue); } } if (stream != null) { return stream.stream(mode, txId, prefix, key, timestamp, tombstoned, version, fp); } else { return true; } }); } finally { lock.release(); } } @Override public boolean takePrefixUpdatesSince(byte[] prefix, long sinceTransactionId, TxFpStream txFpStream) throws Exception { lock.acquire(); try (Cursor cursor = prefixDb.openCursor(null, null)) { byte[] fromFpPk = WALKey.compose(prefix, new byte[0]); byte[] toFpPk = WALKey.prefixUpperExclusive(fromFpPk); DatabaseEntry keyEntry = new DatabaseEntry(fromFpPk); DatabaseEntry valueEntry = new DatabaseEntry(); valueEntry.setPartial(true); return WALKey.decompose(txFpRawKeyValueEntryStream -> { if (cursor.getSearchKeyRange(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { do { if (KeyUtil.compare(keyEntry.getData(), toFpPk) >= 0) { return false; } if (!txFpRawKeyValueEntryStream.stream(-1, -1, null, keyEntry.getData(), false, null, -1, false, -1, null)) { return false; } } while (cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS); } return true; }, (txId, fp, rowType, _prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, entry) -> { long takeTxId = UIO.bytesLong(key, 0); long takeFp = UIO.bytesLong(key, 8); return txFpStream.stream(takeTxId, takeFp, hasValue, value); }); } finally { lock.release(); } } @Override public boolean getPointer(byte[] prefix, byte[] key, WALKeyPointerStream stream) throws Exception { try { lock.acquire(); } catch (InterruptedException ie) { throw new RuntimeException(ie); } try { DatabaseEntry dbValue = new DatabaseEntry(); byte[] pk = WALKey.compose(prefix, key); OperationStatus status = primaryDb.get(null, new DatabaseEntry(pk), dbValue, LockMode.READ_UNCOMMITTED); if (status == OperationStatus.SUCCESS) { return entryToWALPointer(prefix, key, dbValue.getData(), stream, true); } else { return stream.stream(prefix, key, -1, false, -1, -1, false, null); } } finally { lock.release(); } } @Override public boolean getPointers(byte[] prefix, UnprefixedWALKeys keys, WALKeyPointerStream stream) throws Exception { lock.acquire(); try { DatabaseEntry dbKey = new DatabaseEntry(); DatabaseEntry dpPointerValue = new DatabaseEntry(); return keys.consume((key) -> { dbKey.setData(WALKey.compose(prefix, key)); OperationStatus status = primaryDb.get(null, dbKey, dpPointerValue, LockMode.READ_UNCOMMITTED); if (status == OperationStatus.SUCCESS) { return entryToWALPointer(prefix, key, dpPointerValue.getData(), stream, true); } else { return stream.stream(prefix, key, -1, false, -1, -1, false, null); } }); } finally { lock.release(); } } @Override public boolean getPointers(KeyValues keyValues, KeyValuePointerStream stream) throws Exception { lock.acquire(); try { DatabaseEntry dbKey = new DatabaseEntry(); DatabaseEntry dpPointerValue = new DatabaseEntry(); return keyValues.consume((prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { byte[] pk = WALKey.compose(prefix, key); dbKey.setData(pk); OperationStatus status = primaryDb.get(null, dbKey, dpPointerValue, LockMode.READ_UNCOMMITTED); if (status == OperationStatus.SUCCESS) { return entryToWALPointer(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, dpPointerValue.getData(), stream); } else { return stream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion, -1, false, -1, -1, false, null); } }); } finally { lock.release(); } } @Override public boolean containsKeys(byte[] prefix, UnprefixedWALKeys keys, KeyContainedStream stream) throws Exception { lock.acquire(); try { return keys.consume((key) -> getPointer(prefix, key, (_prefix, _key, timestamp, tombstoned, version, fp, hasValue, value) -> { boolean contained = (fp != -1 || hasValue) && !tombstoned; stream.stream(prefix, key, contained, timestamp, version); return true; })); } finally { lock.release(); } } @Override public boolean exists() throws Exception { lock.acquire(); DiskOrderedCursor cursor = null; try { cursor = primaryDb.openCursor(new DiskOrderedCursorConfig().setKeysOnly(true).setQueueSize(1).setLSNBatchSize(1)); DatabaseEntry value = new DatabaseEntry(); value.setPartial(true); return (cursor.getNext(new DatabaseEntry(), value, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS); } finally { lock.release(); if (cursor != null) { cursor.close(); } } } @Override public long deltaCount(WALKeyPointers keyPointers) throws Exception { lock.acquire(); try { long[] delta = new long[1]; boolean completed = keyPointers.consume((prefix, key, requestTimestamp, requestTombstoned, requestVersion, fp, hasValue, value) -> getPointer(prefix, key, (_prefix, _key, indexTimestamp, indexTombstoned, indexVersion, indexFp, indexHasValue, indexValue) -> { // indexFp, indexHasValue, backingHasValue, indexTombstoned, requestTombstoned, delta // -1 false false false false 1 // -1 true true false false 0 // -1 false false false true 0 // -1 true true false true -1 // 1 false true false false 0 // 1 false true false true -1 // 1 false true true false 1 // 1 false true true true 0 boolean backingHasValue = (indexFp != -1 || indexHasValue); if (!requestTombstoned && (!backingHasValue && !indexTombstoned || backingHasValue && indexTombstoned)) { delta[0]++; } else if (backingHasValue && !indexTombstoned && requestTombstoned) { delta[0]--; } return true; })); if (!completed) { return -1; } return delta[0]; } finally { lock.release(); } } // // @Override // public long size() throws Exception { // lock.acquire(); // try { // long size = count.get(); // if (size >= 0) { // return size; // } // int numCommits = commits.get(); // size = primaryDb.count(); // synchronized (commits) { // if (numCommits == commits.get()) { // count.set(size); // } // } // return size; // } finally { // lock.release(); // } // } @Override public void commit(boolean fsync) throws Exception { lock.acquire(); try { environments[currentStripe].flushLog(false); // Hmm synchronized (commits) { count.set(-1); commits.incrementAndGet(); } } finally { lock.release(); } } @Override public void close() throws Exception { lock.acquire(numPermits); try { primaryDb.close(); primaryDb = null; prefixDb.close(); prefixDb = null; } finally { lock.release(numPermits); } } @Override public boolean rowScan(final WALKeyPointerStream stream, boolean hydrateValues) throws Exception { lock.acquire(); try (Cursor cursor = primaryDb.openCursor(null, null)) { DatabaseEntry keyEntry = new DatabaseEntry(); DatabaseEntry valueEntry = new DatabaseEntry(); return WALKey.decompose( (WALKey.TxFpRawKeyValueEntries<byte[]>) txFpRawKeyValueEntryStream -> { while (cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { if (!txFpRawKeyValueEntryStream.stream(-1, -1, null, keyEntry.getData(), false, null, -1, false, -1, valueEntry.getData())) { return false; } } return true; }, (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, entry) -> { return entryToWALPointer(prefix, key, entry, stream, hydrateValues); }); } finally { lock.release(); } } @Override public boolean rangeScan(byte[] fromPrefix, byte[] fromKey, byte[] toPrefix, byte[] toKey, WALKeyPointerStream stream, boolean hydrateValues) throws Exception { lock.acquire(); try (Cursor cursor = primaryDb.openCursor(null, null)) { byte[] fromPk = fromKey != null ? WALKey.compose(fromPrefix, fromKey) : null; byte[] toPk = toKey != null ? WALKey.compose(toPrefix, toKey) : null; if (fromPk != null && toPk != null && KeyUtil.compare(fromPk, toPk) > 0) { // reverse scan DatabaseEntry keyEntry = new DatabaseEntry(toPk); DatabaseEntry valueEntry = new DatabaseEntry(); return WALKey.decompose( (WALKey.TxFpRawKeyValueEntries<byte[]>) txFpRawKeyValueEntryStream -> { if (cursor.getSearchKeyRange(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { if (cursor.getPrev(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS) { do { byte[] key = keyEntry.getData(); if (KeyUtil.compare(key, fromPk) < 0) { return false; } byte[] entry = valueEntry.getData(); if (!txFpRawKeyValueEntryStream.stream(-1, -1, null, key, false, null, -1, false, -1, entry)) { return false; } } while (cursor.getPrev(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS); } } return true; }, (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, entry) -> { return entryToWALPointer(prefix, key, entry, stream, hydrateValues); }); } else { DatabaseEntry keyEntry = new DatabaseEntry(fromPk); DatabaseEntry valueEntry = new DatabaseEntry(); return WALKey.decompose( (WALKey.TxFpRawKeyValueEntries<byte[]>) txFpRawKeyValueEntryStream -> { OperationStatus status; if (fromPk == null) { status = cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); } else { status = cursor.getSearchKeyRange(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED); } if (status == OperationStatus.SUCCESS) { do { if (toPk != null && KeyUtil.compare(keyEntry.getData(), toPk) >= 0) { return false; } if (!txFpRawKeyValueEntryStream.stream(-1, -1, null, keyEntry.getData(), false, null, -1, false, -1, valueEntry.getData())) { return false; } } while (cursor.getNext(keyEntry, valueEntry, LockMode.READ_UNCOMMITTED) == OperationStatus.SUCCESS); } return true; }, (txId, fp, rowType, prefix, key, hasValue, value, valueTimestamp, valueTombstoned, valueVersion, entry) -> { return entryToWALPointer(prefix, key, entry, stream, hydrateValues); }); } } finally { lock.release(); } } @Override public CompactionWALIndex startCompaction(boolean hasActive, int compactionStripe) throws Exception { synchronized (compactingTo) { WALIndex got = compactingTo.get(); if (got != null) { throw new IllegalStateException("Tried to compact while another compaction is already underway: " + name); } if (primaryDb == null || prefixDb == null) { throw new IllegalStateException("Tried to compact a index that has been expunged: " + name); } removeDatabase(compactionStripe, Type.compacting); removeDatabase(compactionStripe, Type.compacted); removeDatabase(currentStripe, Type.backup); final BerkeleyDBWALIndex compactingWALIndex = new BerkeleyDBWALIndex(providerName, versionedPartitionName, environments, name.typeName(Type.compacting), compactionStripe); compactingTo.set(compactingWALIndex); return new CompactionWALIndex() { @Override public boolean merge(TxKeyPointers pointers) throws Exception { return compactingWALIndex.merge(pointers, null); } @Override public void commit(boolean fsync, Callable<Void> commit) throws Exception { lock.acquire(numPermits); try { environments[compactionStripe].flushLog(fsync); compactingWALIndex.close(); if (!compactingTo.compareAndSet(compactingWALIndex, null)) { throw new IllegalStateException("Tried to commit a stale compaction index"); } if (primaryDb == null || prefixDb == null) { LOG.warn("Was not commited because index has been closed."); } else { LOG.debug("Committing before swap: {}", name.getPrimaryName()); renameDatabase(compactionStripe, Type.compacting, Type.compacted); primaryDb.close(); primaryDb = null; prefixDb.close(); prefixDb = null; if (hasActive) { renameDatabase(currentStripe, Type.active, Type.backup); } else { removeDatabase(currentStripe, Type.active); } if (commit != null) { commit.call(); } renameDatabase(compactionStripe, Type.compacted, Type.active); removeDatabase(currentStripe, Type.backup); primaryDb = environments[compactionStripe].openDatabase(null, name.getPrimaryName(), primaryDbConfig); prefixDb = environments[compactionStripe].openDatabase(null, name.getPrefixName(), prefixDbConfig); environments[compactionStripe].flushLog(true); currentStripe = compactionStripe; LOG.debug("Committing after swap: {}", name.getPrimaryName()); } } finally { lock.release(numPermits); } } @Override public void abort() throws Exception { compactingWALIndex.close(); if (compactingTo.compareAndSet(compactingWALIndex, null)) { removeDatabase(compactionStripe, Type.compacting); } } }; } } private void renameDatabase(int stripe, Type fromType, Type toType) { environments[stripe].renameDatabase(null, name.typeName(fromType).getPrimaryName(), name.typeName(toType).getPrimaryName()); environments[stripe].renameDatabase(null, name.typeName(fromType).getPrefixName(), name.typeName(toType).getPrefixName()); } private void removeDatabase(int stripe, Type type) { try { environments[stripe].removeDatabase(null, name.typeName(type).getPrimaryName()); } catch (DatabaseNotFoundException e) { // yummm } try { environments[stripe].removeDatabase(null, name.typeName(type).getPrefixName()); } catch (DatabaseNotFoundException e) { // yummm } } @Override public void updatedProperties(Map<String, String> properties) { } @Override public int getStripe() { return currentStripe; } }