package com.jivesoftware.os.amza.service.replication; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import com.jivesoftware.os.amza.api.IoStats; import com.jivesoftware.os.amza.api.partition.VersionedAquarium; import com.jivesoftware.os.amza.api.partition.VersionedPartitionName; import com.jivesoftware.os.amza.api.scan.RowChanges; import com.jivesoftware.os.amza.api.scan.RowStream; import com.jivesoftware.os.amza.api.scan.RowsChanged; import com.jivesoftware.os.amza.api.stream.Commitable; import com.jivesoftware.os.amza.api.stream.KeyContainedStream; import com.jivesoftware.os.amza.api.stream.KeyValueStream; import com.jivesoftware.os.amza.api.stream.RowType; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream; import com.jivesoftware.os.amza.api.stream.TxKeyValueStream.TxResult; import com.jivesoftware.os.amza.api.stream.UnprefixedWALKeys; import com.jivesoftware.os.amza.api.take.Highwaters; import com.jivesoftware.os.amza.api.wal.PrimaryRowMarshaller; import com.jivesoftware.os.amza.api.wal.WALHighwater; import com.jivesoftware.os.amza.api.wal.WALUpdated; import com.jivesoftware.os.amza.service.stats.AmzaStats; import com.jivesoftware.os.amza.service.storage.HighwaterRowMarshaller; import com.jivesoftware.os.amza.service.storage.PartitionCreator; import com.jivesoftware.os.amza.service.storage.PartitionStore; import com.jivesoftware.os.amza.service.storage.delta.DeltaStripeWALStorage; import com.jivesoftware.os.amza.service.take.HighwaterStorage; import com.jivesoftware.os.aquarium.LivelyEndState; import com.jivesoftware.os.aquarium.State; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import java.util.Objects; /** * @author jonathan.colt */ public class PartitionStripe { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private final AmzaStats amzaStats; private final String name; private final int stripeIndex; private final PartitionCreator partitionCreator; private final DeltaStripeWALStorage storage; private final RowChanges allRowChanges; private final PrimaryRowMarshaller primaryRowMarshaller; private final HighwaterRowMarshaller<byte[]> highwaterRowMarshaller; public PartitionStripe(AmzaStats amzaStats, String name, int stripeIndex, PartitionCreator partitionCreator, DeltaStripeWALStorage storage, RowChanges allRowChanges, PrimaryRowMarshaller primaryRowMarshaller, HighwaterRowMarshaller<byte[]> highwaterRowMarshaller) { this.amzaStats = amzaStats; this.name = name; this.stripeIndex = stripeIndex; this.partitionCreator = partitionCreator; this.storage = storage; this.allRowChanges = allRowChanges; this.primaryRowMarshaller = primaryRowMarshaller; this.highwaterRowMarshaller = highwaterRowMarshaller; } public String getName() { return name; } void deleteDelta(VersionedPartitionName versionedPartitionName) throws Exception { storage.delete(versionedPartitionName); } boolean exists(VersionedPartitionName versionedPartitionName) throws Exception { long highestTxId = storage.getHighestTxId(versionedPartitionName, () -> { PartitionStore partitionStore = partitionCreator.get("stripe>exists", versionedPartitionName, stripeIndex); return partitionStore == null ? HighwaterStorage.LOCAL_NONE : partitionStore.highestTxId(); }); return highestTxId != HighwaterStorage.LOCAL_NONE; } public long highestTxId(VersionedPartitionName versionedPartitionName) throws Exception { return storage.getHighestTxId(versionedPartitionName, () -> { PartitionStore partitionStore = partitionCreator.get("stripe>highestTxId", versionedPartitionName, stripeIndex); return partitionStore == null ? HighwaterStorage.LOCAL_NONE : partitionStore.highestTxId(); }); } public RowsChanged commit(HighwaterStorage highwaterStorage, VersionedAquarium versionedAquarium, boolean directApply, Optional<Long> specificVersion, boolean requiresOnline, byte[] prefix, Commitable updates, WALUpdated updated) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } if (specificVersion.isPresent() && versionedPartitionName.getPartitionVersion() != specificVersion.get()) { return null; } PartitionStore partitionStore = partitionCreator.get("stripe>commit", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { RowsChanged changes = storage.update( amzaStats.updateIoStats, directApply, partitionStore.getProperties().rowType, highwaterStorage, versionedPartitionName, partitionStore, prefix, updates, updated); if (allRowChanges != null && !changes.isEmpty()) { allRowChanges.changes(changes); } return changes; } } public void flush(boolean fsync) throws Exception { storage.flush(fsync); } public boolean get(VersionedAquarium versionedAquarium, byte[] prefix, byte[] key, boolean requiresOnline, KeyValueStream keyValueStream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>get", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { long start = System.currentTimeMillis(); long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); boolean got = storage.get(amzaStats.getIoStats, versionedPartitionName, partitionStore.getWalStorage(), prefix, (stream) -> stream.stream(key), (prefix1, key1, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return keyValueStream.stream(prefix1, key1, null, -1, false, -1); } else { return keyValueStream.stream(prefix1, key1, value, valueTimestamp, valueTombstoned, valueVersion); } }); amzaStats.gets(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); return got; } } public boolean get(VersionedAquarium versionedAquarium, byte[] prefix, boolean requiresOnline, UnprefixedWALKeys keys, KeyValueStream stream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>getPrefix", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { long start = System.currentTimeMillis(); long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); boolean got = storage.get(amzaStats.getIoStats, versionedPartitionName, partitionStore.getWalStorage(), prefix, keys, (prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return stream.stream(prefix1, key, null, -1, false, -1); } else { return stream.stream(prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion); } }); amzaStats.gets(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); return got; } } public void rowScan(VersionedAquarium versionedAquarium, KeyValueStream keyValueStream, boolean hydrateValues, boolean requiresOnline) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>rowScan", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { long start = System.currentTimeMillis(); long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); storage.rowScan(versionedPartitionName, partitionStore, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return true; } else { return keyValueStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } }, hydrateValues); if (hydrateValues) { amzaStats.scans(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); } else { amzaStats.scanKeys(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); } } } public void rangeScan(VersionedAquarium versionedAquarium, byte[] fromPrefix, byte[] fromKey, byte[] toPrefix, byte[] toKey, boolean hydrateValues, boolean requiresOnline, KeyValueStream keyValueStream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>rangeScan", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { long start = System.currentTimeMillis(); long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); storage.rangeScan(versionedPartitionName, partitionStore, fromPrefix, fromKey, toPrefix, toKey, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return true; } else { return keyValueStream.stream(prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } }, hydrateValues); if (hydrateValues) { amzaStats.scans(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); } else { amzaStats.scanKeys(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); } } } public interface TakeRowUpdates<R> { R give(VersionedPartitionName versionedPartitionName, LivelyEndState livelyEndState, RowStreamer streamer) throws Exception; } public interface RowStreamer { boolean stream(RowStream rowStream) throws Exception; } public void takeAllRows(VersionedAquarium versionedAquarium, RowStream rowStream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); if (versionedPartitionName != null && livelyEndState != null) { PartitionStore partitionStore = partitionCreator.get("stripe>takeAllRows", versionedPartitionName, stripeIndex); if (partitionStore != null) { storage.takeAllRows(amzaStats.takeIoStats, versionedPartitionName, partitionStore.getWalStorage(), rowStream); } } } public <R> R takeRowUpdatesSince(IoStats ioStats, VersionedAquarium versionedAquarium, long transactionId, TakeRowUpdates<R> takeRowUpdates) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); if (versionedPartitionName == null || livelyEndState == null || livelyEndState.getCurrentState() == null) { return takeRowUpdates.give(null, null, null); } PartitionStore partitionStore = partitionCreator.get("stripe>takeRowUpdatesSince", versionedPartitionName, stripeIndex); if (partitionStore == null) { return takeRowUpdates.give(null, null, null); } else { RowStreamer streamer = (livelyEndState.getCurrentState() != State.expunged) ? rowStream -> storage.takeRowsFromTransactionId(ioStats, versionedPartitionName, partitionStore.getWalStorage(), transactionId, rowStream) : null; return takeRowUpdates.give(versionedPartitionName, livelyEndState, streamer); } } public WALHighwater takeFromTransactionId(IoStats ioStats, VersionedAquarium versionedAquarium, long transactionId, boolean requiresOnline, HighwaterStorage highwaterStorage, Highwaters highwaters, TxKeyValueStream txKeyValueStream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); WALHighwater partitionHighwater = highwaterStorage.getPartitionHighwater(versionedPartitionName, true); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>takeFromTransactionId", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { WALHighwater[] highwater = new WALHighwater[1]; long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); primaryRowMarshaller.fromRows(txFpRowStream -> { RowStream stream = (rowFP, rowTxId, rowType, row) -> { if (rowType.isPrimary()) { return txFpRowStream.stream(rowTxId, rowFP, rowType, row); } else if (rowType == RowType.highwater) { highwaters.highwater(highwaterRowMarshaller.fromBytes(row)); } return true; }; if (storage.takeRowsFromTransactionId(ioStats, versionedPartitionName, partitionStore.getWalStorage(), transactionId, stream)) { highwater[0] = partitionHighwater; } return true; }, (rowTxId, prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return TxResult.MORE; } else { return txKeyValueStream.stream(rowTxId, prefix, key, value, valueTimestamp, valueTombstoned, valueVersion); } }); return highwater[0]; } } public WALHighwater takeFromTransactionId(VersionedAquarium versionedAquarium, byte[] prefix, long transactionId, boolean requiresOnline, HighwaterStorage highwaterStorage, Highwaters highwaters, TxKeyValueStream txKeyValueStream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); WALHighwater partitionHighwater = highwaterStorage.getPartitionHighwater(versionedPartitionName, true); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>takeFromTransactionIdPrefix", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { WALHighwater[] highwater = new WALHighwater[1]; long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); primaryRowMarshaller.fromRows(txFpRowStream -> { RowStream stream = (rowFP, rowTxId, rowType, row) -> { if (rowType.isPrimary()) { return txFpRowStream.stream(rowTxId, rowFP, rowType, row); } else if (rowType == RowType.highwater) { highwaters.highwater(highwaterRowMarshaller.fromBytes(row)); } return true; }; if (storage.takeRowsFromTransactionId(amzaStats.takeIoStats, versionedPartitionName, partitionStore.getWalStorage(), prefix, transactionId, stream)) { highwater[0] = partitionHighwater; } return true; }, (rowTxId, prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueVersion != -1 && valueVersion < disposalVersion) { return TxResult.MORE; } else { return txKeyValueStream.stream(rowTxId, prefix1, key, value, valueTimestamp, valueTombstoned, valueVersion); } }); return highwater[0]; } } public long count(VersionedAquarium versionedAquarium) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); // any state is OK! PartitionStore partitionStore = partitionCreator.get("stripe>count", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { return storage.count(versionedPartitionName, partitionStore.getWalStorage()); } } public long keyCount(VersionedAquarium versionedAquarium) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); // any state is OK! PartitionStore partitionStore = partitionCreator.get("stripe>keyCount", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { return partitionStore.getWalStorage().keyCount(); } } public long clobberCount(VersionedAquarium versionedAquarium) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); // any state is OK! PartitionStore partitionStore = partitionCreator.get("stripe>clobberCount", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { return partitionStore.getWalStorage().clobberCount(); } } public long approximateCount(VersionedAquarium versionedAquarium) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); // any state is OK! PartitionStore partitionStore = partitionCreator.get("stripe>approximateCount", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { return storage.approximateCount(versionedPartitionName, partitionStore.getWalStorage()); } } public boolean containsKeys(VersionedAquarium versionedAquarium, byte[] prefix, boolean requiresOnline, UnprefixedWALKeys keys, KeyContainedStream stream) throws Exception { VersionedPartitionName versionedPartitionName = versionedAquarium.getVersionedPartitionName(); if (requiresOnline) { LivelyEndState livelyEndState = versionedAquarium.getLivelyEndState(); Preconditions.checkState(livelyEndState.isOnline(), "Partition:%s state:%s is not online.", versionedPartitionName, livelyEndState); } PartitionStore partitionStore = partitionCreator.get("stripe>containsKey", versionedPartitionName, stripeIndex); if (partitionStore == null) { throw new IllegalStateException("No partition defined for " + versionedPartitionName); } else { long start = System.currentTimeMillis(); long disposalVersion = partitionCreator.getPartitionDisposal(versionedPartitionName.getPartitionName()); boolean contained = storage.containsKeys(versionedPartitionName, partitionStore.getWalStorage(), prefix, keys, (prefix1, key, contained1, timestamp, version) -> { if (version != -1 && version < disposalVersion) { return stream.stream(prefix1, key, false, -1, -1); } else { return stream.stream(prefix1, key, contained1, timestamp, version); } }); amzaStats.scans(versionedPartitionName.getPartitionName(), 1, System.currentTimeMillis() - start); return contained; } } @Override public String toString() { return "PartitionStripe{" + "name='" + name + '\'' + '}'; } @Override public int hashCode() { int hash = 3; hash = 19 * hash + Objects.hashCode(this.name); return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final PartitionStripe other = (PartitionStripe) obj; return Objects.equals(this.name, other.name); } }