package com.jivesoftware.os.amza.service.replication; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.jivesoftware.os.amza.api.AmzaInterner; import com.jivesoftware.os.amza.api.TimestampedValue; import com.jivesoftware.os.amza.api.filer.UIO; import com.jivesoftware.os.amza.api.partition.PartitionName; import com.jivesoftware.os.amza.api.partition.RingMembership; import com.jivesoftware.os.amza.api.partition.StorageVersion; import com.jivesoftware.os.amza.api.partition.VersionedPartitionName; import com.jivesoftware.os.amza.api.ring.RingMember; import com.jivesoftware.os.amza.api.scan.RowChanges; import com.jivesoftware.os.amza.api.scan.RowsChanged; import com.jivesoftware.os.amza.api.wal.WALKey; import com.jivesoftware.os.amza.api.wal.WALUpdated; import com.jivesoftware.os.amza.api.wal.WALValue; import com.jivesoftware.os.amza.service.AwaitNotify; import com.jivesoftware.os.amza.service.NotARingMemberException; import com.jivesoftware.os.amza.service.PartitionIsDisposedException; import com.jivesoftware.os.amza.service.PropertiesNotPresentException; import com.jivesoftware.os.amza.service.partition.VersionedPartitionProvider; import com.jivesoftware.os.amza.service.storage.PartitionCreator; import com.jivesoftware.os.amza.service.storage.SystemWALStorage; import com.jivesoftware.os.amza.service.storage.delta.DeltaStripeWALStorage; import com.jivesoftware.os.jive.utils.collections.lh.LHMapState; import com.jivesoftware.os.jive.utils.collections.lh.LHash; import com.jivesoftware.os.jive.utils.ordered.id.OrderIdProvider; import com.jivesoftware.os.mlogger.core.MetricLogger; import com.jivesoftware.os.mlogger.core.MetricLoggerFactory; import java.io.File; import java.io.IOException; import java.nio.channels.FileLock; import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Random; import java.util.concurrent.Callable; import java.util.concurrent.Semaphore; import java.util.concurrent.atomic.AtomicLong; /** * */ public class StorageVersionProvider implements CurrentVersionProvider, RowChanges, SystemStriper { private static final MetricLogger LOG = MetricLoggerFactory.getLogger(); private final AmzaInterner amzaInterner; private final OrderIdProvider orderIdProvider; private final RingMember rootRingMember; private final SystemWALStorage systemWALStorage; private final VersionedPartitionProvider versionedPartitionProvider; private final RingMembership ringMembership; private final File[] workingIndexDirectories; private final FileLock[] stripeLocks; private final long[] stripeVersions; private final long stripeMaxFreeWithinNBytes; private final DeltaStripeWALStorage[] deltaStripeWALStorages; private final WALUpdated walUpdated; private final AwaitNotify<PartitionName> awaitNotify; private final Map<PartitionName, StickyStorage> partitionStorage = Maps.newConcurrentMap(); private final Map<RingMemberAndPartitionName, StorageVersion> remoteVersionCache = Maps.newConcurrentMap(); public StorageVersionProvider(AmzaInterner amzaInterner, OrderIdProvider orderIdProvider, RingMember rootRingMember, SystemWALStorage systemWALStorage, VersionedPartitionProvider versionedPartitionProvider, RingMembership ringMembership, File[] workingIndexDirectories, long[] stripeVersions, FileLock[] stripeLocks, long stripeMaxFreeWithinNBytes, DeltaStripeWALStorage[] deltaStripeWALStorages, WALUpdated walUpdated, AwaitNotify<PartitionName> awaitNotify) { this.amzaInterner = amzaInterner; this.orderIdProvider = orderIdProvider; this.rootRingMember = rootRingMember; this.systemWALStorage = systemWALStorage; this.versionedPartitionProvider = versionedPartitionProvider; this.ringMembership = ringMembership; this.workingIndexDirectories = workingIndexDirectories; this.stripeVersions = stripeVersions; this.stripeLocks = stripeLocks; this.stripeMaxFreeWithinNBytes = stripeMaxFreeWithinNBytes; this.deltaStripeWALStorages = deltaStripeWALStorages; this.walUpdated = walUpdated; this.awaitNotify = awaitNotify; } public void start() { for (int i = 0; i < stripeLocks.length; i++) { Preconditions.checkState(stripeLocks[i].isValid() && !stripeLocks[i].isShared()); } } public void stop() { for (int i = 0; i < stripeLocks.length; i++) { try { stripeLocks[i].release(); } catch (IOException x) { LOG.error("Failed to release stripe lock {} for {}", new Object[] { i, workingIndexDirectories[i] }, x); } } } private static byte[] walKey(RingMember member, PartitionName partitionName) throws Exception { byte[] memberBytes = member.toBytes(); if (partitionName != null) { byte[] partitionNameBytes = partitionName.toBytes(); byte[] asBytes = new byte[1 + 4 + memberBytes.length + 4 + partitionNameBytes.length]; asBytes[0] = 0; // version UIO.intBytes(memberBytes.length, asBytes, 1); System.arraycopy(memberBytes, 0, asBytes, 1 + 4, memberBytes.length); UIO.intBytes(partitionNameBytes.length, asBytes, 1 + 4 + memberBytes.length); System.arraycopy(partitionNameBytes, 0, asBytes, 1 + 4 + memberBytes.length + 4, partitionNameBytes.length); return asBytes; } else { byte[] asBytes = new byte[1 + 4 + memberBytes.length]; asBytes[0] = 0; // version UIO.intBytes(memberBytes.length, asBytes, 1); System.arraycopy(memberBytes, 0, asBytes, 1 + 4, memberBytes.length); return asBytes; } } private StorageVersionProvider.StickyStorage getStickyStorage(PartitionName partitionName) { return partitionStorage.computeIfAbsent(partitionName, key -> { try { return new StorageVersionProvider.StickyStorage(getRawStorageVersion(partitionName)); } catch (Exception e) { throw new RuntimeException(e); } }); } private StorageVersion getRawStorageVersion(PartitionName partitionName) throws Exception { TimestampedValue rawState = systemWALStorage.getTimestampedValue(PartitionCreator.PARTITION_VERSION_INDEX, null, walKey(rootRingMember, partitionName)); if (rawState != null) { return StorageVersion.fromBytes(rawState.getValue()); } else { return null; } } public StorageVersion createIfAbsent(PartitionName partitionName) throws Exception { if (partitionName.isSystemPartition()) { return new StorageVersion(0, 0); } StickyStorage stickyStorage = getStickyStorage(partitionName); int stripeIndex = getCurrentStripe(stickyStorage.storageVersion); if (stripeIndex == -1) { stickyStorage.semaphore.acquire(Short.MAX_VALUE); try { stickyStorage = getStickyStorage(partitionName); stripeIndex = (stickyStorage.storageVersion == null) ? -1 : getStripeIndex(stickyStorage.storageVersion.stripeVersion); if (stripeIndex == -1) { if (!versionedPartitionProvider.hasPartition(partitionName)) { throw new PropertiesNotPresentException("Properties missing for " + partitionName); } if (!ringMembership.isMemberOfRing(partitionName.getRingName(), 0)) { throw new NotARingMemberException("Not a member of ring for " + partitionName); } long maxFree = 0; long[] free = new long[workingIndexDirectories.length]; for (int i = 0; i < workingIndexDirectories.length; i++) { free[i] = workingIndexDirectories[i].getFreeSpace(); if (free[i] > maxFree) { maxFree = free[i]; } } List<Integer> eligible = Lists.newArrayList(); for (int i = 0; i < workingIndexDirectories.length; i++) { long nearMaxFree = maxFree - free[i]; if (nearMaxFree <= stripeMaxFreeWithinNBytes) { eligible.add(i); } } if (eligible.isEmpty()) { throw new IllegalStateException("No disk free"); } Random r = new Random(); stripeIndex = eligible.get(r.nextInt(eligible.size())); updateStickyStorage(partitionName, stickyStorage, orderIdProvider.nextId(), stripeIndex); } } finally { stickyStorage.semaphore.release(Short.MAX_VALUE); } } return stickyStorage.storageVersion; } @Override public <R> R tx(PartitionName partitionName, StorageVersion requireStorageVersion, StripeIndexs<R> tx) throws Exception { if (partitionName.isSystemPartition()) { return tx.tx(-1, getSystemStripe(partitionName), new StorageVersion(0, 0)); } StorageVersionProvider.StickyStorage stickyStorage = getStickyStorage(partitionName); stickyStorage.semaphore.acquire(); try { StorageVersion currentStorageVersion = stickyStorage.storageVersion; if (currentStorageVersion == null && requireStorageVersion == null) { return tx.tx(-1, -1, null); } if (currentStorageVersion == null) { throw new PartitionIsDisposedException("Partition " + partitionName + " is disposed"); } if (requireStorageVersion != null && currentStorageVersion.partitionVersion != requireStorageVersion.partitionVersion) { throw new IllegalArgumentException("Partition version has changed" + " got:" + currentStorageVersion.partitionVersion + " required:" + requireStorageVersion.partitionVersion); } int stripeIndex = getCurrentStripe(currentStorageVersion); if (stripeIndex == -1) { throw new IllegalArgumentException("Missing stripe index for:" + partitionName + " with stripe version:" + currentStorageVersion.stripeVersion); } StickyStripe stickyStripe; synchronized (stickyStorage.stripeCache) { stickyStripe = stickyStorage.stripeCache.get(currentStorageVersion.partitionVersion); if (stickyStripe == null) { VersionedPartitionName versionedPartitionName = new VersionedPartitionName(partitionName, currentStorageVersion.partitionVersion); int s = stripeIndex; for (int i = 0; i < deltaStripeWALStorages.length; i++) { DeltaStripeWALStorage deltaStripeWALStorage = deltaStripeWALStorages[i]; if (deltaStripeWALStorage.hasChangesFor(versionedPartitionName)) { s = i; } } stickyStripe = new StickyStripe(s); stickyStorage.stripeCache.put(currentStorageVersion.partitionVersion, stickyStripe); } stickyStripe.acquired.incrementAndGet(); } try { return tx.tx(stickyStripe.stripeIndex, stripeIndex, currentStorageVersion); } finally { stickyStripe.acquired.decrementAndGet(); } } finally { stickyStorage.semaphore.release(); } } @Override public void invalidateDeltaIndexCache(VersionedPartitionName versionedPartitionName) throws Exception { StickyStorage stickyStorage = getStickyStorage(versionedPartitionName.getPartitionName()); stickyStorage.semaphore.acquire(); try { synchronized (stickyStorage.stripeCache) { StickyStripe stickyStripe = stickyStorage.stripeCache.get(versionedPartitionName.getPartitionVersion()); if (stickyStripe != null && stickyStripe.acquired.get() == 0) { stickyStorage.stripeCache.remove(versionedPartitionName.getPartitionVersion()); } } } finally { stickyStorage.semaphore.release(); } } // Sucks but its our legacy @Override public int getSystemStripe(PartitionName partitionName) { return Math.abs(partitionName.hashCode() % stripeVersions.length); } private int getCurrentStripe(StorageVersion storageVersion) { return (storageVersion == null) ? -1 : getStripeIndex(storageVersion.stripeVersion); } @Override public boolean isCurrentVersion(VersionedPartitionName versionedPartitionName) { PartitionName partitionName = versionedPartitionName.getPartitionName(); if (partitionName.isSystemPartition()) { return true; } StorageVersion storageVersion = getStickyStorage(partitionName).storageVersion; return storageVersion != null && storageVersion.partitionVersion == versionedPartitionName.getPartitionVersion(); } @Override public void abandonVersion(VersionedPartitionName versionedPartitionName) throws Exception { PartitionName partitionName = versionedPartitionName.getPartitionName(); StickyStorage stickyStorage = getStickyStorage(partitionName); stickyStorage.semaphore.acquire(Short.MAX_VALUE); try { StorageVersion storageVersion = getStickyStorage(partitionName).storageVersion; int stripe = (storageVersion == null) ? -1 : getStripeIndex(storageVersion.stripeVersion); if (stripe != -1 && storageVersion.partitionVersion <= versionedPartitionName.getPartitionVersion()) { updateStickyStorage(partitionName, stickyStorage, orderIdProvider.nextId(), stripe); } } finally { stickyStorage.semaphore.release(Short.MAX_VALUE); } } // call with all semaphores for partition void transitionStripe(VersionedPartitionName versionedPartitionName, StorageVersion requireStorageVersion, int rebalanceToStripe) throws Exception { PartitionName partitionName = versionedPartitionName.getPartitionName(); StickyStorage stickyStorage = getStickyStorage(partitionName); StorageVersion currentStorageVersion = stickyStorage.storageVersion; if (requireStorageVersion.equals(currentStorageVersion)) { updateStickyStorage(partitionName, stickyStorage, requireStorageVersion.partitionVersion, rebalanceToStripe); } else { throw new IllegalStateException( "Failed to transition to versionedPartitionName:" + versionedPartitionName + " stripe:" + rebalanceToStripe + " from " + currentStorageVersion + " to " + requireStorageVersion); } } <V> V replaceOneWithAll(PartitionName partitionName, Callable<V> callable) throws Exception { StickyStorage stickyStorage = getStickyStorage(partitionName); stickyStorage.semaphore.release(); try { stickyStorage.semaphore.acquire(Short.MAX_VALUE); try { return callable.call(); } finally { stickyStorage.semaphore.release(Short.MAX_VALUE); } } finally { stickyStorage.semaphore.acquire(); } } public interface PartitionMemberStorageVersionStream { boolean stream(PartitionName partitionName, RingMember ringMember, StorageVersion storageVersion) throws Exception; } public void streamLocal(PartitionMemberStorageVersionStream stream) throws Exception { byte[] fromKey = walKey(rootRingMember, null); byte[] toKey = WALKey.prefixUpperExclusive(fromKey); systemWALStorage.rangeScan(PartitionCreator.PARTITION_VERSION_INDEX, null, fromKey, null, toKey, (prefix, key, value, valueTimestamp, valueTombstoned, valueVersion) -> { if (valueTimestamp != -1 && !valueTombstoned) { int o = 0; o++; //serializationVersion int ringMemberLength = UIO.bytesInt(key, o); o += 4; RingMember ringMember = amzaInterner.internRingMember(key, o, ringMemberLength); o += ringMemberLength; int partitionNameBytesLength = UIO.bytesInt(key, o); o += 4; // partitionNameLength PartitionName partitionName = amzaInterner.internPartitionName(key, o, partitionNameBytesLength); StorageVersion storageVersion = StorageVersion.fromBytes(value); int stripe = getStripeIndex(storageVersion.stripeVersion); if (stripe != -1) { return stream.stream(partitionName, ringMember, storageVersion); } } return true; }, true); } private int getStripeIndex(long stripeVersion) { for (int i = 0; i < stripeVersions.length; i++) { if (stripeVersions[i] == stripeVersion) { return i; } } return -1; } public PartitionName partitionNameFromKey(byte[] key) throws Exception { int o = 0; o++; //serializationVersion int ringMemberLength = UIO.bytesInt(key, o); o += 4; o += ringMemberLength; int partitionNameBytesLength = UIO.bytesInt(key, o); o += 4; // partitionNameLength return amzaInterner.internPartitionName(key, o, partitionNameBytesLength); } public StorageVersion getRemote(RingMember ringMember, PartitionName partitionName) throws Exception { return remoteVersionCache.computeIfAbsent(new RingMemberAndPartitionName(ringMember, partitionName), key -> { try { TimestampedValue rawState = systemWALStorage.getTimestampedValue(PartitionCreator.PARTITION_VERSION_INDEX, null, walKey(ringMember, partitionName)); if (rawState == null) { return null; } return StorageVersion.fromBytes(rawState.getValue()); } catch (Exception e) { throw new RuntimeException(e); } }); } private void updateStickyStorage(PartitionName partitionName, StickyStorage stickyStorage, long partitionVersion, int stripe) throws Exception { StorageVersion storageVersion = new StorageVersion(partitionVersion, stripeVersions[stripe]); VersionedPartitionName versionedPartitionName = new VersionedPartitionName(partitionName, partitionVersion); StorageVersion cachedVersion = stickyStorage.storageVersion; if (cachedVersion != null && cachedVersion.equals(storageVersion)) { return; } byte[] versionedStateBytes = storageVersion.toBytes(); awaitNotify.notifyChange(partitionName, () -> { long timestampAndVersion = orderIdProvider.nextId(); RowsChanged rowsChanged = systemWALStorage.update(PartitionCreator.PARTITION_VERSION_INDEX, null, (highwaters, scan) -> scan.row(orderIdProvider.nextId(), walKey(rootRingMember, partitionName), versionedStateBytes, timestampAndVersion, false, timestampAndVersion), walUpdated); return !rowsChanged.isEmpty(); }); LOG.info("Storage version: {} {} was updated to {}", rootRingMember, versionedPartitionName, partitionVersion); stickyStorage.storageVersion = storageVersion; //TODO anything to notify? //takeCoordinator.stateChanged(amzaRingReader, versionedPartitionName, commitableStorageVersion.state); //takeCoordinator.awakeCya(); } public boolean remove(RingMember rootRingMember, VersionedPartitionName versionedPartitionName) throws Exception { StickyStorage stickyStorage = getStickyStorage(versionedPartitionName.getPartitionName()); stickyStorage.semaphore.acquire(Short.MAX_VALUE); try { long timestampAndVersion = orderIdProvider.nextId(); RowsChanged rowsChanged = systemWALStorage.update(PartitionCreator.PARTITION_VERSION_INDEX, null, (highwaters, scan) -> scan.row(orderIdProvider.nextId(), walKey(rootRingMember, versionedPartitionName.getPartitionName()), null, timestampAndVersion, true, timestampAndVersion), walUpdated); LOG.info("Storage version: {} {} was removed: {}", rootRingMember, versionedPartitionName, rowsChanged); stickyStorage.storageVersion = null; stickyStorage.stripeCache.remove(versionedPartitionName.getPartitionVersion()); return !rowsChanged.isEmpty(); } finally { stickyStorage.semaphore.release(Short.MAX_VALUE); } } @Override public void changes(RowsChanged changes) throws Exception { if (PartitionCreator.PARTITION_VERSION_INDEX.equals(changes.getVersionedPartitionName())) { for (Map.Entry<WALKey, WALValue> change : changes.getApply().entrySet()) { clearCache(change.getKey().key, change.getValue().getValue()); } } } private void invalidateRemoteVersionCache(RingMember ringMember, PartitionName partitionName) { remoteVersionCache.remove(new RingMemberAndPartitionName(ringMember, partitionName)); } void clearCache(byte[] walKey, byte[] walValue) throws Exception { int o = 0; o++; // serializationVersion int ringMemberLength = UIO.bytesInt(walKey, o); o += 4; RingMember ringMember = amzaInterner.internRingMember(walKey, o, ringMemberLength); o += ringMemberLength; if (ringMember != null) { int partitionNameBytesLength = UIO.bytesInt(walKey, o); o += 4; // partitionNameLength PartitionName partitionName = amzaInterner.internPartitionName(walKey, o, partitionNameBytesLength); if (ringMember.equals(rootRingMember)) { if (walValue != null) { StorageVersion storageVersion = StorageVersion.fromBytes(walValue); LOG.warn("Received external row changes for partition {} version {}", partitionName, storageVersion); } else { LOG.warn("Received external row changes for partition {} with no version", partitionName); } } else { invalidateRemoteVersionCache(ringMember, partitionName); } } } private static class StickyStorage { private final Semaphore semaphore = new Semaphore(Short.MAX_VALUE, true); private final LHash<StickyStripe> stripeCache = new LHash<>(new LHMapState<>(3, -1, -2)); private volatile StorageVersion storageVersion; private StickyStorage(StorageVersion storageVersion) { this.storageVersion = storageVersion; } } private static class StickyStripe { private final int stripeIndex; private final AtomicLong acquired = new AtomicLong(); private StickyStripe(int stripeIndex) { this.stripeIndex = stripeIndex; } } private static class RingMemberAndPartitionName { private final byte[] ringMemberBytes; private final boolean systemPartition; private final byte[] ringNameBytes; private final byte[] partitionNameBytes; private final int hash; public RingMemberAndPartitionName(RingMember ringMember, PartitionName partitionName) { this.ringMemberBytes = ringMember.leakBytes(); this.systemPartition = partitionName.isSystemPartition(); this.ringNameBytes = partitionName.getRingName(); this.partitionNameBytes = partitionName.getName(); this.hash = ringMember.hashCode() + 31 * partitionName.hashCode(); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } RingMemberAndPartitionName that = (RingMemberAndPartitionName) o; if (systemPartition != that.systemPartition) { return false; } if (!Arrays.equals(ringMemberBytes, that.ringMemberBytes)) { return false; } if (!Arrays.equals(ringNameBytes, that.ringNameBytes)) { return false; } return Arrays.equals(partitionNameBytes, that.partitionNameBytes); } @Override public int hashCode() { return hash; } } }