package org.infinispan.query.indexmanager; import java.util.List; import java.util.Properties; import java.util.concurrent.atomic.AtomicInteger; import org.hibernate.search.backend.BackendFactory; import org.infinispan.factories.ComponentRegistry; import org.infinispan.notifications.Listener; import org.infinispan.notifications.cachemanagerlistener.annotation.ViewChanged; import org.infinispan.notifications.cachemanagerlistener.event.ViewChangedEvent; import org.infinispan.query.logging.Log; import org.infinispan.remoting.rpc.RpcManager; import org.infinispan.remoting.transport.Address; import org.infinispan.util.logging.LogFactory; import net.jcip.annotations.GuardedBy; /** * Manages the current state of being a "master" node or a node delegating index * update operations to other nodes. * In a static cluster this would have been a boolean state, but a state machine * is modelled here to cope with transitions between: * * Initialization of a node - still not having enough information on the cluster * Becoming a master because of previous master failure / shutdown * Forfaiting the master role (useful for cluster merges) * * The transition to become a master goes via different phases, and at each state * the process is reversible. So for example if operations have been put on hold * while the node is being upgraded, but then the master election is moved to a * different node quickly (cluster startup scenario), the buffered operations * will be forwarded to the last backend. * A node being forwarded update operations but not being the master anymore, * will re-forward the payload to the new master: stability by induction. * * The solution is rather poor at managing cluster Merge operations, but we * need to build on upcoming functionality from Infinispan core for that; for * example the index content wouldn't be consistent either so one would likely * need to wipe the index and rebuild it. * Also we're dealing with the inherent limitation of a "cluster wide lock" * concept not being compatible with sub-groups of nodes in which a new lead * might be elected and a lock per group might have been created. * * A lock cleanup is not too aggressive: in case a stale lock is detected, * scheduled work is postponed. This implies that in such situations in which * a stale lock needs to be cleaned up, index operations might not be visible * to the transaction committer. * I've chosen for this option as the lesser evil vs. blocking incoming RPCs, * although if the buffer for postponed operations gets filled too quickly, * we'll both speed up the lock acquisition and apply backpressure to the clients. * * @author Sanne Grinovero <sanne@hibernate.org> (C) 2014 Red Hat Inc. * @since 7.0 */ @Listener final class ClusteredSwitchingBackend implements LazyInitializableBackend { private static final Log log = LogFactory.getLog(ClusteredSwitchingBackend.class, Log.class); /** * Each attempt introduces approximately 10 seconds delay, and waiting * more should never be reasonable as it means we're handling a cluster merge. * Infinispan doesn't currently handle merges, so in that case the index * is probably corrupted: no point in keeping the lock either. * The only reason to wait for it is to handle very brief merges caused by * occasional high load, or in case users are writing directly to the index. * Sustained direct writes to the index should not be done either, at least * not without disabling index exclusivity which implies the lock will be * available in a shorter time. */ private static final int MAX_LOCK_ACQUISITION_ATTEMPTS = 2; private final Address localAddress; private final RpcManager rpcManager; private final LocalBackendFactory factory; private final IndexLockController indexlock; private final boolean async; private final String indexName; private final String cacheName; /** * Monotonically increasing view identification sequence: * we use it to ignore stale events. * FIXME: why are ids just an int? Is that going to be enough? */ private final AtomicInteger lastSeenViewId = new AtomicInteger(-1); private volatile Address currentMaster; private volatile IndexingBackend currentBackend; @GuardedBy("this") private boolean initialized = false; @GuardedBy("this") private int masterLockAcquisitionAttempts = 0; ClusteredSwitchingBackend(Properties props, ComponentRegistry componentsRegistry, String indexName, LocalBackendFactory factory, IndexLockController indexlock) { this.indexName = indexName; this.factory = factory; this.indexlock = indexlock; this.rpcManager = componentsRegistry.getComponent(RpcManager.class); this.cacheName = componentsRegistry.getCacheName(); if (rpcManager == null) { throw new IllegalStateException("This Cache is not clustered! The switching backend should not be used for local caches"); } this.localAddress = rpcManager.getAddress(); this.currentBackend = new LazyInitializingBackend(this); this.async = !BackendFactory.isConfiguredAsSync(props); } @ViewChanged public void viewChanged(final ViewChangedEvent e) { final int currentViewId = lastSeenViewId.get(); final int viewId = e.getViewId(); if (viewId > currentViewId) { if (lastSeenViewId.compareAndSet(currentViewId, viewId)) { applyViewChangedEvent(e); } } } @Override public void initialize() { // we use lazyInitialize() to postpone operations to last minute: // avoids unnecessary elections while the initial cluster is formed. } @Override public synchronized void lazyInitialize() { if (initialized) { return; } this.initialized = true; final List<Address> members = rpcManager.getMembers(); assert members != null; assert members.size() > 0; assert members.get(0) != null; final Address initialMaster = members.get(0); lastSeenViewId.set(rpcManager.getTransport().getViewId()); if (thisIsNewMaster(initialMaster)) { acquireControlStart(); } else { updateRoutingToNewRemote(initialMaster); } } private synchronized void applyViewChangedEvent(ViewChangedEvent e) { List<Address> newMembers = e.getNewMembers(); if (log.isDebugEnabled()) { log.debug("Notified of new View! Members: " + newMembers); } handleTopologyChange(newMembers); } private synchronized void handleTopologyChange(List<Address> newMembers) { assert newMembers != null; assert newMembers.size() > 0; assert newMembers.get(0) != null; final Address newmaster = newMembers.get(0); if (masterDidChange(newmaster)) { if (thisIsMaster()) { if (log.isDebugEnabled()) { log.debug("No longer a MASTER node, releasing the index lock."); } forfeitControl(newmaster); } else if (thisIsNewMaster(newmaster)) { log.debug("Electing SELF as MASTER!"); acquireControlStart(); } else { updateRoutingToNewRemote(newmaster); if (log.isDebugEnabled()) { log.debug("New master elected, now routing updates to node " + newmaster); } } } } private boolean thisIsNewMaster(Address newmaster) { return localAddress.equals(newmaster); } private boolean thisIsMaster() { return localAddress.equals(currentMaster); } private boolean masterDidChange(final Address newmaster) { if (newmaster == null) { return false; } else { return !newmaster.equals(currentMaster); } } private void updateRoutingToNewRemote(final Address newMaster) { final IndexingBackend newBackend = new RemoteIndexingBackend(cacheName, rpcManager, indexName, newMaster, async); swapNewBackendIn(newBackend, newMaster); } private void acquireControlStart() { final IndexingBackend backend = new LockAcquiringBackend(this); this.masterLockAcquisitionAttempts = 0; swapNewBackendIn(backend, localAddress); } private void forfeitControl(Address newMasterAddress) { final IndexingBackend newBackend = new RemoteIndexingBackend(cacheName, rpcManager, indexName, newMasterAddress, async); swapNewBackendIn(newBackend, newMasterAddress); } private void swapNewBackendIn(IndexingBackend newBackend, Address newMasterAddress) { final IndexingBackend oldBackend = currentBackend; log.debugv("Swapping from backend {0} to {1}'", oldBackend, newBackend); this.currentBackend = newBackend; this.currentMaster = newMasterAddress; closeBackend(oldBackend, currentBackend); } @Override public void shutdown() { closeBackend(currentBackend, null); this.currentBackend = null; } @Override public IndexingBackend getCurrentIndexingBackend() { return currentBackend; } @Override public void refresh() { handleTopologyChange(rpcManager.getMembers()); } private static void closeBackend(final IndexingBackend oldOne, final IndexingBackend replacement) { if (oldOne != null) { oldOne.flushAndClose(replacement); } } @Override public synchronized boolean attemptUpgrade(IndexingBackend expectedBackend) { log.trace("owning lock for attemptUpgrade(IndexingBackend)"); if (currentBackend != expectedBackend) { //This needs to be checked while holding the lock return true; } if (masterLockAcquisitionAttempts >= MAX_LOCK_ACQUISITION_ATTEMPTS) { indexlock.forceLockClear(); swapNewBackendIn(factory.createLocalIndexingBackend(), localAddress); return true; } else { masterLockAcquisitionAttempts++; } if (indexlock.waitForAvailability()) { swapNewBackendIn(factory.createLocalIndexingBackend(), localAddress); return true; } else { log.trace("Index lock not available: index update operations postponed."); return false; } } }