package org.infinispan.tx; import static org.infinispan.test.TestingUtil.waitForNoRebalance; import static org.testng.AssertJUnit.assertEquals; import static org.testng.AssertJUnit.assertTrue; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import org.infinispan.commands.control.LockControlCommand; import org.infinispan.configuration.cache.CacheMode; import org.infinispan.configuration.cache.ConfigurationBuilder; import org.infinispan.configuration.global.GlobalConfigurationBuilder; import org.infinispan.context.impl.TxInvocationContext; import org.infinispan.distribution.MagicKey; import org.infinispan.interceptors.BaseCustomAsyncInterceptor; import org.infinispan.remoting.transport.jgroups.JGroupsTransport; import org.infinispan.test.MultipleCacheManagersTest; import org.infinispan.transaction.LockingMode; import org.infinispan.transaction.lookup.EmbeddedTransactionManagerLookup; import org.infinispan.util.concurrent.IsolationLevel; import org.jgroups.View; import org.testng.annotations.Test; /** * This test reproduces following scenario in Infinispan: * <pre> * NODE-A NODE-B NODE-C * * 1:start-tx * 1:replace key X * 1:lock X (A is owner) * 1:put key Z * 1:lock Z (B is owner) * kill node C * 1:get response from lock Z * 1:release ALL locks (!) * new view is received * 1:retry lock Z * 1:lock Z (B is owner) * 2:start-tx * 2:replace key X * 2:lock X (A is owner) * 2:commit-tx * 1:commit-tx * * The problematic part is marked with exclamation, PessimisticLockingInterceptor releases ALL locks and retries just * one last command, which puts * current transaction in invalid state, when client thinks first operation protects second operation with a lock, but * this is not the case. * </pre> * * @since 9.0 */ @Test(groups = "functional", testName = "tx.InfinispanNodeFailureTest") public class InfinispanNodeFailureTest extends MultipleCacheManagersTest { private static final Integer INITIAL_VALUE = 0; private static final Integer REPLACING_VALUE = 1; private static final String TEST_CACHE = "test_cache"; private CountDownLatch viewLatch; public void killedNodeDoesNotBreakReplaceCommand() throws Exception { defineConfigurationOnAllManagers(TEST_CACHE, new ConfigurationBuilder().read(manager(0).getDefaultCacheConfiguration())); waitForClusterToForm(TEST_CACHE); waitForNoRebalance(caches(TEST_CACHE)); final Object replaceKey = new MagicKey("X", cache(0, TEST_CACHE)); final Object putKey = new MagicKey("Z", cache(1, TEST_CACHE)); cache(0, TEST_CACHE).put(replaceKey, INITIAL_VALUE); // prepare third node to notify us when put command is in progress so we can kill the node final CountDownLatch beforeKill = new CountDownLatch(1); final CountDownLatch afterKill = new CountDownLatch(1); advancedCache(1, TEST_CACHE).getAsyncInterceptorChain().addInterceptor(new BaseCustomAsyncInterceptor() { @Override public Object visitLockControlCommand(TxInvocationContext ctx, LockControlCommand command) throws Throwable { return invokeNextAndFinally(ctx, command, (rCtx, rCommand, rv, t) -> { LockControlCommand cmd = (LockControlCommand) rCommand; if (putKey.equals(cmd.getSingleKey())) { // notify main thread it can start killing third node beforeKill.countDown(); // wait for completion and proceed afterKill.await(10, TimeUnit.SECONDS); } }); } }, 1); // execute replace command in separate thread so we can do something else meanwhile Future<Boolean> firstResult = fork(() -> { try { tm(0, TEST_CACHE).begin(); // this should replace and lock REPLACE_KEY so other transactions can't pass this barrier boolean result = cache(0, TEST_CACHE).replace(replaceKey, INITIAL_VALUE, REPLACING_VALUE); // issue put command so it is retried while node-c is being killed cache(0, TEST_CACHE).put(putKey, "some-value"); // apply new view viewLatch.countDown(); tm(0, TEST_CACHE).commit(); return result; } catch (Throwable t) { return null; } }); // wait third node to complete replace command and kill it assertTrue(beforeKill.await(10, TimeUnit.SECONDS)); // kill node-c, do not wait rehash, it is important to continue with put-retry before new view is received killMember(2, TEST_CACHE, false); afterKill.countDown(); tm(1, TEST_CACHE).begin(); // this replace should never succeed because first node has already replaced and locked value // but during put command replace lock is lost, so we can successfully replace the same value again, which is a bug boolean secondResult = cache(1, TEST_CACHE).replace(replaceKey, INITIAL_VALUE, REPLACING_VALUE); tm(1, TEST_CACHE).commit(); // check that first node did not fail assertEquals(Boolean.TRUE, firstResult.get()); assertEquals(REPLACING_VALUE, cache(0, TEST_CACHE).get(replaceKey)); assertEquals(REPLACING_VALUE, cache(1, TEST_CACHE).get(replaceKey)); // check that second node state is inconsistent, second result should be FALSE in read committed pessimistic cache // uncomment when this bug is fixed assertEquals(false, secondResult); } @Override protected void createCacheManagers() throws Throwable { ConfigurationBuilder configuration = getDefaultClusteredCacheConfig(CacheMode.REPL_SYNC, true); configuration.locking() .useLockStriping(false) .isolationLevel(IsolationLevel.READ_COMMITTED) .lockAcquisitionTimeout(20000); configuration.transaction() .transactionManagerLookup(new EmbeddedTransactionManagerLookup()) .lockingMode(LockingMode.PESSIMISTIC) .useSynchronization(false) .recovery() .disable(); configuration.clustering() .hash() .numSegments(60) .stateTransfer() .fetchInMemoryState(false); viewLatch = new CountDownLatch(1); GlobalConfigurationBuilder global = new GlobalConfigurationBuilder(); global.transport().transport(new DelayedViewJGroupsTransport(viewLatch)); addClusterEnabledCacheManager(global, configuration); addClusterEnabledCacheManager(configuration); addClusterEnabledCacheManager(configuration); } private static final class DelayedViewJGroupsTransport extends JGroupsTransport { private final CountDownLatch waitLatch; DelayedViewJGroupsTransport(CountDownLatch waitLatch) { this.waitLatch = waitLatch; } @Override public void viewAccepted(View newView) { // check if this is an event of node going down, and if so wait for a signal to apply new view if (waitLatch != null && getMembers().size() > newView.getMembers().size()) { try { waitLatch.await(10, TimeUnit.SECONDS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } super.viewAccepted(newView); } } }