package org.infinispan.distribution.rehash; import static java.lang.String.format; import static org.infinispan.test.TestingUtil.sleepRandom; import static org.infinispan.test.fwk.TestCacheManagerFactory.createClusteredCacheManager; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.Future; import javax.transaction.HeuristicMixedException; import javax.transaction.HeuristicRollbackException; import javax.transaction.NotSupportedException; import javax.transaction.RollbackException; import javax.transaction.SystemException; import javax.transaction.TransactionManager; import org.infinispan.Cache; import org.infinispan.configuration.cache.CacheMode; import org.infinispan.configuration.cache.ConfigurationBuilder; import org.infinispan.configuration.global.GlobalConfigurationBuilder; import org.infinispan.context.Flag; import org.infinispan.distribution.DistributionTestHelper; import org.infinispan.distribution.LocalizedCacheTopology; import org.infinispan.manager.EmbeddedCacheManager; import org.infinispan.remoting.transport.Address; import org.infinispan.test.MultipleCacheManagersTest; import org.infinispan.test.TestingUtil; import org.infinispan.transaction.LockingMode; import org.infinispan.transaction.lookup.EmbeddedTransactionManagerLookup; import org.infinispan.util.concurrent.IsolationLevel; import org.infinispan.util.concurrent.TimeoutException; import org.infinispan.util.logging.Log; import org.infinispan.util.logging.LogFactory; import org.testng.annotations.Test; // As this is a SLOW stress test, leave it disabled by default. Only run it manually. @Test(groups = "stress", testName = "distribution.rehash.ConsistencyStressTest", timeOut = 15*60*1000) public class ConsistencyStressTest extends MultipleCacheManagersTest { private static final int NUM_NODES = 10; private static final int WORKERS_PER_NODE = 2; private static final int NUM_ITERATIONS = 5000; private static final boolean IGNORE_TX_FAILURES = true; private static final Log log = LogFactory.getLog(ConsistencyStressTest.class); @Override protected void createCacheManagers() throws Throwable { ConfigurationBuilder c = new ConfigurationBuilder(); c .locking() .isolationLevel(IsolationLevel.READ_COMMITTED) .lockAcquisitionTimeout(60000) .useLockStriping(false) .clustering() .cacheMode(CacheMode.DIST_SYNC) .remoteTimeout(30000) .l1().disable() .transaction() .lockingMode(LockingMode.PESSIMISTIC) .transactionManagerLookup(new EmbeddedTransactionManagerLookup()); GlobalConfigurationBuilder gc = GlobalConfigurationBuilder.defaultClusteredBuilder(); gc.transport().distributedSyncTimeout(60000); List<EmbeddedCacheManager> cacheManagers = new LinkedList<>(); for (int i = 0; i < NUM_NODES; i++) cacheManagers.add(createClusteredCacheManager(gc, c)); registerCacheManager(cacheManagers.toArray(new EmbeddedCacheManager[NUM_NODES])); } public void testConsistency() throws Throwable { Set<Future<Void>> futures = new HashSet<>(NUM_NODES * WORKERS_PER_NODE); Set<String> keysToIgnore = new HashSet<>(); for (int i = 0; i < NUM_NODES; i++) { Cache<String, String> c = cache(i); for (int j = 0; j < WORKERS_PER_NODE; j++) { Future<Void> f = fork(new Stressor(c, i, j, keysToIgnore)); futures.add(f); sleepRandom(500); } } // stressors are now running, generating a lot of data. // wait for all stressors to finish. log.info("Waiting for stressors to finish"); for (Future<Void> f : futures) f.get(); // Now shut down a node: TestingUtil.killCacheManagers(cacheManagers.get(0)); // ... and ensure no data is lost. // Stressors encode data in the format nodeNumber|workerNumber|iterationNumber, and all have the value "value". Map<Address, Cache<Object, Object>> cacheMap = new HashMap<>(); for (int i = 1; i < NUM_NODES; i++) { Cache<Object, Object> c = cache(i); cacheMap.put(address(c), c); } // Let's enforce a quiet period to allow queued up transactions to complete. Thread.sleep(25000); // lets make sure any rehashing work has completed TestingUtil.blockUntilViewsReceived(60000, false, cacheMap.values()); TestingUtil.waitForNoRebalance(cacheMap.values()); LocalizedCacheTopology cacheTopology = cache(1).getAdvancedCache().getDistributionManager().getCacheTopology(); for (int i = 0; i < NUM_NODES; i++) { for (int j = 0; j < WORKERS_PER_NODE; j++) { for (int k = 0; k < NUM_ITERATIONS; k++) { String key = keyFor(i, j, k); if (keysToIgnore.contains(key)) { log.infof("Skipping test on failing key %s", key); } else { Collection<Address> owners = cacheTopology.getWriteOwners(key); for (Map.Entry<Address, Cache<Object, Object>> e : cacheMap.entrySet()) { try { if (owners.contains(e.getKey())) { DistributionTestHelper.assertIsInContainerImmortal(e.getValue(), key); } // Don't bother testing non-owners since invalidations caused by rehashing are async! } catch (Throwable th) { log.fatalf("Key %s (segment %s) should be on owners %s according to %s", key, cacheTopology.getSegment(key), owners, cacheTopology); throw th; } } } } } } } private static String keyFor(int nodeId, int workerId, int iterationId) { return format("__%s_%s_%s__", nodeId, workerId, iterationId); } private static class Stressor implements Callable<Void> { private final Cache<String, String> cache; private final TransactionManager tm; private final int cacheId, workerId; private final Set<String> keysToIgnore; private Stressor(Cache<String, String> cache, int cacheId, int workerId, Set<String> keysToIgnore) { this.cache = cache; tm = TestingUtil.getTransactionManager(cache); this.cacheId = cacheId; this.workerId = workerId; this.keysToIgnore = keysToIgnore; } @Override public Void call() throws TimeoutException { for (int iterationId = 0; iterationId < NUM_ITERATIONS; iterationId++) { if (iterationId % 500 == 0) log.infof(" >> Stressor %s Worker %s Iteration %s", cacheId, workerId, iterationId); boolean txError = false; Exception exception = null; String key = keyFor(cacheId, workerId, iterationId); try { tm.begin(); cache.getAdvancedCache().withFlags(Flag.SKIP_REMOTE_LOOKUP).put(key, "value"); tm.commit(); } catch (HeuristicRollbackException | RollbackException | SystemException | HeuristicMixedException | NotSupportedException | TimeoutException e) { txError = true; exception = e; } if (txError) { //first try and roll back the tx try { tm.rollback(); } catch (Exception exc) { // rollback failed? log.error(" >> Rollback failed"); } if (IGNORE_TX_FAILURES) { keysToIgnore.add(key); log.errorf(" >> Saw a %s when trying to process key %s", exception.getClass().getSimpleName(), key); } else { throw new RuntimeException(exception); } } } return null; } } }