/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.test; import com.carrotsearch.randomizedtesting.RandomizedTest; import com.carrotsearch.randomizedtesting.SeedUtils; import com.carrotsearch.randomizedtesting.SysGlobals; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.generators.RandomPicks; import com.carrotsearch.randomizedtesting.generators.RandomStrings; import org.apache.logging.log4j.Logger; import org.apache.lucene.util.IOUtils; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.admin.cluster.node.stats.NodeStats; import org.elasticsearch.action.admin.cluster.node.tasks.list.ListTasksResponse; import org.elasticsearch.action.admin.indices.stats.CommonStatsFlags; import org.elasticsearch.action.admin.indices.stats.CommonStatsFlags.Flag; import org.elasticsearch.action.support.replication.ReplicationTask; import org.elasticsearch.client.Client; import org.elasticsearch.client.transport.TransportClient; import org.elasticsearch.cluster.ClusterName; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.action.index.MappingUpdatedAction; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNode.Role; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.routing.OperationRouting; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings; import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Strings; import org.elasticsearch.common.breaker.CircuitBreaker; import org.elasticsearch.common.io.FileSystemUtils; import org.elasticsearch.common.lease.Releasables; import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.network.NetworkModule; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings.Builder; import org.elasticsearch.common.transport.TransportAddress; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.PageCacheRecycler; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.common.util.concurrent.ThreadContext; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.discovery.DiscoveryModule; import org.elasticsearch.discovery.zen.ElectMasterService; import org.elasticsearch.discovery.zen.ZenDiscovery; import org.elasticsearch.env.Environment; import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.env.ShardLockObtainFailedException; import org.elasticsearch.http.HttpServerTransport; import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.engine.CommitStats; import org.elasticsearch.index.engine.Engine; import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.breaker.CircuitBreakerService; import org.elasticsearch.indices.breaker.HierarchyCircuitBreakerService; import org.elasticsearch.indices.fielddata.cache.IndicesFieldDataCache; import org.elasticsearch.indices.recovery.RecoverySettings; import org.elasticsearch.node.MockNode; import org.elasticsearch.node.Node; import org.elasticsearch.node.NodeService; import org.elasticsearch.node.NodeValidationException; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.script.ScriptService; import org.elasticsearch.search.SearchService; import org.elasticsearch.tasks.TaskInfo; import org.elasticsearch.tasks.TaskManager; import org.elasticsearch.test.disruption.ServiceDisruptionScheme; import org.elasticsearch.test.transport.MockTransportService; import org.elasticsearch.transport.MockTransportClient; import org.elasticsearch.transport.TcpTransport; import org.elasticsearch.transport.Transport; import org.elasticsearch.transport.TransportService; import org.elasticsearch.transport.TransportSettings; import org.junit.Assert; import java.io.Closeable; import java.io.IOException; import java.net.InetSocketAddress; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.Random; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; import static org.apache.lucene.util.LuceneTestCase.TEST_NIGHTLY; import static org.apache.lucene.util.LuceneTestCase.rarely; import static org.elasticsearch.discovery.DiscoverySettings.INITIAL_STATE_TIMEOUT_SETTING; import static org.elasticsearch.discovery.zen.ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING; import static org.elasticsearch.test.ESTestCase.assertBusy; import static org.elasticsearch.test.ESTestCase.awaitBusy; import static org.elasticsearch.test.ESTestCase.randomFrom; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.junit.Assert.assertThat; import static org.junit.Assert.fail; /** * InternalTestCluster manages a set of JVM private nodes and allows convenient access to them. * The cluster supports randomized configuration such that nodes started in the cluster will * automatically load asserting services tracking resources like file handles or open searchers. * <p> * The Cluster is bound to a test lifecycle where tests must call {@link #beforeTest(java.util.Random, double)} and * {@link #afterTest()} to initialize and reset the cluster in order to be more reproducible. The term "more" relates * to the async nature of Elasticsearch in combination with randomized testing. Once Threads and asynchronous calls * are involved reproducibility is very limited. This class should only be used through {@link ESIntegTestCase}. * </p> */ public final class InternalTestCluster extends TestCluster { private final Logger logger = Loggers.getLogger(getClass()); /** * The number of ports in the range used for this JVM */ public static final int PORTS_PER_JVM = 100; /** * The number of ports in the range used for this cluster */ public static final int PORTS_PER_CLUSTER = 20; private static final int GLOBAL_TRANSPORT_BASE_PORT = 9300; private static final int GLOBAL_HTTP_BASE_PORT = 19200; private static final int JVM_ORDINAL = Integer.parseInt(System.getProperty(SysGlobals.CHILDVM_SYSPROP_JVM_ID, "0")); /** * a per-JVM unique offset to be used for calculating unique port ranges. */ public static final int JVM_BASE_PORT_OFFSET = PORTS_PER_JVM * (JVM_ORDINAL + 1); private static final AtomicInteger clusterOrdinal = new AtomicInteger(); private final int CLUSTER_BASE_PORT_OFFSET = JVM_BASE_PORT_OFFSET + (clusterOrdinal.getAndIncrement() * PORTS_PER_CLUSTER) % PORTS_PER_JVM; public final int TRANSPORT_BASE_PORT = GLOBAL_TRANSPORT_BASE_PORT + CLUSTER_BASE_PORT_OFFSET; public final int HTTP_BASE_PORT = GLOBAL_HTTP_BASE_PORT + CLUSTER_BASE_PORT_OFFSET; public static final int DEFAULT_LOW_NUM_MASTER_NODES = 1; public static final int DEFAULT_HIGH_NUM_MASTER_NODES = 3; static final int DEFAULT_MIN_NUM_DATA_NODES = 1; static final int DEFAULT_MAX_NUM_DATA_NODES = TEST_NIGHTLY ? 6 : 3; static final int DEFAULT_NUM_CLIENT_NODES = -1; static final int DEFAULT_MIN_NUM_CLIENT_NODES = 0; static final int DEFAULT_MAX_NUM_CLIENT_NODES = 1; static final boolean DEFAULT_ENABLE_HTTP_PIPELINING = true; /* sorted map to make traverse order reproducible, concurrent since we do checks on it not within a sync block */ private final NavigableMap<String, NodeAndClient> nodes = new TreeMap<>(); private final Set<Path> dataDirToClean = new HashSet<>(); private final String clusterName; private final AtomicBoolean open = new AtomicBoolean(true); private final Settings defaultSettings; private AtomicInteger nextNodeId = new AtomicInteger(0); /* Each shared node has a node seed that is used to start up the node and get default settings * this is important if a node is randomly shut down in a test since the next test relies on a * fully shared cluster to be more reproducible */ private final long[] sharedNodesSeeds; // if set to 0, data nodes will also assume the master role private final int numSharedDedicatedMasterNodes; private final int numSharedDataNodes; private final int numSharedCoordOnlyNodes; private final NodeConfigurationSource nodeConfigurationSource; private final ExecutorService executor; private final boolean autoManageMinMasterNodes; private final Collection<Class<? extends Plugin>> mockPlugins; /** * All nodes started by the cluster will have their name set to nodePrefix followed by a positive number */ private final String nodePrefix; private final Path baseDir; private ServiceDisruptionScheme activeDisruptionScheme; private Function<Client, Client> clientWrapper; public InternalTestCluster(long clusterSeed, Path baseDir, boolean randomlyAddDedicatedMasters, boolean autoManageMinMasterNodes, int minNumDataNodes, int maxNumDataNodes, String clusterName, NodeConfigurationSource nodeConfigurationSource, int numClientNodes, boolean enableHttpPipelining, String nodePrefix, Collection<Class<? extends Plugin>> mockPlugins, Function<Client, Client> clientWrapper) { super(clusterSeed); this.autoManageMinMasterNodes = autoManageMinMasterNodes; this.clientWrapper = clientWrapper; this.baseDir = baseDir; this.clusterName = clusterName; if (minNumDataNodes < 0 || maxNumDataNodes < 0) { throw new IllegalArgumentException("minimum and maximum number of data nodes must be >= 0"); } if (maxNumDataNodes < minNumDataNodes) { throw new IllegalArgumentException("maximum number of data nodes must be >= minimum number of data nodes"); } Random random = new Random(clusterSeed); boolean useDedicatedMasterNodes = randomlyAddDedicatedMasters ? random.nextBoolean() : false; this.numSharedDataNodes = RandomNumbers.randomIntBetween(random, minNumDataNodes, maxNumDataNodes); assert this.numSharedDataNodes >= 0; if (numSharedDataNodes == 0) { this.numSharedCoordOnlyNodes = 0; this.numSharedDedicatedMasterNodes = 0; } else { if (useDedicatedMasterNodes) { if (random.nextBoolean()) { // use a dedicated master, but only low number to reduce overhead to tests this.numSharedDedicatedMasterNodes = DEFAULT_LOW_NUM_MASTER_NODES; } else { this.numSharedDedicatedMasterNodes = DEFAULT_HIGH_NUM_MASTER_NODES; } } else { this.numSharedDedicatedMasterNodes = 0; } if (numClientNodes < 0) { this.numSharedCoordOnlyNodes = RandomNumbers.randomIntBetween(random, DEFAULT_MIN_NUM_CLIENT_NODES, DEFAULT_MAX_NUM_CLIENT_NODES); } else { this.numSharedCoordOnlyNodes = numClientNodes; } } assert this.numSharedCoordOnlyNodes >= 0; this.nodePrefix = nodePrefix; assert nodePrefix != null; this.mockPlugins = mockPlugins; sharedNodesSeeds = new long[numSharedDedicatedMasterNodes + numSharedDataNodes + numSharedCoordOnlyNodes]; for (int i = 0; i < sharedNodesSeeds.length; i++) { sharedNodesSeeds[i] = random.nextLong(); } logger.info("Setup InternalTestCluster [{}] with seed [{}] using [{}] dedicated masters, " + "[{}] (data) nodes and [{}] coord only nodes (min_master_nodes are [{}])", clusterName, SeedUtils.formatSeed(clusterSeed), numSharedDedicatedMasterNodes, numSharedDataNodes, numSharedCoordOnlyNodes, autoManageMinMasterNodes ? "auto-managed" : "manual"); this.nodeConfigurationSource = nodeConfigurationSource; Builder builder = Settings.builder(); if (random.nextInt(5) == 0) { // sometimes set this // randomize (multi/single) data path, special case for 0, don't set it at all... final int numOfDataPaths = random.nextInt(5); if (numOfDataPaths > 0) { StringBuilder dataPath = new StringBuilder(); for (int i = 0; i < numOfDataPaths; i++) { dataPath.append(baseDir.resolve("d" + i).toAbsolutePath()).append(','); } builder.put(Environment.PATH_DATA_SETTING.getKey(), dataPath.toString()); } } builder.put(NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.getKey(), Integer.MAX_VALUE); builder.put(Environment.PATH_SHARED_DATA_SETTING.getKey(), baseDir.resolve("custom")); builder.put(Environment.PATH_HOME_SETTING.getKey(), baseDir); builder.put(Environment.PATH_REPO_SETTING.getKey(), baseDir.resolve("repos")); builder.put(TransportSettings.PORT.getKey(), TRANSPORT_BASE_PORT + "-" + (TRANSPORT_BASE_PORT + PORTS_PER_CLUSTER)); builder.put("http.port", HTTP_BASE_PORT + "-" + (HTTP_BASE_PORT + PORTS_PER_CLUSTER)); builder.put("http.pipelining", enableHttpPipelining); if (Strings.hasLength(System.getProperty("tests.es.logger.level"))) { builder.put("logger.level", System.getProperty("tests.es.logger.level")); } if (Strings.hasLength(System.getProperty("es.logger.prefix"))) { builder.put("logger.prefix", System.getProperty("es.logger.prefix")); } // Default the watermarks to absurdly low to prevent the tests // from failing on nodes without enough disk space builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1b"); builder.put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "1b"); // Some tests make use of scripting quite a bit, so increase the limit for integration tests builder.put(ScriptService.SCRIPT_MAX_COMPILATIONS_PER_MINUTE.getKey(), 1000); if (TEST_NIGHTLY) { builder.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), RandomNumbers.randomIntBetween(random, 5, 10)); builder.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), RandomNumbers.randomIntBetween(random, 5, 10)); } else if (random.nextInt(100) <= 90) { builder.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING.getKey(), RandomNumbers.randomIntBetween(random, 2, 5)); builder.put(ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), RandomNumbers.randomIntBetween(random, 2, 5)); } // always reduce this - it can make tests really slow builder.put(RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.getKey(), TimeValue.timeValueMillis(RandomNumbers.randomIntBetween(random, 20, 50))); defaultSettings = builder.build(); executor = EsExecutors.newScaling("test runner", 0, Integer.MAX_VALUE, 0, TimeUnit.SECONDS, EsExecutors.daemonThreadFactory("test_" + clusterName), new ThreadContext(Settings.EMPTY)); } @Override public String getClusterName() { return clusterName; } /** returns true if the {@link ElectMasterService#DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING} setting is auto managed by this cluster */ public boolean getAutoManageMinMasterNode() { return autoManageMinMasterNodes; } public String[] getNodeNames() { return nodes.keySet().toArray(Strings.EMPTY_ARRAY); } private Settings getSettings(int nodeOrdinal, long nodeSeed, Settings others) { Builder builder = Settings.builder().put(defaultSettings) .put(getRandomNodeSettings(nodeSeed)); Settings settings = nodeConfigurationSource.nodeSettings(nodeOrdinal); if (settings != null) { if (settings.get(ClusterName.CLUSTER_NAME_SETTING.getKey()) != null) { throw new IllegalStateException("Tests must not set a '" + ClusterName.CLUSTER_NAME_SETTING.getKey() + "' as a node setting set '" + ClusterName.CLUSTER_NAME_SETTING.getKey() + "': [" + settings.get(ClusterName.CLUSTER_NAME_SETTING.getKey()) + "]"); } builder.put(settings); } if (others != null) { builder.put(others); } builder.put(ClusterName.CLUSTER_NAME_SETTING.getKey(), clusterName); return builder.build(); } private Collection<Class<? extends Plugin>> getPlugins() { Set<Class<? extends Plugin>> plugins = new HashSet<>(nodeConfigurationSource.nodePlugins()); plugins.addAll(mockPlugins); return plugins; } private Settings getRandomNodeSettings(long seed) { Random random = new Random(seed); Builder builder = Settings.builder(); builder.put(Transport.TRANSPORT_TCP_COMPRESS.getKey(), rarely(random)); if (random.nextBoolean()) { builder.put("cache.recycler.page.type", RandomPicks.randomFrom(random, PageCacheRecycler.Type.values())); } if (random.nextInt(10) == 0) { // 10% of the nodes have a very frequent check interval builder.put(SearchService.KEEPALIVE_INTERVAL_SETTING.getKey(), TimeValue.timeValueMillis(10 + random.nextInt(2000)).getStringRep()); } else if (random.nextInt(10) != 0) { // 90% of the time - 10% of the time we don't set anything builder.put(SearchService.KEEPALIVE_INTERVAL_SETTING.getKey(), TimeValue.timeValueSeconds(10 + random.nextInt(5 * 60)).getStringRep()); } if (random.nextBoolean()) { // sometimes set a builder.put(SearchService.DEFAULT_KEEPALIVE_SETTING.getKey(), TimeValue.timeValueSeconds(100 + random.nextInt(5 * 60)).getStringRep()); } builder.put(EsExecutors.PROCESSORS_SETTING.getKey(), 1 + random.nextInt(3)); if (random.nextBoolean()) { if (random.nextBoolean()) { builder.put("indices.fielddata.cache.size", 1 + random.nextInt(1000), ByteSizeUnit.MB); } } // randomize tcp settings if (random.nextBoolean()) { builder.put(TcpTransport.CONNECTIONS_PER_NODE_RECOVERY.getKey(), random.nextInt(2) + 1); builder.put(TcpTransport.CONNECTIONS_PER_NODE_BULK.getKey(), random.nextInt(3) + 1); builder.put(TcpTransport.CONNECTIONS_PER_NODE_REG.getKey(), random.nextInt(6) + 1); } if (random.nextBoolean()) { builder.put(MappingUpdatedAction.INDICES_MAPPING_DYNAMIC_TIMEOUT_SETTING.getKey(), new TimeValue(RandomNumbers.randomIntBetween(random, 10, 30), TimeUnit.SECONDS)); } if (random.nextInt(10) == 0) { builder.put(HierarchyCircuitBreakerService.REQUEST_CIRCUIT_BREAKER_TYPE_SETTING.getKey(), "noop"); builder.put(HierarchyCircuitBreakerService.FIELDDATA_CIRCUIT_BREAKER_TYPE_SETTING.getKey(), "noop"); } if (random.nextBoolean()) { if (random.nextInt(10) == 0) { // do something crazy slow here builder.put(RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.getKey(), new ByteSizeValue(RandomNumbers.randomIntBetween(random, 1, 10), ByteSizeUnit.MB)); } else { builder.put(RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING.getKey(), new ByteSizeValue(RandomNumbers.randomIntBetween(random, 10, 200), ByteSizeUnit.MB)); } } if (random.nextBoolean()) { builder.put(TcpTransport.PING_SCHEDULE.getKey(), RandomNumbers.randomIntBetween(random, 100, 2000) + "ms"); } if (random.nextBoolean()) { builder.put(ScriptService.SCRIPT_CACHE_SIZE_SETTING.getKey(), RandomNumbers.randomIntBetween(random, 0, 2000)); } if (random.nextBoolean()) { builder.put(ScriptService.SCRIPT_CACHE_EXPIRE_SETTING.getKey(), TimeValue.timeValueMillis(RandomNumbers.randomIntBetween(random, 750, 10000000)).getStringRep()); } return builder.build(); } public static String clusterName(String prefix, long clusterSeed) { StringBuilder builder = new StringBuilder(prefix); final int childVM = RandomizedTest.systemPropertyAsInt(SysGlobals.CHILDVM_SYSPROP_JVM_ID, 0); builder.append("-CHILD_VM=[").append(childVM).append(']'); builder.append("-CLUSTER_SEED=[").append(clusterSeed).append(']'); // if multiple maven task run on a single host we better have an identifier that doesn't rely on input params builder.append("-HASH=[").append(SeedUtils.formatSeed(System.nanoTime())).append(']'); return builder.toString(); } private void ensureOpen() { if (!open.get()) { throw new RuntimeException("Cluster is already closed"); } } private synchronized NodeAndClient getOrBuildRandomNode() { ensureOpen(); NodeAndClient randomNodeAndClient = getRandomNodeAndClient(); if (randomNodeAndClient != null) { return randomNodeAndClient; } NodeAndClient buildNode = buildNode(1); buildNode.startNode(); publishNode(buildNode); return buildNode; } private synchronized NodeAndClient getRandomNodeAndClient() { return getRandomNodeAndClient(nc -> true); } private synchronized NodeAndClient getRandomNodeAndClient(Predicate<NodeAndClient> predicate) { ensureOpen(); Collection<NodeAndClient> values = nodes.values().stream().filter(predicate).collect(Collectors.toCollection(ArrayList::new)); if (!values.isEmpty()) { int whichOne = random.nextInt(values.size()); for (NodeAndClient nodeAndClient : values) { if (whichOne-- == 0) { return nodeAndClient; } } } return null; } /** * Ensures that at least <code>n</code> data nodes are present in the cluster. * if more nodes than <code>n</code> are present this method will not * stop any of the running nodes. */ public synchronized void ensureAtLeastNumDataNodes(int n) { boolean added = false; int size = numDataNodes(); for (int i = size; i < n; i++) { logger.info("increasing cluster size from {} to {}", size, n); added = true; if (numSharedDedicatedMasterNodes > 0) { startDataOnlyNode(Settings.EMPTY); } else { startNode(Settings.EMPTY); } } if (added) { validateClusterFormed(); } } /** * Ensures that at most <code>n</code> are up and running. * If less nodes that <code>n</code> are running this method * will not start any additional nodes. */ public synchronized void ensureAtMostNumDataNodes(int n) throws IOException { int size = numDataNodes(); if (size <= n) { return; } // prevent killing the master if possible and client nodes final Stream<NodeAndClient> collection = n == 0 ? nodes.values().stream() : nodes.values().stream().filter(new DataNodePredicate().and(new MasterNodePredicate(getMasterName()).negate())); final Iterator<NodeAndClient> values = collection.iterator(); logger.info("changing cluster size from {} data nodes to {}", size, n); Set<NodeAndClient> nodesToRemove = new HashSet<>(); int numNodesAndClients = 0; while (values.hasNext() && numNodesAndClients++ < size - n) { NodeAndClient next = values.next(); nodesToRemove.add(next); } stopNodesAndClients(nodesToRemove); if (!nodesToRemove.isEmpty() && size() > 0) { validateClusterFormed(); } } /** * builds a new node given the settings. * * @param settings the settings to use * @param defaultMinMasterNodes min_master_nodes value to use if min_master_nodes is auto managed */ private NodeAndClient buildNode(Settings settings, int defaultMinMasterNodes) { int ord = nextNodeId.getAndIncrement(); return buildNode(ord, random.nextLong(), settings, false, defaultMinMasterNodes); } /** * builds a new node with default settings * * @param defaultMinMasterNodes min_master_nodes value to use if min_master_nodes is auto managed */ private NodeAndClient buildNode(int defaultMinMasterNodes) { int ord = nextNodeId.getAndIncrement(); return buildNode(ord, random.nextLong(), null, false, defaultMinMasterNodes); } /** * builds a new node * * @param nodeId the node internal id (see {@link NodeAndClient#nodeAndClientId()} * @param seed the node's random seed * @param settings the settings to use * @param reuseExisting if a node with the same name is already part of {@link #nodes}, no new node will be built and * the method will return the existing one * @param defaultMinMasterNodes min_master_nodes value to use if min_master_nodes is auto managed */ private NodeAndClient buildNode(int nodeId, long seed, Settings settings, boolean reuseExisting, int defaultMinMasterNodes) { assert Thread.holdsLock(this); ensureOpen(); settings = getSettings(nodeId, seed, settings); Collection<Class<? extends Plugin>> plugins = getPlugins(); String name = buildNodeName(nodeId, settings); if (reuseExisting && nodes.containsKey(name)) { return nodes.get(name); } else { assert reuseExisting == true || nodes.containsKey(name) == false : "node name [" + name + "] already exists but not allowed to use it"; } Settings.Builder finalSettings = Settings.builder() .put(Environment.PATH_HOME_SETTING.getKey(), baseDir) // allow overriding path.home .put(settings) .put("node.name", name) .put(NodeEnvironment.NODE_ID_SEED_SETTING.getKey(), seed); final boolean usingSingleNodeDiscovery = DiscoveryModule.DISCOVERY_TYPE_SETTING.get(finalSettings.build()).equals("single-node"); if (!usingSingleNodeDiscovery && autoManageMinMasterNodes) { assert finalSettings.get(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey()) == null : "min master nodes may not be set when auto managed"; assert finalSettings.get(INITIAL_STATE_TIMEOUT_SETTING.getKey()) == null : "automatically managing min master nodes require nodes to complete a join cycle" + " when starting"; finalSettings // don't wait too long not to slow down tests .put(ZenDiscovery.MASTER_ELECTION_WAIT_FOR_JOINS_TIMEOUT_SETTING.getKey(), "5s") .put(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), defaultMinMasterNodes); } else if (!usingSingleNodeDiscovery && finalSettings.get(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey()) == null) { throw new IllegalArgumentException(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey() + " must be configured"); } MockNode node = new MockNode(finalSettings.build(), plugins); return new NodeAndClient(name, node, nodeId); } private String buildNodeName(int id, Settings settings) { String prefix = nodePrefix; prefix = prefix + getRoleSuffix(settings); return prefix + id; } /** * returns a suffix string based on the node role. If no explicit role is defined, the suffix will be empty */ private String getRoleSuffix(Settings settings) { String suffix = ""; if (Node.NODE_MASTER_SETTING.exists(settings) && Node.NODE_MASTER_SETTING.get(settings)) { suffix = suffix + Role.MASTER.getAbbreviation(); } if (Node.NODE_DATA_SETTING.exists(settings) && Node.NODE_DATA_SETTING.get(settings)) { suffix = suffix + Role.DATA.getAbbreviation(); } if (Node.NODE_MASTER_SETTING.exists(settings) && Node.NODE_MASTER_SETTING.get(settings) == false && Node.NODE_DATA_SETTING.exists(settings) && Node.NODE_DATA_SETTING.get(settings) == false ) { suffix = suffix + "c"; } return suffix; } /** * Returns the common node name prefix for this test cluster. */ public String nodePrefix() { return nodePrefix; } @Override public synchronized Client client() { ensureOpen(); /* Randomly return a client to one of the nodes in the cluster */ return getOrBuildRandomNode().client(random); } /** * Returns a node client to a data node in the cluster. * Note: use this with care tests should not rely on a certain nodes client. */ public synchronized Client dataNodeClient() { ensureOpen(); /* Randomly return a client to one of the nodes in the cluster */ return getRandomNodeAndClient(new DataNodePredicate()).client(random); } /** * Returns a node client to the current master node. * Note: use this with care tests should not rely on a certain nodes client. */ public synchronized Client masterClient() { ensureOpen(); NodeAndClient randomNodeAndClient = getRandomNodeAndClient(new MasterNodePredicate(getMasterName())); if (randomNodeAndClient != null) { return randomNodeAndClient.nodeClient(); // ensure node client master is requested } Assert.fail("No master client found"); return null; // can't happen } /** * Returns a node client to random node but not the master. This method will fail if no non-master client is available. */ public synchronized Client nonMasterClient() { ensureOpen(); NodeAndClient randomNodeAndClient = getRandomNodeAndClient(new MasterNodePredicate(getMasterName()).negate()); if (randomNodeAndClient != null) { return randomNodeAndClient.nodeClient(); // ensure node client non-master is requested } Assert.fail("No non-master client found"); return null; // can't happen } /** * Returns a client to a coordinating only node */ public synchronized Client coordOnlyNodeClient() { ensureOpen(); NodeAndClient randomNodeAndClient = getRandomNodeAndClient(new NoDataNoMasterNodePredicate()); if (randomNodeAndClient != null) { return randomNodeAndClient.client(random); } int nodeId = nextNodeId.getAndIncrement(); Settings settings = getSettings(nodeId, random.nextLong(), Settings.EMPTY); startCoordinatingOnlyNode(settings); return getRandomNodeAndClient(new NoDataNoMasterNodePredicate()).client(random); } public synchronized String startCoordinatingOnlyNode(Settings settings) { ensureOpen(); // currently unused Builder builder = Settings.builder().put(settings).put(Node.NODE_MASTER_SETTING.getKey(), false) .put(Node.NODE_DATA_SETTING.getKey(), false).put(Node.NODE_INGEST_SETTING.getKey(), false); return startNode(builder); } /** * Returns a transport client */ public synchronized Client transportClient() { ensureOpen(); // randomly return a transport client going to one of the nodes in the cluster return getOrBuildRandomNode().transportClient(); } /** * Returns a node client to a given node. */ public synchronized Client client(String nodeName) { ensureOpen(); NodeAndClient nodeAndClient = nodes.get(nodeName); if (nodeAndClient != null) { return nodeAndClient.client(random); } Assert.fail("No node found with name: [" + nodeName + "]"); return null; // can't happen } /** * Returns a "smart" node client to a random node in the cluster */ public synchronized Client smartClient() { NodeAndClient randomNodeAndClient = getRandomNodeAndClient(); if (randomNodeAndClient != null) { return randomNodeAndClient.nodeClient(); } Assert.fail("No smart client found"); return null; // can't happen } /** * Returns a random node that applies to the given predicate. * The predicate can filter nodes based on the nodes settings. * If all nodes are filtered out this method will return <code>null</code> */ public synchronized Client client(final Predicate<Settings> filterPredicate) { ensureOpen(); final NodeAndClient randomNodeAndClient = getRandomNodeAndClient(nodeAndClient -> filterPredicate.test(nodeAndClient.node.settings())); if (randomNodeAndClient != null) { return randomNodeAndClient.client(random); } return null; } @Override public synchronized void close() { if (this.open.compareAndSet(true, false)) { if (activeDisruptionScheme != null) { activeDisruptionScheme.testClusterClosed(); activeDisruptionScheme = null; } IOUtils.closeWhileHandlingException(nodes.values()); nodes.clear(); executor.shutdownNow(); } } private final class NodeAndClient implements Closeable { private MockNode node; private Client nodeClient; private Client transportClient; private final AtomicBoolean closed = new AtomicBoolean(false); private final String name; private final int nodeAndClientId; NodeAndClient(String name, MockNode node, int nodeAndClientId) { this.node = node; this.name = name; this.nodeAndClientId = nodeAndClientId; markNodeDataDirsAsNotEligableForWipe(node); } Node node() { if (closed.get()) { throw new RuntimeException("already closed"); } return node; } public int nodeAndClientId() { return nodeAndClientId; } public String getName() { return name; } public boolean isMasterEligible() { return Node.NODE_MASTER_SETTING.get(node.settings()); } Client client(Random random) { if (closed.get()) { throw new RuntimeException("already closed"); } double nextDouble = random.nextDouble(); if (nextDouble < transportClientRatio) { if (logger.isTraceEnabled()) { logger.trace("Using transport client for node [{}] sniff: [{}]", node.settings().get("node.name"), false); } return getOrBuildTransportClient(); } else { return getOrBuildNodeClient(); } } Client nodeClient() { if (closed.get()) { throw new RuntimeException("already closed"); } return getOrBuildNodeClient(); } Client transportClient() { if (closed.get()) { throw new RuntimeException("already closed"); } return getOrBuildTransportClient(); } private Client getOrBuildNodeClient() { if (nodeClient == null) { nodeClient = node.client(); } return clientWrapper.apply(nodeClient); } private Client getOrBuildTransportClient() { if (transportClient == null) { /* no sniff client for now - doesn't work will all tests since it might throw NoNodeAvailableException if nodes are shut down. * we first need support of transportClientRatio as annotations or so */ transportClient = new TransportClientFactory(false, nodeConfigurationSource.transportClientSettings(), baseDir, nodeConfigurationSource.transportClientPlugins()).client(node, clusterName); } return clientWrapper.apply(transportClient); } void resetClient() throws IOException { if (closed.get() == false) { Releasables.close(nodeClient, transportClient); nodeClient = null; transportClient = null; } } void startNode() { try { node.start(); } catch (NodeValidationException e) { throw new RuntimeException(e); } } void closeNode() throws IOException { markNodeDataDirsAsPendingForWipe(node); node.close(); } /** * closes the current node if not already closed, builds a new node object using the current node settings and starts it */ void restart(RestartCallback callback, boolean clearDataIfNeeded, int minMasterNodes) throws Exception { if (!node.isClosed()) { closeNode(); } recreateNodeOnRestart(callback, clearDataIfNeeded, minMasterNodes); startNode(); } /** * rebuilds a new node object using the current node settings and starts it */ void recreateNodeOnRestart(RestartCallback callback, boolean clearDataIfNeeded, int minMasterNodes) throws Exception { assert callback != null; Settings callbackSettings = callback.onNodeStopped(name); Settings.Builder newSettings = Settings.builder(); if (callbackSettings != null) { newSettings.put(callbackSettings); } if (minMasterNodes >= 0) { assert DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.exists(newSettings.build()) == false : "min master nodes is auto managed"; newSettings.put(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), minMasterNodes).build(); } if (clearDataIfNeeded) { clearDataIfNeeded(callback); } createNewNode(newSettings.build()); // make sure cached client points to new node resetClient(); } private void clearDataIfNeeded(RestartCallback callback) throws IOException { if (callback.clearData(name)) { NodeEnvironment nodeEnv = node.getNodeEnvironment(); if (nodeEnv.hasNodeFile()) { final Path[] locations = nodeEnv.nodeDataPaths(); logger.debug("removing node data paths: [{}]", Arrays.toString(locations)); IOUtils.rm(locations); } } } private void createNewNode(final Settings newSettings) { final long newIdSeed = NodeEnvironment.NODE_ID_SEED_SETTING.get(node.settings()) + 1; // use a new seed to make sure we have new node id Settings finalSettings = Settings.builder().put(node.settings()).put(newSettings).put(NodeEnvironment.NODE_ID_SEED_SETTING.getKey(), newIdSeed).build(); if (DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.exists(finalSettings) == false) { throw new IllegalStateException(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey() + " is not configured after restart of [" + name + "]"); } Collection<Class<? extends Plugin>> plugins = node.getClasspathPlugins(); node = new MockNode(finalSettings, plugins); markNodeDataDirsAsNotEligableForWipe(node); } @Override public void close() throws IOException { try { resetClient(); } finally { closed.set(true); closeNode(); } } } public static final String TRANSPORT_CLIENT_PREFIX = "transport_client_"; static class TransportClientFactory { private final boolean sniff; private final Settings settings; private final Path baseDir; private final Collection<Class<? extends Plugin>> plugins; TransportClientFactory(boolean sniff, Settings settings, Path baseDir, Collection<Class<? extends Plugin>> plugins) { this.sniff = sniff; this.settings = settings != null ? settings : Settings.EMPTY; this.baseDir = baseDir; this.plugins = plugins; } public Client client(Node node, String clusterName) { TransportAddress addr = node.injector().getInstance(TransportService.class).boundAddress().publishAddress(); Settings nodeSettings = node.settings(); Builder builder = Settings.builder() .put("client.transport.nodes_sampler_interval", "1s") .put(Environment.PATH_HOME_SETTING.getKey(), baseDir) .put("node.name", TRANSPORT_CLIENT_PREFIX + node.settings().get("node.name")) .put(ClusterName.CLUSTER_NAME_SETTING.getKey(), clusterName).put("client.transport.sniff", sniff) .put("logger.prefix", nodeSettings.get("logger.prefix", "")) .put("logger.level", nodeSettings.get("logger.level", "INFO")) .put(settings); if ( NetworkModule.TRANSPORT_TYPE_SETTING.exists(settings)) { builder.put(NetworkModule.TRANSPORT_TYPE_KEY, NetworkModule.TRANSPORT_TYPE_SETTING.get(settings)); } TransportClient client = new MockTransportClient(builder.build(), plugins); client.addTransportAddress(addr); return client; } } @Override public synchronized void beforeTest(Random random, double transportClientRatio) throws IOException, InterruptedException { super.beforeTest(random, transportClientRatio); reset(true); } private synchronized void reset(boolean wipeData) throws IOException { // clear all rules for mock transport services for (NodeAndClient nodeAndClient : nodes.values()) { TransportService transportService = nodeAndClient.node.injector().getInstance(TransportService.class); if (transportService instanceof MockTransportService) { final MockTransportService mockTransportService = (MockTransportService) transportService; mockTransportService.clearAllRules(); mockTransportService.clearTracers(); } } randomlyResetClients(); final int newSize = sharedNodesSeeds.length; if (nextNodeId.get() == newSize && nodes.size() == newSize) { if (wipeData) { wipePendingDataDirectories(); } if (nodes.size() > 0 && autoManageMinMasterNodes) { updateMinMasterNodes(getMasterNodesCount()); } logger.debug("Cluster hasn't changed - moving out - nodes: [{}] nextNodeId: [{}] numSharedNodes: [{}]", nodes.keySet(), nextNodeId.get(), newSize); return; } logger.debug("Cluster is NOT consistent - restarting shared nodes - nodes: [{}] nextNodeId: [{}] numSharedNodes: [{}]", nodes.keySet(), nextNodeId.get(), newSize); // trash all nodes with id >= sharedNodesSeeds.length - they are non shared final List<NodeAndClient> toClose = new ArrayList<>(); for (Iterator<NodeAndClient> iterator = nodes.values().iterator(); iterator.hasNext();) { NodeAndClient nodeAndClient = iterator.next(); if (nodeAndClient.nodeAndClientId() >= sharedNodesSeeds.length) { logger.debug("Close Node [{}] not shared", nodeAndClient.name); toClose.add(nodeAndClient); } } stopNodesAndClients(toClose); // clean up what the nodes left that is unused if (wipeData) { wipePendingDataDirectories(); } // start any missing node assert newSize == numSharedDedicatedMasterNodes + numSharedDataNodes + numSharedCoordOnlyNodes; final int numberOfMasterNodes = numSharedDedicatedMasterNodes > 0 ? numSharedDedicatedMasterNodes : numSharedDataNodes; final int defaultMinMasterNodes = (numberOfMasterNodes / 2) + 1; final List<NodeAndClient> toStartAndPublish = new ArrayList<>(); // we want to start nodes in one go due to min master nodes for (int i = 0; i < numSharedDedicatedMasterNodes; i++) { final Settings.Builder settings = Settings.builder(); settings.put(Node.NODE_MASTER_SETTING.getKey(), true); settings.put(Node.NODE_DATA_SETTING.getKey(), false); NodeAndClient nodeAndClient = buildNode(i, sharedNodesSeeds[i], settings.build(), true, defaultMinMasterNodes); toStartAndPublish.add(nodeAndClient); } for (int i = numSharedDedicatedMasterNodes; i < numSharedDedicatedMasterNodes + numSharedDataNodes; i++) { final Settings.Builder settings = Settings.builder(); if (numSharedDedicatedMasterNodes > 0) { // if we don't have dedicated master nodes, keep things default settings.put(Node.NODE_MASTER_SETTING.getKey(), false).build(); settings.put(Node.NODE_DATA_SETTING.getKey(), true).build(); } NodeAndClient nodeAndClient = buildNode(i, sharedNodesSeeds[i], settings.build(), true, defaultMinMasterNodes); toStartAndPublish.add(nodeAndClient); } for (int i = numSharedDedicatedMasterNodes + numSharedDataNodes; i < numSharedDedicatedMasterNodes + numSharedDataNodes + numSharedCoordOnlyNodes; i++) { final Builder settings = Settings.builder().put(Node.NODE_MASTER_SETTING.getKey(), false) .put(Node.NODE_DATA_SETTING.getKey(), false).put(Node.NODE_INGEST_SETTING.getKey(), false); NodeAndClient nodeAndClient = buildNode(i, sharedNodesSeeds[i], settings.build(), true, defaultMinMasterNodes); toStartAndPublish.add(nodeAndClient); } startAndPublishNodesAndClients(toStartAndPublish); nextNodeId.set(newSize); assert size() == newSize; if (newSize > 0) { validateClusterFormed(); } logger.debug("Cluster is consistent again - nodes: [{}] nextNodeId: [{}] numSharedNodes: [{}]", nodes.keySet(), nextNodeId.get(), newSize); } /** ensure a cluster is formed with all published nodes. */ public synchronized void validateClusterFormed() { String name = randomFrom(random, getNodeNames()); validateClusterFormed(name); } /** ensure a cluster is formed with all published nodes, but do so by using the client of the specified node */ public synchronized void validateClusterFormed(String viaNode) { Set<DiscoveryNode> expectedNodes = new HashSet<>(); for (NodeAndClient nodeAndClient : nodes.values()) { expectedNodes.add(getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode()); } logger.trace("validating cluster formed via [{}], expecting {}", viaNode, expectedNodes); final Client client = client(viaNode); try { if (awaitBusy(() -> { DiscoveryNodes discoveryNodes = client.admin().cluster().prepareState().get().getState().nodes(); if (discoveryNodes.getSize() != expectedNodes.size()) { return false; } for (DiscoveryNode expectedNode : expectedNodes) { if (discoveryNodes.nodeExists(expectedNode) == false) { return false; } } return true; }, 30, TimeUnit.SECONDS) == false) { throw new IllegalStateException("cluster failed to form with expected nodes " + expectedNodes + " and actual nodes " + client.admin().cluster().prepareState().get().getState().nodes()); } } catch (InterruptedException e) { throw new IllegalStateException(e); } } @Override public synchronized void afterTest() throws IOException { wipePendingDataDirectories(); randomlyResetClients(); /* reset all clients - each test gets its own client based on the Random instance created above. */ } @Override public void beforeIndexDeletion() throws Exception { // Check that the operations counter on index shard has reached 0. // The assumption here is that after a test there are no ongoing write operations. // test that have ongoing write operations after the test (for example because ttl is used // and not all docs have been purged after the test) and inherit from // ElasticsearchIntegrationTest must override beforeIndexDeletion() to avoid failures. assertShardIndexCounter(); //check that shards that have same sync id also contain same number of documents assertSameSyncIdSameDocs(); } private void assertSameSyncIdSameDocs() { Map<String, Long> docsOnShards = new HashMap<>(); final Collection<NodeAndClient> nodesAndClients = nodes.values(); for (NodeAndClient nodeAndClient : nodesAndClients) { IndicesService indexServices = getInstance(IndicesService.class, nodeAndClient.name); for (IndexService indexService : indexServices) { for (IndexShard indexShard : indexService) { CommitStats commitStats = indexShard.commitStats(); if (commitStats != null) { // null if the engine is closed or if the shard is recovering String syncId = commitStats.getUserData().get(Engine.SYNC_COMMIT_ID); if (syncId != null) { long liveDocsOnShard = commitStats.getNumDocs(); if (docsOnShards.get(syncId) != null) { assertThat("sync id is equal but number of docs does not match on node " + nodeAndClient.name + ". expected " + docsOnShards.get(syncId) + " but got " + liveDocsOnShard, docsOnShards.get(syncId), equalTo(liveDocsOnShard)); } else { docsOnShards.put(syncId, liveDocsOnShard); } } } } } } } private void assertShardIndexCounter() throws Exception { assertBusy(() -> { final Collection<NodeAndClient> nodesAndClients = nodes.values(); for (NodeAndClient nodeAndClient : nodesAndClients) { IndicesService indexServices = getInstance(IndicesService.class, nodeAndClient.name); for (IndexService indexService : indexServices) { for (IndexShard indexShard : indexService) { int activeOperationsCount = indexShard.getActiveOperationsCount(); if (activeOperationsCount > 0) { TaskManager taskManager = getInstance(TransportService.class, nodeAndClient.name).getTaskManager(); DiscoveryNode localNode = getInstance(ClusterService.class, nodeAndClient.name).localNode(); List<TaskInfo> taskInfos = taskManager.getTasks().values().stream() .filter(task -> task instanceof ReplicationTask) .map(task -> task.taskInfo(localNode.getId(), true)) .collect(Collectors.toList()); ListTasksResponse response = new ListTasksResponse(taskInfos, Collections.emptyList(), Collections.emptyList()); try { XContentBuilder builder = XContentFactory.jsonBuilder().prettyPrint().value(response); throw new AssertionError("expected index shard counter on shard " + indexShard.shardId() + " on node " + nodeAndClient.name + " to be 0 but was " + activeOperationsCount + ". Current replication tasks on node:\n" + builder.string()); } catch (IOException e) { throw new RuntimeException("caught exception while building response [" + response + "]", e); } } } } } }); } private void randomlyResetClients() throws IOException { // only reset the clients on nightly tests, it causes heavy load... if (RandomizedTest.isNightly() && rarely(random)) { final Collection<NodeAndClient> nodesAndClients = nodes.values(); for (NodeAndClient nodeAndClient : nodesAndClients) { nodeAndClient.resetClient(); } } } private void wipePendingDataDirectories() { assert Thread.holdsLock(this); if (!dataDirToClean.isEmpty()) { try { for (Path path : dataDirToClean) { try { FileSystemUtils.deleteSubDirectories(path); logger.info("Successfully wiped data directory for node location: {}", path); } catch (IOException e) { logger.info("Failed to wipe data directory for node location: {}", path); } } } finally { dataDirToClean.clear(); } } } private void markNodeDataDirsAsPendingForWipe(Node node) { assert Thread.holdsLock(this); NodeEnvironment nodeEnv = node.getNodeEnvironment(); if (nodeEnv.hasNodeFile()) { dataDirToClean.addAll(Arrays.asList(nodeEnv.nodeDataPaths())); } } private void markNodeDataDirsAsNotEligableForWipe(Node node) { assert Thread.holdsLock(this); NodeEnvironment nodeEnv = node.getNodeEnvironment(); if (nodeEnv.hasNodeFile()) { dataDirToClean.removeAll(Arrays.asList(nodeEnv.nodeDataPaths())); } } /** * Returns a reference to a random node's {@link ClusterService} */ public ClusterService clusterService() { return clusterService(null); } /** * Returns a reference to a node's {@link ClusterService}. If the given node is null, a random node will be selected. */ public synchronized ClusterService clusterService(@Nullable String node) { return getInstance(ClusterService.class, node); } /** * Returns an Iterable to all instances for the given class >T< across all nodes in the cluster. */ public synchronized <T> Iterable<T> getInstances(Class<T> clazz) { List<T> instances = new ArrayList<>(nodes.size()); for (NodeAndClient nodeAndClient : nodes.values()) { instances.add(getInstanceFromNode(clazz, nodeAndClient.node)); } return instances; } /** * Returns an Iterable to all instances for the given class >T< across all data nodes in the cluster. */ public synchronized <T> Iterable<T> getDataNodeInstances(Class<T> clazz) { return getInstances(clazz, new DataNodePredicate()); } /** * Returns an Iterable to all instances for the given class >T< across all data and master nodes * in the cluster. */ public synchronized <T> Iterable<T> getDataOrMasterNodeInstances(Class<T> clazz) { return getInstances(clazz, new DataOrMasterNodePredicate()); } private synchronized <T> Iterable<T> getInstances(Class<T> clazz, Predicate<NodeAndClient> predicate) { Iterable<NodeAndClient> filteredNodes = nodes.values().stream().filter(predicate)::iterator; List<T> instances = new ArrayList<>(); for (NodeAndClient nodeAndClient : filteredNodes) { instances.add(getInstanceFromNode(clazz, nodeAndClient.node)); } return instances; } /** * Returns a reference to the given nodes instances of the given class >T< */ public synchronized <T> T getInstance(Class<T> clazz, final String node) { return getInstance(clazz, nc -> node == null || node.equals(nc.name)); } public synchronized <T> T getDataNodeInstance(Class<T> clazz) { return getInstance(clazz, new DataNodePredicate()); } private synchronized <T> T getInstance(Class<T> clazz, Predicate<NodeAndClient> predicate) { NodeAndClient randomNodeAndClient = getRandomNodeAndClient(predicate); assert randomNodeAndClient != null; return getInstanceFromNode(clazz, randomNodeAndClient.node); } /** * Returns a reference to a random nodes instances of the given class >T< */ public synchronized <T> T getInstance(Class<T> clazz) { return getInstance(clazz, nc -> true); } private synchronized <T> T getInstanceFromNode(Class<T> clazz, Node node) { return node.injector().getInstance(clazz); } @Override public synchronized int size() { return this.nodes.size(); } @Override public InetSocketAddress[] httpAddresses() { List<InetSocketAddress> addresses = new ArrayList<>(); for (HttpServerTransport httpServerTransport : getInstances(HttpServerTransport.class)) { addresses.add(httpServerTransport.boundAddress().publishAddress().address()); } return addresses.toArray(new InetSocketAddress[addresses.size()]); } /** * Stops a random data node in the cluster. Returns true if a node was found to stop, false otherwise. */ public synchronized boolean stopRandomDataNode() throws IOException { ensureOpen(); NodeAndClient nodeAndClient = getRandomNodeAndClient(new DataNodePredicate()); if (nodeAndClient != null) { logger.info("Closing random node [{}] ", nodeAndClient.name); stopNodesAndClient(nodeAndClient); return true; } return false; } /** * Stops a random node in the cluster that applies to the given filter or non if the non of the nodes applies to the * filter. */ public synchronized void stopRandomNode(final Predicate<Settings> filter) throws IOException { ensureOpen(); NodeAndClient nodeAndClient = getRandomNodeAndClient(nc -> filter.test(nc.node.settings())); if (nodeAndClient != null) { logger.info("Closing filtered random node [{}] ", nodeAndClient.name); stopNodesAndClient(nodeAndClient); } } /** * Stops the current master node forcefully */ public synchronized void stopCurrentMasterNode() throws IOException { ensureOpen(); assert size() > 0; String masterNodeName = getMasterName(); assert nodes.containsKey(masterNodeName); logger.info("Closing master node [{}] ", masterNodeName); stopNodesAndClient(nodes.get(masterNodeName)); } /** * Stops any of the current nodes but not the master node. */ public synchronized void stopRandomNonMasterNode() throws IOException { NodeAndClient nodeAndClient = getRandomNodeAndClient(new MasterNodePredicate(getMasterName()).negate()); if (nodeAndClient != null) { logger.info("Closing random non master node [{}] current master [{}] ", nodeAndClient.name, getMasterName()); stopNodesAndClient(nodeAndClient); } } private synchronized void startAndPublishNodesAndClients(List<NodeAndClient> nodeAndClients) { if (nodeAndClients.size() > 0) { final int newMasters = (int) nodeAndClients.stream().filter(NodeAndClient::isMasterEligible) .filter(nac -> nodes.containsKey(nac.name) == false) // filter out old masters .count(); final int currentMasters = getMasterNodesCount(); if (autoManageMinMasterNodes && currentMasters > 1 && newMasters > 0) { // special case for 1 node master - we can't update the min master nodes before we add more nodes. updateMinMasterNodes(currentMasters + newMasters); } List<Future<?>> futures = nodeAndClients.stream().map(node -> executor.submit(node::startNode)).collect(Collectors.toList()); try { for (Future<?> future : futures) { future.get(); } } catch (InterruptedException e) { throw new AssertionError("interrupted while starting nodes", e); } catch (ExecutionException e) { throw new RuntimeException("failed to start nodes", e); } nodeAndClients.forEach(this::publishNode); if (autoManageMinMasterNodes && currentMasters == 1 && newMasters > 0) { // update once masters have joined validateClusterFormed(); updateMinMasterNodes(currentMasters + newMasters); } } } private synchronized void stopNodesAndClient(NodeAndClient nodeAndClient) throws IOException { stopNodesAndClients(Collections.singleton(nodeAndClient)); } private synchronized void stopNodesAndClients(Collection<NodeAndClient> nodeAndClients) throws IOException { if (autoManageMinMasterNodes && nodeAndClients.size() > 0) { int masters = (int)nodeAndClients.stream().filter(NodeAndClient::isMasterEligible).count(); if (masters > 0) { updateMinMasterNodes(getMasterNodesCount() - masters); } } for (NodeAndClient nodeAndClient: nodeAndClients) { removeDisruptionSchemeFromNode(nodeAndClient); NodeAndClient previous = nodes.remove(nodeAndClient.name); assert previous == nodeAndClient; nodeAndClient.close(); } } /** * Restarts a random node in the cluster */ public void restartRandomNode() throws Exception { restartRandomNode(EMPTY_CALLBACK); } /** * Restarts a random node in the cluster and calls the callback during restart. */ public void restartRandomNode(RestartCallback callback) throws Exception { restartRandomNode(nc -> true, callback); } /** * Restarts a random data node in the cluster */ public void restartRandomDataNode() throws Exception { restartRandomDataNode(EMPTY_CALLBACK); } /** * Restarts a random data node in the cluster and calls the callback during restart. */ public void restartRandomDataNode(RestartCallback callback) throws Exception { restartRandomNode(new DataNodePredicate(), callback); } /** * Restarts a random node in the cluster and calls the callback during restart. */ private synchronized void restartRandomNode(Predicate<NodeAndClient> predicate, RestartCallback callback) throws Exception { ensureOpen(); NodeAndClient nodeAndClient = getRandomNodeAndClient(predicate); if (nodeAndClient != null) { restartNode(nodeAndClient, callback); } } /** * Restarts a node and calls the callback during restart. */ public synchronized void restartNode(String nodeName, RestartCallback callback) throws Exception { ensureOpen(); NodeAndClient nodeAndClient = nodes.get(nodeName); if (nodeAndClient != null) { restartNode(nodeAndClient, callback); } } public static final RestartCallback EMPTY_CALLBACK = new RestartCallback() { @Override public Settings onNodeStopped(String node) { return null; } }; /** * Restarts all nodes in the cluster. It first stops all nodes and then restarts all the nodes again. */ public void fullRestart() throws Exception { fullRestart(EMPTY_CALLBACK); } /** * Restarts all nodes in a rolling restart fashion ie. only restarts on node a time. */ public void rollingRestart() throws Exception { rollingRestart(EMPTY_CALLBACK); } /** * Restarts all nodes in a rolling restart fashion ie. only restarts on node a time. */ public synchronized void rollingRestart(RestartCallback callback) throws Exception { int numNodesRestarted = 0; for (NodeAndClient nodeAndClient : nodes.values()) { callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient()); restartNode(nodeAndClient, callback); } } private void restartNode(NodeAndClient nodeAndClient, RestartCallback callback) throws Exception { logger.info("Restarting node [{}] ", nodeAndClient.name); if (activeDisruptionScheme != null) { activeDisruptionScheme.removeFromNode(nodeAndClient.name, this); } final int masterNodesCount = getMasterNodesCount(); // special case to allow stopping one node in a two node cluster and keep it functional final boolean updateMinMaster = nodeAndClient.isMasterEligible() && masterNodesCount == 2 && autoManageMinMasterNodes; if (updateMinMaster) { updateMinMasterNodes(masterNodesCount - 1); } nodeAndClient.restart(callback, true, autoManageMinMasterNodes ? getMinMasterNodes(masterNodesCount) : -1); if (activeDisruptionScheme != null) { activeDisruptionScheme.applyToNode(nodeAndClient.name, this); } if (callback.validateClusterForming() || updateMinMaster) { // we have to validate cluster size if updateMinMaster == true, because we need the // second node to join in order to increment min_master_nodes back to 2. // we also have to do via the node that was just restarted as it may be that the master didn't yet process // the fact it left validateClusterFormed(nodeAndClient.name); } if (updateMinMaster) { updateMinMasterNodes(masterNodesCount); } } /** * Restarts all nodes in the cluster. It first stops all nodes and then restarts all the nodes again. */ public synchronized void fullRestart(RestartCallback callback) throws Exception { int numNodesRestarted = 0; Map<Set<Role>, List<NodeAndClient>> nodesByRoles = new HashMap<>(); Set[] rolesOrderedByOriginalStartupOrder = new Set[nextNodeId.get()]; for (NodeAndClient nodeAndClient : nodes.values()) { callback.doAfterNodes(numNodesRestarted++, nodeAndClient.nodeClient()); logger.info("Stopping node [{}] ", nodeAndClient.name); if (activeDisruptionScheme != null) { activeDisruptionScheme.removeFromNode(nodeAndClient.name, this); } nodeAndClient.closeNode(); // delete data folders now, before we start other nodes that may claim it nodeAndClient.clearDataIfNeeded(callback); DiscoveryNode discoveryNode = getInstanceFromNode(ClusterService.class, nodeAndClient.node()).localNode(); rolesOrderedByOriginalStartupOrder[nodeAndClient.nodeAndClientId] = discoveryNode.getRoles(); nodesByRoles.computeIfAbsent(discoveryNode.getRoles(), k -> new ArrayList<>()).add(nodeAndClient); } assert nodesByRoles.values().stream().collect(Collectors.summingInt(List::size)) == nodes.size(); // randomize start up order, but making sure that: // 1) A data folder that was assigned to a data node will stay so // 2) Data nodes will get the same node lock ordinal range, so custom index paths (where the ordinal is used) // will still belong to data nodes for (List<NodeAndClient> sameRoleNodes : nodesByRoles.values()) { Collections.shuffle(sameRoleNodes, random); } List<NodeAndClient> startUpOrder = new ArrayList<>(); for (Set roles : rolesOrderedByOriginalStartupOrder) { if (roles == null) { // if some nodes were stopped, we want have a role for that ordinal continue; } final List<NodeAndClient> nodesByRole = nodesByRoles.get(roles); startUpOrder.add(nodesByRole.remove(0)); } assert nodesByRoles.values().stream().collect(Collectors.summingInt(List::size)) == 0; // do two rounds to minimize pinging (mock zen pings pings with no delay and can create a lot of logs) for (NodeAndClient nodeAndClient : startUpOrder) { logger.info("resetting node [{}] ", nodeAndClient.name); // we already cleared data folders, before starting nodes up nodeAndClient.recreateNodeOnRestart(callback, false, autoManageMinMasterNodes ? getMinMasterNodes(getMasterNodesCount()) : -1); } startAndPublishNodesAndClients(startUpOrder); if (callback.validateClusterForming()) { validateClusterFormed(); } } /** * Returns the name of the current master node in the cluster. */ public String getMasterName() { return getMasterName(null); } /** * Returns the name of the current master node in the cluster and executes the request via the node specified * in the viaNode parameter. If viaNode isn't specified a random node will be picked to the send the request to. */ public String getMasterName(@Nullable String viaNode) { try { Client client = viaNode != null ? client(viaNode) : client(); ClusterState state = client.admin().cluster().prepareState().execute().actionGet().getState(); return state.nodes().getMasterNode().getName(); } catch (Exception e) { logger.warn("Can't fetch cluster state", e); throw new RuntimeException("Can't get master node " + e.getMessage(), e); } } synchronized Set<String> allDataNodesButN(int numNodes) { return nRandomDataNodes(numDataNodes() - numNodes); } private synchronized Set<String> nRandomDataNodes(int numNodes) { assert size() >= numNodes; Map<String, NodeAndClient> dataNodes = nodes .entrySet() .stream() .filter(new EntryNodePredicate(new DataNodePredicate())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); final HashSet<String> set = new HashSet<>(); final Iterator<String> iterator = dataNodes.keySet().iterator(); for (int i = 0; i < numNodes; i++) { assert iterator.hasNext(); set.add(iterator.next()); } return set; } /** * Returns a set of nodes that have at least one shard of the given index. */ public synchronized Set<String> nodesInclude(String index) { if (clusterService().state().routingTable().hasIndex(index)) { List<ShardRouting> allShards = clusterService().state().routingTable().allShards(index); DiscoveryNodes discoveryNodes = clusterService().state().getNodes(); Set<String> nodes = new HashSet<>(); for (ShardRouting shardRouting : allShards) { if (shardRouting.assignedToNode()) { DiscoveryNode discoveryNode = discoveryNodes.get(shardRouting.currentNodeId()); nodes.add(discoveryNode.getName()); } } return nodes; } return Collections.emptySet(); } /** * Starts a node with default settings and returns it's name. */ public synchronized String startNode() { return startNode(Settings.EMPTY); } /** * Starts a node with the given settings builder and returns it's name. */ public synchronized String startNode(Settings.Builder settings) { return startNode(settings.build()); } /** * Starts a node with the given settings and returns it's name. */ public synchronized String startNode(Settings settings) { final int defaultMinMasterNodes = getMinMasterNodes(getMasterNodesCount() + (Node.NODE_MASTER_SETTING.get(settings) ? 1 : 0)); NodeAndClient buildNode = buildNode(settings, defaultMinMasterNodes); startAndPublishNodesAndClients(Collections.singletonList(buildNode)); return buildNode.name; } /** * Starts multiple nodes with default settings and returns their names */ public synchronized List<String> startNodes(int numOfNodes) { return startNodes(numOfNodes, Settings.EMPTY); } /** * Starts multiple nodes with the given settings and returns their names */ public synchronized List<String> startNodes(int numOfNodes, Settings settings) { return startNodes(Collections.nCopies(numOfNodes, settings).stream().toArray(Settings[]::new)); } /** * Starts multiple nodes with the given settings and returns their names */ public synchronized List<String> startNodes(Settings... settings) { final int defaultMinMasterNodes; if (autoManageMinMasterNodes) { int mastersDelta = (int) Stream.of(settings).filter(Node.NODE_MASTER_SETTING::get).count(); defaultMinMasterNodes = getMinMasterNodes(getMasterNodesCount() + mastersDelta); } else { defaultMinMasterNodes = -1; } List<NodeAndClient> nodes = new ArrayList<>(); for (Settings nodeSettings: settings) { nodes.add(buildNode(nodeSettings, defaultMinMasterNodes)); } startAndPublishNodesAndClients(nodes); if (autoManageMinMasterNodes) { validateClusterFormed(); } return nodes.stream().map(NodeAndClient::getName).collect(Collectors.toList()); } public synchronized List<String> startMasterOnlyNodes(int numNodes) { return startMasterOnlyNodes(numNodes, Settings.EMPTY); } public synchronized List<String> startMasterOnlyNodes(int numNodes, Settings settings) { Settings settings1 = Settings.builder().put(settings).put(Node.NODE_MASTER_SETTING.getKey(), true).put(Node.NODE_DATA_SETTING.getKey(), false).build(); return startNodes(numNodes, settings1); } public synchronized List<String> startDataOnlyNodes(int numNodes) { return startDataOnlyNodes(numNodes, Settings.EMPTY); } public synchronized List<String> startDataOnlyNodes(int numNodes, Settings settings) { Settings settings1 = Settings.builder().put(settings).put(Node.NODE_MASTER_SETTING.getKey(), false).put(Node.NODE_DATA_SETTING.getKey(), true).build(); return startNodes(numNodes, settings1); } /** * updates the min master nodes setting in the current running cluster. * * @param eligibleMasterNodeCount the number of master eligible nodes to use as basis for the min master node setting */ private int updateMinMasterNodes(int eligibleMasterNodeCount) { assert autoManageMinMasterNodes; final int minMasterNodes = getMinMasterNodes(eligibleMasterNodeCount); if (getMasterNodesCount() > 0) { // there should be at least one master to update logger.debug("updating min_master_nodes to [{}]", minMasterNodes); try { assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings( Settings.builder().put(DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), minMasterNodes) )); } catch (Exception e) { throw new ElasticsearchException("failed to update minimum master node to [{}] (current masters [{}])", e, minMasterNodes, getMasterNodesCount()); } } return minMasterNodes; } /** calculates a min master nodes value based on the given number of master nodes */ private int getMinMasterNodes(int eligibleMasterNodes) { return eligibleMasterNodes / 2 + 1; } private int getMasterNodesCount() { return (int)nodes.values().stream().filter(n -> Node.NODE_MASTER_SETTING.get(n.node().settings())).count(); } public synchronized String startMasterOnlyNode() { return startMasterOnlyNode(Settings.EMPTY); } public synchronized String startMasterOnlyNode(Settings settings) { Settings settings1 = Settings.builder().put(settings).put(Node.NODE_MASTER_SETTING.getKey(), true).put(Node.NODE_DATA_SETTING.getKey(), false).build(); return startNode(settings1); } public synchronized String startDataOnlyNode() { return startDataOnlyNode(Settings.EMPTY); } public synchronized String startDataOnlyNode(Settings settings) { Settings settings1 = Settings.builder().put(settings).put(Node.NODE_MASTER_SETTING.getKey(), false).put(Node.NODE_DATA_SETTING.getKey(), true).build(); return startNode(settings1); } private synchronized void publishNode(NodeAndClient nodeAndClient) { assert !nodeAndClient.node().isClosed(); nodes.put(nodeAndClient.name, nodeAndClient); applyDisruptionSchemeToNode(nodeAndClient); } public void closeNonSharedNodes(boolean wipeData) throws IOException { reset(wipeData); } @Override public int numDataNodes() { return dataNodeAndClients().size(); } @Override public int numDataAndMasterNodes() { return dataAndMasterNodes().size(); } public synchronized int numMasterNodes() { return filterNodes(nodes, NodeAndClient::isMasterEligible).size(); } public void setDisruptionScheme(ServiceDisruptionScheme scheme) { assert activeDisruptionScheme == null : "there is already and active disruption [" + activeDisruptionScheme + "]. call clearDisruptionScheme first"; scheme.applyToCluster(this); activeDisruptionScheme = scheme; } public void clearDisruptionScheme() { clearDisruptionScheme(true); } public void clearDisruptionScheme(boolean ensureHealthyCluster) { if (activeDisruptionScheme != null) { TimeValue expectedHealingTime = activeDisruptionScheme.expectedTimeToHeal(); logger.info("Clearing active scheme {}, expected healing time {}", activeDisruptionScheme, expectedHealingTime); if (ensureHealthyCluster) { activeDisruptionScheme.removeAndEnsureHealthy(this); } else { activeDisruptionScheme.removeFromCluster(this); } } activeDisruptionScheme = null; } private void applyDisruptionSchemeToNode(NodeAndClient nodeAndClient) { if (activeDisruptionScheme != null) { assert nodes.containsKey(nodeAndClient.name); activeDisruptionScheme.applyToNode(nodeAndClient.name, this); } } private void removeDisruptionSchemeFromNode(NodeAndClient nodeAndClient) { if (activeDisruptionScheme != null) { assert nodes.containsKey(nodeAndClient.name); activeDisruptionScheme.removeFromNode(nodeAndClient.name, this); } } private synchronized Collection<NodeAndClient> dataNodeAndClients() { return filterNodes(nodes, new DataNodePredicate()); } private synchronized Collection<NodeAndClient> dataAndMasterNodes() { return filterNodes(nodes, new DataOrMasterNodePredicate()); } private synchronized Collection<NodeAndClient> filterNodes(Map<String, InternalTestCluster.NodeAndClient> map, Predicate<NodeAndClient> predicate) { return map .values() .stream() .filter(predicate) .collect(Collectors.toCollection(ArrayList::new)); } private static final class DataNodePredicate implements Predicate<NodeAndClient> { @Override public boolean test(NodeAndClient nodeAndClient) { return DiscoveryNode.isDataNode(nodeAndClient.node.settings()); } } private static final class DataOrMasterNodePredicate implements Predicate<NodeAndClient> { @Override public boolean test(NodeAndClient nodeAndClient) { return DiscoveryNode.isDataNode(nodeAndClient.node.settings()) || DiscoveryNode.isMasterNode(nodeAndClient.node.settings()); } } private static final class MasterNodePredicate implements Predicate<NodeAndClient> { private final String masterNodeName; MasterNodePredicate(String masterNodeName) { this.masterNodeName = masterNodeName; } @Override public boolean test(NodeAndClient nodeAndClient) { return masterNodeName.equals(nodeAndClient.name); } } private static final class NoDataNoMasterNodePredicate implements Predicate<NodeAndClient> { @Override public boolean test(NodeAndClient nodeAndClient) { return DiscoveryNode.isMasterNode(nodeAndClient.node.settings()) == false && DiscoveryNode.isDataNode(nodeAndClient.node.settings()) == false; } } private static final class EntryNodePredicate implements Predicate<Map.Entry<String, NodeAndClient>> { private final Predicate<NodeAndClient> delegateNodePredicate; EntryNodePredicate(Predicate<NodeAndClient> delegateNodePredicate) { this.delegateNodePredicate = delegateNodePredicate; } @Override public boolean test(Map.Entry<String, NodeAndClient> entry) { return delegateNodePredicate.test(entry.getValue()); } } synchronized String routingKeyForShard(Index index, int shard, Random random) { assertThat(shard, greaterThanOrEqualTo(0)); assertThat(shard, greaterThanOrEqualTo(0)); for (NodeAndClient n : nodes.values()) { Node node = n.node; IndicesService indicesService = getInstanceFromNode(IndicesService.class, node); ClusterService clusterService = getInstanceFromNode(ClusterService.class, node); IndexService indexService = indicesService.indexService(index); if (indexService != null) { assertThat(indexService.getIndexSettings().getSettings().getAsInt(IndexMetaData.SETTING_NUMBER_OF_SHARDS, -1), greaterThan(shard)); OperationRouting operationRouting = clusterService.operationRouting(); while (true) { String routing = RandomStrings.randomAsciiOfLength(random, 10); final int targetShard = operationRouting.indexShards(clusterService.state(), index.getName(), null, routing).shardId().getId(); if (shard == targetShard) { return routing; } } } } fail("Could not find a node that holds " + index); return null; } public synchronized Iterable<Client> getClients() { ensureOpen(); return () -> { ensureOpen(); final Iterator<NodeAndClient> iterator = nodes.values().iterator(); return new Iterator<Client>() { @Override public boolean hasNext() { return iterator.hasNext(); } @Override public Client next() { return iterator.next().client(random); } @Override public void remove() { throw new UnsupportedOperationException(""); } }; }; } /** * Returns a predicate that only accepts settings of nodes with one of the given names. */ public static Predicate<Settings> nameFilter(String... nodeName) { return new NodeNamePredicate(new HashSet<>(Arrays.asList(nodeName))); } private static final class NodeNamePredicate implements Predicate<Settings> { private final HashSet<String> nodeNames; NodeNamePredicate(HashSet<String> nodeNames) { this.nodeNames = nodeNames; } @Override public boolean test(Settings settings) { return nodeNames.contains(settings.get("node.name")); } } /** * An abstract class that is called during {@link #rollingRestart(InternalTestCluster.RestartCallback)} * and / or {@link #fullRestart(InternalTestCluster.RestartCallback)} to execute actions at certain * stages of the restart. */ public static class RestartCallback { /** * Executed once the give node name has been stopped. */ public Settings onNodeStopped(String nodeName) throws Exception { return Settings.EMPTY; } /** * Executed for each node before the <tt>n+1</tt> node is restarted. The given client is * an active client to the node that will be restarted next. */ public void doAfterNodes(int n, Client client) throws Exception { } /** * If this returns <code>true</code> all data for the node with the given node name will be cleared including * gateways and all index data. Returns <code>false</code> by default. */ public boolean clearData(String nodeName) { return false; } /** returns true if the restart should also validate the cluster has reformed */ public boolean validateClusterForming() { return true; } } public Settings getDefaultSettings() { return defaultSettings; } @Override public void ensureEstimatedStats() { if (size() > 0) { // Checks that the breakers have been reset without incurring a // network request, because a network request can increment one // of the breakers for (NodeAndClient nodeAndClient : nodes.values()) { final IndicesFieldDataCache fdCache = getInstanceFromNode(IndicesService.class, nodeAndClient.node).getIndicesFieldDataCache(); // Clean up the cache, ensuring that entries' listeners have been called fdCache.getCache().refresh(); final String name = nodeAndClient.name; final CircuitBreakerService breakerService = getInstanceFromNode(CircuitBreakerService.class, nodeAndClient.node); CircuitBreaker fdBreaker = breakerService.getBreaker(CircuitBreaker.FIELDDATA); assertThat("Fielddata breaker not reset to 0 on node: " + name, fdBreaker.getUsed(), equalTo(0L)); // Anything that uses transport or HTTP can increase the // request breaker (because they use bigarrays), because of // that the breaker can sometimes be incremented from ping // requests from other clusters because Jenkins is running // multiple ES testing jobs in parallel on the same machine. // To combat this we check whether the breaker has reached 0 // in an assertBusy loop, so it will try for 10 seconds and // fail if it never reached 0 try { assertBusy(new Runnable() { @Override public void run() { CircuitBreaker reqBreaker = breakerService.getBreaker(CircuitBreaker.REQUEST); assertThat("Request breaker not reset to 0 on node: " + name, reqBreaker.getUsed(), equalTo(0L)); } }); } catch (Exception e) { fail("Exception during check for request breaker reset to 0: " + e); } NodeService nodeService = getInstanceFromNode(NodeService.class, nodeAndClient.node); CommonStatsFlags flags = new CommonStatsFlags(Flag.FieldData, Flag.QueryCache, Flag.Segments); NodeStats stats = nodeService.stats(flags, false, false, false, false, false, false, false, false, false, false, false); assertThat("Fielddata size must be 0 on node: " + stats.getNode(), stats.getIndices().getFieldData().getMemorySizeInBytes(), equalTo(0L)); assertThat("Query cache size must be 0 on node: " + stats.getNode(), stats.getIndices().getQueryCache().getMemorySizeInBytes(), equalTo(0L)); assertThat("FixedBitSet cache size must be 0 on node: " + stats.getNode(), stats.getIndices().getSegments().getBitsetMemoryInBytes(), equalTo(0L)); } } } @Override public void assertAfterTest() throws IOException { super.assertAfterTest(); assertRequestsFinished(); for (NodeAndClient nodeAndClient : nodes.values()) { NodeEnvironment env = nodeAndClient.node().getNodeEnvironment(); Set<ShardId> shardIds = env.lockedShards(); for (ShardId id : shardIds) { try { env.shardLock(id, TimeUnit.SECONDS.toMillis(5)).close(); } catch (ShardLockObtainFailedException ex) { fail("Shard " + id + " is still locked after 5 sec waiting"); } } } } private void assertRequestsFinished() { if (size() > 0) { for (NodeAndClient nodeAndClient : nodes.values()) { CircuitBreaker inFlightRequestsBreaker = getInstance(CircuitBreakerService.class, nodeAndClient.name) .getBreaker(CircuitBreaker.IN_FLIGHT_REQUESTS); try { // see #ensureEstimatedStats() assertBusy(() -> { // ensure that our size accounting on transport level is reset properly long bytesUsed = inFlightRequestsBreaker.getUsed(); assertThat("All incoming requests on node [" + nodeAndClient.name + "] should have finished. Expected 0 but got " + bytesUsed, bytesUsed, equalTo(0L)); }); } catch (Exception e) { logger.error("Could not assert finished requests within timeout", e); fail("Could not assert finished requests within timeout on node [" + nodeAndClient.name + "]"); } } } } }