/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.indices.store; import com.google.common.base.Predicate; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse; import org.elasticsearch.cluster.ClusterService; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.health.ClusterHealthStatus; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.IndexRoutingTable; import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.TestShardRouting; import org.elasticsearch.cluster.routing.allocation.command.MoveAllocationCommand; import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider; import org.elasticsearch.common.Priority; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.indices.recovery.RecoverySource; import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.ESIntegTestCase.ClusterScope; import org.elasticsearch.test.InternalTestCluster; import org.elasticsearch.test.disruption.BlockClusterStateProcessing; import org.elasticsearch.test.disruption.SingleNodeDisruption; import org.elasticsearch.test.transport.MockTransportService; import org.elasticsearch.transport.ConnectTransportException; import org.elasticsearch.transport.TransportException; import org.elasticsearch.transport.TransportRequest; import org.elasticsearch.transport.TransportRequestOptions; import org.elasticsearch.transport.TransportService; import org.junit.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import static java.lang.Thread.sleep; import static org.elasticsearch.common.settings.Settings.settingsBuilder; import static org.elasticsearch.test.ESIntegTestCase.Scope; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.hamcrest.Matchers.equalTo; /** * */ @ClusterScope(scope = Scope.TEST, numDataNodes = 0) public class IndicesStoreIntegrationIT extends ESIntegTestCase { @Override protected Settings nodeSettings(int nodeOrdinal) { // simplify this and only use a single data path return Settings.settingsBuilder().put(super.nodeSettings(nodeOrdinal)).put("path.data", "") // by default this value is 1 sec in tests (30 sec in practice) but we adding disruption here // which is between 1 and 2 sec can cause each of the shard deletion requests to timeout. // to prevent this we are setting the timeout here to something highish ie. the default in practice .put(IndicesStore.INDICES_STORE_DELETE_SHARD_TIMEOUT, new TimeValue(30, TimeUnit.SECONDS)) .build(); } @Override protected Collection<Class<? extends Plugin>> nodePlugins() { return pluginList(MockTransportService.TestPlugin.class); } @Override protected void ensureClusterStateConsistency() throws IOException { // testShardActiveElseWhere might change the state of a non-master node // so we cannot check state consistency of this cluster } @Test public void indexCleanup() throws Exception { final String masterNode = internalCluster().startNode(Settings.builder().put("node.data", false)); final String node_1 = internalCluster().startNode(Settings.builder().put("node.master", false)); final String node_2 = internalCluster().startNode(Settings.builder().put("node.master", false)); logger.info("--> creating index [test] with one shard and on replica"); assertAcked(prepareCreate("test").setSettings( Settings.builder().put(indexSettings()) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)) ); ensureGreen("test"); logger.info("--> making sure that shard and its replica are allocated on node_1 and node_2"); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, "test")), equalTo(true)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_2, "test")), equalTo(true)); logger.info("--> starting node server3"); final String node_3 = internalCluster().startNode(Settings.builder().put("node.master", false)); logger.info("--> running cluster_health"); ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth() .setWaitForNodes("4") .setWaitForRelocatingShards(0) .get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, "test")), equalTo(true)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_2, "test")), equalTo(true)); assertThat(Files.exists(shardDirectory(node_3, "test", 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_3, "test")), equalTo(false)); logger.info("--> move shard from node_1 to node_3, and wait for relocation to finish"); if (randomBoolean()) { // sometimes add cluster-state delay to trigger observers in IndicesStore.ShardActiveRequestHandler SingleNodeDisruption disruption = new BlockClusterStateProcessing(node_3, getRandom()); internalCluster().setDisruptionScheme(disruption); MockTransportService transportServiceNode3 = (MockTransportService) internalCluster().getInstance(TransportService.class, node_3); CountDownLatch beginRelocationLatch = new CountDownLatch(1); CountDownLatch endRelocationLatch = new CountDownLatch(1); transportServiceNode3.addTracer(new ReclocationStartEndTracer(logger, beginRelocationLatch, endRelocationLatch)); internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand(new ShardId("test", 0), node_1, node_3)).get(); // wait for relocation to start beginRelocationLatch.await(); disruption.startDisrupting(); // wait for relocation to finish endRelocationLatch.await(); // wait a little so that cluster state observer is registered sleep(50); disruption.stopDisrupting(); } else { internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand(new ShardId("test", 0), node_1, node_3)).get(); } clusterHealth = client().admin().cluster().prepareHealth() .setWaitForRelocatingShards(0) .get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); assertThat(waitForShardDeletion(node_1, "test", 0), equalTo(false)); assertThat(waitForIndexDeletion(node_1, "test"), equalTo(false)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_2, "test")), equalTo(true)); assertThat(Files.exists(shardDirectory(node_3, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_3, "test")), equalTo(true)); } @Test /* Test that shard is deleted in case ShardActiveRequest after relocation and next incoming cluster state is an index delete. */ public void shardCleanupIfShardDeletionAfterRelocationFailedAndIndexDeleted() throws Exception { final String node_1 = internalCluster().startNode(); logger.info("--> creating index [test] with one shard and on replica"); assertAcked(prepareCreate("test").setSettings( Settings.builder().put(indexSettings()) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0)) ); ensureGreen("test"); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, "test")), equalTo(true)); final String node_2 = internalCluster().startDataOnlyNode(Settings.builder().build()); assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("2").get().isTimedOut()); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(indexDirectory(node_1, "test")), equalTo(true)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_2, "test")), equalTo(false)); // add a transport delegate that will prevent the shard active request to succeed the first time after relocation has finished. // node_1 will then wait for the next cluster state change before it tries a next attempt to delet the shard. MockTransportService transportServiceNode_1 = (MockTransportService) internalCluster().getInstance(TransportService.class, node_1); TransportService transportServiceNode_2 = internalCluster().getInstance(TransportService.class, node_2); final CountDownLatch shardActiveRequestSent = new CountDownLatch(1); transportServiceNode_1.addDelegate(transportServiceNode_2, new MockTransportService.DelegateTransport(transportServiceNode_1.original()) { @Override public void sendRequest(DiscoveryNode node, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException, TransportException { if (action.equals("internal:index/shard/exists") && shardActiveRequestSent.getCount() > 0) { shardActiveRequestSent.countDown(); logger.info("prevent shard active request from being sent"); throw new ConnectTransportException(node, "DISCONNECT: simulated"); } super.sendRequest(node, requestId, action, request, options); } }); logger.info("--> move shard from {} to {}, and wait for relocation to finish", node_1, node_2); internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand(new ShardId("test", 0), node_1, node_2)).get(); shardActiveRequestSent.await(); ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth() .setWaitForRelocatingShards(0) .get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); logClusterState(); // delete the index. node_1 that still waits for the next cluster state update will then get the delete index next. // it must still delete the shard, even if it cannot find it anymore in indicesservice client().admin().indices().prepareDelete("test").get(); assertThat(waitForShardDeletion(node_1, "test", 0), equalTo(false)); assertThat(waitForIndexDeletion(node_1, "test"), equalTo(false)); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_1, "test")), equalTo(false)); assertThat(waitForShardDeletion(node_2, "test", 0), equalTo(false)); assertThat(waitForIndexDeletion(node_2, "test"), equalTo(false)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(false)); assertThat(Files.exists(indexDirectory(node_2, "test")), equalTo(false)); } @Test public void shardsCleanup() throws Exception { final String node_1 = internalCluster().startNode(); final String node_2 = internalCluster().startNode(); logger.info("--> creating index [test] with one shard and on replica"); assertAcked(prepareCreate("test").setSettings( Settings.builder().put(indexSettings()) .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)) ); ensureGreen("test"); logger.info("--> making sure that shard and its replica are allocated on node_1 and node_2"); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(shardDirectory(node_2, "test", 0)), equalTo(true)); logger.info("--> starting node server3"); String node_3 = internalCluster().startNode(); logger.info("--> running cluster_health"); ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth() .setWaitForNodes("3") .setWaitForRelocatingShards(0) .get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); logger.info("--> making sure that shard is not allocated on server3"); assertThat(waitForShardDeletion(node_3, "test", 0), equalTo(false)); Path server2Shard = shardDirectory(node_2, "test", 0); logger.info("--> stopping node " + node_2); internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node_2)); logger.info("--> running cluster_health"); clusterHealth = client().admin().cluster().prepareHealth() .setWaitForGreenStatus() .setWaitForNodes("2") .setWaitForRelocatingShards(0) .get(); assertThat(clusterHealth.isTimedOut(), equalTo(false)); logger.info("--> done cluster_health, status " + clusterHealth.getStatus()); assertThat(Files.exists(server2Shard), equalTo(true)); logger.info("--> making sure that shard and its replica exist on server1, server2 and server3"); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(server2Shard), equalTo(true)); assertThat(Files.exists(shardDirectory(node_3, "test", 0)), equalTo(true)); logger.info("--> starting node node_4"); final String node_4 = internalCluster().startNode(); logger.info("--> running cluster_health"); ensureGreen(); logger.info("--> making sure that shard and its replica are allocated on server1 and server3 but not on server2"); assertThat(Files.exists(shardDirectory(node_1, "test", 0)), equalTo(true)); assertThat(Files.exists(shardDirectory(node_3, "test", 0)), equalTo(true)); assertThat(waitForShardDeletion(node_4, "test", 0), equalTo(false)); } @Test public void testShardActiveElsewhereDoesNotDeleteAnother() throws Exception { Future<String> masterFuture = internalCluster().startNodeAsync( Settings.builder().put("node.master", true, "node.data", false).build()); Future<List<String>> nodesFutures = internalCluster().startNodesAsync(4, Settings.builder().put("node.master", false, "node.data", true).build()); final String masterNode = masterFuture.get(); final String node1 = nodesFutures.get().get(0); final String node2 = nodesFutures.get().get(1); final String node3 = nodesFutures.get().get(2); // we will use this later on, handy to start now to make sure it has a different data folder that node 1,2 &3 final String node4 = nodesFutures.get().get(3); assertAcked(prepareCreate("test").setSettings(Settings.builder() .put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 3) .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1) .put(FilterAllocationDecider.INDEX_ROUTING_EXCLUDE_GROUP + "_name", node4) )); assertFalse(client().admin().cluster().prepareHealth().setWaitForRelocatingShards(0).setWaitForGreenStatus().setWaitForNodes("5").get().isTimedOut()); // disable allocation to control the situation more easily assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none"))); logger.debug("--> shutting down two random nodes"); internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node1, node2, node3)); internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node1, node2, node3)); logger.debug("--> verifying index is red"); ClusterHealthResponse health = client().admin().cluster().prepareHealth().setWaitForNodes("3").get(); if (health.getStatus() != ClusterHealthStatus.RED) { logClusterState(); fail("cluster didn't become red, despite of shutting 2 of 3 nodes"); } logger.debug("--> allowing index to be assigned to node [{}]", node4); assertAcked(client().admin().indices().prepareUpdateSettings("test").setSettings( Settings.builder() .put(FilterAllocationDecider.INDEX_ROUTING_EXCLUDE_GROUP + "_name", "NONE"))); assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "all"))); logger.debug("--> waiting for shards to recover on [{}]", node4); // we have to do this in two steps as we now do async shard fetching before assigning, so the change to the // allocation filtering may not have immediate effect // TODO: we should add an easier to do this. It's too much of a song and dance.. assertBusy(new Runnable() { @Override public void run() { assertTrue(internalCluster().getInstance(IndicesService.class, node4).hasIndex("test")); } }); // wait for 4 active shards - we should have lost one shard assertFalse(client().admin().cluster().prepareHealth().setWaitForActiveShards(4).get().isTimedOut()); // disable allocation again to control concurrency a bit and allow shard active to kick in before allocation assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "none"))); logger.debug("--> starting the two old nodes back"); internalCluster().startNodesAsync(2, Settings.builder().put("node.master", false, "node.data", true).build()); assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("5").get().isTimedOut()); assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder() .put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE, "all"))); logger.debug("--> waiting for the lost shard to be recovered"); ensureGreen("test"); } @Test public void testShardActiveElseWhere() throws Exception { List<String> nodes = internalCluster().startNodesAsync(2).get(); final String masterNode = internalCluster().getMasterName(); final String nonMasterNode = nodes.get(0).equals(masterNode) ? nodes.get(1) : nodes.get(0); final String masterId = internalCluster().clusterService(masterNode).localNode().id(); final String nonMasterId = internalCluster().clusterService(nonMasterNode).localNode().id(); final int numShards = scaledRandomIntBetween(2, 10); assertAcked(prepareCreate("test") .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, numShards)) ); ensureGreen("test"); waitNoPendingTasksOnAll(); ClusterStateResponse stateResponse = client().admin().cluster().prepareState().get(); RoutingNode routingNode = stateResponse.getState().getRoutingNodes().node(nonMasterId); final int[] node2Shards = new int[routingNode.numberOfOwningShards()]; int i = 0; for (ShardRouting shardRouting : routingNode) { node2Shards[i] = shardRouting.shardId().id(); i++; } logger.info("Node [{}] has shards: {}", nonMasterNode, Arrays.toString(node2Shards)); final long shardVersions[] = new long[numShards]; final int shardIds[] = new int[numShards]; i = 0; for (ShardRouting shardRouting : stateResponse.getState().getRoutingTable().allShards("test")) { shardVersions[i] = shardRouting.version(); shardIds[i] = shardRouting.getId(); i++; } // disable relocations when we do this, to make sure the shards are not relocated from node2 // due to rebalancing, and delete its content client().admin().cluster().prepareUpdateSettings().setTransientSettings(settingsBuilder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE, EnableAllocationDecider.Rebalance.NONE)).get(); internalCluster().getInstance(ClusterService.class, nonMasterNode).submitStateUpdateTask("test", new ClusterStateUpdateTask(Priority.IMMEDIATE) { @Override public ClusterState execute(ClusterState currentState) throws Exception { IndexRoutingTable.Builder indexRoutingTableBuilder = IndexRoutingTable.builder("test"); for (int i = 0; i < numShards; i++) { indexRoutingTableBuilder.addIndexShard( new IndexShardRoutingTable.Builder(new ShardId("test", i)) .addShard(TestShardRouting.newShardRouting("test", i, masterId, true, ShardRoutingState.STARTED, shardVersions[shardIds[i]])) .build() ); } return ClusterState.builder(currentState) .routingTable(RoutingTable.builder().add(indexRoutingTableBuilder).build()) .build(); } @Override public boolean runOnlyOnMaster() { return false; } @Override public void onFailure(String source, Throwable t) { } }); waitNoPendingTasksOnAll(); logger.info("Checking if shards aren't removed"); for (int shard : node2Shards) { assertTrue(waitForShardDeletion(nonMasterNode, "test", shard)); } } private Path indexDirectory(String server, String index) { NodeEnvironment env = internalCluster().getInstance(NodeEnvironment.class, server); final Path[] paths = env.indexPaths(new Index(index)); assert paths.length == 1; return paths[0]; } private Path shardDirectory(String server, String index, int shard) { NodeEnvironment env = internalCluster().getInstance(NodeEnvironment.class, server); final Path[] paths = env.availableShardPaths(new ShardId(index, shard)); assert paths.length == 1; return paths[0]; } private boolean waitForShardDeletion(final String server, final String index, final int shard) throws InterruptedException { awaitBusy(new Predicate<Object>() { @Override public boolean apply(Object o) { return !Files.exists(shardDirectory(server, index, shard)); } }); return Files.exists(shardDirectory(server, index, shard)); } private boolean waitForIndexDeletion(final String server, final String index) throws InterruptedException { awaitBusy(new Predicate<Object>() { @Override public boolean apply(Object o) { return !Files.exists(indexDirectory(server, index)); } }); return Files.exists(indexDirectory(server, index)); } /** * This Tracer can be used to signal start and end of a recovery. * This is used to test the following: * Whenever a node deletes a shard because it was relocated somewhere else, it first * checks if enough other copies are started somewhere else. The node sends a ShardActiveRequest * to the other nodes that should have a copy according to cluster state. * The nodes that receive this request check if the shard is in state STARTED in which case they * respond with "true". If they have the shard in POST_RECOVERY they register a cluster state * observer that checks at each update if the shard has moved to STARTED. * To test that this mechanism actually works, this can be triggered by blocking the cluster * state processing when a recover starts and only unblocking it shortly after the node receives * the ShardActiveRequest. */ public static class ReclocationStartEndTracer extends MockTransportService.Tracer { private final ESLogger logger; private final CountDownLatch beginRelocationLatch; private final CountDownLatch receivedShardExistsRequestLatch; public ReclocationStartEndTracer(ESLogger logger, CountDownLatch beginRelocationLatch, CountDownLatch receivedShardExistsRequestLatch) { this.logger = logger; this.beginRelocationLatch = beginRelocationLatch; this.receivedShardExistsRequestLatch = receivedShardExistsRequestLatch; } @Override public void receivedRequest(long requestId, String action) { if (action.equals(IndicesStore.ACTION_SHARD_EXISTS)) { receivedShardExistsRequestLatch.countDown(); logger.info("received: {}, relocation done", action); } } @Override public void requestSent(DiscoveryNode node, long requestId, String action, TransportRequestOptions options) { if (action.equals(RecoverySource.Actions.START_RECOVERY)) { logger.info("sent: {}, relocation starts", action); beginRelocationLatch.countDown(); } } } }