package org.elasticsearch.cluster.routing.allocation; /* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import org.elasticsearch.Version; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ESAllocationTestCase; import org.elasticsearch.cluster.action.shard.ShardStateAction; import org.elasticsearch.cluster.action.shard.ShardStateAction.ShardEntry; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand; import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.util.set.Sets; import org.junit.Before; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasItem; public class InSyncAllocationIdTests extends ESAllocationTestCase { private AllocationService allocation; private ShardStateAction.ShardFailedClusterStateTaskExecutor failedClusterStateTaskExecutor; @Before public void setupAllocationService() { allocation = createAllocationService(); failedClusterStateTaskExecutor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger); } public void testInSyncAllocationIdsUpdated() { logger.info("creating an index with 1 shard, 2 replicas"); MetaData metaData = MetaData.builder() .put(IndexMetaData.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2)) // add index metadata where we have no routing nodes to check that allocation ids are not removed .put(IndexMetaData.builder("test-old").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2) .putInSyncAllocationIds(0, new HashSet<>(Arrays.asList("x", "y")))) .build(); RoutingTable routingTable = RoutingTable.builder() .addAsNew(metaData.index("test")) .build(); ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metaData(metaData).routingTable(routingTable).build(); logger.info("adding three nodes and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add( newNode("node1")).add(newNode("node2")).add(newNode("node3"))).build(); clusterState = allocation.reroute(clusterState, "reroute"); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0)); assertThat(clusterState.metaData().index("test-old").inSyncAllocationIds(0), equalTo(new HashSet<>(Arrays.asList("x", "y")))); logger.info("start primary shard"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1)); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).get(0).allocationId().getId(), equalTo(clusterState.metaData().index("test").inSyncAllocationIds(0).iterator().next())); assertThat(clusterState.metaData().index("test-old").inSyncAllocationIds(0), equalTo(new HashSet<>(Arrays.asList("x", "y")))); logger.info("start replica shards"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3)); logger.info("remove a node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove("node1")) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3)); logger.info("remove all remaining nodes"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove("node2").remove("node3")) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertThat(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).size(), equalTo(3)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3)); // force empty primary clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .add(newNode("node1"))) .build(); clusterState = allocation.reroute(clusterState, new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", true)), false, false) .getClusterState(); // check that in-sync allocation ids are reset by forcing an empty primary assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0)); logger.info("start primary shard"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1)); logger.info("fail primary shard"); ShardRouting startedPrimary = clusterState.getRoutingNodes().shardsWithState(STARTED).get(0); clusterState = allocation.applyFailedShard(clusterState, startedPrimary); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(0)); assertEquals(Collections.singleton(startedPrimary.allocationId().getId()), clusterState.metaData().index("test").inSyncAllocationIds(0)); } /** * Assume following scenario: indexing request is written to primary, but fails to be replicated to active replica. * The primary instructs master to fail replica before acknowledging write to client. In the meanwhile, the node of the replica was * removed from the cluster (deassociateDeadNodes). This means that the ShardRouting of the replica was failed, but it's allocation * id is still part of the in-sync set. We have to make sure that the failShard request from the primary removes the allocation id * from the in-sync set. */ public void testDeadNodesBeforeReplicaFailed() throws Exception { ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); logger.info("remove replica node"); IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(replicaShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(2)); logger.info("fail replica (for which there is no shard routing in the CS anymore)"); assertNull(clusterState.getRoutingNodes().getByAllocationId(replicaShard.shardId(), replicaShard.allocationId().getId())); ShardStateAction.ShardFailedClusterStateTaskExecutor failedClusterStateTaskExecutor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger); long primaryTerm = clusterState.metaData().index("test").primaryTerm(0); clusterState = failedClusterStateTaskExecutor.execute(clusterState, Arrays.asList( new ShardEntry(shardRoutingTable.shardId(), replicaShard.allocationId().getId(), primaryTerm, "dummy", null)) ).resultingState; assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1)); } /** * Assume following scenario: indexing request is written to primary, but fails to be replicated to active replica. * The primary instructs master to fail replica before acknowledging write to client. In the meanwhile, primary fails for an unrelated * reason. Master now batches both requests to fail primary and replica. We have to make sure that only the allocation id of the primary * is kept in the in-sync allocation set before we acknowledge request to client. Otherwise we would acknowledge a write that made it * into the primary but not the replica but the replica is still considered non-stale. */ public void testPrimaryFailureBatchedWithReplicaFailure() throws Exception { ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); ShardRouting primaryShard = shardRoutingTable.primaryShard(); ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); long primaryTerm = clusterState.metaData().index("test").primaryTerm(0); List<ShardEntry> failureEntries = new ArrayList<>(); failureEntries.add(new ShardEntry( shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null)); failureEntries.add(new ShardEntry( shardRoutingTable.shardId(), replicaShard.allocationId().getId(), primaryTerm, "dummy", null)); Collections.shuffle(failureEntries, random()); logger.info("Failing {}", failureEntries); clusterState = failedClusterStateTaskExecutor.execute(clusterState, failureEntries).resultingState; assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0), equalTo(Collections.singleton(primaryShard.allocationId().getId()))); // resend shard failures to check if they are ignored clusterState = failedClusterStateTaskExecutor.execute(clusterState, failureEntries).resultingState; assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0), equalTo(Collections.singleton(primaryShard.allocationId().getId()))); } /** * Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary * but repeatedly shut down nodes that have active replicas. * We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set */ public void testInSyncIdsNotGrowingWithoutBounds() throws Exception { ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); assertThat(inSyncSet.size(), equalTo(2)); IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); ShardRouting primaryShard = shardRoutingTable.primaryShard(); ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); logger.info("remove a node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(replicaShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); // check that inSyncAllocationIds can not grow without bounds for (int i = 0; i < 5; i++) { logger.info("add back node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .add(newNode(replicaShard.currentNodeId()))) .build(); clusterState = allocation.reroute(clusterState, "reroute"); logger.info("start replica shards"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); logger.info("remove the node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(replicaShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); } // in-sync allocation set is bounded Set<String> newInSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); assertThat(newInSyncSet.size(), equalTo(2)); // only allocation id of replica was changed assertFalse(Sets.haveEmptyIntersection(inSyncSet, newInSyncSet)); assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId())); } /** * Only trim set of allocation ids when the set grows */ public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception { ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); assertThat(inSyncSet.size(), equalTo(2)); IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); ShardRouting primaryShard = shardRoutingTable.primaryShard(); ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); logger.info("remove replica node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(replicaShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); logger.info("remove primary node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(primaryShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); logger.info("decrease number of replicas to 0"); clusterState = ClusterState.builder(clusterState) .routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build()) .metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build(); logger.info("add back node 1"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add( newNode("node1"))).build(); clusterState = allocation.reroute(clusterState, "reroute"); assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1)); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); logger.info("start primary shard"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); } /** * Don't remove allocation id of failed active primary if there is no replica to promote as primary. */ public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception { ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation); Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0); assertThat(inSyncSet.size(), equalTo(2)); IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0); ShardRouting primaryShard = shardRoutingTable.primaryShard(); ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0); logger.info("remove replica node"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()) .remove(replicaShard.currentNodeId())) .build(); clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute"); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); logger.info("fail primary shard"); clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry( shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState; assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0)); // in-sync allocation ids should not be updated assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0)); } private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) { logger.info("creating an index with 1 shard, 1 replica"); MetaData metaData = MetaData.builder() .put(IndexMetaData.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)) .build(); RoutingTable routingTable = RoutingTable.builder() .addAsNew(metaData.index("test")) .build(); ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING .getDefault(Settings.EMPTY)).metaData(metaData).routingTable(routingTable).build(); logger.info("adding two nodes and performing rerouting"); clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add( newNode("node1")).add(newNode("node2"))).build(); clusterState = allocation.reroute(clusterState, "reroute"); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0)); logger.info("start primary shard"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1)); assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).get(0).allocationId().getId(), equalTo(clusterState.metaData().index("test").inSyncAllocationIds(0).iterator().next())); logger.info("start replica shard"); clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING)); assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(2)); return clusterState; } }