package org.elasticsearch.cluster.routing.allocation;
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ESAllocationTestCase;
import org.elasticsearch.cluster.action.shard.ShardStateAction;
import org.elasticsearch.cluster.action.shard.ShardStateAction.ShardEntry;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand;
import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.junit.Before;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED;
import static org.elasticsearch.cluster.routing.ShardRoutingState.UNASSIGNED;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
public class InSyncAllocationIdTests extends ESAllocationTestCase {
private AllocationService allocation;
private ShardStateAction.ShardFailedClusterStateTaskExecutor failedClusterStateTaskExecutor;
@Before
public void setupAllocationService() {
allocation = createAllocationService();
failedClusterStateTaskExecutor = new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger);
}
public void testInSyncAllocationIdsUpdated() {
logger.info("creating an index with 1 shard, 2 replicas");
MetaData metaData = MetaData.builder()
.put(IndexMetaData.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2))
// add index metadata where we have no routing nodes to check that allocation ids are not removed
.put(IndexMetaData.builder("test-old").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2)
.putInSyncAllocationIds(0, new HashSet<>(Arrays.asList("x", "y"))))
.build();
RoutingTable routingTable = RoutingTable.builder()
.addAsNew(metaData.index("test"))
.build();
ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING
.getDefault(Settings.EMPTY)).metaData(metaData).routingTable(routingTable).build();
logger.info("adding three nodes and performing rerouting");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
newNode("node1")).add(newNode("node2")).add(newNode("node3"))).build();
clusterState = allocation.reroute(clusterState, "reroute");
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0));
assertThat(clusterState.metaData().index("test-old").inSyncAllocationIds(0), equalTo(new HashSet<>(Arrays.asList("x", "y"))));
logger.info("start primary shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1));
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).get(0).allocationId().getId(),
equalTo(clusterState.metaData().index("test").inSyncAllocationIds(0).iterator().next()));
assertThat(clusterState.metaData().index("test-old").inSyncAllocationIds(0), equalTo(new HashSet<>(Arrays.asList("x", "y"))));
logger.info("start replica shards");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3));
logger.info("remove a node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove("node1"))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3));
logger.info("remove all remaining nodes");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove("node2").remove("node3"))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertThat(clusterState.getRoutingTable().shardsWithState(UNASSIGNED).size(), equalTo(3));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(3));
// force empty primary
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.add(newNode("node1")))
.build();
clusterState = allocation.reroute(clusterState,
new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 0, "node1", true)), false, false)
.getClusterState();
// check that in-sync allocation ids are reset by forcing an empty primary
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0));
logger.info("start primary shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1));
logger.info("fail primary shard");
ShardRouting startedPrimary = clusterState.getRoutingNodes().shardsWithState(STARTED).get(0);
clusterState = allocation.applyFailedShard(clusterState, startedPrimary);
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(0));
assertEquals(Collections.singleton(startedPrimary.allocationId().getId()),
clusterState.metaData().index("test").inSyncAllocationIds(0));
}
/**
* Assume following scenario: indexing request is written to primary, but fails to be replicated to active replica.
* The primary instructs master to fail replica before acknowledging write to client. In the meanwhile, the node of the replica was
* removed from the cluster (deassociateDeadNodes). This means that the ShardRouting of the replica was failed, but it's allocation
* id is still part of the in-sync set. We have to make sure that the failShard request from the primary removes the allocation id
* from the in-sync set.
*/
public void testDeadNodesBeforeReplicaFailed() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
logger.info("remove replica node");
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(2));
logger.info("fail replica (for which there is no shard routing in the CS anymore)");
assertNull(clusterState.getRoutingNodes().getByAllocationId(replicaShard.shardId(), replicaShard.allocationId().getId()));
ShardStateAction.ShardFailedClusterStateTaskExecutor failedClusterStateTaskExecutor =
new ShardStateAction.ShardFailedClusterStateTaskExecutor(allocation, null, logger);
long primaryTerm = clusterState.metaData().index("test").primaryTerm(0);
clusterState = failedClusterStateTaskExecutor.execute(clusterState, Arrays.asList(
new ShardEntry(shardRoutingTable.shardId(), replicaShard.allocationId().getId(), primaryTerm, "dummy", null))
).resultingState;
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1));
}
/**
* Assume following scenario: indexing request is written to primary, but fails to be replicated to active replica.
* The primary instructs master to fail replica before acknowledging write to client. In the meanwhile, primary fails for an unrelated
* reason. Master now batches both requests to fail primary and replica. We have to make sure that only the allocation id of the primary
* is kept in the in-sync allocation set before we acknowledge request to client. Otherwise we would acknowledge a write that made it
* into the primary but not the replica but the replica is still considered non-stale.
*/
public void testPrimaryFailureBatchedWithReplicaFailure() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
long primaryTerm = clusterState.metaData().index("test").primaryTerm(0);
List<ShardEntry> failureEntries = new ArrayList<>();
failureEntries.add(new ShardEntry(
shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null));
failureEntries.add(new ShardEntry(
shardRoutingTable.shardId(), replicaShard.allocationId().getId(), primaryTerm, "dummy", null));
Collections.shuffle(failureEntries, random());
logger.info("Failing {}", failureEntries);
clusterState = failedClusterStateTaskExecutor.execute(clusterState, failureEntries).resultingState;
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0),
equalTo(Collections.singleton(primaryShard.allocationId().getId())));
// resend shard failures to check if they are ignored
clusterState = failedClusterStateTaskExecutor.execute(clusterState, failureEntries).resultingState;
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0),
equalTo(Collections.singleton(primaryShard.allocationId().getId())));
}
/**
* Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
* but repeatedly shut down nodes that have active replicas.
* We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
*/
public void testInSyncIdsNotGrowingWithoutBounds() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(inSyncSet.size(), equalTo(2));
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
logger.info("remove a node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
// check that inSyncAllocationIds can not grow without bounds
for (int i = 0; i < 5; i++) {
logger.info("add back node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.add(newNode(replicaShard.currentNodeId())))
.build();
clusterState = allocation.reroute(clusterState, "reroute");
logger.info("start replica shards");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
logger.info("remove the node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
}
// in-sync allocation set is bounded
Set<String> newInSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(newInSyncSet.size(), equalTo(2));
// only allocation id of replica was changed
assertFalse(Sets.haveEmptyIntersection(inSyncSet, newInSyncSet));
assertThat(newInSyncSet, hasItem(primaryShard.allocationId().getId()));
}
/**
* Only trim set of allocation ids when the set grows
*/
public void testInSyncIdsNotTrimmedWhenNotGrowing() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(inSyncSet.size(), equalTo(2));
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
logger.info("remove replica node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("remove primary node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(primaryShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("decrease number of replicas to 0");
clusterState = ClusterState.builder(clusterState)
.routingTable(RoutingTable.builder(clusterState.routingTable()).updateNumberOfReplicas(0, "test").build())
.metaData(MetaData.builder(clusterState.metaData()).updateNumberOfReplicas(0, "test")).build();
logger.info("add back node 1");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
newNode("node1"))).build();
clusterState = allocation.reroute(clusterState, "reroute");
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(1));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("start primary shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
}
/**
* Don't remove allocation id of failed active primary if there is no replica to promote as primary.
*/
public void testPrimaryAllocationIdNotRemovedFromInSyncSetWhenNoFailOver() throws Exception {
ClusterState clusterState = createOnePrimaryOneReplicaClusterState(allocation);
Set<String> inSyncSet = clusterState.metaData().index("test").inSyncAllocationIds(0);
assertThat(inSyncSet.size(), equalTo(2));
IndexShardRoutingTable shardRoutingTable = clusterState.routingTable().index("test").shard(0);
ShardRouting primaryShard = shardRoutingTable.primaryShard();
ShardRouting replicaShard = shardRoutingTable.replicaShards().get(0);
logger.info("remove replica node");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes())
.remove(replicaShard.currentNodeId()))
.build();
clusterState = allocation.deassociateDeadNodes(clusterState, true, "reroute");
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
logger.info("fail primary shard");
clusterState = failedClusterStateTaskExecutor.execute(clusterState, Collections.singletonList(new ShardEntry(
shardRoutingTable.shardId(), primaryShard.allocationId().getId(), 0L, "dummy", null))).resultingState;
assertThat(clusterState.routingTable().index("test").shard(0).assignedShards().size(), equalTo(0));
// in-sync allocation ids should not be updated
assertEquals(inSyncSet, clusterState.metaData().index("test").inSyncAllocationIds(0));
}
private ClusterState createOnePrimaryOneReplicaClusterState(AllocationService allocation) {
logger.info("creating an index with 1 shard, 1 replica");
MetaData metaData = MetaData.builder()
.put(IndexMetaData.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1))
.build();
RoutingTable routingTable = RoutingTable.builder()
.addAsNew(metaData.index("test"))
.build();
ClusterState clusterState = ClusterState.builder(org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING
.getDefault(Settings.EMPTY)).metaData(metaData).routingTable(routingTable).build();
logger.info("adding two nodes and performing rerouting");
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(
newNode("node1")).add(newNode("node2"))).build();
clusterState = allocation.reroute(clusterState, "reroute");
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(0));
logger.info("start primary shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).size(), equalTo(1));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(1));
assertThat(clusterState.getRoutingTable().shardsWithState(STARTED).get(0).allocationId().getId(),
equalTo(clusterState.metaData().index("test").inSyncAllocationIds(0).iterator().next()));
logger.info("start replica shard");
clusterState = allocation.applyStartedShards(clusterState, clusterState.getRoutingNodes().shardsWithState(INITIALIZING));
assertThat(clusterState.metaData().index("test").inSyncAllocationIds(0).size(), equalTo(2));
return clusterState;
}
}