package org.elasticsearch.cluster.routing;
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import com.carrotsearch.hppc.cursors.IntObjectCursor;
import org.elasticsearch.action.admin.cluster.reroute.ClusterRerouteRequestBuilder;
import org.elasticsearch.action.admin.indices.shards.IndicesShardStoresResponse;
import org.elasticsearch.action.support.ActiveShardCount;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.allocation.command.AllocateEmptyPrimaryAllocationCommand;
import org.elasticsearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand;
import org.elasticsearch.common.collect.ImmutableOpenIntMap;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.gateway.GatewayAllocator;
import org.elasticsearch.plugins.Plugin;
import org.elasticsearch.test.ESIntegTestCase;
import org.elasticsearch.test.InternalTestCluster;
import org.elasticsearch.test.discovery.TestZenDiscovery;
import org.elasticsearch.test.disruption.NetworkDisruption;
import org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect;
import org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions;
import org.elasticsearch.test.transport.MockTransportService;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount;
import static org.hamcrest.Matchers.empty;
import static org.hamcrest.Matchers.equalTo;
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
public class PrimaryAllocationIT extends ESIntegTestCase {
@Override
protected Collection<Class<? extends Plugin>> nodePlugins() {
// disruption tests need MockTransportService
return Arrays.asList(MockTransportService.TestPlugin.class);
}
@Override
protected Settings nodeSettings(int nodeOrdinal) {
return Settings.builder().put(super.nodeSettings(nodeOrdinal))
.put(TestZenDiscovery.USE_MOCK_PINGS.getKey(), false).build();
}
private void createStaleReplicaScenario() throws Exception {
logger.info("--> starting 3 nodes, 1 master, 2 data");
String master = internalCluster().startMasterOnlyNode(Settings.EMPTY);
internalCluster().startDataOnlyNodes(2);
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
.put("index.number_of_shards", 1).put("index.number_of_replicas", 1)).get());
ensureGreen();
logger.info("--> indexing...");
client().prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
refresh();
ClusterState state = client().admin().cluster().prepareState().all().get().getState();
List<ShardRouting> shards = state.routingTable().allShards("test");
assertThat(shards.size(), equalTo(2));
final String primaryNode;
final String replicaNode;
if (shards.get(0).primary()) {
primaryNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
} else {
primaryNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
}
NetworkDisruption partition = new NetworkDisruption(
new TwoPartitions(Sets.newHashSet(master, replicaNode), Collections.singleton(primaryNode)),
new NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
logger.info("--> partitioning node with primary shard from rest of cluster");
partition.startDisrupting();
ensureStableCluster(2, master);
logger.info("--> index a document into previous replica shard (that is now primary)");
client(replicaNode).prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
logger.info("--> shut down node that has new acknowledged document");
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureStableCluster(1, master);
partition.stopDisrupting();
logger.info("--> waiting for node with old primary shard to rejoin the cluster");
ensureStableCluster(2, master);
logger.info("--> check that old primary shard does not get promoted to primary again");
// kick reroute and wait for all shard states to be fetched
client(master).admin().cluster().prepareReroute().get();
assertBusy(() -> assertThat(internalCluster().getInstance(GatewayAllocator.class, master).getNumberOfInFlightFetch(), equalTo(0)));
// kick reroute a second time and check that all shards are unassigned
assertThat(client(master).admin().cluster().prepareReroute().get().getState().getRoutingNodes().unassigned().size(), equalTo(2));
}
public void testDoNotAllowStaleReplicasToBePromotedToPrimary() throws Exception {
createStaleReplicaScenario();
logger.info("--> starting node that reuses data folder with the up-to-date primary shard");
internalCluster().startDataOnlyNode(Settings.EMPTY);
logger.info("--> check that the up-to-date primary shard gets promoted and that documents are available");
ensureYellow("test");
assertHitCount(client().prepareSearch().setSize(0).setQuery(matchAllQuery()).get(), 2L);
}
public void testFailedAllocationOfStalePrimaryToDataNodeWithNoData() throws Exception {
String dataNodeWithShardCopy = internalCluster().startNode();
logger.info("--> create single shard index");
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
.put("index.number_of_shards", 1).put("index.number_of_replicas", 0)).get());
ensureGreen("test");
String dataNodeWithNoShardCopy = internalCluster().startNode();
ensureStableCluster(2);
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(dataNodeWithShardCopy));
ensureStableCluster(1);
assertThat(client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").getShards().get(0).primaryShard().unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.NODE_LEFT));
logger.info("--> force allocation of stale copy to node that does not have shard copy");
client().admin().cluster().prepareReroute().add(new AllocateStalePrimaryAllocationCommand("test", 0, dataNodeWithNoShardCopy, true)).get();
logger.info("--> wait until shard is failed and becomes unassigned again");
assertBusy(() ->
assertTrue(client().admin().cluster().prepareState().get().getState().toString(),
client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").allPrimaryShardsUnassigned()));
assertThat(client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").getShards().get(0).primaryShard().unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
}
public void testForceStaleReplicaToBePromotedToPrimary() throws Exception {
boolean useStaleReplica = randomBoolean(); // if true, use stale replica, otherwise a completely empty copy
createStaleReplicaScenario();
logger.info("--> explicitly promote old primary shard");
final String idxName = "test";
ImmutableOpenIntMap<List<IndicesShardStoresResponse.StoreStatus>> storeStatuses = client().admin().indices().prepareShardStores(idxName).get().getStoreStatuses().get(idxName);
ClusterRerouteRequestBuilder rerouteBuilder = client().admin().cluster().prepareReroute();
for (IntObjectCursor<List<IndicesShardStoresResponse.StoreStatus>> shardStoreStatuses : storeStatuses) {
int shardId = shardStoreStatuses.key;
IndicesShardStoresResponse.StoreStatus storeStatus = randomFrom(shardStoreStatuses.value);
logger.info("--> adding allocation command for shard {}", shardId);
// force allocation based on node id
if (useStaleReplica) {
rerouteBuilder.add(new AllocateStalePrimaryAllocationCommand(idxName, shardId, storeStatus.getNode().getId(), true));
} else {
rerouteBuilder.add(new AllocateEmptyPrimaryAllocationCommand(idxName, shardId, storeStatus.getNode().getId(), true));
}
}
rerouteBuilder.get();
logger.info("--> check that the stale primary shard gets allocated and that documents are available");
ensureYellow(idxName);
if (useStaleReplica == false) {
// When invoking AllocateEmptyPrimaryAllocationCommand, due to the UnassignedInfo.Reason being changed to INDEX_CREATION,
// its possible that the shard has not completed initialization, even though the cluster health is yellow, so the
// search can throw an "all shards failed" exception. We will wait until the shard initialization has completed before
// verifying the search hit count.
assertBusy(() -> assertTrue(client().admin().cluster().prepareState().get()
.getState().routingTable().index(idxName).allPrimaryShardsActive()));
}
assertHitCount(client().prepareSearch(idxName).setSize(0).setQuery(matchAllQuery()).get(), useStaleReplica ? 1L : 0L);
// allocation id of old primary was cleaned from the in-sync set
ClusterState state = client().admin().cluster().prepareState().get().getState();
assertEquals(Collections.singleton(state.routingTable().index(idxName).shard(0).primary.allocationId().getId()),
state.metaData().index(idxName).inSyncAllocationIds(0));
}
public void testForcePrimaryShardIfAllocationDecidersSayNoAfterIndexCreation() throws ExecutionException, InterruptedException {
String node = internalCluster().startNode();
client().admin().indices().prepareCreate("test").setWaitForActiveShards(ActiveShardCount.NONE).setSettings(Settings.builder()
.put("index.routing.allocation.exclude._name", node)
.put("index.number_of_shards", 1).put("index.number_of_replicas", 0)).get();
assertThat(client().admin().cluster().prepareState().get().getState().getRoutingTable().shardRoutingTable("test", 0).assignedShards(), empty());
client().admin().cluster().prepareReroute().add(new AllocateEmptyPrimaryAllocationCommand("test", 0, node, true)).get();
ensureGreen("test");
}
public void testDoNotRemoveAllocationIdOnNodeLeave() throws Exception {
internalCluster().startMasterOnlyNode(Settings.EMPTY);
internalCluster().startDataOnlyNode(Settings.EMPTY);
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
.put("index.number_of_shards", 1).put("index.number_of_replicas", 1).put("index.unassigned.node_left.delayed_timeout", "0ms")).get());
String replicaNode = internalCluster().startDataOnlyNode(Settings.EMPTY);
ensureGreen("test");
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureYellow("test");
assertEquals(2, client().admin().cluster().prepareState().get().getState().metaData().index("test").inSyncAllocationIds(0).size());
internalCluster().restartRandomDataNode(new InternalTestCluster.RestartCallback() {
@Override
public boolean clearData(String nodeName) {
return true;
}
});
logger.info("--> wait until shard is failed and becomes unassigned again");
assertBusy(() -> assertTrue(client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").allPrimaryShardsUnassigned()));
assertEquals(2, client().admin().cluster().prepareState().get().getState().metaData().index("test").inSyncAllocationIds(0).size());
logger.info("--> starting node that reuses data folder with the up-to-date shard");
internalCluster().startDataOnlyNode(Settings.EMPTY);
ensureGreen("test");
}
public void testRemoveAllocationIdOnWriteAfterNodeLeave() throws Exception {
internalCluster().startMasterOnlyNode(Settings.EMPTY);
internalCluster().startDataOnlyNode(Settings.EMPTY);
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
.put("index.number_of_shards", 1).put("index.number_of_replicas", 1).put("index.unassigned.node_left.delayed_timeout", "0ms")).get());
String replicaNode = internalCluster().startDataOnlyNode(Settings.EMPTY);
ensureGreen("test");
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureYellow("test");
assertEquals(2, client().admin().cluster().prepareState().get().getState().metaData().index("test").inSyncAllocationIds(0).size());
logger.info("--> indexing...");
client().prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
assertEquals(1, client().admin().cluster().prepareState().get().getState().metaData().index("test").inSyncAllocationIds(0).size());
internalCluster().restartRandomDataNode(new InternalTestCluster.RestartCallback() {
@Override
public boolean clearData(String nodeName) {
return true;
}
});
logger.info("--> wait until shard is failed and becomes unassigned again");
assertBusy(() -> assertTrue(client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").allPrimaryShardsUnassigned()));
assertEquals(1, client().admin().cluster().prepareState().get().getState().metaData().index("test").inSyncAllocationIds(0).size());
logger.info("--> starting node that reuses data folder with the up-to-date shard");
internalCluster().startDataOnlyNode(Settings.EMPTY);
assertBusy(() -> assertTrue(client().admin().cluster().prepareState().get().getState().getRoutingTable().index("test").allPrimaryShardsUnassigned()));
}
public void testNotWaitForQuorumCopies() throws Exception {
logger.info("--> starting 3 nodes");
internalCluster().startNodes(3);
logger.info("--> creating index with 1 primary and 2 replicas");
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder()
.put("index.number_of_shards", randomIntBetween(1, 3)).put("index.number_of_replicas", 2)).get());
ensureGreen("test");
client().prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
logger.info("--> removing 2 nodes from cluster");
internalCluster().stopRandomDataNode();
internalCluster().stopRandomDataNode();
internalCluster().fullRestart();
logger.info("--> checking that index still gets allocated with only 1 shard copy being available");
ensureYellow("test");
assertHitCount(client().prepareSearch().setSize(0).setQuery(matchAllQuery()).get(), 1L);
}
/**
* This test ensures that for an unassigned primary shard that has a valid shard copy on at least one node,
* we will force allocate the primary shard to one of those nodes, even if the allocation deciders all return
* a NO decision to allocate.
*/
public void testForceAllocatePrimaryOnNoDecision() throws Exception {
logger.info("--> starting 1 node");
final String node = internalCluster().startNode();
logger.info("--> creating index with 1 primary and 0 replicas");
final String indexName = "test-idx";
assertAcked(client().admin().indices()
.prepareCreate(indexName)
.setSettings(Settings.builder().put(IndexMetaData.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1)
.put(IndexMetaData.INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0))
.get());
logger.info("--> update the settings to prevent allocation to the data node");
assertTrue(client().admin().indices().prepareUpdateSettings(indexName)
.setSettings(Settings.builder().put(IndexMetaData.INDEX_ROUTING_EXCLUDE_GROUP_SETTING.getKey() + "_name", node))
.get()
.isAcknowledged());
logger.info("--> full cluster restart");
internalCluster().fullRestart();
logger.info("--> checking that the primary shard is force allocated to the data node despite being blocked by the exclude filter");
ensureGreen(indexName);
assertEquals(1, client().admin().cluster().prepareState().get().getState()
.routingTable().index(indexName).shardsWithState(ShardRoutingState.STARTED).size());
}
}