/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.elasticsearch.cluster.ClusterInfoService;
import org.elasticsearch.cluster.ClusterService;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.health.ClusterHealthStatus;
import org.elasticsearch.cluster.health.ClusterStateHealth;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands;
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import com.carrotsearch.hppc.cursors.ObjectCursor;
/**
* This service manages the node allocation of a cluster. For this reason the
* {@link AllocationService} keeps {@link AllocationDeciders} to choose nodes
* for shard allocation. This class also manages new nodes joining the cluster
* and rerouting of shards.
*/
public class AllocationService extends AbstractComponent {
private final ClusterInfoService clusterInfoService;
private final ClusterService clusterService;
@Inject
public AllocationService(Settings settings, ClusterService clusterService, ClusterInfoService clusterInfoService) {
super(settings);
this.clusterService = clusterService;
this.clusterInfoService = clusterInfoService;
}
/**
* Applies the started shards. Note, shards can be called several times within this method.
* <p/>
* <p>If the same instance of the routing table is returned, then no change has been made.</p>
*/
public RoutingAllocation.Result applyStartedShards(ClusterState clusterState, List<? extends ShardRouting> startedShards) {
return applyStartedShards(clusterState, startedShards, true);
}
public RoutingAllocation.Result applyStartedShards(ClusterState clusterState, List<? extends ShardRouting> startedShards, boolean withReroute) {
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState);
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
/*
routingNodes.unassigned().shuffle();
StartedRerouteAllocation allocation = new StartedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), startedShards, clusterInfoService.getClusterInfo());
*/
boolean changed = applyStartedShards(routingNodes, startedShards);
if (!changed) {
return new RoutingAllocation.Result(false, clusterState.routingTable());
}
/*
shardsAllocators.applyStartedShards(allocation);
if (withReroute) {
reroute(allocation);
}
*/
// update local gossip state
for (RoutingNode routingNode : routingNodes) {
for (ShardRouting shard : routingNode) {
if (this.clusterService.localNode().getId().equals(shard.currentNodeId())) {
try {
clusterService.putShardRoutingState(shard.index(), shard.state());
logger.debug("gossip index shard state updated index={} state={}", shard.index(), shard.state());
} catch (Exception e) {
logger.warn("Failed to set gossip index shard state index={} state={}", shard.index(), shard.state());
}
}
}
}
return new RoutingAllocation.Result(true, new RoutingTable.Builder(this.clusterService,clusterState).build().validateRaiseException(clusterState.metaData()));
}
public RoutingAllocation.Result applyFailedShard(ClusterState clusterState, ShardRouting failedShard) {
return applyFailedShards(clusterState, Collections.singletonList(new FailedRerouteAllocation.FailedShard(failedShard, null, null)));
}
/**
* Applies the failed shards. Note, shards can be called several times within this method.
* <p/>
* <p>If the same instance of the routing table is returned, then no change has been made.</p>
*/
public RoutingAllocation.Result applyFailedShards(ClusterState clusterState, List<FailedRerouteAllocation.FailedShard> failedShards) {
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
/*
routingNodes.unassigned().shuffle();
FailedRerouteAllocation allocation = new FailedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), failedShards, clusterInfoService.getClusterInfo());
boolean changed = false;
for (FailedRerouteAllocation.FailedShard failedShard : failedShards) {
changed |= applyFailedShard(allocation, failedShard.shard, true, new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, failedShard.message, failedShard.failure));
}
if (!changed) {
return new RoutingAllocation.Result(false, clusterState.routingTable());
}
shardsAllocators.applyFailedShards(allocation);
reroute(allocation);
*/
RoutingTable routingTable = new RoutingTable.Builder(this.clusterService,clusterState).build();
RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable.validateRaiseException(clusterState.metaData()));
logClusterHealthStateChange(
new ClusterStateHealth(clusterState),
new ClusterStateHealth(clusterState.getMetaData(), routingTable),
"shards failed ..."
);
return result;
}
public RoutingAllocation.Result reroute(ClusterState clusterState, AllocationCommands commands) {
return reroute(clusterState, commands, false);
}
public RoutingAllocation.Result reroute(ClusterState clusterState, AllocationCommands commands, boolean explain) {
/*
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState);
// we don't shuffle the unassigned shards here, to try and get as close as possible to
// a consistent result of the effect the commands have on the routing
// this allows systems to dry run the commands, see the resulting cluster state, and act on it
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes(), clusterInfoService.getClusterInfo());
// don't short circuit deciders, we want a full explanation
allocation.debugDecision(true);
// we ignore disable allocation, because commands are explicit
allocation.ignoreDisable(true);
RoutingExplanations explanations = commands.execute(allocation, explain);
// we revert the ignore disable flag, since when rerouting, we want the original setting to take place
allocation.ignoreDisable(false);
// the assumption is that commands will move / act on shards (or fail through exceptions)
// so, there will always be shard "movements", so no need to check on reroute
reroute(allocation);
*/
RoutingTable routingTable = new RoutingTable.Builder(this.clusterService,clusterState).build();
RoutingAllocation.Result result = new RoutingAllocation.Result(true, routingTable.validateRaiseException(clusterState.metaData()));
logClusterHealthStateChange(
new ClusterStateHealth(clusterState),
new ClusterStateHealth(clusterState.getMetaData(), routingTable),
"reroute commands"
);
return result;
}
/**
* Reroutes the routing table based on the live nodes.
* <p/>
* <p>If the same instance of the routing table is returned, then no change has been made.
*/
public RoutingAllocation.Result reroute(ClusterState clusterState, String reason) {
return reroute(clusterState, reason, false);
}
/**
* Reroutes the routing table based on the live nodes.
* <p/>
* <p>If the same instance of the routing table is returned, then no change has been made.
*/
public RoutingAllocation.Result reroute(ClusterState clusterState, String reason, boolean debug) {
/*
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState);
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
routingNodes.unassigned().shuffle();
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes(), clusterInfoService.getClusterInfo());
allocation.debugDecision(debug);
if (!reroute(allocation)) {
return new RoutingAllocation.Result(false, clusterState.routingTable());
}
*/
// reroute nothing !!!
RoutingTable routingTable = clusterState.routingTable();
RoutingAllocation.Result result = new RoutingAllocation.Result(false, routingTable);
logClusterHealthStateChange(
new ClusterStateHealth(clusterState),
new ClusterStateHealth(clusterState.getMetaData(), routingTable),
reason
);
return result;
}
private void logClusterHealthStateChange(ClusterStateHealth previousStateHealth, ClusterStateHealth newStateHealth, String reason) {
ClusterHealthStatus previousHealth = previousStateHealth.getStatus();
ClusterHealthStatus currentHealth = newStateHealth.getStatus();
if (!previousHealth.equals(currentHealth)) {
logger.info("Cluster health status changed from [{}] to [{}] (reason: [{}]).", previousHealth, currentHealth, reason);
}
}
private boolean reroute(RoutingAllocation allocation) {
boolean changed = false;
/*
// first, clear from the shards any node id they used to belong to that is now dead
changed |= deassociateDeadNodes(allocation);
// create a sorted list of from nodes with least number of shards to the maximum ones
applyNewNodes(allocation);
// elect primaries *before* allocating unassigned, so backups of primaries that failed
// will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*)
changed |= electPrimariesAndUnassignedDanglingReplicas(allocation);
// now allocate all the unassigned to available nodes
if (allocation.routingNodes().unassigned().size() > 0) {
changed |= shardsAllocators.allocateUnassigned(allocation);
}
// move shards that no longer can be allocated
changed |= moveShards(allocation);
// rebalance
changed |= shardsAllocators.rebalance(allocation);
assert RoutingNodes.assertShardStats(allocation.routingNodes());
*/
return changed;
}
private boolean moveShards(RoutingAllocation allocation) {
boolean changed = false;
// create a copy of the shards interleaving between nodes, and check if they can remain
List<ShardRouting> shards = new ArrayList<>();
int index = 0;
boolean found = true;
final RoutingNodes routingNodes = allocation.routingNodes();
while (found) {
found = false;
for (RoutingNode routingNode : routingNodes) {
if (index >= routingNode.size()) {
continue;
}
found = true;
shards.add(routingNode.get(index));
}
index++;
}
for (int i = 0; i < shards.size(); i++) {
ShardRouting shardRouting = shards.get(i);
// we can only move started shards...
if (!shardRouting.started()) {
continue;
}
final RoutingNode routingNode = routingNodes.node(shardRouting.currentNodeId());
Decision decision = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
if (decision.type() == Decision.Type.NO) {
logger.debug("[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node());
//boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation);
boolean moved = false;
if (!moved) {
logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id());
} else {
changed = true;
}
}
}
return changed;
}
private boolean electPrimariesAndUnassignedDanglingReplicas(RoutingAllocation allocation) {
boolean changed = false;
RoutingNodes routingNodes = allocation.routingNodes();
if (routingNodes.unassigned().getNumPrimaries() == 0) {
// move out if we don't have unassigned primaries
return changed;
}
// go over and remove dangling replicas that are initializing for primary shards
List<ShardRouting> shardsToFail = new ArrayList<>();
for (ShardRouting shardEntry : routingNodes.unassigned()) {
if (shardEntry.primary()) {
for (ShardRouting routing : routingNodes.assignedShards(shardEntry)) {
if (!routing.primary() && routing.initializing()) {
shardsToFail.add(routing);
}
}
}
}
for (ShardRouting shardToFail : shardsToFail) {
changed |= applyFailedShard(allocation, shardToFail, false, new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, "primary failed while replica initializing"));
}
// now, go over and elect a new primary if possible, not, from this code block on, if one is elected,
// routingNodes.hasUnassignedPrimaries() will potentially be false
for (ShardRouting shardEntry : routingNodes.unassigned()) {
if (shardEntry.primary()) {
ShardRouting candidate = allocation.routingNodes().activeReplica(shardEntry);
if (candidate != null) {
IndexMetaData index = allocation.metaData().index(candidate.index());
routingNodes.swapPrimaryFlag(shardEntry, candidate);
if (candidate.relocatingNodeId() != null) {
changed = true;
// its also relocating, make sure to move the other routing to primary
RoutingNode node = routingNodes.node(candidate.relocatingNodeId());
if (node != null) {
for (ShardRouting shardRouting : node) {
if (shardRouting.shardId().equals(candidate.shardId()) && !shardRouting.primary()) {
routingNodes.swapPrimaryFlag(shardRouting);
break;
}
}
}
}
if (IndexMetaData.isIndexUsingShadowReplicas(index.getSettings())) {
routingNodes.reinitShadowPrimary(candidate);
changed = true;
}
}
}
}
return changed;
}
/**
* Applies the new nodes to the routing nodes and returns them (just the
* new nodes);
*/
private void applyNewNodes(RoutingAllocation allocation) {
final RoutingNodes routingNodes = allocation.routingNodes();
for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
DiscoveryNode node = cursor.value;
if (!routingNodes.isKnown(node)) {
routingNodes.addNode(node);
}
}
}
private boolean deassociateDeadNodes(RoutingAllocation allocation) {
boolean changed = false;
for (RoutingNodes.RoutingNodesIterator it = allocation.routingNodes().nodes(); it.hasNext(); ) {
RoutingNode node = it.next();
if (allocation.nodes().dataNodes().containsKey(node.nodeId())) {
// its a live node, continue
continue;
}
changed = true;
// now, go over all the shards routing on the node, and fail them
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.NODE_LEFT, "node_left[" + node.nodeId() + "]");
for (ShardRouting shardRouting : node.copyShards()) {
applyFailedShard(allocation, shardRouting, false, unassignedInfo);
}
// its a dead node, remove it, note, its important to remove it *after* we apply failed shard
// since it relies on the fact that the RoutingNode exists in the list of nodes
it.remove();
}
return changed;
}
private boolean applyStartedShards(RoutingNodes routingNodes, Iterable<? extends ShardRouting> startedShardEntries) {
boolean dirty = false;
// apply shards might be called several times with the same shard, ignore it
for (ShardRouting startedShard : startedShardEntries) {
assert startedShard.initializing();
// validate index still exists. strictly speaking this is not needed but it gives clearer logs
if (routingNodes.routingTable().index(startedShard.index()) == null) {
logger.debug("{} ignoring shard started, unknown index (routing: {})", startedShard.shardId(), startedShard);
continue;
}
RoutingNodes.RoutingNodeIterator currentRoutingNode = routingNodes.routingNodeIter(startedShard.currentNodeId());
if (currentRoutingNode == null) {
logger.debug("{} failed to find shard in order to start it [failed to find node], ignoring (routing: {})", startedShard.shardId(), startedShard);
continue;
}
for (ShardRouting shard : currentRoutingNode) {
if (shard.isSameAllocation(startedShard)) {
if (shard.active()) {
logger.trace("{} shard is already started, ignoring (routing: {})", startedShard.shardId(), startedShard);
} else {
dirty = true;
// override started shard with the latest copy. Capture it now , before starting the shard destroys it...
startedShard = new ShardRouting(shard);
routingNodes.started(shard);
logger.trace("{} marked shard as started (routing: {})", startedShard.shardId(), startedShard);
}
break;
}
}
// startedShard is the current state of the shard (post relocation for example)
// this means that after relocation, the state will be started and the currentNodeId will be
// the node we relocated to
if (startedShard.relocatingNodeId() == null) {
continue;
}
RoutingNodes.RoutingNodeIterator sourceRoutingNode = routingNodes.routingNodeIter(startedShard.relocatingNodeId());
if (sourceRoutingNode != null) {
while (sourceRoutingNode.hasNext()) {
ShardRouting shard = sourceRoutingNode.next();
if (shard.isRelocationSourceOf(startedShard)) {
dirty = true;
sourceRoutingNode.remove();
break;
}
}
}
}
return dirty;
}
/**
* Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened that
* require relocation.
*/
private boolean applyFailedShard(RoutingAllocation allocation, ShardRouting failedShard, boolean addToIgnoreList, UnassignedInfo unassignedInfo) {
IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index());
if (indexRoutingTable == null) {
logger.debug("{} ignoring shard failure, unknown index in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
return false;
}
RoutingNodes routingNodes = allocation.routingNodes();
RoutingNodes.RoutingNodeIterator matchedNode = routingNodes.routingNodeIter(failedShard.currentNodeId());
if (matchedNode == null) {
logger.debug("{} ignoring shard failure, unknown node in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
return false;
}
boolean matchedShard = false;
while (matchedNode.hasNext()) {
ShardRouting routing = matchedNode.next();
if (routing.isSameAllocation(failedShard)) {
matchedShard = true;
logger.debug("{} failed shard {} found in routingNodes, failing it ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
break;
}
}
if (matchedShard == false) {
logger.debug("{} ignoring shard failure, unknown allocation id in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
return false;
}
// replace incoming instance to make sure we work on the latest one. Copy it to maintain information during modifications.
failedShard = new ShardRouting(matchedNode.current());
// remove the current copy of the shard
matchedNode.remove();
if (addToIgnoreList) {
// make sure we ignore this shard on the relevant node
allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
}
if (failedShard.relocatingNodeId() != null && failedShard.initializing()) {
// The shard is a target of a relocating shard. In that case we only
// need to remove the target shard and cancel the source relocation.
// No shard is left unassigned
logger.trace("{} is a relocation target, resolving source to cancel relocation ({})", failedShard, unassignedInfo.shortSummary());
RoutingNode relocatingFromNode = routingNodes.node(failedShard.relocatingNodeId());
if (relocatingFromNode != null) {
for (ShardRouting shardRouting : relocatingFromNode) {
if (shardRouting.isRelocationSourceOf(failedShard)) {
logger.trace("{}, resolved source to [{}]. canceling relocation ... ({})", failedShard.shardId(), shardRouting, unassignedInfo.shortSummary());
routingNodes.cancelRelocation(shardRouting);
break;
}
}
}
} else {
// The fail shard is the main copy of the current shard routing. Any
// relocation will be cancelled (and the target shard removed as well)
// and the shard copy needs to be marked as unassigned
if (failedShard.relocatingNodeId() != null) {
// handle relocation source shards. we need to find the target initializing shard that is recovering, and remove it...
assert failedShard.initializing() == false; // should have been dealt with and returned
assert failedShard.relocating();
RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.relocatingNodeId());
if (initializingNode != null) {
while (initializingNode.hasNext()) {
ShardRouting shardRouting = initializingNode.next();
if (shardRouting.isRelocationTargetOf(failedShard)) {
logger.trace("{} is removed due to the failure of the source shard", shardRouting);
initializingNode.remove();
}
}
}
}
matchedNode.moveToUnassigned(unassignedInfo);
}
assert matchedNode.isRemoved() : "failedShard " + failedShard + " was matched but wasn't removed";
return true;
}
private RoutingNodes getMutableRoutingNodes(ClusterState clusterState) {
RoutingNodes routingNodes = new RoutingNodes(clusterState, false); // this is a costly operation - only call this once!
return routingNodes;
}
}