/* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.cluster.routing.allocation; import com.google.common.collect.Lists; import org.elasticsearch.ElasticSearchException; import org.elasticsearch.ElasticSearchIllegalStateException; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.ImmutableShardRouting; import org.elasticsearch.cluster.routing.IndexRoutingTable; import org.elasticsearch.cluster.routing.MutableShardRouting; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingNodes; import org.elasticsearch.cluster.routing.RoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocators; import org.elasticsearch.cluster.routing.allocation.command.AllocationCommands; import org.elasticsearch.cluster.routing.allocation.decider.ServerAllocationDeciders; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.node.settings.NodeSettingsService; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING; import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING; /** * */ public class AllocationService extends AbstractComponent { private final ServerAllocationDeciders allocationDeciders; private final ShardsAllocators shardsAllocators; public AllocationService() { this(ImmutableSettings.Builder.EMPTY_SETTINGS); } public AllocationService(Settings settings) { this(settings, new ServerAllocationDeciders(settings, new NodeSettingsService(ImmutableSettings.Builder.EMPTY_SETTINGS)), new ShardsAllocators(settings) ); } @Inject public AllocationService(Settings settings, ServerAllocationDeciders allocationDeciders, ShardsAllocators shardsAllocators) { super(settings); this.allocationDeciders = allocationDeciders; this.shardsAllocators = shardsAllocators; } /** * Applies the started shards. Note, shards can be called several times within this method. * <p/> * <p>If the same instance of the routing table is returned, then no change has been made. */ public RoutingAllocationResult applyStartedShards(ClusterState clusterState, List<? extends ShardRouting> startedShards) { RoutingNodes routingNodes = clusterState.routingNodes(); // shuffle the unassigned nodes, just so we won't have things like poison failed shards Collections.shuffle(routingNodes.unassigned()); StartedRerouteAllocation allocation = new StartedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), startedShards); boolean changed = applyStartedShards(routingNodes, startedShards); if (!changed) { return new RoutingAllocationResult(false, clusterState.routingTable(), allocation.explanation()); } shardsAllocators.applyStartedShards(allocation); reroute(allocation); return new RoutingAllocationResult(true, new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()), allocation.explanation()); } /** * Applies the failed shards. Note, shards can be called several times within this method. * <p/> * <p>If the same instance of the routing table is returned, then no change has been made. */ public RoutingAllocationResult applyFailedShard(ClusterState clusterState, ShardRouting failedShard) { RoutingNodes routingNodes = clusterState.routingNodes(); // shuffle the unassigned nodes, just so we won't have things like poison failed shards Collections.shuffle(routingNodes.unassigned()); FailedRerouteAllocation allocation = new FailedRerouteAllocation(allocationDeciders, routingNodes, clusterState.nodes(), failedShard); boolean changed = applyFailedShard(allocation, failedShard); if (!changed) { return new RoutingAllocationResult(false, clusterState.routingTable(), allocation.explanation()); } shardsAllocators.applyFailedShards(allocation); reroute(allocation); return new RoutingAllocationResult(true, new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()), allocation.explanation()); } public RoutingAllocationResult reroute(ClusterState clusterState, AllocationCommands commands) throws ElasticSearchException { RoutingNodes routingNodes = clusterState.routingNodes(); // we don't shuffle the unassigned shards here, to try and get as close as possible to // a consistent result of the effect the commands have on the routing // this allows systems to dry run the commands, see the resulting cluster state, and act on it RoutingAllocation allocation = new SimpleRoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes()); // we ignore disable allocation, because commands are explicit allocation.ignoreDisable(true); commands.execute(allocation); // we revert the ignore disable flag, since when rerouting, we want the original setting to take place allocation.ignoreDisable(false); // the assumption is that commands will move / act on shards (or fail through exceptions) // so, there will always be shard "movements", so no need to check on reroute reroute(allocation); return new RoutingAllocationResult(true, new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()), allocation.explanation()); } /** * Reroutes the routing table based on the live nodes. * <p/> * <p>If the same instance of the routing table is returned, then no change has been made. */ public RoutingAllocationResult reroute(ClusterState clusterState) { RoutingNodes routingNodes = clusterState.routingNodes(); // shuffle the unassigned nodes, just so we won't have things like poison failed shards Collections.shuffle(routingNodes.unassigned()); RoutingAllocation allocation = new SimpleRoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes()); if (!reroute(allocation)) { return new RoutingAllocationResult(false, clusterState.routingTable(), allocation.explanation()); } return new RoutingAllocationResult(true, new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()), allocation.explanation()); } /** * Only handles reroute but *without* any reassignment of unassigned shards or rebalancing. Does * make sure to handle removed nodes, but only moved the shards to UNASSIGNED, does not reassign * them. */ public RoutingAllocationResult rerouteWithNoReassign(ClusterState clusterState) { RoutingNodes routingNodes = clusterState.routingNodes(); // shuffle the unassigned nodes, just so we won't have things like poison failed shards Collections.shuffle(routingNodes.unassigned()); RoutingAllocation allocation = new SimpleRoutingAllocation(allocationDeciders, routingNodes, clusterState.nodes()); Iterable<DiscoveryNode> dataNodes = allocation.nodes().dataNodes().values(); boolean changed = false; // first, clear from the shards any node id they used to belong to that is now dead changed |= deassociateDeadNodes(allocation); // create a sorted list of from nodes with least number of shards to the maximum ones applyNewNodes(allocation); // elect primaries *before* allocating unassigned, so backups of primaries that failed // will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*) changed |= electPrimaries(allocation.routingNodes()); if (!changed) { return new RoutingAllocationResult(false, clusterState.routingTable(), allocation.explanation()); } return new RoutingAllocationResult(true, new RoutingTable.Builder().updateNodes(routingNodes).build().validateRaiseException(clusterState.metaData()), allocation.explanation()); } private boolean reroute(RoutingAllocation allocation) { Iterable<DiscoveryNode> dataNodes = allocation.nodes().dataNodes().values(); boolean changed = false; // first, clear from the shards any node id they used to belong to that is now dead changed |= deassociateDeadNodes(allocation); // create a sorted list of from nodes with least number of shards to the maximum ones applyNewNodes(allocation); // elect primaries *before* allocating unassigned, so backups of primaries that failed // will be moved to primary state and not wait for primaries to be allocated and recovered (*from gateway*) changed |= electPrimaries(allocation.routingNodes()); // now allocate all the unassigned to available nodes if (allocation.routingNodes().hasUnassigned()) { changed |= shardsAllocators.allocateUnassigned(allocation); // elect primaries again, in case this is needed with unassigned allocation changed |= electPrimaries(allocation.routingNodes()); } // move shards that no longer can be allocated changed |= moveShards(allocation); // rebalance changed |= shardsAllocators.rebalance(allocation); return changed; } private boolean moveShards(RoutingAllocation allocation) { boolean changed = false; // create a copy of the shards interleaving between nodes, and check if they can remain List<MutableShardRouting> shards = new ArrayList<MutableShardRouting>(); int index = 0; boolean found = true; while (found) { found = false; for (RoutingNode routingNode : allocation.routingNodes()) { if (index >= routingNode.shards().size()) { continue; } found = true; shards.add(routingNode.shards().get(index)); } index++; } for (int i = 0; i < shards.size(); i++) { MutableShardRouting shardRouting = shards.get(i); // we can only move started shards... if (!shardRouting.started()) { continue; } RoutingNode routingNode = allocation.routingNodes().node(shardRouting.currentNodeId()); if (!allocation.deciders().canRemain(shardRouting, routingNode, allocation)) { logger.debug("[{}][{}] allocated on [{}], but can no longer be allocated on it, moving...", shardRouting.index(), shardRouting.id(), routingNode.node()); boolean moved = shardsAllocators.move(shardRouting, routingNode, allocation); if (!moved) { logger.debug("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); } else { changed = true; } } } return changed; } private boolean electPrimaries(RoutingNodes routingNodes) { boolean changed = false; for (MutableShardRouting shardEntry : routingNodes.unassigned()) { if (shardEntry.primary() && !shardEntry.assignedToNode()) { boolean elected = false; // primary and not assigned, go over and find a replica that is assigned and active (since it might be relocating) for (RoutingNode routingNode : routingNodes.nodesToShards().values()) { for (MutableShardRouting shardEntry2 : routingNode.shards()) { if (shardEntry.shardId().equals(shardEntry2.shardId()) && shardEntry2.active()) { assert shardEntry2.assignedToNode(); assert !shardEntry2.primary(); changed = true; shardEntry.moveFromPrimary(); shardEntry2.moveToPrimary(); if (shardEntry2.relocatingNodeId() != null) { // its also relocating, make sure to move the other routing to primary RoutingNode node = routingNodes.node(shardEntry2.relocatingNodeId()); if (node != null) { for (MutableShardRouting shardRouting : node) { if (shardRouting.shardId().equals(shardEntry2.shardId()) && !shardRouting.primary()) { shardRouting.moveToPrimary(); break; } } } } elected = true; break; } } if (elected) { break; } } } } return changed; } /** * Applies the new nodes to the routing nodes and returns them (just the * new nodes); */ private void applyNewNodes(RoutingAllocation allocation) { for (DiscoveryNode node : allocation.nodes().dataNodes().values()) { if (!allocation.routingNodes().nodesToShards().containsKey(node.id())) { RoutingNode routingNode = new RoutingNode(node.id(), node); allocation.routingNodes().nodesToShards().put(node.id(), routingNode); } } } private boolean deassociateDeadNodes(RoutingAllocation allocation) { boolean changed = false; for (Iterator<RoutingNode> it = allocation.routingNodes().nodesToShards().values().iterator(); it.hasNext(); ) { RoutingNode node = it.next(); if (allocation.nodes().dataNodes().containsKey(node.nodeId())) { // its a live node, continue continue; } changed = true; // now, go over all the shards routing on the node, and fail them for (MutableShardRouting shardRouting : new ArrayList<MutableShardRouting>(node.shards())) { // we create a copy of the shard routing, since applyFailedShard assumes its a new copy applyFailedShard(allocation, shardRouting); } // its a dead node, remove it, note, its important to remove it *after* we apply failed shard // since it relies on the fact that the RoutingNode exists in the list of nodes it.remove(); } return changed; } private boolean applyStartedShards(RoutingNodes routingNodes, Iterable<? extends ShardRouting> startedShardEntries) { boolean dirty = false; // apply shards might be called several times with the same shard, ignore it for (ShardRouting startedShard : startedShardEntries) { assert startedShard.state() == INITIALIZING; // retrieve the relocating node id before calling moveToStarted(). String relocatingNodeId = null; RoutingNode currentRoutingNode = routingNodes.nodesToShards().get(startedShard.currentNodeId()); if (currentRoutingNode != null) { for (MutableShardRouting shard : currentRoutingNode) { if (shard.shardId().equals(startedShard.shardId())) { relocatingNodeId = shard.relocatingNodeId(); if (!shard.started()) { dirty = true; shard.moveToStarted(); } break; } } } // startedShard is the current state of the shard (post relocation for example) // this means that after relocation, the state will be started and the currentNodeId will be // the node we relocated to if (relocatingNodeId == null) continue; RoutingNode sourceRoutingNode = routingNodes.nodesToShards().get(relocatingNodeId); if (sourceRoutingNode != null) { Iterator<MutableShardRouting> shardsIter = sourceRoutingNode.iterator(); while (shardsIter.hasNext()) { MutableShardRouting shard = shardsIter.next(); if (shard.shardId().equals(startedShard.shardId())) { if (shard.relocating()) { dirty = true; shardsIter.remove(); break; } } } } } return dirty; } /** * Applies the relevant logic to handle a failed shard. Returns <tt>true</tt> if changes happened that * require relocation. */ private boolean applyFailedShard(RoutingAllocation allocation, ShardRouting failedShard) { // create a copy of the failed shard, since we assume we can change possible refernces to it without // changing the state of failed shard failedShard = new ImmutableShardRouting(failedShard); IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index()); if (indexRoutingTable == null) { return false; } if (failedShard.relocatingNodeId() != null) { // the shard is relocating, either in initializing (recovery from another node) or relocating (moving to another node) if (failedShard.state() == INITIALIZING) { // the shard is initializing and recovering from another node boolean dirty = false; // first, we need to cancel the current node that is being initialized RoutingNode initializingNode = allocation.routingNodes().node(failedShard.currentNodeId()); if (initializingNode != null) { for (Iterator<MutableShardRouting> it = initializingNode.iterator(); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (shardRouting.equals(failedShard)) { dirty = true; it.remove(); shardRouting.deassignNode(); // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId()); break; } } } if (dirty) { // now, find the node that we are relocating *from*, and cancel its relocation RoutingNode relocatingFromNode = allocation.routingNodes().node(failedShard.relocatingNodeId()); if (relocatingFromNode != null) { for (Iterator<MutableShardRouting> it = relocatingFromNode.iterator(); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.state() == RELOCATING) { dirty = true; shardRouting.cancelRelocation(); break; } } } } return dirty; } else if (failedShard.state() == RELOCATING) { boolean dirty = false; // the shard is relocating, meaning its the source the shard is relocating from // first, we need to cancel the current relocation from the current node // now, find the node that we are recovering from, cancel the relocation, remove it from the node // and add it to the unassigned shards list... RoutingNode relocatingFromNode = allocation.routingNodes().node(failedShard.currentNodeId()); if (relocatingFromNode != null) { for (Iterator<MutableShardRouting> it = relocatingFromNode.iterator(); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (shardRouting.equals(failedShard)) { dirty = true; shardRouting.cancelRelocation(); it.remove(); // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId()); allocation.routingNodes().unassigned().add(new MutableShardRouting(failedShard.index(), failedShard.id(), null, failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1)); break; } } } if (dirty) { // next, we need to find the target initializing shard that is recovering from, and remove it... RoutingNode initializingNode = allocation.routingNodes().node(failedShard.relocatingNodeId()); if (initializingNode != null) { for (Iterator<MutableShardRouting> it = initializingNode.iterator(); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.state() == INITIALIZING) { dirty = true; shardRouting.deassignNode(); it.remove(); } } } } return dirty; } else { throw new ElasticSearchIllegalStateException("illegal state for a failed shard, relocating node id is set, but state does not match: " + failedShard); } } else { // the shard is not relocating, its either started, or initializing, just cancel it and move on... boolean dirty = false; RoutingNode node = allocation.routingNodes().node(failedShard.currentNodeId()); if (node != null) { for (Iterator<MutableShardRouting> it = node.iterator(); it.hasNext(); ) { MutableShardRouting shardRouting = it.next(); if (shardRouting.equals(failedShard)) { dirty = true; // make sure we ignore this shard on the relevant node allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId()); it.remove(); // move all the shards matching the failed shard to the end of the unassigned list // so we give a chance for other allocations and won't create poison failed allocations // that can keep other shards from being allocated (because of limits applied on how many // shards we can start per node) List<MutableShardRouting> shardsToMove = Lists.newArrayList(); for (Iterator<MutableShardRouting> unassignedIt = allocation.routingNodes().unassigned().iterator(); unassignedIt.hasNext(); ) { MutableShardRouting unassignedShardRouting = unassignedIt.next(); if (unassignedShardRouting.shardId().equals(failedShard.shardId())) { unassignedIt.remove(); shardsToMove.add(unassignedShardRouting); } } if (!shardsToMove.isEmpty()) { allocation.routingNodes().unassigned().addAll(shardsToMove); } allocation.routingNodes().unassigned().add(new MutableShardRouting(failedShard.index(), failedShard.id(), null, failedShard.primary(), ShardRoutingState.UNASSIGNED, failedShard.version() + 1)); break; } } } return dirty; } } }