/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RecoverySource;
import org.elasticsearch.cluster.routing.RoutingChangesObserver;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.UnassignedInfo;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Observer that tracks changes made to RoutingNodes in order to update the primary terms and in-sync allocation ids in
* {@link IndexMetaData} once the allocation round has completed.
*
* Primary terms are updated on primary initialization or when an active primary fails.
*
* Allocation ids are added for shards that become active and removed for shards that stop being active.
*/
public class IndexMetaDataUpdater extends RoutingChangesObserver.AbstractRoutingChangesObserver {
private final Map<ShardId, Updates> shardChanges = new HashMap<>();
@Override
public void shardInitialized(ShardRouting unassignedShard, ShardRouting initializedShard) {
assert initializedShard.isRelocationTarget() == false : "shardInitialized is not called on relocation target: " + initializedShard;
if (initializedShard.primary()) {
increasePrimaryTerm(initializedShard.shardId());
Updates updates = changes(initializedShard.shardId());
assert updates.initializedPrimary == null : "Primary cannot be initialized more than once in same allocation round: " +
"(previous: " + updates.initializedPrimary + ", next: " + initializedShard + ")";
updates.initializedPrimary = initializedShard;
}
}
@Override
public void shardStarted(ShardRouting initializingShard, ShardRouting startedShard) {
addAllocationId(startedShard);
}
@Override
public void shardFailed(ShardRouting failedShard, UnassignedInfo unassignedInfo) {
if (failedShard.active() && unassignedInfo.getReason() != UnassignedInfo.Reason.NODE_LEFT) {
removeAllocationId(failedShard);
if (failedShard.primary()) {
Updates updates = changes(failedShard.shardId());
if (updates.firstFailedPrimary == null) {
// more than one primary can be failed (because of batching, primary can be failed, replica promoted and then failed...)
updates.firstFailedPrimary = failedShard;
}
}
}
if (failedShard.active() && failedShard.primary()) {
increasePrimaryTerm(failedShard.shardId());
}
}
@Override
public void relocationCompleted(ShardRouting removedRelocationSource) {
removeAllocationId(removedRelocationSource);
}
/**
* Updates the current {@link MetaData} based on the changes of this RoutingChangesObserver. Specifically
* we update {@link IndexMetaData#getInSyncAllocationIds()} and {@link IndexMetaData#primaryTerm(int)} based on
* the changes made during this allocation.
*
* @param oldMetaData {@link MetaData} object from before the routing nodes was changed.
* @param newRoutingTable {@link RoutingTable} object after routing changes were applied.
* @return adapted {@link MetaData}, potentially the original one if no change was needed.
*/
public MetaData applyChanges(MetaData oldMetaData, RoutingTable newRoutingTable) {
Map<Index, List<Map.Entry<ShardId, Updates>>> changesGroupedByIndex =
shardChanges.entrySet().stream().collect(Collectors.groupingBy(e -> e.getKey().getIndex()));
MetaData.Builder metaDataBuilder = null;
for (Map.Entry<Index, List<Map.Entry<ShardId, Updates>>> indexChanges : changesGroupedByIndex.entrySet()) {
Index index = indexChanges.getKey();
final IndexMetaData oldIndexMetaData = oldMetaData.getIndexSafe(index);
IndexMetaData.Builder indexMetaDataBuilder = null;
for (Map.Entry<ShardId, Updates> shardEntry : indexChanges.getValue()) {
ShardId shardId = shardEntry.getKey();
Updates updates = shardEntry.getValue();
indexMetaDataBuilder = updateInSyncAllocations(newRoutingTable, oldIndexMetaData, indexMetaDataBuilder, shardId, updates);
indexMetaDataBuilder = updatePrimaryTerm(oldIndexMetaData, indexMetaDataBuilder, shardId, updates);
}
if (indexMetaDataBuilder != null) {
if (metaDataBuilder == null) {
metaDataBuilder = MetaData.builder(oldMetaData);
}
metaDataBuilder.put(indexMetaDataBuilder);
}
}
if (metaDataBuilder != null) {
return metaDataBuilder.build();
} else {
return oldMetaData;
}
}
/**
* Updates in-sync allocations with routing changes that were made to the routing table.
*/
private IndexMetaData.Builder updateInSyncAllocations(RoutingTable newRoutingTable, IndexMetaData oldIndexMetaData,
IndexMetaData.Builder indexMetaDataBuilder, ShardId shardId, Updates updates) {
assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) :
"allocation ids cannot be both added and removed in the same allocation round, added ids: " +
updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds;
Set<String> oldInSyncAllocationIds = oldIndexMetaData.inSyncAllocationIds(shardId.id());
// check if we have been force-initializing an empty primary or a stale primary
if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false &&
oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) {
// we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating
// an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand).
RecoverySource.Type recoverySourceType = updates.initializedPrimary.recoverySource().getType();
boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE;
assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") +
" primary is not force-initialized in same allocation round where shards are started";
if (indexMetaDataBuilder == null) {
indexMetaDataBuilder = IndexMetaData.builder(oldIndexMetaData);
}
if (emptyPrimary) {
// forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate)
indexMetaDataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet());
} else {
// forcing a stale primary resets the in-sync allocations to the singleton set with the stale id
indexMetaDataBuilder.putInSyncAllocationIds(shardId.id(),
Collections.singleton(updates.initializedPrimary.allocationId().getId()));
}
} else {
// standard path for updating in-sync ids
Set<String> inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds);
inSyncAllocationIds.addAll(updates.addedAllocationIds);
inSyncAllocationIds.removeAll(updates.removedAllocationIds);
// Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
// but repeatedly shut down nodes that have active replicas.
// We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
// Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
// of replicas was decreased while shards were unassigned.
int maxActiveShards = oldIndexMetaData.getNumberOfReplicas() + 1; // +1 for the primary
IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
// trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards();
assert assignedShards.size() <= maxActiveShards :
"cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
inSyncAllocationIds = inSyncAllocationIds.stream()
.sorted(Comparator.comparing(assignedAllocations::contains).reversed()) // values with routing entries first
.limit(maxActiveShards)
.collect(Collectors.toSet());
}
// only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that
// the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the
// in-sync set, this could create an empty primary on the next allocation.
if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
// add back allocation id of failed primary
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
}
assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() :
"in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds;
// be extra safe here and only update in-sync set if it is non-empty
if (inSyncAllocationIds.isEmpty() == false) {
if (indexMetaDataBuilder == null) {
indexMetaDataBuilder = IndexMetaData.builder(oldIndexMetaData);
}
indexMetaDataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds);
}
}
return indexMetaDataBuilder;
}
/**
* Removes allocation ids from the in-sync set for shard copies for which there is no routing entries in the routing table.
* This method is called in AllocationService before any changes to the routing table are made.
*/
public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List<StaleShard> staleShards) {
MetaData oldMetaData = clusterState.metaData();
RoutingTable oldRoutingTable = clusterState.routingTable();
MetaData.Builder metaDataBuilder = null;
// group staleShards entries by index
for (Map.Entry<Index, List<StaleShard>> indexEntry : staleShards.stream().collect(
Collectors.groupingBy(fs -> fs.getShardId().getIndex())).entrySet()) {
final IndexMetaData oldIndexMetaData = oldMetaData.getIndexSafe(indexEntry.getKey());
IndexMetaData.Builder indexMetaDataBuilder = null;
// group staleShards entries by shard id
for (Map.Entry<ShardId, List<StaleShard>> shardEntry : indexEntry.getValue().stream().collect(
Collectors.groupingBy(staleShard -> staleShard.getShardId())).entrySet()) {
int shardNumber = shardEntry.getKey().getId();
Set<String> oldInSyncAllocations = oldIndexMetaData.inSyncAllocationIds(shardNumber);
Set<String> idsToRemove = shardEntry.getValue().stream().map(e -> e.getAllocationId()).collect(Collectors.toSet());
assert idsToRemove.stream().allMatch(id -> oldRoutingTable.getByAllocationId(shardEntry.getKey(), id) == null) :
"removing stale ids: " + idsToRemove + ", some of which have still a routing entry: " + oldRoutingTable;
Set<String> remainingInSyncAllocations = Sets.difference(oldInSyncAllocations, idsToRemove);
assert remainingInSyncAllocations.isEmpty() == false : "Set of in-sync ids cannot become empty for shard " +
shardEntry.getKey() + " (before: " + oldInSyncAllocations + ", ids to remove: " + idsToRemove + ")";
// be extra safe here: if the in-sync set were to become empty, this would create an empty primary on the next allocation
// (see ShardRouting#allocatedPostIndexCreate)
if (remainingInSyncAllocations.isEmpty() == false) {
if (indexMetaDataBuilder == null) {
indexMetaDataBuilder = IndexMetaData.builder(oldIndexMetaData);
}
indexMetaDataBuilder.putInSyncAllocationIds(shardNumber, remainingInSyncAllocations);
}
}
if (indexMetaDataBuilder != null) {
if (metaDataBuilder == null) {
metaDataBuilder = MetaData.builder(oldMetaData);
}
metaDataBuilder.put(indexMetaDataBuilder);
}
}
if (metaDataBuilder != null) {
return ClusterState.builder(clusterState).metaData(metaDataBuilder).build();
} else {
return clusterState;
}
}
/**
* Increases the primary term if {@link #increasePrimaryTerm} was called for this shard id.
*/
private IndexMetaData.Builder updatePrimaryTerm(IndexMetaData oldIndexMetaData, IndexMetaData.Builder indexMetaDataBuilder,
ShardId shardId, Updates updates) {
if (updates.increaseTerm) {
if (indexMetaDataBuilder == null) {
indexMetaDataBuilder = IndexMetaData.builder(oldIndexMetaData);
}
indexMetaDataBuilder.primaryTerm(shardId.id(), oldIndexMetaData.primaryTerm(shardId.id()) + 1);
}
return indexMetaDataBuilder;
}
/**
* Helper method that creates update entry for the given shard id if such an entry does not exist yet.
*/
private Updates changes(ShardId shardId) {
return shardChanges.computeIfAbsent(shardId, k -> new Updates());
}
/**
* Remove allocation id of this shard from the set of in-sync shard copies
*/
private void removeAllocationId(ShardRouting shardRouting) {
changes(shardRouting.shardId()).removedAllocationIds.add(shardRouting.allocationId().getId());
}
/**
* Add allocation id of this shard to the set of in-sync shard copies
*/
private void addAllocationId(ShardRouting shardRouting) {
changes(shardRouting.shardId()).addedAllocationIds.add(shardRouting.allocationId().getId());
}
/**
* Increase primary term for this shard id
*/
private void increasePrimaryTerm(ShardId shardId) {
changes(shardId).increaseTerm = true;
}
private static class Updates {
private boolean increaseTerm; // whether primary term should be increased
private Set<String> addedAllocationIds = new HashSet<>(); // allocation ids that should be added to the in-sync set
private Set<String> removedAllocationIds = new HashSet<>(); // allocation ids that should be removed from the in-sync set
private ShardRouting initializedPrimary = null; // primary that was initialized from unassigned
private ShardRouting firstFailedPrimary = null; // first active primary that was failed
}
}