/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.index.seqno; import com.carrotsearch.hppc.ObjectLongHashMap; import com.carrotsearch.hppc.ObjectLongMap; import com.carrotsearch.hppc.cursors.ObjectLongCursor; import org.elasticsearch.common.SuppressForbidden; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.shard.AbstractIndexShardComponent; import org.elasticsearch.index.shard.ShardId; import java.util.HashSet; import java.util.Locale; import java.util.Set; /** * This class is responsible of tracking the global checkpoint. The global checkpoint is the highest sequence number for which all lower (or * equal) sequence number have been processed on all shards that are currently active. Since shards count as "active" when the master starts * them, and before this primary shard has been notified of this fact, we also include shards that have completed recovery. These shards * have received all old operations via the recovery mechanism and are kept up to date by the various replications actions. The set of * shards that are taken into account for the global checkpoint calculation are called the "in-sync shards". * <p> * The global checkpoint is maintained by the primary shard and is replicated to all the replicas (via {@link GlobalCheckpointSyncAction}). */ public class GlobalCheckpointTracker extends AbstractIndexShardComponent { /* * This map holds the last known local checkpoint for every active shard and initializing shard copies that has been brought up to speed * through recovery. These shards are treated as valid copies and participate in determining the global checkpoint. This map is keyed by * allocation IDs. All accesses to this set are guarded by a lock on this. */ final ObjectLongMap<String> inSyncLocalCheckpoints; /* * This map holds the last known local checkpoint for initializing shards that are undergoing recovery. Such shards do not participate * in determining the global checkpoint. We must track these local checkpoints so that when a shard is activated we use the highest * known checkpoint. */ final ObjectLongMap<String> trackingLocalCheckpoints; /* * This set contains allocation IDs for which there is a thread actively waiting for the local checkpoint to advance to at least the * current global checkpoint. */ final Set<String> pendingInSync; /* * The current global checkpoint for this shard. Note that this field is guarded by a lock on this and thus this field does not need to * be volatile. */ private long globalCheckpoint; /** * Initialize the global checkpoint service. The specified global checkpoint should be set to the last known global checkpoint, or * {@link SequenceNumbersService#UNASSIGNED_SEQ_NO}. * * @param shardId the shard ID * @param indexSettings the index settings * @param globalCheckpoint the last known global checkpoint for this shard, or {@link SequenceNumbersService#UNASSIGNED_SEQ_NO} */ GlobalCheckpointTracker(final ShardId shardId, final IndexSettings indexSettings, final long globalCheckpoint) { super(shardId, indexSettings); assert globalCheckpoint >= SequenceNumbersService.UNASSIGNED_SEQ_NO : "illegal initial global checkpoint: " + globalCheckpoint; this.inSyncLocalCheckpoints = new ObjectLongHashMap<>(1 + indexSettings.getNumberOfReplicas()); this.trackingLocalCheckpoints = new ObjectLongHashMap<>(indexSettings.getNumberOfReplicas()); this.globalCheckpoint = globalCheckpoint; this.pendingInSync = new HashSet<>(); } /** * Notifies the service to update the local checkpoint for the shard with the provided allocation ID. If the checkpoint is lower than * the currently known one, this is a no-op. If the allocation ID is not tracked, it is ignored. This is to prevent late arrivals from * shards that are removed to be re-added. * * @param allocationId the allocation ID of the shard to update the local checkpoint for * @param localCheckpoint the local checkpoint for the shard */ public synchronized void updateLocalCheckpoint(final String allocationId, final long localCheckpoint) { final boolean updated; if (updateLocalCheckpoint(allocationId, localCheckpoint, inSyncLocalCheckpoints, "in-sync")) { updated = true; updateGlobalCheckpointOnPrimary(); } else if (updateLocalCheckpoint(allocationId, localCheckpoint, trackingLocalCheckpoints, "tracking")) { updated = true; } else { logger.trace("ignored local checkpoint [{}] of [{}], allocation ID is not tracked", localCheckpoint, allocationId); updated = false; } if (updated) { notifyAllWaiters(); } } /** * Notify all threads waiting on the monitor on this tracker. These threads should be waiting for the local checkpoint on a specific * allocation ID to catch up to the global checkpoint. */ @SuppressForbidden(reason = "Object#notifyAll waiters for local checkpoint advancement") private synchronized void notifyAllWaiters() { this.notifyAll(); } /** * Update the local checkpoint for the specified allocation ID in the specified tracking map. If the checkpoint is lower than the * currently known one, this is a no-op. If the allocation ID is not tracked, it is ignored. * * @param allocationId the allocation ID of the shard to update the local checkpoint for * @param localCheckpoint the local checkpoint for the shard * @param map the tracking map * @param reason the reason for the update (used for logging) * @return {@code true} if the local checkpoint was updated, otherwise {@code false} if this was a no-op */ private boolean updateLocalCheckpoint( final String allocationId, final long localCheckpoint, ObjectLongMap<String> map, final String reason) { final int index = map.indexOf(allocationId); if (index >= 0) { final long current = map.indexGet(index); if (current < localCheckpoint) { map.indexReplace(index, localCheckpoint); logger.trace("updated local checkpoint of [{}] in [{}] from [{}] to [{}]", allocationId, reason, current, localCheckpoint); } else { logger.trace( "skipped updating local checkpoint of [{}] in [{}] from [{}] to [{}], current checkpoint is higher", allocationId, reason, current, localCheckpoint); } return true; } else { return false; } } /** * Scans through the currently known local checkpoint and updates the global checkpoint accordingly. */ private synchronized void updateGlobalCheckpointOnPrimary() { long minLocalCheckpoint = Long.MAX_VALUE; if (inSyncLocalCheckpoints.isEmpty() || !pendingInSync.isEmpty()) { return; } for (final ObjectLongCursor<String> localCheckpoint : inSyncLocalCheckpoints) { if (localCheckpoint.value == SequenceNumbersService.UNASSIGNED_SEQ_NO) { logger.trace("unknown local checkpoint for active allocation ID [{}], requesting a sync", localCheckpoint.key); return; } minLocalCheckpoint = Math.min(localCheckpoint.value, minLocalCheckpoint); } assert minLocalCheckpoint != SequenceNumbersService.UNASSIGNED_SEQ_NO : "new global checkpoint must be assigned"; if (minLocalCheckpoint < globalCheckpoint) { final String message = String.format( Locale.ROOT, "new global checkpoint [%d] is lower than previous one [%d]", minLocalCheckpoint, globalCheckpoint); throw new IllegalStateException(message); } if (globalCheckpoint != minLocalCheckpoint) { logger.trace("global checkpoint updated to [{}]", minLocalCheckpoint); globalCheckpoint = minLocalCheckpoint; } } /** * Returns the global checkpoint for the shard. * * @return the global checkpoint */ public synchronized long getGlobalCheckpoint() { return globalCheckpoint; } /** * Updates the global checkpoint on a replica shard after it has been updated by the primary. * * @param globalCheckpoint the global checkpoint */ synchronized void updateGlobalCheckpointOnReplica(final long globalCheckpoint) { /* * The global checkpoint here is a local knowledge which is updated under the mandate of the primary. It can happen that the primary * information is lagging compared to a replica (e.g., if a replica is promoted to primary but has stale info relative to other * replica shards). In these cases, the local knowledge of the global checkpoint could be higher than sync from the lagging primary. */ if (this.globalCheckpoint <= globalCheckpoint) { this.globalCheckpoint = globalCheckpoint; logger.trace("global checkpoint updated from primary to [{}]", globalCheckpoint); } } /** * Notifies the service of the current allocation ids in the cluster state. This method trims any shards that have been removed. * * @param activeAllocationIds the allocation IDs of the currently active shard copies * @param initializingAllocationIds the allocation IDs of the currently initializing shard copies */ public synchronized void updateAllocationIdsFromMaster( final Set<String> activeAllocationIds, final Set<String> initializingAllocationIds) { // remove shards whose allocation ID no longer exists inSyncLocalCheckpoints.removeAll(a -> !activeAllocationIds.contains(a) && !initializingAllocationIds.contains(a)); // add any new active allocation IDs for (final String a : activeAllocationIds) { if (!inSyncLocalCheckpoints.containsKey(a)) { final long localCheckpoint = trackingLocalCheckpoints.getOrDefault(a, SequenceNumbersService.UNASSIGNED_SEQ_NO); inSyncLocalCheckpoints.put(a, localCheckpoint); logger.trace("marked [{}] as in-sync with local checkpoint [{}] via cluster state update from master", a, localCheckpoint); } } trackingLocalCheckpoints.removeAll(a -> !initializingAllocationIds.contains(a)); for (final String a : initializingAllocationIds) { if (inSyncLocalCheckpoints.containsKey(a)) { /* * This can happen if we mark the allocation ID as in sync at the end of recovery before seeing a cluster state update from * marking the shard as active. */ continue; } if (trackingLocalCheckpoints.containsKey(a)) { // we are already tracking this allocation ID continue; } // this is a new allocation ID trackingLocalCheckpoints.put(a, SequenceNumbersService.UNASSIGNED_SEQ_NO); logger.trace("tracking [{}] via cluster state update from master", a); } updateGlobalCheckpointOnPrimary(); } /** * Marks the shard with the provided allocation ID as in-sync with the primary shard. This method will block until the local checkpoint * on the specified shard advances above the current global checkpoint. * * @param allocationId the allocation ID of the shard to mark as in-sync * @param localCheckpoint the current local checkpoint on the shard * * @throws InterruptedException if the thread is interrupted waiting for the local checkpoint on the shard to advance */ public synchronized void markAllocationIdAsInSync(final String allocationId, final long localCheckpoint) throws InterruptedException { if (!trackingLocalCheckpoints.containsKey(allocationId)) { /* * This can happen if the recovery target has been failed and the cluster state update from the master has triggered removing * this allocation ID from the tracking map but this recovery thread has not yet been made aware that the recovery is * cancelled. */ return; } updateLocalCheckpoint(allocationId, localCheckpoint, trackingLocalCheckpoints, "tracking"); if (!pendingInSync.add(allocationId)) { throw new IllegalStateException("there is already a pending sync in progress for allocation ID [" + allocationId + "]"); } try { waitForAllocationIdToBeInSync(allocationId); } finally { pendingInSync.remove(allocationId); updateGlobalCheckpointOnPrimary(); } } /** * Wait for knowledge of the local checkpoint for the specified allocation ID to advance to the global checkpoint. Global checkpoint * advancement is blocked while there are any allocation IDs waiting to catch up to the global checkpoint. * * @param allocationId the allocation ID * @throws InterruptedException if this thread was interrupted before of during waiting */ private synchronized void waitForAllocationIdToBeInSync(final String allocationId) throws InterruptedException { while (true) { /* * If the allocation has been cancelled and so removed from the tracking map from a cluster state update from the master it * means that this recovery will be cancelled; we are here on a cancellable recovery thread and so this thread will throw an * interrupted exception as soon as it tries to wait on the monitor. */ final long current = trackingLocalCheckpoints.getOrDefault(allocationId, Long.MIN_VALUE); if (current >= globalCheckpoint) { logger.trace("marked [{}] as in-sync with local checkpoint [{}]", allocationId, current); trackingLocalCheckpoints.remove(allocationId); /* * This is prematurely adding the allocation ID to the in-sync map as at this point recovery is not yet finished and could * still abort. At this point we will end up with a shard in the in-sync map holding back the global checkpoint because the * shard never recovered and we would have to wait until either the recovery retries and completes successfully, or the * master fails the shard and issues a cluster state update that removes the shard from the set of active allocation IDs. */ inSyncLocalCheckpoints.put(allocationId, current); break; } else { waitForLocalCheckpointToAdvance(); } } } /** * Wait for the local checkpoint to advance to the global checkpoint. * * @throws InterruptedException if this thread was interrupted before of during waiting */ @SuppressForbidden(reason = "Object#wait for local checkpoint advancement") private synchronized void waitForLocalCheckpointToAdvance() throws InterruptedException { this.wait(); } /** * Check if there are any recoveries pending in-sync. * * @return {@code true} if there is at least one shard pending in-sync, otherwise false */ public boolean pendingInSync() { return !pendingInSync.isEmpty(); } /** * Returns the local checkpoint for the shard with the specified allocation ID, or {@link SequenceNumbersService#UNASSIGNED_SEQ_NO} if * the shard is not in-sync. * * @param allocationId the allocation ID of the shard to obtain the local checkpoint for * @return the local checkpoint, or {@link SequenceNumbersService#UNASSIGNED_SEQ_NO} */ synchronized long getLocalCheckpointForAllocationId(final String allocationId) { if (inSyncLocalCheckpoints.containsKey(allocationId)) { return inSyncLocalCheckpoints.get(allocationId); } return SequenceNumbersService.UNASSIGNED_SEQ_NO; } }