/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.cluster.routing; import org.elasticsearch.cluster.ClusterChangedEvent; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateListener; import org.elasticsearch.cluster.ClusterStateUpdateTask; import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.AbstractRunnable; import org.elasticsearch.common.util.concurrent.FutureUtils; import org.elasticsearch.threadpool.ThreadPool; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; /** * The {@link DelayedAllocationService} listens to cluster state changes and checks * if there are unassigned shards with delayed allocation (unassigned shards that have * the delay marker). These are shards that have become unassigned due to a node leaving * and which were assigned the delay marker based on the index delay setting * {@link UnassignedInfo#INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING} * (see {@link AllocationService#deassociateDeadNodes(RoutingAllocation)}). * This class is responsible for choosing the next (closest) delay expiration of a * delayed shard to schedule a reroute to remove the delay marker. * The actual removal of the delay marker happens in * {@link AllocationService#removeDelayMarkers(RoutingAllocation)}, triggering yet * another cluster change event. */ public class DelayedAllocationService extends AbstractLifecycleComponent implements ClusterStateListener { static final String CLUSTER_UPDATE_TASK_SOURCE = "delayed_allocation_reroute"; final ThreadPool threadPool; private final ClusterService clusterService; private final AllocationService allocationService; AtomicReference<DelayedRerouteTask> delayedRerouteTask = new AtomicReference<>(); // package private to access from tests /** * represents a delayed scheduling of the reroute action that can be cancelled. */ class DelayedRerouteTask extends ClusterStateUpdateTask { final TimeValue nextDelay; // delay until submitting the reroute command final long baseTimestampNanos; // timestamp (in nanos) upon which delay was calculated volatile ScheduledFuture future; final AtomicBoolean cancelScheduling = new AtomicBoolean(); DelayedRerouteTask(TimeValue nextDelay, long baseTimestampNanos) { this.nextDelay = nextDelay; this.baseTimestampNanos = baseTimestampNanos; } public long scheduledTimeToRunInNanos() { return baseTimestampNanos + nextDelay.nanos(); } public void cancelScheduling() { cancelScheduling.set(true); FutureUtils.cancel(future); removeIfSameTask(this); } public void schedule() { future = threadPool.schedule(nextDelay, ThreadPool.Names.SAME, new AbstractRunnable() { @Override protected void doRun() throws Exception { if (cancelScheduling.get()) { return; } clusterService.submitStateUpdateTask(CLUSTER_UPDATE_TASK_SOURCE, DelayedRerouteTask.this); } @Override public void onFailure(Exception e) { logger.warn("failed to submit schedule/execute reroute post unassigned shard", e); removeIfSameTask(DelayedRerouteTask.this); } }); } @Override public ClusterState execute(ClusterState currentState) throws Exception { removeIfSameTask(this); return allocationService.reroute(currentState, "assign delayed unassigned shards"); } @Override public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { if (oldState == newState) { // no state changed, check when we should remove the delay flag from the shards the next time. // if cluster state changed, we can leave the scheduling of the next delay up to the clusterChangedEvent // this should not be needed, but we want to be extra safe here scheduleIfNeeded(currentNanoTime(), newState); } } @Override public void onFailure(String source, Exception e) { removeIfSameTask(this); logger.warn("failed to schedule/execute reroute post unassigned shard", e); } } @Inject public DelayedAllocationService(Settings settings, ThreadPool threadPool, ClusterService clusterService, AllocationService allocationService) { super(settings); this.threadPool = threadPool; this.clusterService = clusterService; this.allocationService = allocationService; clusterService.addListener(this); } @Override protected void doStart() { } @Override protected void doStop() { } @Override protected void doClose() { clusterService.removeListener(this); removeTaskAndCancel(); } /** override this to control time based decisions during delayed allocation */ protected long currentNanoTime() { return System.nanoTime(); } @Override public void clusterChanged(ClusterChangedEvent event) { long currentNanoTime = currentNanoTime(); if (event.state().nodes().isLocalNodeElectedMaster()) { scheduleIfNeeded(currentNanoTime, event.state()); } } private void removeTaskAndCancel() { DelayedRerouteTask existingTask = delayedRerouteTask.getAndSet(null); if (existingTask != null) { logger.trace("cancelling existing delayed reroute task"); existingTask.cancelScheduling(); } } private void removeIfSameTask(DelayedRerouteTask expectedTask) { delayedRerouteTask.compareAndSet(expectedTask, null); } /** * Figure out if an existing scheduled reroute is good enough or whether we need to cancel and reschedule. */ private synchronized void scheduleIfNeeded(long currentNanoTime, ClusterState state) { assertClusterOrMasterStateThread(); long nextDelayNanos = UnassignedInfo.findNextDelayedAllocation(currentNanoTime, state); if (nextDelayNanos < 0) { logger.trace("no need to schedule reroute - no delayed unassigned shards"); removeTaskAndCancel(); } else { TimeValue nextDelay = TimeValue.timeValueNanos(nextDelayNanos); final boolean earlierRerouteNeeded; DelayedRerouteTask existingTask = delayedRerouteTask.get(); DelayedRerouteTask newTask = new DelayedRerouteTask(nextDelay, currentNanoTime); if (existingTask == null) { earlierRerouteNeeded = true; } else if (newTask.scheduledTimeToRunInNanos() < existingTask.scheduledTimeToRunInNanos()) { // we need an earlier delayed reroute logger.trace("cancelling existing delayed reroute task as delayed reroute has to happen [{}] earlier", TimeValue.timeValueNanos(existingTask.scheduledTimeToRunInNanos() - newTask.scheduledTimeToRunInNanos())); existingTask.cancelScheduling(); earlierRerouteNeeded = true; } else { earlierRerouteNeeded = false; } if (earlierRerouteNeeded) { logger.info("scheduling reroute for delayed shards in [{}] ({} delayed shards)", nextDelay, UnassignedInfo.getNumberOfDelayedUnassigned(state)); DelayedRerouteTask currentTask = delayedRerouteTask.getAndSet(newTask); assert existingTask == currentTask || currentTask == null; newTask.schedule(); } else { logger.trace("no need to reschedule delayed reroute - currently scheduled delayed reroute in [{}] is enough", nextDelay); } } } // protected so that it can be overridden (and disabled) by unit tests protected void assertClusterOrMasterStateThread() { assert ClusterService.assertClusterOrMasterStateThread(); } }