/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.cluster.service; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.logging.log4j.util.Supplier; import org.elasticsearch.cluster.ClusterChangedEvent; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateApplier; import org.elasticsearch.cluster.ClusterStateListener; import org.elasticsearch.cluster.ClusterStateObserver; import org.elasticsearch.cluster.ClusterStateTaskConfig; import org.elasticsearch.cluster.ClusterStateTaskListener; import org.elasticsearch.cluster.LocalNodeMasterListener; import org.elasticsearch.cluster.NodeConnectionsService; import org.elasticsearch.cluster.TimeoutClusterStateListener; import org.elasticsearch.cluster.metadata.ProcessClusterEventTimeoutException; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.Priority; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.common.util.concurrent.FutureUtils; import org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor; import org.elasticsearch.common.util.iterable.Iterables; import org.elasticsearch.threadpool.ThreadPool; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Queue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Stream; import static org.elasticsearch.cluster.service.ClusterService.CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING; import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadFactory; public class ClusterApplierService extends AbstractLifecycleComponent implements ClusterApplier { public static final String CLUSTER_UPDATE_THREAD_NAME = "clusterApplierService#updateTask"; private final ClusterSettings clusterSettings; protected final ThreadPool threadPool; private volatile TimeValue slowTaskLoggingThreshold; private volatile PrioritizedEsThreadPoolExecutor threadPoolExecutor; /** * Those 3 state listeners are changing infrequently - CopyOnWriteArrayList is just fine */ private final Collection<ClusterStateApplier> highPriorityStateAppliers = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateApplier> normalPriorityStateAppliers = new CopyOnWriteArrayList<>(); private final Collection<ClusterStateApplier> lowPriorityStateAppliers = new CopyOnWriteArrayList<>(); private final Iterable<ClusterStateApplier> clusterStateAppliers = Iterables.concat(highPriorityStateAppliers, normalPriorityStateAppliers, lowPriorityStateAppliers); private final Collection<ClusterStateListener> clusterStateListeners = new CopyOnWriteArrayList<>(); private final Collection<TimeoutClusterStateListener> timeoutClusterStateListeners = Collections.newSetFromMap(new ConcurrentHashMap<TimeoutClusterStateListener, Boolean>()); private final LocalNodeMasterListeners localNodeMasterListeners; private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue(); private final AtomicReference<ClusterState> state; // last applied state private NodeConnectionsService nodeConnectionsService; public ClusterApplierService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) { super(settings); this.clusterSettings = clusterSettings; this.threadPool = threadPool; this.state = new AtomicReference<>(); this.slowTaskLoggingThreshold = CLUSTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings); this.localNodeMasterListeners = new LocalNodeMasterListeners(threadPool); } public void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) { this.slowTaskLoggingThreshold = slowTaskLoggingThreshold; } public synchronized void setNodeConnectionsService(NodeConnectionsService nodeConnectionsService) { assert this.nodeConnectionsService == null : "nodeConnectionsService is already set"; this.nodeConnectionsService = nodeConnectionsService; } @Override public void setInitialState(ClusterState initialState) { if (lifecycle.started()) { throw new IllegalStateException("can't set initial state when started"); } assert state.get() == null : "state is already set"; state.set(initialState); } @Override protected synchronized void doStart() { Objects.requireNonNull(nodeConnectionsService, "please set the node connection service before starting"); Objects.requireNonNull(state.get(), "please set initial state before starting"); addListener(localNodeMasterListeners); threadPoolExecutor = EsExecutors.newSinglePrioritizing(CLUSTER_UPDATE_THREAD_NAME, daemonThreadFactory(settings, CLUSTER_UPDATE_THREAD_NAME), threadPool.getThreadContext(), threadPool.scheduler()); } class UpdateTask extends SourcePrioritizedRunnable implements Function<ClusterState, ClusterState> { final ClusterStateTaskListener listener; final Function<ClusterState, ClusterState> updateFunction; UpdateTask(Priority priority, String source, ClusterStateTaskListener listener, Function<ClusterState, ClusterState> updateFunction) { super(priority, source); this.listener = listener; this.updateFunction = updateFunction; } @Override public ClusterState apply(ClusterState clusterState) { return updateFunction.apply(clusterState); } @Override public void run() { runTask(this); } } @Override protected synchronized void doStop() { for (NotifyTimeout onGoingTimeout : onGoingTimeouts) { onGoingTimeout.cancel(); try { onGoingTimeout.cancel(); onGoingTimeout.listener.onClose(); } catch (Exception ex) { logger.debug("failed to notify listeners on shutdown", ex); } } ThreadPool.terminate(threadPoolExecutor, 10, TimeUnit.SECONDS); // close timeout listeners that did not have an ongoing timeout timeoutClusterStateListeners.forEach(TimeoutClusterStateListener::onClose); removeListener(localNodeMasterListeners); } @Override protected synchronized void doClose() { } /** * The current cluster state. * Should be renamed to appliedClusterState */ public ClusterState state() { assert assertNotCalledFromClusterStateApplier("the applied cluster state is not yet available"); ClusterState clusterState = this.state.get(); assert clusterState != null : "initial cluster state not set yet"; return clusterState; } /** * Adds a high priority applier of updated cluster states. */ public void addHighPriorityApplier(ClusterStateApplier applier) { highPriorityStateAppliers.add(applier); } /** * Adds an applier which will be called after all high priority and normal appliers have been called. */ public void addLowPriorityApplier(ClusterStateApplier applier) { lowPriorityStateAppliers.add(applier); } /** * Adds a applier of updated cluster states. */ public void addStateApplier(ClusterStateApplier applier) { normalPriorityStateAppliers.add(applier); } /** * Removes an applier of updated cluster states. */ public void removeApplier(ClusterStateApplier applier) { normalPriorityStateAppliers.remove(applier); highPriorityStateAppliers.remove(applier); lowPriorityStateAppliers.remove(applier); } /** * Add a listener for updated cluster states */ public void addListener(ClusterStateListener listener) { clusterStateListeners.add(listener); } /** * Removes a listener for updated cluster states. */ public void removeListener(ClusterStateListener listener) { clusterStateListeners.remove(listener); } /** * Removes a timeout listener for updated cluster states. */ public void removeTimeoutListener(TimeoutClusterStateListener listener) { timeoutClusterStateListeners.remove(listener); for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) { NotifyTimeout timeout = it.next(); if (timeout.listener.equals(listener)) { timeout.cancel(); it.remove(); } } } /** * Add a listener for on/off local node master events */ public void addLocalNodeMasterListener(LocalNodeMasterListener listener) { localNodeMasterListeners.add(listener); } /** * Remove the given listener for on/off local master events */ public void removeLocalNodeMasterListener(LocalNodeMasterListener listener) { localNodeMasterListeners.remove(listener); } /** * Adds a cluster state listener that is expected to be removed during a short period of time. * If provided, the listener will be notified once a specific time has elapsed. * * NOTE: the listener is not removed on timeout. This is the responsibility of the caller. */ public void addTimeoutListener(@Nullable final TimeValue timeout, final TimeoutClusterStateListener listener) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); return; } // call the post added notification on the same event thread try { threadPoolExecutor.execute(new SourcePrioritizedRunnable(Priority.HIGH, "_add_listener_") { @Override public void run() { if (timeout != null) { NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout); notifyTimeout.future = threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout); onGoingTimeouts.add(notifyTimeout); } timeoutClusterStateListeners.add(listener); listener.postAdded(); } }); } catch (EsRejectedExecutionException e) { if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { throw e; } } } public void runOnApplierThread(final String source, Consumer<ClusterState> clusterStateConsumer, final ClusterStateTaskListener listener, Priority priority) { submitStateUpdateTask(source, ClusterStateTaskConfig.build(priority), (clusterState) -> { clusterStateConsumer.accept(clusterState); return clusterState; }, listener); } public void runOnApplierThread(final String source, Consumer<ClusterState> clusterStateConsumer, final ClusterStateTaskListener listener) { runOnApplierThread(source, clusterStateConsumer, listener, Priority.HIGH); } @Override public void onNewClusterState(final String source, final java.util.function.Supplier<ClusterState> clusterStateSupplier, final ClusterStateTaskListener listener) { Function<ClusterState, ClusterState> applyFunction = currentState -> { ClusterState nextState = clusterStateSupplier.get(); if (nextState != null) { return nextState; } else { return currentState; } }; submitStateUpdateTask(source, ClusterStateTaskConfig.build(Priority.HIGH), applyFunction, listener); } private void submitStateUpdateTask(final String source, final ClusterStateTaskConfig config, final Function<ClusterState, ClusterState> executor, final ClusterStateTaskListener listener) { if (!lifecycle.started()) { return; } try { UpdateTask updateTask = new UpdateTask(config.priority(), source, new SafeClusterStateTaskListener(listener, logger), executor); if (config.timeout() != null) { threadPoolExecutor.execute(updateTask, config.timeout(), () -> threadPool.generic().execute( () -> listener.onFailure(source, new ProcessClusterEventTimeoutException(config.timeout(), source)))); } else { threadPoolExecutor.execute(updateTask); } } catch (EsRejectedExecutionException e) { // ignore cases where we are shutting down..., there is really nothing interesting // to be done here... if (!lifecycle.stoppedOrClosed()) { throw e; } } } /** asserts that the current thread is the cluster state update thread */ public static boolean assertClusterStateUpdateThread() { assert Thread.currentThread().getName().contains(ClusterApplierService.CLUSTER_UPDATE_THREAD_NAME) : "not called from the cluster state update thread"; return true; } /** asserts that the current thread is <b>NOT</b> the cluster state update thread */ public static boolean assertNotClusterStateUpdateThread(String reason) { assert Thread.currentThread().getName().contains(CLUSTER_UPDATE_THREAD_NAME) == false : "Expected current thread [" + Thread.currentThread() + "] to not be the cluster state update thread. Reason: [" + reason + "]"; return true; } /** asserts that the current stack trace does <b>NOT</b> involve a cluster state applier */ private static boolean assertNotCalledFromClusterStateApplier(String reason) { if (Thread.currentThread().getName().contains(CLUSTER_UPDATE_THREAD_NAME)) { for (StackTraceElement element : Thread.currentThread().getStackTrace()) { final String className = element.getClassName(); final String methodName = element.getMethodName(); if (className.equals(ClusterStateObserver.class.getName())) { // people may start an observer from an applier return true; } else if (className.equals(ClusterApplierService.class.getName()) && methodName.equals("callClusterStateAppliers")) { throw new AssertionError("should not be called by a cluster state applier. reason [" + reason + "]"); } } } return true; } protected void runTask(UpdateTask task) { if (!lifecycle.started()) { logger.debug("processing [{}]: ignoring, cluster applier service not started", task.source); return; } logger.debug("processing [{}]: execute", task.source); final ClusterState previousClusterState = state.get(); long startTimeNS = currentTimeInNanos(); final ClusterState newClusterState; try { newClusterState = task.apply(previousClusterState); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); if (logger.isTraceEnabled()) { logger.trace( (Supplier<?>) () -> new ParameterizedMessage( "failed to execute cluster state applier in [{}], state:\nversion [{}], source [{}]\n{}{}{}", executionTime, previousClusterState.version(), task.source, previousClusterState.nodes(), previousClusterState.routingTable(), previousClusterState.getRoutingNodes()), e); } warnAboutSlowTaskIfNeeded(executionTime, task.source); task.listener.onFailure(task.source, e); return; } if (previousClusterState == newClusterState) { task.listener.clusterStateProcessed(task.source, newClusterState, newClusterState); TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug("processing [{}]: took [{}] no change in cluster state", task.source, executionTime); warnAboutSlowTaskIfNeeded(executionTime, task.source); } else { if (logger.isTraceEnabled()) { logger.trace("cluster state updated, source [{}]\n{}", task.source, newClusterState); } else if (logger.isDebugEnabled()) { logger.debug("cluster state updated, version [{}], source [{}]", newClusterState.version(), task.source); } try { applyChanges(task, previousClusterState, newClusterState); TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); logger.debug("processing [{}]: took [{}] done applying updated cluster state (version: {}, uuid: {})", task.source, executionTime, newClusterState.version(), newClusterState.stateUUID()); warnAboutSlowTaskIfNeeded(executionTime, task.source); } catch (Exception e) { TimeValue executionTime = TimeValue.timeValueMillis(Math.max(0, TimeValue.nsecToMSec(currentTimeInNanos() - startTimeNS))); final long version = newClusterState.version(); final String stateUUID = newClusterState.stateUUID(); final String fullState = newClusterState.toString(); logger.warn( (Supplier<?>) () -> new ParameterizedMessage( "failed to apply updated cluster state in [{}]:\nversion [{}], uuid [{}], source [{}]\n{}", executionTime, version, stateUUID, task.source, fullState), e); // TODO: do we want to call updateTask.onFailure here? } } } private void applyChanges(UpdateTask task, ClusterState previousClusterState, ClusterState newClusterState) { ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(task.source, newClusterState, previousClusterState); // new cluster state, notify all listeners final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta(); if (nodesDelta.hasChanges() && logger.isInfoEnabled()) { String summary = nodesDelta.shortSummary(); if (summary.length() > 0) { logger.info("{}, reason: {}", summary, task.source); } } nodeConnectionsService.connectToNodes(newClusterState.nodes()); logger.debug("applying cluster state version {}", newClusterState.version()); try { // nothing to do until we actually recover from the gateway or any other block indicates we need to disable persistency if (clusterChangedEvent.state().blocks().disableStatePersistence() == false && clusterChangedEvent.metaDataChanged()) { final Settings incomingSettings = clusterChangedEvent.state().metaData().settings(); clusterSettings.applySettings(incomingSettings); } } catch (Exception ex) { logger.warn("failed to apply cluster settings", ex); } logger.debug("apply cluster state with version {}", newClusterState.version()); callClusterStateAppliers(clusterChangedEvent); nodeConnectionsService.disconnectFromNodesExcept(newClusterState.nodes()); logger.debug("set locally applied cluster state to version {}", newClusterState.version()); state.set(newClusterState); callClusterStateListeners(clusterChangedEvent); task.listener.clusterStateProcessed(task.source, previousClusterState, newClusterState); } private void callClusterStateAppliers(ClusterChangedEvent clusterChangedEvent) { clusterStateAppliers.forEach(applier -> { try { logger.trace("calling [{}] with change to version [{}]", applier, clusterChangedEvent.state().version()); applier.applyClusterState(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateApplier", ex); } }); } private void callClusterStateListeners(ClusterChangedEvent clusterChangedEvent) { Stream.concat(clusterStateListeners.stream(), timeoutClusterStateListeners.stream()).forEach(listener -> { try { logger.trace("calling [{}] with change to version [{}]", listener, clusterChangedEvent.state().version()); listener.clusterChanged(clusterChangedEvent); } catch (Exception ex) { logger.warn("failed to notify ClusterStateListener", ex); } }); } private static class SafeClusterStateTaskListener implements ClusterStateTaskListener { private final ClusterStateTaskListener listener; private final Logger logger; SafeClusterStateTaskListener(ClusterStateTaskListener listener, Logger logger) { this.listener = listener; this.logger = logger; } @Override public void onFailure(String source, Exception e) { try { listener.onFailure(source, e); } catch (Exception inner) { inner.addSuppressed(e); logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener notifying of failure from [{}]", source), inner); } } @Override public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) { try { listener.clusterStateProcessed(source, oldState, newState); } catch (Exception e) { logger.error( (Supplier<?>) () -> new ParameterizedMessage( "exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n" + "{}\nnew cluster state:\n{}", source, oldState, newState), e); } } } protected void warnAboutSlowTaskIfNeeded(TimeValue executionTime, String source) { if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) { logger.warn("cluster state applier task [{}] took [{}] above the warn threshold of {}", source, executionTime, slowTaskLoggingThreshold); } } class NotifyTimeout implements Runnable { final TimeoutClusterStateListener listener; final TimeValue timeout; volatile ScheduledFuture future; NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) { this.listener = listener; this.timeout = timeout; } public void cancel() { FutureUtils.cancel(future); } @Override public void run() { if (future != null && future.isCancelled()) { return; } if (lifecycle.stoppedOrClosed()) { listener.onClose(); } else { listener.onTimeout(this.timeout); } // note, we rely on the listener to remove itself in case of timeout if needed } } private static class LocalNodeMasterListeners implements ClusterStateListener { private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<>(); private final ThreadPool threadPool; private volatile boolean master = false; private LocalNodeMasterListeners(ThreadPool threadPool) { this.threadPool = threadPool; } @Override public void clusterChanged(ClusterChangedEvent event) { if (!master && event.localNodeMaster()) { master = true; for (LocalNodeMasterListener listener : listeners) { java.util.concurrent.Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OnMasterRunnable(listener)); } return; } if (master && !event.localNodeMaster()) { master = false; for (LocalNodeMasterListener listener : listeners) { java.util.concurrent.Executor executor = threadPool.executor(listener.executorName()); executor.execute(new OffMasterRunnable(listener)); } } } private void add(LocalNodeMasterListener listener) { listeners.add(listener); } private void remove(LocalNodeMasterListener listener) { listeners.remove(listener); } private void clear() { listeners.clear(); } } private static class OnMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OnMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.onMaster(); } } private static class OffMasterRunnable implements Runnable { private final LocalNodeMasterListener listener; private OffMasterRunnable(LocalNodeMasterListener listener) { this.listener = listener; } @Override public void run() { listener.offMaster(); } } // this one is overridden in tests so we can control time protected long currentTimeInNanos() { return System.nanoTime(); } }