/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.service;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.ElasticSearchIllegalStateException;
import org.elasticsearch.cluster.*;
import org.elasticsearch.cluster.block.ClusterBlock;
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.operation.OperationRouting;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.discovery.DiscoveryService;
import org.elasticsearch.node.settings.NodeSettingsService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportService;
import java.util.Iterator;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.*;
import static java.util.concurrent.Executors.newSingleThreadExecutor;
import static org.elasticsearch.cluster.ClusterState.Builder;
import static org.elasticsearch.cluster.ClusterState.newClusterStateBuilder;
import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadFactory;
/**
*
*/
public class InternalClusterService extends AbstractLifecycleComponent<ClusterService> implements ClusterService {
private final ThreadPool threadPool;
private final DiscoveryService discoveryService;
private final OperationRouting operationRouting;
private final TransportService transportService;
private final NodeSettingsService nodeSettingsService;
private final TimeValue reconnectInterval;
private volatile ExecutorService updateTasksExecutor;
private final List<ClusterStateListener> priorityClusterStateListeners = new CopyOnWriteArrayList<ClusterStateListener>();
private final List<ClusterStateListener> clusterStateListeners = new CopyOnWriteArrayList<ClusterStateListener>();
private final List<ClusterStateListener> lastClusterStateListeners = new CopyOnWriteArrayList<ClusterStateListener>();
private final LocalNodeMasterListeners localNodeMasterListeners;
private final Queue<NotifyTimeout> onGoingTimeouts = ConcurrentCollections.newQueue();
private volatile ClusterState clusterState = newClusterStateBuilder().build();
private final ClusterBlocks.Builder initialBlocks = ClusterBlocks.builder().addGlobalBlock(Discovery.NO_MASTER_BLOCK);
private volatile ScheduledFuture reconnectToNodes;
@Inject
public InternalClusterService(Settings settings, DiscoveryService discoveryService, OperationRouting operationRouting, TransportService transportService,
NodeSettingsService nodeSettingsService, ThreadPool threadPool) {
super(settings);
this.operationRouting = operationRouting;
this.transportService = transportService;
this.discoveryService = discoveryService;
this.threadPool = threadPool;
this.nodeSettingsService = nodeSettingsService;
this.nodeSettingsService.setClusterService(this);
this.reconnectInterval = componentSettings.getAsTime("reconnect_interval", TimeValue.timeValueSeconds(10));
localNodeMasterListeners = new LocalNodeMasterListeners(threadPool);
}
public NodeSettingsService settingsService() {
return this.nodeSettingsService;
}
public void addInitialStateBlock(ClusterBlock block) throws ElasticSearchIllegalStateException {
if (lifecycle.started()) {
throw new ElasticSearchIllegalStateException("can't set initial block when started");
}
initialBlocks.addGlobalBlock(block);
}
@Override
protected void doStart() throws ElasticSearchException {
add(localNodeMasterListeners);
this.clusterState = newClusterStateBuilder().blocks(initialBlocks).build();
this.updateTasksExecutor = newSingleThreadExecutor(daemonThreadFactory(settings, "clusterService#updateTask"));
this.reconnectToNodes = threadPool.schedule(reconnectInterval, ThreadPool.Names.GENERIC, new ReconnectToNodes());
}
@Override
protected void doStop() throws ElasticSearchException {
this.reconnectToNodes.cancel(true);
for (NotifyTimeout onGoingTimeout : onGoingTimeouts) {
onGoingTimeout.cancel();
onGoingTimeout.listener.onClose();
}
updateTasksExecutor.shutdown();
try {
updateTasksExecutor.awaitTermination(10, TimeUnit.SECONDS);
} catch (InterruptedException e) {
// ignore
}
remove(localNodeMasterListeners);
}
@Override
protected void doClose() throws ElasticSearchException {
}
@Override
public DiscoveryNode localNode() {
return discoveryService.localNode();
}
@Override
public OperationRouting operationRouting() {
return operationRouting;
}
public ClusterState state() {
return this.clusterState;
}
public void addFirst(ClusterStateListener listener) {
priorityClusterStateListeners.add(listener);
}
public void addLast(ClusterStateListener listener) {
lastClusterStateListeners.add(listener);
}
public void add(ClusterStateListener listener) {
clusterStateListeners.add(listener);
}
public void remove(ClusterStateListener listener) {
clusterStateListeners.remove(listener);
priorityClusterStateListeners.remove(listener);
lastClusterStateListeners.remove(listener);
for (Iterator<NotifyTimeout> it = onGoingTimeouts.iterator(); it.hasNext(); ) {
NotifyTimeout timeout = it.next();
if (timeout.listener.equals(listener)) {
timeout.cancel();
it.remove();
}
}
}
@Override
public void add(LocalNodeMasterListener listener) {
localNodeMasterListeners.add(listener);
}
@Override
public void remove(LocalNodeMasterListener listener) {
localNodeMasterListeners.remove(listener);
}
public void add(TimeValue timeout, final TimeoutClusterStateListener listener) {
if (lifecycle.stoppedOrClosed()) {
listener.onClose();
return;
}
NotifyTimeout notifyTimeout = new NotifyTimeout(listener, timeout);
notifyTimeout.future = threadPool.schedule(timeout, ThreadPool.Names.GENERIC, notifyTimeout);
onGoingTimeouts.add(notifyTimeout);
clusterStateListeners.add(listener);
// call the post added notification on the same event thread
updateTasksExecutor.execute(new Runnable() {
@Override
public void run() {
listener.postAdded();
}
});
}
public void submitStateUpdateTask(final String source, final ClusterStateUpdateTask updateTask) {
if (!lifecycle.started()) {
return;
}
updateTasksExecutor.execute(new Runnable() {
@Override
public void run() {
if (!lifecycle.started()) {
logger.debug("processing [{}]: ignoring, cluster_service not started", source);
return;
}
logger.debug("processing [{}]: execute", source);
ClusterState previousClusterState = clusterState;
ClusterState newClusterState;
try {
newClusterState = updateTask.execute(previousClusterState);
} catch (Exception e) {
StringBuilder sb = new StringBuilder("failed to execute cluster state update, state:\nversion [").append(previousClusterState.version()).append("], source [").append(source).append("]\n");
sb.append(previousClusterState.nodes().prettyPrint());
sb.append(previousClusterState.routingTable().prettyPrint());
sb.append(previousClusterState.readOnlyRoutingNodes().prettyPrint());
logger.warn(sb.toString(), e);
return;
}
if (previousClusterState == newClusterState) {
logger.debug("processing [{}]: no change in cluster_state", source);
if (updateTask instanceof ProcessedClusterStateUpdateTask) {
((ProcessedClusterStateUpdateTask) updateTask).clusterStateProcessed(newClusterState);
}
return;
}
try {
if (newClusterState.nodes().localNodeMaster()) {
// only the master controls the version numbers
Builder builder = ClusterState.builder().state(newClusterState).version(newClusterState.version() + 1);
if (previousClusterState.routingTable() != newClusterState.routingTable()) {
builder.routingTable(RoutingTable.builder().routingTable(newClusterState.routingTable()).version(newClusterState.routingTable().version() + 1));
}
if (previousClusterState.metaData() != newClusterState.metaData()) {
builder.metaData(MetaData.builder().metaData(newClusterState.metaData()).version(newClusterState.metaData().version() + 1));
}
newClusterState = builder.build();
} else {
if (previousClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK) && !newClusterState.blocks().hasGlobalBlock(Discovery.NO_MASTER_BLOCK)) {
// force an update, its a fresh update from the master as we transition from a start of not having a master to having one
// have a fresh instances of routing and metadata to remove the chance that version might be the same
Builder builder = ClusterState.builder().state(newClusterState);
builder.routingTable(RoutingTable.builder().routingTable(newClusterState.routingTable()));
builder.metaData(MetaData.builder().metaData(newClusterState.metaData()));
newClusterState = builder.build();
logger.debug("got first state from fresh master [{}]", newClusterState.nodes().masterNodeId());
} else if (newClusterState.version() < previousClusterState.version()) {
// we got this cluster state from the master, filter out based on versions (don't call listeners)
logger.debug("got old cluster state [" + newClusterState.version() + "<" + previousClusterState.version() + "] from source [" + source + "], ignoring");
return;
}
}
if (logger.isTraceEnabled()) {
StringBuilder sb = new StringBuilder("cluster state updated:\nversion [").append(newClusterState.version()).append("], source [").append(source).append("]\n");
sb.append(newClusterState.nodes().prettyPrint());
sb.append(newClusterState.routingTable().prettyPrint());
sb.append(newClusterState.readOnlyRoutingNodes().prettyPrint());
logger.trace(sb.toString());
} else if (logger.isDebugEnabled()) {
logger.debug("cluster state updated, version [{}], source [{}]", newClusterState.version(), source);
}
ClusterChangedEvent clusterChangedEvent = new ClusterChangedEvent(source, newClusterState, previousClusterState);
// new cluster state, notify all listeners
final DiscoveryNodes.Delta nodesDelta = clusterChangedEvent.nodesDelta();
if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {
String summary = nodesDelta.shortSummary();
if (summary.length() > 0) {
logger.info("{}, reason: {}", summary, source);
}
}
// TODO, do this in parallel (and wait)
for (DiscoveryNode node : nodesDelta.addedNodes()) {
if (!nodeRequiresConnection(node)) {
continue;
}
try {
transportService.connectToNode(node);
} catch (Exception e) {
// the fault detection will detect it as failed as well
logger.warn("failed to connect to node [" + node + "]", e);
}
}
// if we are the master, publish the new state to all nodes
// we publish here before we send a notification to all the listeners, since if it fails
// we don't want to notify
if (newClusterState.nodes().localNodeMaster()) {
discoveryService.publish(newClusterState);
}
// update the current cluster state
clusterState = newClusterState;
for (ClusterStateListener listener : priorityClusterStateListeners) {
listener.clusterChanged(clusterChangedEvent);
}
for (ClusterStateListener listener : clusterStateListeners) {
listener.clusterChanged(clusterChangedEvent);
}
for (ClusterStateListener listener : lastClusterStateListeners) {
listener.clusterChanged(clusterChangedEvent);
}
if (!nodesDelta.removedNodes().isEmpty()) {
threadPool.generic().execute(new Runnable() {
@Override
public void run() {
for (DiscoveryNode node : nodesDelta.removedNodes()) {
transportService.disconnectFromNode(node);
}
}
});
}
if (updateTask instanceof ProcessedClusterStateUpdateTask) {
((ProcessedClusterStateUpdateTask) updateTask).clusterStateProcessed(newClusterState);
}
logger.debug("processing [{}]: done applying updated cluster_state", source);
} catch (Exception e) {
StringBuilder sb = new StringBuilder("failed to apply updated cluster state:\nversion [").append(newClusterState.version()).append("], source [").append(source).append("]\n");
sb.append(newClusterState.nodes().prettyPrint());
sb.append(newClusterState.routingTable().prettyPrint());
sb.append(newClusterState.readOnlyRoutingNodes().prettyPrint());
logger.warn(sb.toString(), e);
}
}
});
}
class NotifyTimeout implements Runnable {
final TimeoutClusterStateListener listener;
final TimeValue timeout;
ScheduledFuture future;
NotifyTimeout(TimeoutClusterStateListener listener, TimeValue timeout) {
this.listener = listener;
this.timeout = timeout;
}
public void cancel() {
future.cancel(false);
}
@Override
public void run() {
if (future.isCancelled()) {
return;
}
if (lifecycle.stoppedOrClosed()) {
listener.onClose();
} else {
listener.onTimeout(this.timeout);
}
// note, we rely on the listener to remove itself in case of timeout if needed
}
}
private class ReconnectToNodes implements Runnable {
private ConcurrentMap<DiscoveryNode, Integer> failureCount = ConcurrentCollections.newConcurrentMap();
@Override
public void run() {
// master node will check against all nodes if its alive with certain discoveries implementations,
// but we can't rely on that, so we check on it as well
for (DiscoveryNode node : clusterState.nodes()) {
if (lifecycle.stoppedOrClosed()) {
return;
}
if (!nodeRequiresConnection(node)) {
continue;
}
if (clusterState.nodes().nodeExists(node.id())) { // we double check existence of node since connectToNode might take time...
if (!transportService.nodeConnected(node)) {
try {
transportService.connectToNode(node);
} catch (Exception e) {
if (lifecycle.stoppedOrClosed()) {
return;
}
if (clusterState.nodes().nodeExists(node.id())) { // double check here as well, maybe its gone?
Integer nodeFailureCount = failureCount.get(node);
if (nodeFailureCount == null) {
nodeFailureCount = 1;
} else {
nodeFailureCount = nodeFailureCount + 1;
}
// log every 6th failure
if ((nodeFailureCount % 6) == 0) {
// reset the failure count...
nodeFailureCount = 0;
logger.warn("failed to reconnect to node {}", e, node);
}
failureCount.put(node, nodeFailureCount);
}
}
}
}
}
// go over and remove failed nodes that have been removed
DiscoveryNodes nodes = clusterState.nodes();
for (Iterator<DiscoveryNode> failedNodesIt = failureCount.keySet().iterator(); failedNodesIt.hasNext(); ) {
DiscoveryNode failedNode = failedNodesIt.next();
if (!nodes.nodeExists(failedNode.id())) {
failedNodesIt.remove();
}
}
if (lifecycle.started()) {
reconnectToNodes = threadPool.schedule(reconnectInterval, ThreadPool.Names.GENERIC, this);
}
}
}
private boolean nodeRequiresConnection(DiscoveryNode node) {
return localNode().shouldConnectTo(node);
}
private static class LocalNodeMasterListeners implements ClusterStateListener {
private final List<LocalNodeMasterListener> listeners = new CopyOnWriteArrayList<LocalNodeMasterListener>();
private final ThreadPool threadPool;
private volatile boolean master = false;
private LocalNodeMasterListeners(ThreadPool threadPool) {
this.threadPool = threadPool;
}
@Override
public void clusterChanged(ClusterChangedEvent event) {
if (!master && event.localNodeMaster()) {
master = true;
for (LocalNodeMasterListener listener : listeners) {
Executor executor = threadPool.executor(listener.executorName());
executor.execute(new OnMasterRunnable(listener));
}
return;
}
if (master && !event.localNodeMaster()) {
master = false;
for (LocalNodeMasterListener listener : listeners) {
Executor executor = threadPool.executor(listener.executorName());
executor.execute(new OffMasterRunnable(listener));
}
}
}
private void add(LocalNodeMasterListener listener) {
listeners.add(listener);
}
private void remove(LocalNodeMasterListener listener) {
listeners.remove(listener);
}
private void clear() {
listeners.clear();
}
}
private static class OnMasterRunnable implements Runnable {
private final LocalNodeMasterListener listener;
private OnMasterRunnable(LocalNodeMasterListener listener) {
this.listener = listener;
}
@Override
public void run() {
listener.onMaster();
}
}
private static class OffMasterRunnable implements Runnable {
private final LocalNodeMasterListener listener;
private OffMasterRunnable(LocalNodeMasterListener listener) {
this.listener = listener;
}
@Override
public void run() {
listener.offMaster();
}
}
}