/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.cluster; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.LatchedActionListener; import org.elasticsearch.action.admin.cluster.node.stats.NodeStats; import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsRequest; import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse; import org.elasticsearch.action.admin.cluster.node.stats.TransportNodesStatsAction; import org.elasticsearch.action.admin.indices.stats.IndicesStatsRequest; import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse; import org.elasticsearch.action.admin.indices.stats.ShardStats; import org.elasticsearch.action.admin.indices.stats.TransportIndicesStatsAction; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.block.ClusterBlockException; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MetaData; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.monitor.fs.FsInfo; import org.elasticsearch.node.settings.NodeSettingsService; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.ReceiveTimeoutTransportException; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; /** * InternalClusterInfoService provides the ClusterInfoService interface, * routinely updated on a timer. The timer can be dynamically changed by * setting the <code>cluster.info.update.interval</code> setting (defaulting * to 30 seconds). The InternalClusterInfoService only runs on the master node. * Listens for changes in the number of data nodes and immediately submits a * ClusterInfoUpdateJob if a node has been added. * * Every time the timer runs, gathers information about the disk usage and * shard sizes across the cluster. */ public class InternalClusterInfoService extends AbstractComponent implements ClusterInfoService, LocalNodeMasterListener, ClusterStateListener { public static final String INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL = "cluster.info.update.interval"; public static final String INTERNAL_CLUSTER_INFO_TIMEOUT = "cluster.info.update.timeout"; private volatile TimeValue updateFrequency; private volatile Map<String, DiskUsage> leastAvailableSpaceUsages; private volatile Map<String, DiskUsage> mostAvailableSpaceUsages; private volatile Map<ShardRouting, String> shardRoutingToDataPath; private volatile Map<String, Long> shardSizes; private volatile boolean isMaster = false; private volatile boolean enabled; private volatile TimeValue fetchTimeout; private final TransportNodesStatsAction transportNodesStatsAction; private final TransportIndicesStatsAction transportIndicesStatsAction; private final ClusterService clusterService; private final ThreadPool threadPool; private final List<Listener> listeners = new CopyOnWriteArrayList<>(); @Inject public InternalClusterInfoService(Settings settings, NodeSettingsService nodeSettingsService, TransportNodesStatsAction transportNodesStatsAction, TransportIndicesStatsAction transportIndicesStatsAction, ClusterService clusterService, ThreadPool threadPool) { super(settings); this.leastAvailableSpaceUsages = Collections.emptyMap(); this.mostAvailableSpaceUsages = Collections.emptyMap(); this.shardRoutingToDataPath = Collections.emptyMap(); this.shardSizes = Collections.emptyMap(); this.transportNodesStatsAction = transportNodesStatsAction; this.transportIndicesStatsAction = transportIndicesStatsAction; this.clusterService = clusterService; this.threadPool = threadPool; this.updateFrequency = settings.getAsTime(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, TimeValue.timeValueSeconds(30)); this.fetchTimeout = settings.getAsTime(INTERNAL_CLUSTER_INFO_TIMEOUT, TimeValue.timeValueSeconds(15)); this.enabled = settings.getAsBoolean(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, true); nodeSettingsService.addListener(new ApplySettings()); // Add InternalClusterInfoService to listen for Master changes this.clusterService.add((LocalNodeMasterListener)this); // Add to listen for state changes (when nodes are added) this.clusterService.add((ClusterStateListener)this); } class ApplySettings implements NodeSettingsService.Listener { @Override public void onRefreshSettings(Settings settings) { TimeValue newUpdateFrequency = settings.getAsTime(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, null); // ClusterInfoService is only enabled if the DiskThresholdDecider is enabled Boolean newEnabled = settings.getAsBoolean(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, null); if (newUpdateFrequency != null) { if (newUpdateFrequency.getMillis() < TimeValue.timeValueSeconds(10).getMillis()) { logger.warn("[{}] set too low [{}] (< 10s)", INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, newUpdateFrequency); throw new IllegalStateException("Unable to set " + INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL + " less than 10 seconds"); } else { logger.info("updating [{}] from [{}] to [{}]", INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, updateFrequency, newUpdateFrequency); InternalClusterInfoService.this.updateFrequency = newUpdateFrequency; } } TimeValue newFetchTimeout = settings.getAsTime(INTERNAL_CLUSTER_INFO_TIMEOUT, null); if (newFetchTimeout != null) { logger.info("updating fetch timeout [{}] from [{}] to [{}]", INTERNAL_CLUSTER_INFO_TIMEOUT, fetchTimeout, newFetchTimeout); InternalClusterInfoService.this.fetchTimeout = newFetchTimeout; } // We don't log about enabling it here, because the DiskThresholdDecider will already be logging about enable/disable if (newEnabled != null) { InternalClusterInfoService.this.enabled = newEnabled; } } } @Override public void onMaster() { this.isMaster = true; if (logger.isTraceEnabled()) { logger.trace("I have been elected master, scheduling a ClusterInfoUpdateJob"); } try { // Submit a job that will start after DEFAULT_STARTING_INTERVAL, and reschedule itself after running threadPool.schedule(updateFrequency, executorName(), new SubmitReschedulingClusterInfoUpdatedJob()); if (clusterService.state().getNodes().getDataNodes().size() > 1) { // Submit an info update job to be run immediately threadPool.executor(executorName()).execute(new Runnable() { @Override public void run() { maybeRefresh(); } }); } } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug("Couldn't schedule cluster info update task - node might be shutting down", ex); } } } @Override public void offMaster() { this.isMaster = false; } @Override public String executorName() { return ThreadPool.Names.MANAGEMENT; } @Override public void clusterChanged(ClusterChangedEvent event) { if (!this.enabled) { return; } // Check whether it was a data node that was added boolean dataNodeAdded = false; for (DiscoveryNode addedNode : event.nodesDelta().addedNodes()) { if (addedNode.dataNode()) { dataNodeAdded = true; break; } } if (this.isMaster && dataNodeAdded && clusterService.state().getNodes().getDataNodes().size() > 1) { if (logger.isDebugEnabled()) { logger.debug("data node was added, retrieving new cluster info"); } threadPool.executor(executorName()).execute(new Runnable() { @Override public void run() { maybeRefresh(); } }); } if (this.isMaster && event.nodesRemoved()) { for (DiscoveryNode removedNode : event.nodesDelta().removedNodes()) { if (removedNode.dataNode()) { if (logger.isTraceEnabled()) { logger.trace("Removing node from cluster info: {}", removedNode.getId()); } if (leastAvailableSpaceUsages.containsKey(removedNode.getId())) { Map<String, DiskUsage> newMaxUsages = new HashMap<>(leastAvailableSpaceUsages); newMaxUsages.remove(removedNode.getId()); leastAvailableSpaceUsages = Collections.unmodifiableMap(newMaxUsages); } if (mostAvailableSpaceUsages.containsKey(removedNode.getId())) { Map<String, DiskUsage> newMinUsages = new HashMap<>(mostAvailableSpaceUsages); newMinUsages.remove(removedNode.getId()); mostAvailableSpaceUsages = Collections.unmodifiableMap(newMinUsages); } } } } } @Override public ClusterInfo getClusterInfo() { return new ClusterInfo(leastAvailableSpaceUsages, mostAvailableSpaceUsages, shardSizes, shardRoutingToDataPath); } @Override public void addListener(Listener listener) { this.listeners.add(listener); } /** * Class used to submit {@link #maybeRefresh()} on the * {@link InternalClusterInfoService} threadpool, these jobs will * reschedule themselves by placing a new instance of this class onto the * scheduled threadpool. */ public class SubmitReschedulingClusterInfoUpdatedJob implements Runnable { @Override public void run() { if (logger.isTraceEnabled()) { logger.trace("Submitting new rescheduling cluster info update job"); } try { threadPool.executor(executorName()).execute(new Runnable() { @Override public void run() { try { maybeRefresh(); } finally { //schedule again after we refreshed if (isMaster) { if (logger.isTraceEnabled()) { logger.trace("Scheduling next run for updating cluster info in: {}", updateFrequency.toString()); } try { threadPool.schedule(updateFrequency, executorName(), this); } catch (EsRejectedExecutionException ex) { logger.debug("Reschedule cluster info service was rejected", ex); } } } } }); } catch (EsRejectedExecutionException ex) { if (logger.isDebugEnabled()) { logger.debug("Couldn't re-schedule cluster info update task - node might be shutting down", ex); } } } } /** * Retrieve the latest nodes stats, calling the listener when complete * @return a latch that can be used to wait for the nodes stats to complete if desired */ protected CountDownLatch updateNodeStats(final ActionListener<NodesStatsResponse> listener) { final CountDownLatch latch = new CountDownLatch(1); final NodesStatsRequest nodesStatsRequest = new NodesStatsRequest("data:true"); nodesStatsRequest.clear(); nodesStatsRequest.fs(true); nodesStatsRequest.timeout(fetchTimeout); transportNodesStatsAction.execute(nodesStatsRequest, new LatchedActionListener<>(listener, latch)); return latch; } /** * Retrieve the latest indices stats, calling the listener when complete * @return a latch that can be used to wait for the indices stats to complete if desired */ protected CountDownLatch updateIndicesStats(final ActionListener<IndicesStatsResponse> listener) { final CountDownLatch latch = new CountDownLatch(1); final IndicesStatsRequest indicesStatsRequest = new IndicesStatsRequest(); indicesStatsRequest.clear(); indicesStatsRequest.store(true); transportIndicesStatsAction.execute(indicesStatsRequest, new LatchedActionListener<>(listener, latch)); return latch; } private final void maybeRefresh() { // Short-circuit if not enabled if (enabled) { refresh(); } else { if (logger.isTraceEnabled()) { logger.trace("Skipping ClusterInfoUpdatedJob since it is disabled"); } } } /** * Refreshes the ClusterInfo in a blocking fashion */ public final ClusterInfo refresh() { if (logger.isTraceEnabled()) { logger.trace("Performing ClusterInfoUpdateJob"); } final CountDownLatch nodeLatch = updateNodeStats(new ActionListener<NodesStatsResponse>() { @Override public void onResponse(NodesStatsResponse nodeStatses) { Map<String, DiskUsage> newLeastAvaiableUsages = new HashMap<>(); Map<String, DiskUsage> newMostAvaiableUsages = new HashMap<>(); fillDiskUsagePerNode(logger, nodeStatses.getNodes(), newLeastAvaiableUsages, newMostAvaiableUsages); leastAvailableSpaceUsages = Collections.unmodifiableMap(newLeastAvaiableUsages); mostAvailableSpaceUsages = Collections.unmodifiableMap(newMostAvaiableUsages); } @Override public void onFailure(Throwable e) { if (e instanceof ReceiveTimeoutTransportException) { logger.error("NodeStatsAction timed out for ClusterInfoUpdateJob (reason [{}])", e.getMessage()); } else { if (e instanceof ClusterBlockException) { if (logger.isTraceEnabled()) { logger.trace("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e); } } else { logger.warn("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e); } // we empty the usages list, to be safe - we don't know what's going on. leastAvailableSpaceUsages = Collections.emptyMap(); mostAvailableSpaceUsages = Collections.emptyMap(); } } }); final CountDownLatch indicesLatch = updateIndicesStats(new ActionListener<IndicesStatsResponse>() { @Override public void onResponse(IndicesStatsResponse indicesStatsResponse) { ShardStats[] stats = indicesStatsResponse.getShards(); final HashMap<String, Long> newShardSizes = new HashMap<>(); final HashMap<ShardRouting, String> newShardRoutingToDataPath = new HashMap<>(); buildShardLevelInfo(logger, stats, newShardSizes, newShardRoutingToDataPath, clusterService.state()); shardSizes = Collections.unmodifiableMap(newShardSizes); shardRoutingToDataPath = Collections.unmodifiableMap(newShardRoutingToDataPath); } @Override public void onFailure(Throwable e) { if (e instanceof ReceiveTimeoutTransportException) { logger.error("IndicesStatsAction timed out for ClusterInfoUpdateJob (reason [{}])", e.getMessage()); } else { if (e instanceof ClusterBlockException) { if (logger.isTraceEnabled()) { logger.trace("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e); } } else { logger.warn("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e); } // we empty the usages list, to be safe - we don't know what's going on. shardSizes = Collections.emptyMap(); shardRoutingToDataPath = Collections.emptyMap(); } } }); try { nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); // restore interrupt status logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); } try { indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS); } catch (InterruptedException e) { Thread.currentThread().interrupt(); // restore interrupt status logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout); } ClusterInfo clusterInfo = getClusterInfo(); for (Listener l : listeners) { try { l.onNewInfo(clusterInfo); } catch (Exception e) { logger.info("Failed executing ClusterInfoService listener", e); } } return clusterInfo; } static void buildShardLevelInfo(ESLogger logger, ShardStats[] stats, HashMap<String, Long> newShardSizes, HashMap<ShardRouting, String> newShardRoutingToDataPath, ClusterState state) { MetaData meta = state.getMetaData(); for (ShardStats s : stats) { IndexMetaData indexMeta = meta.index(s.getShardRouting().index()); Settings indexSettings = indexMeta == null ? null : indexMeta.getSettings(); newShardRoutingToDataPath.put(s.getShardRouting(), s.getDataPath()); long size = s.getStats().getStore().sizeInBytes(); String sid = ClusterInfo.shardIdentifierFromRouting(s.getShardRouting()); if (logger.isTraceEnabled()) { logger.trace("shard: {} size: {}", sid, size); } if (indexSettings != null && IndexMetaData.isIndexUsingShadowReplicas(indexSettings)) { // Shards on a shared filesystem should be considered of size 0 if (logger.isTraceEnabled()) { logger.trace("shard: {} is using shadow replicas and will be treated as size 0", sid); } size = 0; } newShardSizes.put(sid, size); } } static void fillDiskUsagePerNode(ESLogger logger, NodeStats[] nodeStatsArray, Map<String, DiskUsage> newLeastAvaiableUsages, Map<String, DiskUsage> newMostAvaiableUsages) { for (NodeStats nodeStats : nodeStatsArray) { if (nodeStats.getFs() == null) { logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().name()); } else { FsInfo.Path leastAvailablePath = null; FsInfo.Path mostAvailablePath = null; for (FsInfo.Path info : nodeStats.getFs()) { if (leastAvailablePath == null) { assert mostAvailablePath == null; mostAvailablePath = leastAvailablePath = info; } else if (leastAvailablePath.getAvailable().bytes() > info.getAvailable().bytes()){ leastAvailablePath = info; } else if (mostAvailablePath.getAvailable().bytes() < info.getAvailable().bytes()) { mostAvailablePath = info; } } String nodeId = nodeStats.getNode().id(); String nodeName = nodeStats.getNode().getName(); if (logger.isTraceEnabled()) { logger.trace("node: [{}], most available: total disk: {}, available disk: {} / least available: total disk: {}, available disk: {}", nodeId, mostAvailablePath.getTotal(), leastAvailablePath.getAvailable(), leastAvailablePath.getTotal(), leastAvailablePath.getAvailable()); } if (leastAvailablePath.getTotal().bytes() < 0) { if (logger.isTraceEnabled()) { logger.trace("node: [{}] least available path has less than 0 total bytes of disk [{}], skipping", nodeId, leastAvailablePath.getTotal().bytes()); } } else { newLeastAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, leastAvailablePath.getPath(), leastAvailablePath.getTotal().bytes(), leastAvailablePath.getAvailable().bytes())); } if (mostAvailablePath.getTotal().bytes() < 0) { if (logger.isTraceEnabled()) { logger.trace("node: [{}] most available path has less than 0 total bytes of disk [{}], skipping", nodeId, mostAvailablePath.getTotal().bytes()); } } else { newMostAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, mostAvailablePath.getPath(), mostAvailablePath.getTotal().bytes(), mostAvailablePath.getAvailable().bytes())); } } } } }