/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation.decider;
import com.google.common.collect.Sets;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterInfo;
import org.elasticsearch.cluster.ClusterInfoService;
import org.elasticsearch.cluster.DiskUsage;
import org.elasticsearch.cluster.EmptyClusterInfoService;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.RatioValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.node.settings.NodeSettingsService;
import java.util.Map;
import java.util.Set;
/**
* The {@link DiskThresholdDecider} checks that the node a shard is potentially
* being allocated to has enough disk space.
*
* It has three configurable settings, all of which can be changed dynamically:
*
* <code>cluster.routing.allocation.disk.watermark.low</code> is the low disk
* watermark. New shards will not allocated to a node with usage higher than this,
* although this watermark may be passed by allocating a shard. It defaults to
* 0.85 (85.0%).
*
* <code>cluster.routing.allocation.disk.watermark.high</code> is the high disk
* watermark. If a node has usage higher than this, shards are not allowed to
* remain on the node. In addition, if allocating a shard to a node causes the
* node to pass this watermark, it will not be allowed. It defaults to
* 0.90 (90.0%).
*
* Both watermark settings are expressed in terms of used disk percentage, or
* exact byte values for free space (like "500mb")
*
* <code>cluster.routing.allocation.disk.threshold_enabled</code> is used to
* enable or disable this decider. It defaults to false (disabled).
*/
public class DiskThresholdDecider extends AllocationDecider {
public static final String NAME = "disk_threshold";
private volatile Double freeDiskThresholdLow;
private volatile Double freeDiskThresholdHigh;
private volatile ByteSizeValue freeBytesThresholdLow;
private volatile ByteSizeValue freeBytesThresholdHigh;
private volatile boolean includeRelocations;
private volatile boolean enabled;
private volatile TimeValue rerouteInterval;
public static final String CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED = "cluster.routing.allocation.disk.threshold_enabled";
public static final String CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK = "cluster.routing.allocation.disk.watermark.low";
public static final String CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK = "cluster.routing.allocation.disk.watermark.high";
public static final String CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS = "cluster.routing.allocation.disk.include_relocations";
public static final String CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL = "cluster.routing.allocation.disk.reroute_interval";
class ApplySettings implements NodeSettingsService.Listener {
@Override
public void onRefreshSettings(Settings settings) {
String newLowWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, null);
String newHighWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, null);
Boolean newRelocationsSetting = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS, null);
Boolean newEnableSetting = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, null);
TimeValue newRerouteInterval = settings.getAsTime(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, null);
if (newEnableSetting != null) {
logger.info("updating [{}] from [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED,
DiskThresholdDecider.this.enabled, newEnableSetting);
DiskThresholdDecider.this.enabled = newEnableSetting;
}
if (newRelocationsSetting != null) {
logger.info("updating [{}] from [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS,
DiskThresholdDecider.this.includeRelocations, newRelocationsSetting);
DiskThresholdDecider.this.includeRelocations = newRelocationsSetting;
}
if (newLowWatermark != null) {
if (!validWatermarkSetting(newLowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse low watermark [{}]", newLowWatermark);
}
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, newLowWatermark);
DiskThresholdDecider.this.freeDiskThresholdLow = 100.0 - thresholdPercentageFromWatermark(newLowWatermark);
DiskThresholdDecider.this.freeBytesThresholdLow = thresholdBytesFromWatermark(newLowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK);
}
if (newHighWatermark != null) {
if (!validWatermarkSetting(newHighWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse high watermark [{}]", newHighWatermark);
}
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, newHighWatermark);
DiskThresholdDecider.this.freeDiskThresholdHigh = 100.0 - thresholdPercentageFromWatermark(newHighWatermark);
DiskThresholdDecider.this.freeBytesThresholdHigh = thresholdBytesFromWatermark(newHighWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK);
}
if (newRerouteInterval != null) {
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, newRerouteInterval);
DiskThresholdDecider.this.rerouteInterval = newRerouteInterval;
}
}
}
/**
* Listens for a node to go over the high watermark and kicks off an empty
* reroute if it does. Also responsible for logging about nodes that have
* passed the disk watermarks
*/
class DiskListener implements ClusterInfoService.Listener {
private final Client client;
private final Set<String> nodeHasPassedWatermark = Sets.newConcurrentHashSet();
private long lastRunNS;
DiskListener(Client client) {
this.client = client;
}
/**
* Warn about the given disk usage if the low or high watermark has been passed
*/
private void warnAboutDiskIfNeeded(DiskUsage usage) {
// Check absolute disk values
if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes()) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
DiskThresholdDecider.this.freeBytesThresholdHigh, usage);
} else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes()) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
DiskThresholdDecider.this.freeBytesThresholdLow, usage);
}
// Check percentage disk values
if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh, "%"), usage);
} else if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdLow, "%"), usage);
}
}
@Override
public void onNewInfo(ClusterInfo info) {
Map<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
if (usages != null) {
boolean reroute = false;
String explanation = "";
// Garbage collect nodes that have been removed from the cluster
// from the map that tracks watermark crossing
Set<String> nodes = usages.keySet();
for (String node : nodeHasPassedWatermark) {
if (nodes.contains(node) == false) {
nodeHasPassedWatermark.remove(node);
}
}
for (Map.Entry<String, DiskUsage> entry : usages.entrySet()) {
String node = entry.getKey();
DiskUsage usage = entry.getValue();
warnAboutDiskIfNeeded(usage);
if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes() ||
usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) {
if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "high disk watermark exceeded on one or more nodes";
} else {
logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred in the last [{}], skipping reroute",
node, DiskThresholdDecider.this.rerouteInterval);
}
nodeHasPassedWatermark.add(node);
} else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes() ||
usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) {
nodeHasPassedWatermark.add(node);
} else {
if (nodeHasPassedWatermark.contains(node)) {
// The node has previously been over the high or
// low watermark, but is no longer, so we should
// reroute so any unassigned shards can be allocated
// if they are able to be
if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "one or more nodes has gone under the high or low watermark";
nodeHasPassedWatermark.remove(node);
} else {
logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred in the last [{}], skipping reroute",
node, DiskThresholdDecider.this.rerouteInterval);
}
}
}
}
if (reroute) {
logger.info("rerouting shards: [{}]", explanation);
// Execute an empty reroute, but don't block on the response
client.admin().cluster().prepareReroute().execute();
}
}
}
}
public DiskThresholdDecider(Settings settings) {
// It's okay the Client is null here, because the empty cluster info
// service will never actually call the listener where the client is
// needed. Also this constructor is only used for tests
this(settings, new NodeSettingsService(settings), EmptyClusterInfoService.INSTANCE, null);
}
@Inject
public DiskThresholdDecider(Settings settings, NodeSettingsService nodeSettingsService, ClusterInfoService infoService, Client client) {
super(settings);
String lowWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, "85%");
String highWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, "90%");
if (!validWatermarkSetting(lowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse low watermark [{}]", lowWatermark);
}
if (!validWatermarkSetting(highWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse high watermark [{}]", highWatermark);
}
// Watermark is expressed in terms of used data, but we need "free" data watermark
this.freeDiskThresholdLow = 100.0 - thresholdPercentageFromWatermark(lowWatermark);
this.freeDiskThresholdHigh = 100.0 - thresholdPercentageFromWatermark(highWatermark);
this.freeBytesThresholdLow = thresholdBytesFromWatermark(lowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK);
this.freeBytesThresholdHigh = thresholdBytesFromWatermark(highWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK);
this.includeRelocations = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS, true);
this.rerouteInterval = settings.getAsTime(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, TimeValue.timeValueSeconds(60));
this.enabled = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, true);
nodeSettingsService.addListener(new ApplySettings());
infoService.addListener(new DiskListener(client));
}
// For Testing
ApplySettings newApplySettings() {
return new ApplySettings();
}
// For Testing
public Double getFreeDiskThresholdLow() {
return freeDiskThresholdLow;
}
// For Testing
public Double getFreeDiskThresholdHigh() {
return freeDiskThresholdHigh;
}
// For Testing
public Double getUsedDiskThresholdLow() {
return 100.0 - freeDiskThresholdLow;
}
// For Testing
public Double getUsedDiskThresholdHigh() {
return 100.0 - freeDiskThresholdHigh;
}
// For Testing
public ByteSizeValue getFreeBytesThresholdLow() {
return freeBytesThresholdLow;
}
// For Testing
public ByteSizeValue getFreeBytesThresholdHigh() {
return freeBytesThresholdHigh;
}
// For Testing
public boolean isIncludeRelocations() {
return includeRelocations;
}
// For Testing
public boolean isEnabled() {
return enabled;
}
// For Testing
public TimeValue getRerouteInterval() {
return rerouteInterval;
}
/**
* Returns the size of all shards that are currently being relocated to
* the node, but may not be finished transfering yet.
*
* If subtractShardsMovingAway is true then the size of shards moving away is subtracted from the total size of all shards
*/
static long sizeOfRelocatingShards(RoutingNode node, ClusterInfo clusterInfo,
boolean subtractShardsMovingAway, String dataPath) {
long totalSize = 0;
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.RELOCATING, ShardRoutingState.INITIALIZING)) {
String actualPath = clusterInfo.getDataPath(routing);
if (dataPath.equals(actualPath)) {
if (routing.initializing() && routing.relocatingNodeId() != null) {
totalSize += getShardSize(routing, clusterInfo);
} else if (subtractShardsMovingAway && routing.relocating()) {
totalSize -= getShardSize(routing, clusterInfo);
}
}
}
return totalSize;
}
static long getShardSize(ShardRouting routing, ClusterInfo clusterInfo) {
Long shardSize = clusterInfo.getShardSize(routing);
return shardSize == null ? 0 : shardSize;
}
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
ClusterInfo clusterInfo = allocation.clusterInfo();
Map<String, DiskUsage> usages = clusterInfo.getNodeMostAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow;
final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh;
// subtractLeavingShards is passed as false here, because they still use disk space, and therefore should we should be extra careful
// and take the size into account
DiskUsage usage = getDiskUsage(node, allocation, usages, false);
// First, check that the node currently over the low watermark
double freeDiskPercentage = usage.getFreeDiskAsPercentage();
// Cache the used disk percentage for displaying disk percentages consistent with documentation
double usedDiskPercentage = usage.getUsedDiskAsPercentage();
long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
}
// a flag for whether the primary shard has been previously allocated
boolean primaryHasBeenAllocated = shardRouting.primary() && shardRouting.allocatedPostIndexCreate();
// checks for exact byte comparisons
if (freeBytes < freeBytesThresholdLow.bytes()) {
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
freeBytesThresholdLow, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
freeBytesThresholdLow, new ByteSizeValue(freeBytes));
} else if (freeBytes > freeBytesThresholdHigh.bytes()) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"but allowing allocation because primary has never been allocated",
freeBytesThresholdLow, freeBytes, node.nodeId());
}
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
freeBytesThresholdHigh, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
freeBytesThresholdHigh, new ByteSizeValue(freeBytes));
}
}
// checks for percentage comparisons
if (freeDiskPercentage < freeDiskThresholdLow) {
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdLow, freeDiskPercentage);
} else if (freeDiskPercentage > freeDiskThresholdHigh) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], " +
"but allowing allocation because primary has never been allocated",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
Strings.format1Decimals(freeDiskThresholdHigh, "%"),
Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdHigh, freeDiskPercentage);
}
}
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
final long shardSize = getShardSize(shardRouting, allocation.clusterInfo());
double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
long freeBytesAfterShard = freeBytes - shardSize;
if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) {
logger.warn("after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation",
node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard);
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]",
freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard));
}
if (freeSpaceAfterShard < freeDiskThresholdHigh) {
logger.warn("after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation",
node.nodeId(), Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeSpaceAfterShard, "%"));
return allocation.decision(Decision.NO, NAME, "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdLow, freeSpaceAfterShard);
}
return allocation.decision(Decision.YES, NAME, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes));
}
@Override
public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
if (shardRouting.currentNodeId().equals(node.nodeId()) == false) {
throw new IllegalArgumentException("Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]");
}
final ClusterInfo clusterInfo = allocation.clusterInfo();
final Map<String, DiskUsage> usages = clusterInfo.getNodeLeastAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
// subtractLeavingShards is passed as true here, since this is only for shards remaining, we will *eventually* have enough disk
// since shards are moving away. No new shards will be incoming since in canAllocate we pass false for this check.
final DiskUsage usage = getDiskUsage(node, allocation, usages, true);
final String dataPath = clusterInfo.getDataPath(shardRouting);
// If this node is already above the high threshold, the shard cannot remain (get it off!)
final double freeDiskPercentage = usage.getFreeDiskAsPercentage();
final long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes);
}
if (dataPath == null || usage.getPath().equals(dataPath) == false) {
return allocation.decision(Decision.YES, NAME, "shard is not allocated on the most utilized disk");
}
if (freeBytes < freeBytesThresholdHigh.bytes()) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",
freeBytesThresholdHigh, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]",
freeBytesThresholdHigh, new ByteSizeValue(freeBytes));
}
if (freeDiskPercentage < freeDiskThresholdHigh) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain",
freeDiskThresholdHigh, freeDiskPercentage, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s%%] free disk on node, free: [%s%%]",
freeDiskThresholdHigh, freeDiskPercentage);
}
return allocation.decision(Decision.YES, NAME, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes));
}
private DiskUsage getDiskUsage(RoutingNode node, RoutingAllocation allocation,
Map<String, DiskUsage> usages, boolean subtractLeavingShards) {
DiskUsage usage = usages.get(node.nodeId());
if (usage == null) {
// If there is no usage, and we have other nodes in the cluster,
// use the average usage for all nodes as the usage for this node
usage = averageUsage(node, usages);
if (logger.isDebugEnabled()) {
logger.debug("unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage());
}
}
if (includeRelocations) {
long relocatingShardsSize = sizeOfRelocatingShards(node, allocation.clusterInfo(), subtractLeavingShards, usage.getPath());
DiskUsage usageIncludingRelocations = new DiskUsage(node.nodeId(), node.node().name(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize);
if (logger.isTraceEnabled()) {
logger.trace("usage without relocations: {}", usage);
logger.trace("usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations);
}
usage = usageIncludingRelocations;
}
return usage;
}
/**
* Returns a {@link DiskUsage} for the {@link RoutingNode} using the
* average usage of other nodes in the disk usage map.
* @param node Node to return an averaged DiskUsage object for
* @param usages Map of nodeId to DiskUsage for all known nodes
* @return DiskUsage representing given node using the average disk usage
*/
public DiskUsage averageUsage(RoutingNode node, Map<String, DiskUsage> usages) {
if (usages.size() == 0) {
return new DiskUsage(node.nodeId(), node.node().name(), "_na_", 0, 0);
}
long totalBytes = 0;
long freeBytes = 0;
for (DiskUsage du : usages.values()) {
totalBytes += du.getTotalBytes();
freeBytes += du.getFreeBytes();
}
return new DiskUsage(node.nodeId(), node.node().name(), "_na_", totalBytes / usages.size(), freeBytes / usages.size());
}
/**
* Given the DiskUsage for a node and the size of the shard, return the
* percentage of free disk if the shard were to be allocated to the node.
* @param usage A DiskUsage for the node to have space computed for
* @param shardSize Size in bytes of the shard
* @return Percentage of free space after the shard is assigned to the node
*/
public double freeDiskPercentageAfterShardAssigned(DiskUsage usage, Long shardSize) {
shardSize = (shardSize == null) ? 0 : shardSize;
DiskUsage newUsage = new DiskUsage(usage.getNodeId(), usage.getNodeName(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - shardSize);
return newUsage.getFreeDiskAsPercentage();
}
/**
* Attempts to parse the watermark into a percentage, returning 100.0% if
* it cannot be parsed.
*/
public double thresholdPercentageFromWatermark(String watermark) {
try {
return RatioValue.parseRatioValue(watermark).getAsPercent();
} catch (ElasticsearchParseException ex) {
// NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two cases separately
return 100.0;
}
}
/**
* Attempts to parse the watermark into a {@link ByteSizeValue}, returning
* a ByteSizeValue of 0 bytes if the value cannot be parsed.
*/
public ByteSizeValue thresholdBytesFromWatermark(String watermark, String settingName) {
try {
return ByteSizeValue.parseBytesSizeValue(watermark, settingName);
} catch (ElasticsearchParseException ex) {
// NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two cases separately
return ByteSizeValue.parseBytesSizeValue("0b", settingName);
}
}
/**
* Checks if a watermark string is a valid percentage or byte size value,
* returning true if valid, false if invalid.
*/
public boolean validWatermarkSetting(String watermark, String settingName) {
try {
RatioValue.parseRatioValue(watermark);
return true;
} catch (ElasticsearchParseException e) {
try {
ByteSizeValue.parseBytesSizeValue(watermark, settingName);
return true;
} catch (ElasticsearchParseException ex) {
return false;
}
}
}
private Decision earlyTerminate(RoutingAllocation allocation, final Map<String, DiskUsage> usages) {
// Always allow allocation if the decider is disabled
if (!enabled) {
return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled");
}
// Allow allocation regardless if only a single data node is available
if (allocation.nodes().dataNodes().size() <= 1) {
if (logger.isTraceEnabled()) {
logger.trace("only a single data node is present, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "only a single data node is present");
}
// Fail open there is no info available
final ClusterInfo clusterInfo = allocation.clusterInfo();
if (clusterInfo == null) {
if (logger.isTraceEnabled()) {
logger.trace("cluster info unavailable for disk threshold decider, allowing allocation.");
}
return allocation.decision(Decision.YES, NAME, "cluster info unavailable");
}
// Fail open if there are no disk usages available
if (usages.isEmpty()) {
if (logger.isTraceEnabled()) {
logger.trace("unable to determine disk usages for disk-aware allocation, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "disk usages unavailable");
}
return null;
}
}