/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.job.spawn.balancer;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Supplier;
import com.addthis.basis.util.JitterClock;
import com.addthis.basis.util.Parameter;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.codec.codables.Codable;
import com.addthis.codec.config.Configs;
import com.addthis.codec.json.CodecJSON;
import com.addthis.hydra.job.HostFailWorker;
import com.addthis.hydra.job.IJob;
import com.addthis.hydra.job.Job;
import com.addthis.hydra.job.JobState;
import com.addthis.hydra.job.JobTask;
import com.addthis.hydra.job.JobTaskMoveAssignment;
import com.addthis.hydra.job.JobTaskReplica;
import com.addthis.hydra.job.JobTaskState;
import com.addthis.hydra.job.mq.CommandTaskStop;
import com.addthis.hydra.job.mq.CoreMessage;
import com.addthis.hydra.job.mq.HostState;
import com.addthis.hydra.job.mq.JobKey;
import com.addthis.hydra.job.spawn.HostManager;
import com.addthis.hydra.job.spawn.Spawn;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Gauge;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static com.addthis.hydra.job.JobTaskState.IDLE;
import static com.addthis.hydra.job.JobTaskState.QUEUED;
import static com.addthis.hydra.job.JobTaskState.QUEUED_HOST_UNAVAIL;
import static com.addthis.hydra.job.JobTaskState.QUEUED_NO_SLOT;
import static com.addthis.hydra.job.store.SpawnDataStoreKeys.SPAWN_BALANCE_PARAM_PATH;
/**
* A class in charge of balancing load among spawn's hosts.
* General assumptions:
* The boxes are not so asymmetrical that running a job on three boxes is slower than running it on a single box.
* Jobs run faster when they have as few tasks grouped together on individual boxes as possible.
*/
@JsonAutoDetect(getterVisibility = JsonAutoDetect.Visibility.NONE,
isGetterVisibility = JsonAutoDetect.Visibility.NONE,
setterVisibility = JsonAutoDetect.Visibility.NONE)
public class SpawnBalancer implements Codable, AutoCloseable {
private static final Logger log = LoggerFactory.getLogger(SpawnBalancer.class);
private static final Set<JobTaskState> movableTaskStates = ImmutableSet.of(
IDLE, QUEUED, QUEUED_HOST_UNAVAIL, QUEUED_NO_SLOT);
// How often to update aggregate host statistics
static final long AGGREGATE_STAT_UPDATE_INTERVAL = Parameter.intValue("spawnbalance.stat.update", 15 * 1000);
// metrics
private volatile double avgDiskPercentUsedDiff = 0;
private volatile double minDiskPercentUsedDiff = 0;
private volatile double maxDiskPercentUsedDiff = 0;
private volatile double avgTaskPercentDiff = 0;
private volatile double minTaskPercentDiff = 0;
private volatile double maxTaskPercentDiff = 0;
private final ConcurrentHashMap<String, HostScore> cachedHostScores;
private final ReentrantLock aggregateStatisticsLock;
private final AtomicBoolean autobalanceStarted;
private final Cache<String, Boolean> recentlyAutobalancedJobs;
private final Cache<String, Boolean> recentlyBalancedHosts;
private final Cache<String, Boolean> recentlyReplicatedToHosts;
private final SpawnBalancerTaskSizer taskSizer;
private final Comparator<HostAndScore> hostAndScoreComparator;
private final Comparator<HostState> hostStateScoreComparator;
private final Comparator<Job> jobAverageTaskSizeComparator;
private final Comparator<HostState> hostStateReplicationSuitabilityComparator;
private final ScheduledExecutorService taskExecutor;
final Spawn spawn;
final HostManager hostManager;
private volatile Set<String> activeJobIds;
@FieldConfig
private volatile SpawnBalancerConfig config;
public SpawnBalancer(Spawn spawn, HostManager hostManager) {
this.spawn = spawn;
this.hostManager = hostManager;
config = loadConfigFromDataStore(new SpawnBalancerConfig());
taskExecutor = new ScheduledThreadPoolExecutor(
2, new ThreadFactoryBuilder().setNameFormat("spawnBalancer-%d").build());
taskExecutor.scheduleAtFixedRate(new AggregateStatUpdaterTask(this), AGGREGATE_STAT_UPDATE_INTERVAL,
AGGREGATE_STAT_UPDATE_INTERVAL, TimeUnit.MILLISECONDS);
taskSizer = new SpawnBalancerTaskSizer(spawn, hostManager);
cachedHostScores = new ConcurrentHashMap<>();
aggregateStatisticsLock = new ReentrantLock();
autobalanceStarted = new AtomicBoolean(false);
recentlyAutobalancedJobs = CacheBuilder.newBuilder().expireAfterWrite(
Parameter.intValue("spawnbalance.job.autobalance.interval.mins", 60 * 12), TimeUnit.MINUTES
).build();
recentlyBalancedHosts = CacheBuilder.newBuilder().expireAfterWrite(
Parameter.intValue("spawnbalance.host.balance.interval.mins", 3), TimeUnit.MINUTES
).build();
recentlyReplicatedToHosts = CacheBuilder.newBuilder().expireAfterWrite(
Parameter.intValue("spawnbalance.host.replicate.interval.mins", 15), TimeUnit.MINUTES
).build();
hostAndScoreComparator = Comparator.comparingDouble(has -> has.score);
hostStateScoreComparator = Comparator.comparingDouble(hostState -> this.getHostScoreCached(hostState.getHostUuid()));
jobAverageTaskSizeComparator = (job, job1) -> {
if ((job == null) || (job1 == null)) {
return 0;
} else {
return Double.compare(job.calcAverageTaskSizeBytes(), job1.calcAverageTaskSizeBytes());
}
};
hostStateReplicationSuitabilityComparator = (hostState, hostState1) -> {
// Treat recently-replicated-to hosts as having fewer than their reported available bytes
long availBytes = hostState.getAvailDiskBytes();
long availBytes1 = hostState1.getAvailDiskBytes();
if (recentlyReplicatedToHosts.getIfPresent(hostState.getHostUuid()) != null) {
availBytes /= 2;
}
if (recentlyReplicatedToHosts.getIfPresent(hostState1.getHostUuid()) != null) {
availBytes1 /= 2;
}
return -Double.compare(availBytes, availBytes1);
};
activeJobIds = new HashSet<>();
this.initMetrics();
}
private void initMetrics() {
SpawnBalancer.makeGauge("minDiskPercentUsedDiff", () -> minDiskPercentUsedDiff);
SpawnBalancer.makeGauge("maxDiskPercentUsedDiff", () -> maxDiskPercentUsedDiff);
SpawnBalancer.makeGauge("avgDiskPercentUsedDiff", () -> avgDiskPercentUsedDiff);
SpawnBalancer.makeGauge("minTaskPercentDiff", () -> minTaskPercentDiff);
SpawnBalancer.makeGauge("maxTaskPercentDiff", () -> maxTaskPercentDiff);
SpawnBalancer.makeGauge("avgTaskPercentDiff", () -> avgTaskPercentDiff);
}
private static <T> void makeGauge(String name, Supplier<T> value) {
Gauge<T> gauge = new Gauge<T>() {
@Override public T value() {
return value.get();
}
};
Metrics.newGauge(SpawnBalancer.class, name, gauge);
}
/** Loads SpawnBalancerConfig from data store; if no data or failed, returns the default. */
@VisibleForTesting
protected SpawnBalancerConfig loadConfigFromDataStore(SpawnBalancerConfig defaultValue) {
String configString = spawn.getSpawnDataStore().get(SPAWN_BALANCE_PARAM_PATH);
if (!Strings.isNullOrEmpty(configString)) {
try {
return Configs.decodeObject(SpawnBalancerConfig.class, configString);
} catch (Exception e) {
log.warn("Failed to decode SpawnBalancerConfig", e);
}
}
return defaultValue;
}
/**
* A cached version of getHostScore that will keep host score calculation outside the main spawn thread
*
* @param hostId The host to measure
* @return A non-negative number representing load.
*/
public double getHostScoreCached(String hostId) {
if (hostId == null) {
return config.getDefaultHostScore();
}
HostScore score = cachedHostScores.get(hostId);
if (score != null) {
return score.getOverallScore();
} else {
return config.getDefaultHostScore();
}
}
/** Start a thread that will perform autobalancing in the background if appropriate to do so */
public synchronized void startAutobalanceTask() {
if (autobalanceStarted.compareAndSet(false, true)) {
taskExecutor.scheduleWithFixedDelay(new AutobalanceTask(this), config.getAutobalanceCheckInterval(),
config.getAutobalanceCheckInterval(), TimeUnit.MILLISECONDS);
}
}
/**
* Is this job unbalanced enough to warrant a rebalance?
*
* @param job The job to check
* @param hosts The hosts in the cluster
* @return True if it is appropriate to rebalance the job
*/
private synchronized boolean shouldAutobalanceJob(Job job, List<HostState> hosts) {
if ((job == null)
|| (recentlyAutobalancedJobs.getIfPresent(job.getId()) != null)
|| (JobState.IDLE != job.getState())
|| job.getDontAutoBalanceMe()
|| (job.getRunCount() < 1)) {
return false;
}
if (config.getAutoBalanceLevel() >= 2) {
// At level 2, rebalance every job that hasn't already been rebalanced recently
return true;
}
JobTaskItemByHostMap tasksByHost = generateTaskCountByHost(hosts, job.getCopyOfTasks());
int maxPerHost = maxTasksPerHost(job, hosts.size());
// If any host has sufficiently more or less than the expected fair share, this job is unbalanced.
return (tasksByHost.findLeastTasksOnHost() <= (maxPerHost - 2)) ||
(tasksByHost.findMostTasksOnHost() >= (maxPerHost + 1));
}
@Override public void close() {
MoreExecutors.shutdownAndAwaitTermination(taskExecutor, 120, TimeUnit.SECONDS);
}
public void saveConfigToDataStore() {
try {
spawn.getSpawnDataStore().put(SPAWN_BALANCE_PARAM_PATH, CodecJSON.encodeString(config));
} catch (Exception e) {
log.warn("Failed to save SpawnBalancerConfig to data store", e);
}
}
public void startTaskSizePolling() {
taskSizer.startPolling(taskExecutor);
}
/**
* Takes any arbitrary list of tasks and finds the best hosts to house all of them.
*
* @param tasks The tasks to assign to hosts
* @param hosts All available hosts in the cluster
* @return A map describing where to send each task
*/
public Map<JobTask, String> assignTasksFromMultipleJobsToHosts(Collection<JobTask> tasks,
Collection<HostState> hosts) {
// Populate a map grouping the tasks by job ID.
Map<String, List<JobTask>> tasksByJobID = new HashMap<>();
for (JobTask task : tasks) {
if (task.getJobUUID() != null) {
Job job = spawn.getJob(task.getJobUUID());
if (job != null) {
List<JobTask> taskList = tasksByJobID.computeIfAbsent(job.getId(), key -> new ArrayList<>());
taskList.add(task);
}
}
}
// This map of hosts to scores will be updated by every call to assignTasksFromSingleJobToHosts.
Map<HostState, Double> hostScoreMap = generateHostStateScoreMap(hosts, null);
// This map stores where to send each task.
Map<JobTask, String> hostAssignments = new HashMap<>(tasks.size());
for (Map.Entry<String, List<JobTask>> entry : tasksByJobID.entrySet()) {
Map<JobTask, String> singleHostAssignments =
assignTasksFromSingleJobToHosts(entry.getValue(), hostScoreMap);
hostAssignments.putAll(singleHostAssignments);
}
return hostAssignments;
}
/**
* Internal function that creates a map of HostStates to their hostScores.
*
* @param hosts The hosts from spawn
* @param jobID is specified, adds a factor that scales with the number of siblings on each host from that job.
* @return A map taking each HostState to its score
*/
private Map<HostState, Double> generateHostStateScoreMap(Collection<HostState> hosts, @Nullable String jobID) {
final Map<HostState, Double> hostScores = new HashMap<>(hosts.size());
for (HostState host : hosts) {
if ((host != null) && host.isUp() && !host.isDead()) {
int siblingScore = (jobID != null) ? (host.getTaskCount(jobID) * config.getSiblingWeight()) : 0;
double score = getHostScoreCached(host.getHostUuid()) + siblingScore;
hostScores.put(host, score);
}
}
return hostScores;
}
/**
* Take a list of tasks belonging to a single job and find the best hosts to house these tasks.
* A map of host scores is passed as a parameter so that it can be persisted across multiple calls to this function
* by, for example, assignTasksFromMultipleJobsToHosts.
*/
private Map<JobTask, String> assignTasksFromSingleJobToHosts(List<JobTask> tasks,
Map<HostState, Double> storedHostScores) {
if ((tasks == null) || tasks.isEmpty()) {
return new HashMap<>();
}
// Make a heap of hosts based on the storedHostScores, from lightest to heaviest.
PriorityQueue<HostAndScore> hostScores =
new PriorityQueue<>(1 + storedHostScores.size(), hostAndScoreComparator);
for (Map.Entry<HostState, Double> hostStateDoubleEntry : storedHostScores.entrySet()) {
if (canReceiveNewTasks(hostStateDoubleEntry.getKey())) {
hostScores.add(new HostAndScore(hostStateDoubleEntry.getKey(), hostStateDoubleEntry.getValue()));
}
}
if (hostScores.isEmpty()) {
log.warn("[spawn.balancer] found no hosts eligible to receive new tasks");
throw new RuntimeException("no eligible hosts for new task");
}
// Make a list of hosts as big as the list of tasks. This list may repeat hosts if it is necessary to do so.
Collection<String> hostsToAssign = new ArrayList<>(tasks.size());
for (JobTask task : tasks) {
if (task == null) {
continue;
}
// Pick the lightest host.
HostAndScore h = hostScores.poll();
HostState host = h.host;
hostsToAssign.add(host.getHostUuid());
// Then moderately weight that host in the heap so we won't pick it again immediately.
hostScores.add(new HostAndScore(host, h.score + config.getSiblingWeight()));
// Lightly weight that host in storedHostScores so we won't pick the same hosts over and over if we call
// this method repeatedly
storedHostScores.put(host, h.score + config.getActiveTaskWeight());
}
return pairTasksAndHosts(tasks, hostsToAssign);
}
public boolean canReceiveNewTasks(HostState host) {
if (host == null) {
return false;
}
if (spawn.getHostFailWorker().getFailureState(host.getHostUuid()) != HostFailWorker.FailState.ALIVE) {
return false;
}
return host.canMirrorTasks() && (host.getAvailDiskBytes() > config.getMinFreeDiskSpaceToRecieveNewTasks());
}
/**
* Given a list of tasks and a list of potential hosts of the same size, pair them up.
* Keep tasks on their existing hosts as much as possible.
*/
private static Map<JobTask, String> pairTasksAndHosts(List<JobTask> tasks, Collection<String> hosts) {
if ((tasks == null) || (hosts == null) || (tasks.size() != hosts.size())) {
log.warn("[spawn.balancer] invalid call to pairTasksAndHosts: tasks={} hosts={}", tasks, hosts);
return new HashMap<>(0);
}
Map<JobTask, String> rv = new HashMap<>(tasks.size());
// For any task already living on a host in our hosts list, keep that task where it is.
Collection<JobTask> unassignedTasks = new ArrayList<>();
String jobID = tasks.get(0).getJobUUID();
for (JobTask task : tasks) {
if (!task.getJobUUID().equals(jobID)) {
throw new RuntimeException(
"Illegal call to assignTasksFromSingleJobToHosts: not all tasks came from the same job");
}
String hostID = task.getHostUUID();
if ((hostID != null) && hosts.contains(hostID)) {
hosts.remove(hostID);
} else {
unassignedTasks.add(task);
}
}
// Assign the remaining tasks by iterating down the remaining hosts.
Iterator<String> hostIterator = hosts.iterator();
for (JobTask task : unassignedTasks) {
rv.put(task, hostIterator.next());
}
return rv;
}
public List<JobTaskMoveAssignment> pushTasksOffDiskForFilesystemOkayFailure(HostState host, int moveLimit) {
List<HostState> hosts = hostManager.listHostStatus(null);
return pushTasksOffHost(host, hosts, false, 1, moveLimit, false);
}
/* Push/pull the tasks on a host to balance its disk, obeying an overall limit on the number of tasks/bytes to
move */
private List<JobTaskMoveAssignment> pushTasksOffHost(HostState host,
Collection<HostState> otherHosts,
boolean limitBytes,
double byteLimitFactor,
int moveLimit,
boolean obeyDontAutobalanceMe) {
List<JobTaskMoveAssignment> rv = purgeMisplacedTasks(host, moveLimit);
if (rv.size() <= moveLimit) {
long byteLimit = (long) (byteLimitFactor * config.getBytesMovedFullRebalance());
List<HostState> hostsSorted = sortHostsByDiskSpace(otherHosts);
for (JobTask task : findTasksToMove(host, obeyDontAutobalanceMe)) {
long taskTrueSize = getTaskTrueSize(task);
if (limitBytes && (taskTrueSize > byteLimit)) {
continue;
}
JobTaskMoveAssignment assignment = moveTask(task, host.getHostUuid(), hostsSorted);
// we don't want to take up one of the limited rebalance slots
// with an assignment that we know has no chance of happening
// because either the assignment is null or the target host
// for the assignment is null
if (assignment != null && assignment.getTargetUUID() != null) {
markRecentlyReplicatedTo(assignment.getTargetUUID());
rv.add(assignment);
byteLimit -= taskTrueSize;
}
if (rv.size() >= moveLimit) {
break;
}
}
}
return rv;
}
/* Look through a hoststate to find tasks that don't correspond to an actual job or are on the wrong host */
private List<JobTaskMoveAssignment> purgeMisplacedTasks(HostState host, int deleteLimit) {
List<JobTaskMoveAssignment> rv = new ArrayList<>();
for (JobKey key : host.allJobKeys()) {
if (spawn.getJob(key) == null) {
// Nonexistent job
rv.add(new JobTaskMoveAssignment(key, host.getHostUuid(), null, false, true));
} else {
// Task has a copy on the wrong host. Do a fixTaskDir to ensure we aren't deleting the only remaining
// copy
JobTask task = spawn.getTask(key);
if (!host.getHostUuid().equals(task.getHostUUID()) && !task.hasReplicaOnHost(host.getHostUuid())) {
spawn.fixTaskDir(key.getJobUuid(), key.getNodeNumber(), false, false);
deleteLimit -= 1;
}
}
if (rv.size() >= deleteLimit) {
break;
}
}
return rv;
}
/**
* Sorts the host based on their available disk space, from most space available to least
*
* @param hosts - the hosts to sort
* @return the sorted list of hosts
*/
@VisibleForTesting
protected List<HostState> sortHostsByDiskSpace(Collection<HostState> hosts) {
List<HostState> hostList = new ArrayList<>(hosts);
removeDownHosts(hostList);
Collections.sort(hostList, hostStateReplicationSuitabilityComparator);
return hostList;
}
/**
* Find suitable tasks to move off a host.
*
* @param host The hoststate to move tasks off of
* @param obeyDontAutobalanceMe If true, obey the job parameter dontAutobalanceMe. See notes below.
* @return A list of tasks that are suitable to be moved
*/
private Iterable<JobTask> findTasksToMove(HostState host, boolean obeyDontAutobalanceMe) {
Collection<JobTask> rv = new ArrayList<>();
if (host != null) {
String hostId = host.getHostUuid();
for (JobKey jobKey : host.allJobKeys()) {
Job job = spawn.getJob(jobKey);
JobTask task = spawn.getTask(jobKey);
if ((job != null) && (task != null) && isInMovableState(task)
// Only add non-null tasks that are either idle or queued
&& (hostId.equals(task.getHostUUID()) || task.hasReplicaOnHost(
host.getHostUuid()))) // Only add tasks that are supposed to live on the specified host.
{
if (obeyDontAutobalanceMe && job.getDontAutoBalanceMe()) {
// obeyDontAutobalanceMe is set to false when spawn is doing a filesystem-okay host failure.
// In this case, spawn needs to move the task even if the job owner specified no swapping,
// because the box is likely to be ailing/scheduled for decommission.
// All rebalancing actions use obeyDontAutobalanceMe=true and will conform to the job owner's
// wishes.
continue;
}
rv.add(task);
}
}
}
return rv;
}
public long getTaskTrueSize(JobTask task) {
return taskSizer.estimateTrueSize(task);
}
/**
* Given an ordered list of hosts, move the task to a suitable host, then move that host to the end of the list
*
* @param task The task to move
* @param fromHostId The task to move the host from
* @param otherHosts The potential target hosts
* @return An assignment describing how to move the task
*/
@Nullable private JobTaskMoveAssignment moveTask(JobTask task,
String fromHostId,
Collection<HostState> otherHosts) {
Iterator<HostState> hostStateIterator = otherHosts.iterator();
String taskHost = task.getHostUUID();
boolean live = task.getHostUUID().equals(fromHostId);
if (!live && !task.hasReplicaOnHost(fromHostId)) {
return null;
}
while (hostStateIterator.hasNext()) {
HostState next = hostStateIterator.next();
String nextId = next.getHostUuid();
if (!taskHost.equals(nextId) && !task.hasReplicaOnHost(nextId) && next.canMirrorTasks() &&
okToPutReplicaOnHost(next, task)) {
hostStateIterator.remove();
otherHosts.add(next);
return new JobTaskMoveAssignment(task.getJobKey(), fromHostId, nextId, !live, false);
}
}
return null;
}
@VisibleForTesting
protected void markRecentlyReplicatedTo(String hostId) {
if (hostId != null) {
recentlyReplicatedToHosts.put(hostId, true);
}
}
private static void removeDownHosts(Iterable<HostState> hosts) {
Iterator<HostState> hostIter = hosts.iterator();
while (hostIter.hasNext()) {
HostState host = hostIter.next();
if (host.isDead() || !host.isUp()) {
hostIter.remove();
}
}
}
public static boolean isInMovableState(JobTask task) {
return (task != null) && movableTaskStates.contains(task.getState());
}
/**
* Is it acceptable to put a replica of this task on this host? (task can't have a live and replica version on
* host)
*
* @param hostCandidate The host that is being considered to house the new replica
* @param task The task that might be replicated to that host
* @return True if it is okay to put a replica on the host
*/
private boolean okToPutReplicaOnHost(HostState hostCandidate, JobTask task) {
Job job;
String hostId;
if ((hostCandidate == null) || ((hostId = hostCandidate.getHostUuid()) == null) ||
!canReceiveNewTasks(hostCandidate) || ((job = spawn.getJob(task.getJobKey())) == null) ||
!hostCandidate.getMinionTypes().contains(job.getMinionType())) {
return false;
}
if (spawn.getHostFailWorker().getFailureState(hostId) != HostFailWorker.FailState.ALIVE) {
return false;
}
HostState taskHost = hostManager.getHostState(task.getHostUUID());
/* Protect against npe in case the existing host has disappeared somehow */
String existingHost = (taskHost != null) ? taskHost.getHost() : null;
/* in non-local-stack, prevent replicates to same host (multi-minion-per-host-setup) */
if (!config.allowSameHostReplica() && hostCandidate.getHost().equals(existingHost)) {
return false;
}
/* don't let the same minion have duplicate tasks */
if (task.getHostUUID().equals(hostCandidate.getHostUuid()) ||
task.hasReplicaOnHost(hostCandidate.getHostUuid())) {
return false;
}
/* try not to put a task on a host if it would almost fill the host */
if (taskSizer.estimateTrueSize(task) > (config.getHostDiskFactor() * hostCandidate.getAvailDiskBytes())) {
return false;
}
return true;
}
/**
* Decide if spawn is in a good state to perform an autobalance.
*
* @return True if it is okay to autobalance
*/
public boolean okayToAutobalance() {
// Don't autobalance if it is disabled, spawn is quiesced, or the failure queue is non-empty
if ((config.getAutoBalanceLevel() == 0) ||
spawn.getSystemManager().isQuiesced() ||
spawn.getHostFailWorker().queuedHosts().size() > 0) {
return false;
}
// Don't autobalance if there are still jobs in rebalance state
for (Job job : spawn.listJobs()) {
if (JobState.REBALANCE.equals(job.getState())) {
log.warn("Auto rebalance blocked by job (enabled = {}) in rebalance: {}", job.isEnabled(), job.getId());
return false;
}
}
return true;
}
/**
* Find some task move assignments to autobalance the cluster
*
* @param type Whether to balance hosts or jobs
* @param weight Whether to balance light, medium, or heavy items of the chosen type
* @return A list of assignments to perform the specified balancing operation
*/
@Nullable public List<JobTaskMoveAssignment> getAssignmentsForAutoBalance(RebalanceType type,
RebalanceWeight weight) {
List<HostState> hosts = hostManager.getLiveHosts(null);
switch (type) {
case HOST:
if (hosts.isEmpty()) {
return null;
}
List<HostState> hostsSorted = new ArrayList<>(hosts);
Collections.sort(hostsSorted, hostStateScoreComparator);
HostState hostToBalance = hostsSorted.get(getWeightedElementIndex(hostsSorted.size(), weight));
return getAssignmentsToBalanceHost(hostToBalance,
hostManager.listHostStatus(hostToBalance.getMinionTypes()));
case JOB:
List<Job> autobalanceJobs = getJobsToAutobalance(hosts);
if ((autobalanceJobs == null) || autobalanceJobs.isEmpty()) {
return null;
}
Job jobToBalance = autobalanceJobs.get(getWeightedElementIndex(autobalanceJobs.size(), weight));
recentlyAutobalancedJobs.put(jobToBalance.getId(), true);
return getAssignmentsForJobReallocation(jobToBalance, -1,
hostManager.listHostStatus(jobToBalance.getMinionType()));
default:
throw new IllegalArgumentException("unknown rebalance type " + type);
}
}
public List<Job> getJobsToAutobalance(List<HostState> hosts) {
List<Job> autobalanceJobs = new ArrayList<>();
for (Job job : spawn.listJobs()) {
if (shouldAutobalanceJob(job, hosts)) {
autobalanceJobs.add(job);
}
}
Collections.sort(autobalanceJobs, jobAverageTaskSizeComparator);
return autobalanceJobs;
}
public Map<Integer, List<String>> getAssignmentsForNewReplicas(IJob job) {
return getAssignmentsForNewReplicas(job, -1);
}
/**
* Given a job, decide where to put additional replicas so that every task will have its full quantity of replicas.
*
* @param job The job in question (not altered)
* @param taskID The task that needs replicas, or -1 for all tasks
* @return Map sending nodeid => list of host IDs for which to make new replicas
*/
public Map<Integer, List<String>> getAssignmentsForNewReplicas(IJob job, int taskID) {
Map<Integer, List<String>> rv = new HashMap<>();
if (job == null) {
return rv;
}
int replicaCount = job.getReplicas();
Map<String, Double> scoreMap = generateTaskCountHostScoreMap(job);
PriorityQueue<HostAndScore> scoreHeap = new PriorityQueue<>(1, hostAndScoreComparator);
for (Map.Entry<String, Double> entry : scoreMap.entrySet()) {
scoreHeap.add(new HostAndScore(hostManager.getHostState(entry.getKey()), entry.getValue()));
}
Map<String, Integer> allocationMap = new HashMap<>();
List<JobTask> tasks = (taskID > 0) ? Collections.singletonList(job.getTask(taskID)) : job.getCopyOfTasks();
for (JobTask task : tasks) {
int numExistingReplicas = task.getReplicas() != null ? task.getReplicas().size() : 0;
List<String> hostIDsToAdd = new ArrayList<>(replicaCount);
// Add new replicas as long as the task needs them & there are remaining hosts
for (int i = 0; i < (replicaCount - numExistingReplicas); i++) {
for (HostAndScore hostAndScore : scoreHeap) {
HostState candidateHost = hostAndScore.host;
if ((candidateHost == null) || !candidateHost.canMirrorTasks()) {
continue;
}
int currentCount;
if (allocationMap.containsKey(candidateHost.getHostUuid())) {
currentCount = allocationMap.get(candidateHost.getHostUuid());
} else {
currentCount = 0;
}
if (okToPutReplicaOnHost(candidateHost, task)) {
hostIDsToAdd.add(candidateHost.getHostUuid());
scoreHeap.remove(hostAndScore);
scoreHeap.add(new HostAndScore(candidateHost, hostAndScore.score + 1));
allocationMap.put(candidateHost.getHostUuid(), currentCount + 1);
break;
}
}
}
if (!hostIDsToAdd.isEmpty()) {
rv.put(task.getTaskID(), hostIDsToAdd);
}
}
return rv;
}
/**
* Count the number of tasks per host for a single job, then add in a small factor for how heavily weighted each
* host's disk is
*
* @param job The job to count
* @return A map describing how heavily a job is assigned to each of its hosts
*/
private Map<String, Double> generateTaskCountHostScoreMap(IJob job) {
Map<String, Double> rv = new HashMap<>();
if (job != null) {
List<JobTask> tasks = job.getCopyOfTasks();
for (JobTask task : tasks) {
rv.put(task.getHostUUID(), addOrIncrement(rv.get(task.getHostUUID()), 1d));
if (task.getReplicas() == null) {
continue;
}
for (JobTaskReplica replica : task.getReplicas()) {
rv.put(replica.getHostUUID(), addOrIncrement(rv.get(replica.getHostUUID()), 1d));
}
}
for (HostState host : hostManager.listHostStatus(job.getMinionType())) {
if (host.isUp() && !host.isDead()) {
double availDisk = 1 - host.getDiskUsedPercent();
rv.put(host.getHostUuid(), addOrIncrement(rv.get(host.getHostUuid()), availDisk));
}
}
}
return rv;
}
private static double addOrIncrement(Double currentValue, Double value) {
if (currentValue != null) {
return currentValue + value;
} else {
return value;
}
}
public void requestJobSizeUpdate(String jobId, int taskId) {
taskSizer.requestJobSizeFetch(jobId, taskId);
}
public SpawnBalancerConfig getConfig() {
return config;
}
public void setConfig(SpawnBalancerConfig config) {
this.config = config;
}
public void clearRecentlyRebalancedHosts() {
recentlyBalancedHosts.invalidateAll();
}
/**
* Given a list of tasks with lives/replicas on a failed host, fix all of the tasks
*
* @param hosts All available hosts in the cluster of the appropriate type
* @param failedHost The id of the host being failed
*/
public void fixTasksForFailedHost(List<HostState> hosts, String failedHost) {
List<JobTask> tasks = findAllTasksAssignedToHost(failedHost);
List<JobTask> sortedTasks = new ArrayList<>(tasks);
Collections.sort(sortedTasks,
(o1, o2) -> Long.compare(taskSizer.estimateTrueSize(o1), taskSizer.estimateTrueSize(o2)));
hosts = sortHostsByDiskSpace(hosts);
Collection<String> modifiedJobIds = new HashSet<>();
for (JobTask task : sortedTasks) {
modifiedJobIds.add(task.getJobUUID());
try {
attemptFixTaskForFailedHost(task, hosts, failedHost);
} catch (Exception ex) {
log.warn("Warning: failed to recover task {}", task.getJobKey(), ex);
}
}
for (String jobId : modifiedJobIds) {
try {
spawn.updateJob(spawn.getJob(jobId));
} catch (Exception e) {
log.warn("Warning: failed to update job: {}", jobId, e);
}
}
}
private List<JobTask> findAllTasksAssignedToHost(String failedHostUUID) {
List<JobTask> rv = new ArrayList<>();
spawn.acquireJobLock();
try {
for (Job job : spawn.listJobs()) {
if (job != null) {
for (JobTask task : job.getCopyOfTasks()) {
if ((task != null) &&
(task.getHostUUID().equals(failedHostUUID) || task.hasReplicaOnHost(failedHostUUID))) {
rv.add(task);
}
}
}
}
return rv;
} finally {
spawn.releaseJobLock();
}
}
/**
* For a particular task with a copy on a failed host, attempt to find a suitable replica; mark it degraded
* otherwise
*
* @param task The task to modify
* @param hosts A list of available hosts
* @param failedHostUuid The host being failed
*/
private void attemptFixTaskForFailedHost(JobTask task, Collection<HostState> hosts, String failedHostUuid) {
Iterator<HostState> hostIterator = hosts.iterator();
Job job;
if ((task == null) || (task.getJobUUID() == null) || ((job = spawn.getJob(task.getJobUUID())) == null)) {
log.warn("Skipping nonexistent job for task {} during host fail.", task);
return;
}
if (!task.getHostUUID().equals(failedHostUuid) && !task.hasReplicaOnHost(failedHostUuid)) {
// This task was not actually assigned to the failed host. Nothing to do.
return;
}
if (!spawn.isNewTask(task) && ((task.getReplicas() == null) || task.getReplicas().isEmpty())) {
log.warn("Found no replica for task {}", task.getJobKey());
job.setState(JobState.DEGRADED, true);
return;
}
while (hostIterator.hasNext()) {
HostState host = hostIterator.next();
if (host.getHostUuid().equals(failedHostUuid)) {
continue;
}
if (host.canMirrorTasks() && okToPutReplicaOnHost(host, task)) {
// Host found! Move this host to the end of the host list so we don't immediately pick it again
hostIterator.remove();
hosts.add(host);
executeHostFailureRecovery(task, failedHostUuid, host);
return;
}
}
log.warn("Failed to find a host that could hold {} after host failure", task.getJobKey());
job.setState(JobState.DEGRADED, true);
}
/**
* Modify the live/replica copies of a task to handle a failed host
*
* @param task The task to be modified
* @param failedHostUuid The host being failed
* @param newReplicaHost A host that will receive a new copy of the data
*/
private void executeHostFailureRecovery(JobTask task, String failedHostUuid, CoreMessage newReplicaHost) {
boolean liveOnFailedHost = task.getHostUUID().equals(failedHostUuid);
String newReplicaUuid = newReplicaHost.getHostUuid();
if (liveOnFailedHost) {
if (spawn.isNewTask(task)) {
// Task has never run before. Just switch to the new host.
task.setHostUUID(newReplicaHost.getHostUuid());
} else {
// Send a kill message if the task is running on the failed host
spawn.sendControlMessage(
new CommandTaskStop(failedHostUuid, task.getJobUUID(), task.getTaskID(), 0, true, false));
// Find a replica, promote it, and tell it to replicate to the new replica on completion
String chosenReplica = task.getReplicas().get(0).getHostUUID();
task.replaceReplica(chosenReplica, newReplicaUuid);
task.setHostUUID(chosenReplica);
spawn.replicateTask(task, Collections.singletonList(newReplicaUuid));
}
} else {
// Replace the replica on the failed host with one on a new host
task.replaceReplica(failedHostUuid, newReplicaUuid);
if (!spawn.isNewTask(task)) {
spawn.replicateTask(task, Collections.singletonList(newReplicaUuid));
}
}
}
/**
* Makes the specified number of tasks for the given job ID, and assigns each task to a suitable host.
*
* @param jobID the job in question
* @param taskCount how many tasks should be created
* @param hosts a current set of hosts
* @return a list of tasks with HostID set.
*/
public List<JobTask> generateAssignedTasksForNewJob(String jobID, int taskCount, Collection<HostState> hosts)
throws Exception {
List<JobTask> tasks = generateUnassignedTaskList(jobID, taskCount);
Map<JobTask, String> hostAssignments =
assignTasksFromSingleJobToHosts(tasks, generateHostStateScoreMap(hosts, null));
List<JobTask> rv = new ArrayList<>(tasks.size());
for (Map.Entry<JobTask, String> entry : hostAssignments.entrySet()) {
JobTask task = entry.getKey();
String hostID = entry.getValue();
if (hostID == null) {
throw new RuntimeException("Unable to allocate job tasks because no suitable host was found");
}
task.setHostUUID(hostID);
rv.add(task);
}
return rv;
}
/**
* Makes a list of empty tasks which will be assigned to hosts later.
*/
private static List<JobTask> generateUnassignedTaskList(String jobID, int taskCount) {
List<JobTask> rv = new ArrayList<>(Math.max(0, taskCount));
for (int i = 0; i < taskCount; i++) {
JobTask task = new JobTask();
task.setJobUUID(jobID);
task.setTaskID(i);
rv.add(task);
}
return rv;
}
/**
* Advises Spawn how to reallocate a job, sending some tasks to different hosts
*
* @param job The job being reallocated
* @param tasksToMove The number of tasks to move. If <= 0, use the default.
* @param hosts The available target hosts
* @return a map assigning a good host for each jobtask
*/
public List<JobTaskMoveAssignment> getAssignmentsForJobReallocation(Job job,
int tasksToMove,
List<HostState> hosts) {
int maxTasksToMove = (tasksToMove > 0) ? tasksToMove : config.getTasksMovedFullRebalance();
List<JobTaskMoveAssignment> candidateAssignments = new ArrayList<>();
// Count the number of tasks per host
JobTaskItemByHostMap tasksByHost = generateTaskCountByHost(hosts, job.getCopyOfTasks());
// Find the max number of tasks each host should have
int maxPerHost = maxTasksPerHost(job, tasksByHost.size());
if (log.isDebugEnabled()) {
log.debug("Rebalancing job: {} maxTasksToMove={} maxPerHost={}", job.getId(), maxTasksToMove, maxPerHost);
}
while (candidateAssignments.size() < maxTasksToMove) {
MoveAssignmentList moves = null;
List<String> hostsSorted = tasksByHost.generateHostsSorted();
String hostWithMost = hostsSorted.get(hostsSorted.size() - 1);
String hostWithLeast = hostsSorted.get(0);
int mostTasksOnHost = tasksByHost.findMostTasksOnHost();
int leastTasksOnHost = tasksByHost.findLeastTasksOnHost();
boolean isExtremeHost = isExtremeHost(hostWithMost, true, true);
if (log.isDebugEnabled()) {
log.debug(
"hostsSorted.size={} hostWithMost:{} hostWithLeast:{} mostTasksOnHost: {} leastTasksOnHost: {}",
hostsSorted.size(), hostWithMost, hostWithLeast, mostTasksOnHost, leastTasksOnHost);
}
// If any host has more than the max number, move some tasks off that host
if (mostTasksOnHost > maxPerHost) {
moves = moveTasksOffHost(tasksByHost, maxPerHost, 1, -1, hostWithMost);
} else if (leastTasksOnHost < (maxPerHost -
1)) { // If a host has significantly fewer than the max number, move some
// tasks onto that host
moves = moveTasksOntoHost(tasksByHost, maxPerHost, 1, -1, hostWithLeast);
} else if (isExtremeHost) { // If a host with many tasks is heavily weighted, move a task off that host
moves = moveTasksOffHost(tasksByHost, maxPerHost, 1, -1, hostWithMost);
}
if ((moves == null) || moves.isEmpty()) {
break;
} else {
candidateAssignments.addAll(moves);
}
}
if (candidateAssignments.size() > maxTasksToMove) {
candidateAssignments = candidateAssignments.subList(0, maxTasksToMove);
}
candidateAssignments = removeDuplicateAssignments(candidateAssignments);
return pruneTaskReassignments(candidateAssignments);
}
/**
* Advises Spawn how to reallocate a host, pushing or pulling jobs to balance the number of tasks run by each
* machine.
*
* @param host The particular host to consider
* @param hosts All available hosts; should include host
* @return a (possibly empty) map specifying some tasks and advised destinations for those tasks
*/
public List<JobTaskMoveAssignment> getAssignmentsToBalanceHost(HostState host, List<HostState> hosts) {
String hostID = host.getHostUuid();
List<JobTaskMoveAssignment> rv = new ArrayList<>();
if ((hosts == null) || hosts.isEmpty()) {
log.warn("[spawn.balancer] {} reallocation failed: host list empty", hostID);
return rv;
}
List<HostState> sortedHosts = sortHostsByDiskSpace(hosts);
HostFailWorker.FailState failState = spawn.getHostFailWorker().getFailureState(hostID);
int numAlleviateHosts = (int) Math.ceil(sortedHosts.size() * config.getAlleviateHostPercentage());
if ((failState == HostFailWorker.FailState.FAILING_FS_OKAY) || isExtremeHost(hostID, true, true) ||
(host.getAvailDiskBytes() < config.getMinFreeDiskSpaceToRecieveNewTasks())) {
// Host disk is overloaded
log.info("[spawn.balancer] {} categorized as overloaded host; looking for tasks to push off of it", hostID);
List<HostState> lightHosts = sortedHosts.subList(0, numAlleviateHosts);
rv.addAll(pushTasksOffHost(host, lightHosts, true, 1, config.getTasksMovedFullRebalance(), true));
} else if (isExtremeHost(hostID, true, false)) {
// Host disk is underloaded
log.info("[spawn.balancer] {} categorized as underloaded host; looking for tasks to pull onto it", hostID);
List<HostState> heavyHosts =
Lists.reverse(sortedHosts.subList(sortedHosts.size() - numAlleviateHosts, sortedHosts.size()));
pushTasksOntoDisk(host, heavyHosts);
} else if (isExtremeHost(hostID, false, true)) {
// Host is overworked
log.info("[spawn.balance] {} categorized as overworked host; looking for tasks to push off it", hostID);
rv.addAll(balanceActiveJobsOnHost(host, hosts));
}
if (rv.isEmpty()) {
rv.addAll(balanceActiveJobsOnHost(host, hosts));
}
return pruneTaskReassignments(rv);
}
/**
* Sorts the hosts based on their load as measured by hostScores, lightest to heaviest.
*
* @param hosts - the hosts to sort
* @return the sorted list of hosts, light to heavy.
*/
public List<HostState> sortHostsByActiveTasks(Collection<HostState> hosts) {
List<HostState> hostList = new ArrayList<>(hosts);
removeDownHosts(hostList);
Collections.sort(hostList, Comparator.comparingDouble(this::countTotalActiveTasksOnHost));
return hostList;
}
private int countTotalActiveTasksOnHost(HostState host) {
int count = 0;
if (host != null) {
host.generateJobTaskCountMap();
Set<String> jobIds = getActiveJobIds();
for (String jobId : jobIds) {
count += host.getTaskCount(jobId);
}
}
return count;
}
private Set<String> getActiveJobIds() {
return activeJobIds;
}
/** Updates activeJobIds atomically */
@VisibleForTesting
void updateActiveJobIDs() {
Collection<Job> jobs = spawn.listJobsConcurrentImmutable();
if ((jobs != null) && !jobs.isEmpty()) {
Set<String> jobIds = new HashSet<>(getActiveJobIds().size());
for (Job job : jobs) {
if (isWellFormedAndActiveJob(job)) {
jobIds.add(job.getId());
}
}
this.activeJobIds = jobIds;
}
}
/**
* Is this job non-null and has it run recently?
*/
private boolean isWellFormedAndActiveJob(IJob job) {
long earliestActiveTime = JitterClock.globalTime() - config.getActiveTaskMilliCutoff();
return (job != null) && (job.getStartTime() != null) && (job.getStartTime() > earliestActiveTime);
}
/**
* Given a job, for each task, remove duplicate replicas, replicas pointing at the live host
*
* @param job The job to modify
*/
public void removeInvalidReplicas(IJob job) {
if ((job != null) && (job.getCopyOfTasks() != null)) {
List<JobTask> tasks = job.getCopyOfTasks();
for (JobTask task : tasks) {
List<JobTaskReplica> newReplicas = new ArrayList<>(job.getReplicas());
if (task.getReplicas() != null) {
Iterable<JobTaskReplica> oldReplicas = new ArrayList<>(task.getReplicas());
for (JobTaskReplica replica : oldReplicas) {
Collection<String> replicasSeen = new ArrayList<>();
String replicaHostID = replica.getHostUUID();
if (hostManager.getHostState(replicaHostID) == null) {
log.warn("[spawn.balancer] removing replica for missing host {}", replicaHostID);
} else if (replicaHostID.equals(task.getHostUUID()) || replicasSeen.contains(replicaHostID)) {
log.warn("[spawn.balancer] removing erroneous replica for {} on {}",
task.getJobKey(), replicaHostID);
} else if (!config.allowSameHostReplica() && onSameHost(replicaHostID, task.getHostUUID())) {
log.warn("[spawn.balancer] removing replica on same host for {}live={} replica={}",
task.getJobKey(), task.getHostUUID(), replicaHostID);
} else {
replicasSeen.add(replicaHostID);
newReplicas.add(replica);
}
}
}
task.setReplicas(newReplicas);
}
}
}
private boolean onSameHost(String hostID1, String hostID2) {
HostState host1 = hostManager.getHostState(hostID1);
HostState host2 = hostManager.getHostState(hostID2);
if ((host1 == null) || (host2 == null)) {
return false;
} else {
return host1.getHost().equals(host2.getHost());
}
}
/**
* Check the live and replica hosts for a given task to see whether any of these hosts has a nearly-full disk
*
* @param task The task to check
* @return True if at least one host is near full
*/
protected boolean hasFullDiskHost(JobTask task) {
Collection<HostState> hostsToCheck = new ArrayList<>();
hostsToCheck.add(hostManager.getHostState(task.getHostUUID()));
if (task.getReplicas() != null) {
for (JobTaskReplica replica : task.getReplicas()) {
if (replica != null) {
hostsToCheck.add(hostManager.getHostState(replica.getHostUUID()));
}
}
for (HostState host : hostsToCheck) {
if ((host != null) && isDiskFull(host)) {
return true;
}
}
}
return false;
}
/**
* Does this host have a nearly full disk?
*
* @param host The host to check
* @return True if the disk is nearly full
*/
public boolean isDiskFull(@Nullable HostState host) {
if (host == null) {
return false;
}
long freeSpace = host.getAvailDiskBytes();
boolean full = freeSpace <= config.getMinFreeDiskSpaceToRunJobs();
if (full) {
log.warn("[spawn.balancer] Host {} with uuid {} is nearly full, with {} GB free disk space", host.getHost(),
host.getHostUuid(), freeSpace / 1_000_000_000);
}
return full;
}
/**
* Update SpawnBalancer's cluster-wide metrics, including host scores and active jobs.
*
* @param hosts A list of HostStates
*/
protected void updateAggregateStatistics(List<HostState> hosts) {
aggregateStatisticsLock.lock();
try {
updateActiveJobIDs();
double maxMeanActive = -1;
double maxDiskPercentUsed = -1;
double minDiskPercentUsed = 0;
double sumDiskPercentUsed = 0;
double maxTaskPercent = 0;
double minTaskPercent = 1;
double sumTaskPercent = 0;
for (HostState host : hosts) {
maxMeanActive = Math.max(maxMeanActive, host.getMeanActiveTasks());
double diskPercentUsed = host.getDiskUsedPercent();
sumDiskPercentUsed += diskPercentUsed;
maxDiskPercentUsed = Math.max(diskPercentUsed, maxDiskPercentUsed);
minDiskPercentUsed = Math.min(diskPercentUsed, minDiskPercentUsed);
double taskPercent = host.getMeanActiveTasks();
sumTaskPercent += taskPercent;
maxTaskPercent = Math.max(taskPercent, maxTaskPercent);
minTaskPercent = Math.min(taskPercent, minTaskPercent);
}
int numScores = hosts.size();
double avgDiskPercentUsed = sumDiskPercentUsed / (double) numScores;
double sumDiskPercentUsedDiff = 0;
double avgTaskPercent = sumTaskPercent / (double) numScores;
double sumTaskPercentDiff = 0;
for (HostState host : hosts) {
HostScore score = calculateHostScore(host, maxMeanActive, maxDiskPercentUsed);
cachedHostScores.put(host.getHostUuid(), score);
// update average metrics
double diskDiff = Math.abs(avgDiskPercentUsed - host.getDiskUsedPercent());
sumDiskPercentUsedDiff += diskDiff;
double taskDiff = score.getScoreValue(false);
sumTaskPercentDiff += Math.abs(avgTaskPercent - taskDiff);
}
avgDiskPercentUsedDiff = sumDiskPercentUsedDiff / (double) numScores;
avgTaskPercentDiff = sumTaskPercentDiff / (double) numScores;
minDiskPercentUsedDiff = avgDiskPercentUsed - minDiskPercentUsed;
maxDiskPercentUsedDiff = maxDiskPercentUsed - avgDiskPercentUsed;
minTaskPercentDiff = avgTaskPercent - minTaskPercent;
maxTaskPercentDiff = maxTaskPercent - avgTaskPercent;
} finally {
aggregateStatisticsLock.unlock();
}
}
private HostScore calculateHostScore(HostState host, double clusterMaxMeanActive, double clusterMaxDiskUsed) {
double meanActive = host.getMeanActiveTasks();
// Get percentage of allowed disk space used (max - min free space = allowed)
double diskUsedPercentModified = host.getDiskUsedPercentModified(config.getMinFreeDiskSpaceToRunJobs());
int activeTaskWeight = config.getActiveTaskWeight();
int diskUsedWeight = config.getDiskUsedWeight();
// Assemble the score as a combination of the mean active tasks and the disk used
double exponent = 2.5;
double score = activeTaskWeight * Math.pow(meanActive, exponent);
double diskPercentPowered = Math.pow(diskUsedPercentModified, exponent);
score += diskUsedWeight * diskPercentPowered;
// If host is very full, make sure to give the host a big score
score = Math.max(score, (activeTaskWeight + diskUsedWeight) * diskPercentPowered);
return new HostScore(meanActive, diskUsedPercentModified, score);
}
/**
* Is this host's load significantly different from the rest of the cluster?
*
* @param hostID The host to check
* @param diskSpace Whether to consider load based on disk space (as opposed to number of tasks)
* @param high Whether to look for heavy load as opposed to light load
* @return True if the host has the specified level of load
*/
protected boolean isExtremeHost(@Nullable String hostID, boolean diskSpace, boolean high) {
aggregateStatisticsLock.lock();
try {
if ((hostID == null) || (cachedHostScores == null) || !cachedHostScores.containsKey(hostID) ||
cachedHostScores.isEmpty()) {
return false;
}
double clusterAverage = 0;
for (HostScore score : cachedHostScores.values()) {
clusterAverage += score.getScoreValue(diskSpace);
}
clusterAverage /= cachedHostScores.size(); // Nonzero as we check if cachedHostScores.isEmpty first
double hostValue = cachedHostScores.get(hostID).getScoreValue(diskSpace);
return (high && (hostValue > (clusterAverage * config.getExtremeHostRatio()))) ||
(!high && (hostValue < (clusterAverage / config.getExtremeHostRatio())));
} finally {
aggregateStatisticsLock.unlock();
}
}
/* Pull tasks off the given host and move them elsewhere, obeying the maxPerHost and maxBytesToMove limits */
private MoveAssignmentList moveTasksOffHost(JobTaskItemByHostMap tasksByHost,
int maxPerHost,
int numToMove,
long maxBytesToMove,
String pushHost) {
if (log.isDebugEnabled()) {
log.debug("received move assignment maxPerHost:{} numToMove:{} pushHost:{}", maxPerHost, numToMove,
pushHost);
}
MoveAssignmentList rv = new MoveAssignmentList(spawn, taskSizer);
Collection<JobKey> alreadyMoved = new HashSet<>();
Iterator<String> otherHosts = tasksByHost.getHostIterator(true);
while (otherHosts.hasNext() && (rv.size() < numToMove)) {
String pullHost = otherHosts.next();
if (pushHost.equals(pullHost)) {
continue;
}
HostState pullHostState = hostManager.getHostState(pullHost);
Iterator<JobTaskItem> itemIterator = new ArrayList<>(tasksByHost.get(pushHost)).iterator();
while (itemIterator.hasNext() && (rv.size() < numToMove) &&
(tasksByHost.get(pullHost).size() < maxPerHost)) {
JobTaskItem nextTaskItem = itemIterator.next();
long trueSizeBytes = taskSizer.estimateTrueSize(nextTaskItem.getTask());
JobKey jobKey = nextTaskItem.getTask().getJobKey();
Job job = spawn.getJob(jobKey);
if ((job == null) || !pullHostState.getMinionTypes().contains(job.getMinionType())) {
continue;
}
// Reject the move if the target host is heavily loaded, already has a copy of the task, or the task
// is too large
if (isExtremeHost(pullHost, true, true) || pullHostState.hasLive(jobKey) ||
((maxBytesToMove > 0) && (trueSizeBytes > maxBytesToMove))) {
if (log.isDebugEnabled()) {
log.debug("Unable to move task to host {} fullDisk={} alreadyLive={} byteCount={}>{} {}",
pullHost, isExtremeHost(pullHost, true, true), pullHostState.hasLive(jobKey),
trueSizeBytes, maxBytesToMove, trueSizeBytes > maxBytesToMove);
}
continue;
}
if (!alreadyMoved.contains(jobKey) && (pullHost != null) &&
tasksByHost.moveTask(nextTaskItem, pushHost, pullHost)) {
rv.add(new JobTaskMoveAssignment(nextTaskItem.getTask().getJobKey(), pushHost, pullHost, false,
false));
alreadyMoved.add(nextTaskItem.getTask().getJobKey());
maxBytesToMove -= trueSizeBytes;
}
}
}
return rv;
}
/* Push tasks onto the given host, obeying the maxPerHost and maxBytesToMove limits */
private MoveAssignmentList moveTasksOntoHost(JobTaskItemByHostMap tasksByHost,
int maxPerHost,
int numToMove,
long maxBytesToMove,
String pullHost) {
MoveAssignmentList rv = new MoveAssignmentList(spawn, taskSizer);
Collection<JobKey> alreadyMoved = new HashSet<>();
Iterator<String> otherHosts = tasksByHost.getHostIterator(false);
if (isExtremeHost(pullHost, true, true)) {
return rv;
}
if (isExtremeHost(pullHost, false, true)) {
numToMove = Math.max(1, numToMove / 2); // Move fewer tasks onto a host if it's already doing a lot of work
}
HostState pullHostState = hostManager.getHostState(pullHost);
if (pullHostState == null) {
return rv;
}
while (otherHosts.hasNext() && (rv.size() < numToMove) && (tasksByHost.get(pullHost).size() < maxPerHost)) {
String pushHost = otherHosts.next();
if ((pushHost == null) || pushHost.equals(pullHost)) {
continue;
}
Collection<JobTaskItem> pushHostItems = new ArrayList<>(tasksByHost.get(pushHost));
if (pushHostItems.size() < maxPerHost) {
break;
}
for (JobTaskItem item : pushHostItems) {
JobKey jobKey = item.getTask().getJobKey();
Job job = spawn.getJob(jobKey);
if ((job == null) || !pullHostState.getMinionTypes().contains(job.getMinionType())) {
continue;
}
long trueSizeBytes = taskSizer.estimateTrueSize(item.getTask());
if (pullHostState.hasLive(item.getTask().getJobKey()) ||
((maxBytesToMove > 0) && (trueSizeBytes > maxBytesToMove))) {
continue;
}
if (!alreadyMoved.contains(jobKey) && tasksByHost.moveTask(item, pushHost, pullHost)) {
rv.add(new JobTaskMoveAssignment(item.getTask().getJobKey(), pushHost, pullHost, false, false));
alreadyMoved.add(item.getTask().getJobKey());
maxBytesToMove -= trueSizeBytes;
}
if (rv.size() >= numToMove) {
break;
}
}
}
return rv;
}
/* Count the number of tasks that live on each host */
private JobTaskItemByHostMap generateTaskCountByHost(List<HostState> hosts, Iterable<JobTask> tasks) {
JobTaskItemByHostMap rv = new JobTaskItemByHostMap(this, hosts, config.getTasksMovedPerUnspecifiedHost(),
config.getTasksMovedPerUnspecifiedHost());
for (JobTask task : tasks) {
rv.addLiveAndReplicasForTask(task);
}
return rv;
}
private List<JobTaskMoveAssignment> pushTasksOntoDisk(HostState host, Iterable<HostState> heavyHosts) {
MoveAssignmentList moveAssignments = new MoveAssignmentList(spawn, taskSizer);
for (HostState heavyHost : heavyHosts) {
double byteLimitFactor =
1 - ((double) moveAssignments.getBytesUsed() / config.getBytesMovedFullRebalance());
moveAssignments.addAll(pushTasksOffHost(heavyHost, Collections.singletonList(host), true, byteLimitFactor,
config.getTasksMovedFullRebalance(), true));
}
moveAssignments.addAll(purgeMisplacedTasks(host, 1));
return moveAssignments;
}
/* For each active job, ensure that the given host has a fair share of tasks from that job */
private Collection<JobTaskMoveAssignment> balanceActiveJobsOnHost(HostState host, List<HostState> hosts) {
int totalTasksToMove = config.getTasksMovedFullRebalance();
long totalBytesToMove = config.getBytesMovedFullRebalance();
Set<String> activeJobs = getActiveJobIds();
List<JobTaskMoveAssignment> rv = purgeMisplacedTasks(host, 1);
String hostID = host.getHostUuid();
for (String jobID : activeJobs) {
spawn.acquireJobLock();
try {
Job job = spawn.getJob(jobID);
if (job != null) {
JobTaskItemByHostMap tasksByHost =
new JobTaskItemByHostMap(this, hosts, config.getTasksMovedPerUnspecifiedHost(),
config.getTasksMovedPerUnspecifiedHost());
for (JobTask task : job.getCopyOfTasks()) {
tasksByHost.addLiveAndReplicasForTask(task);
}
int maxPerHost = maxTasksPerHost(job, hosts.size());
int numExistingTasks = tasksByHost.get(hostID).size();
if ((tasksByHost.findLeastTasksOnHost() >= (maxPerHost - 1)) ||
(tasksByHost.findMostTasksOnHost() <= maxPerHost)) {
continue;
}
boolean pushFrom = (numExistingTasks > maxPerHost) || ((numExistingTasks == maxPerHost) &&
(tasksByHost.findLeastTasksOnHost() <
(maxPerHost - 1)));
if (totalTasksToMove > 0) {
MoveAssignmentList assignments = pushFrom ?
moveTasksOffHost(tasksByHost, maxPerHost, 1, totalBytesToMove,
host.getHostUuid())
:
moveTasksOntoHost(tasksByHost, maxPerHost, 1, totalBytesToMove,
host.getHostUuid());
rv.addAll(assignments);
totalTasksToMove -= assignments.size();
totalBytesToMove -= assignments.getBytesUsed();
} else {
break;
}
}
} finally {
spawn.releaseJobLock();
}
}
return rv;
}
/**
* Prune a tentative list of task reassignments, removing illegal moves or moves to overburdened hosts
*
* @param candidateAssignments The initial list of assignments
* @return A list of assignments with illogical moves removed
*/
private List<JobTaskMoveAssignment> pruneTaskReassignments(Iterable<JobTaskMoveAssignment> candidateAssignments) {
List<JobTaskMoveAssignment> rv = new ArrayList<>();
Map<String, Boolean> snapshot = new HashMap<>(recentlyBalancedHosts.asMap());
for (JobTaskMoveAssignment assignment : candidateAssignments) {
String newHostID = assignment.getTargetUUID();
JobKey jobKey = assignment.getJobKey();
String jobID = (jobKey == null) ? null : jobKey.getJobUuid();
if (isExtremeHost(newHostID, true, true)) {
log.warn("[spawn.balancer] decided not to move task from job {} to host {} " +
"because it is already heavily loaded", jobID, newHostID);
continue;
}
HostState newHost = hostManager.getHostState(newHostID);
if ((newHost == null) || newHost.hasLive(jobKey) ||
!canReceiveNewTasks(newHost)) {
log.warn("[spawn.balancer] decided not to move task from job {} to host {} " +
"because it cannot receive the new task", jobID, newHostID);
continue;
}
if (snapshot.containsKey(newHostID)) {
log.warn("[spawn.balancer] decided not to move task from job {} to host {} " +
"because it already received a different task recently", jobID, newHostID);
continue;
}
rv.add(assignment);
recentlyBalancedHosts.put(newHostID, true);
}
return rv;
}
private static List<JobTaskMoveAssignment> removeDuplicateAssignments(Iterable<JobTaskMoveAssignment>
candidateAssignments) {
Collection<JobKey> movedTasks = new HashSet<>();
List<JobTaskMoveAssignment> rv = new ArrayList<>();
for (JobTaskMoveAssignment assignment : candidateAssignments) {
JobKey jobKey = assignment.getJobKey();
if (!movedTasks.contains(jobKey)) {
rv.add(assignment);
movedTasks.add(jobKey);
}
}
return rv;
}
/**
* Assuming a job has n tasks, each with R replicas, and there are m hosts available, each host should have no more
* than n*(1+R)/m, rounded up.
*/
private static int maxTasksPerHost(Job job, int numHosts) {
if (job == null) {
return 0;
}
numHosts = Math.max(1, numHosts);
return (int) Math.ceil((double) (job.getTaskCount() * (1 + job.getReplicas())) / numHosts);
}
private static int getWeightedElementIndex(int numItems, RebalanceWeight weight) {
switch (weight) {
case LIGHT:
return 0;
case MEDIUM:
return numItems / 2;
case HEAVY:
return numItems - 1;
default:
throw new IllegalArgumentException("unknown weight type " + weight);
}
}
}