/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.job;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import com.addthis.basis.util.Parameter;
import com.addthis.hydra.job.mq.HostState;
import com.addthis.hydra.job.mq.JobKey;
import com.addthis.hydra.job.spawn.HostManager;
import com.addthis.hydra.job.spawn.Spawn;
import com.addthis.maljson.JSONException;
import com.addthis.maljson.JSONObject;
import com.google.common.collect.ImmutableSet;
import com.yammer.metrics.Metrics;
import com.yammer.metrics.core.Counter;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class HostFailWorker {
private static final Logger log = LoggerFactory.getLogger(HostFailWorker.class);
private final AtomicBoolean newAdditions = new AtomicBoolean(false); // True if a host has been recently added to the queue
private final AtomicBoolean obeyTaskSlots = new AtomicBoolean(true); // Whether spawn should honor the max task slots when moving tasks to fail hosts
private final HostFailState hostFailState;
private final Spawn spawn;
private final HostManager hostManager;
// Perform host-failure related operations at a given interval
private static final long hostFailDelayMillis = Parameter.longValue("host.fail.delay", 15_000);
// Quiet period between when host is failed in UI and when Spawn begins failure-related operations
private static final long hostFailQuietPeriod = Parameter.longValue("host.fail.quiet.period", 20_000);
// Don't rebalance additional tasks if spawn is already rebalancing at least this many.
private static final int maxMovingTasks = Parameter.intValue("host.fail.maxMovingTasks", 6);
// Use a smaller max when a disk is being failed, to avoid a 'thundering herds' scenario
private static final int maxMovingTasksDiskFull = Parameter.intValue("host.fail.maxMovingTasksDiskFull", 2);
private static final Counter failHostCount = Metrics.newCounter(Spawn.class, "failHostCount");
// Various keys used to make JSON objects to send to the UI
private static final String infoHostsKey = "uuids";
private static final String infoDeadFsKey = "deadFs";
private static final String infoWarningKey = "warning";
private static final String infoPrefailCapacityKey = "prefail";
private static final String infoPostfailCapacityKey = "postfail";
private static final String infoFatalWarningKey = "fatal";
private final ScheduledExecutorService executorService;
public HostFailWorker(Spawn spawn, HostManager hostManager, ScheduledExecutorService executorService) {
this.spawn = spawn;
this.hostManager = hostManager;
this.executorService = executorService;
hostFailState = new HostFailState(spawn);
hostFailState.loadState();
}
/** Initializes scheduled execution of fail host task **/
public void initFailHostTaskSchedule() {
if (executorService != null) {
if (hostFailState.nextHostToFail() != null) {
queueFailNextHost();
}
executorService.scheduleWithFixedDelay(new FailHostTask(true), hostFailDelayMillis, hostFailDelayMillis, TimeUnit.MILLISECONDS);
}
}
/**
* Mark a series of hosts for failure
*
* @param hostIds A comma-separated list of host uuids
* @param state The state of the hosts being failed
*/
public void markHostsToFail(String hostIds, FailState state) {
if (hostIds != null) {
for (String host : hostIds.split(",")) {
hostFailState.putHost(host, state);
spawn.sendHostUpdateEvent(spawn.hostManager.getHostState(host));
}
queueFailNextHost();
}
}
/**
* @return The set of all minion ids across all failure queues.
*/
public Set<String> queuedHosts() {
return hostFailState.queuedHosts();
}
/**
* Retrieve an enum describing whether/how a host has been failed (for programmatic purposes)
*
* @param hostId A host uuid to check
* @return A FailState object describing whether the host has been failed
*/
public FailState getFailureState(String hostId) {
return hostFailState.getState(hostId);
}
public boolean shouldKickTasks(String hostId) {
FailState failState = getFailureState(hostId);
// A Failing_Fs_Okay host is nominally fine for the time being. It should be allowed to run tasks.
return failState == FailState.ALIVE || failState == FailState.FAILING_FS_OKAY;
}
/**
* Retrieve a human-readable string describing whether/how a host has been failed
*
* @param hostId A host uuid to check
* @param up Whether the host is up
* @return A String describing the host's failure state (mainly for the UI)
*/
public String getFailureStateString(String hostId, boolean up) {
FailState failState = getFailureState(hostId);
switch (failState) {
case ALIVE:
return up ? "up" : "down";
case FAILING_FS_DEAD:
return "queued to fail (fs dead)";
case FAILING_FS_OKAY:
return "queued to fail (fs okay)";
case DISK_FULL:
return "disk near full; moving tasks off";
default:
return "UNKNOWN";
}
}
/**
* Cancel the failure of one or more hosts
*
* @param hostIds A comma-separated list of host uuids
*/
public void removeHostsForFailure(String hostIds) {
if (hostIds != null) {
for (String host : hostIds.split(",")) {
hostFailState.removeHost(host);
spawn.sendHostUpdateEvent(spawn.hostManager.getHostState(host));
}
}
}
/**
* Decide whether a given host can be failed based on whether other minions in the cluster are up
*
* @param failedHostUuid The host to be failed
* @return True only if there are no down hosts that would need to be up in order to correctly fail the host
*/
protected boolean checkHostStatesForFailure(String failedHostUuid) {
Collection<HostState> hostStates = spawn.hostManager.listHostStatus(null);
for (HostState hostState : hostStates) {
if (!failedHostUuid.equals(hostState.getHostUuid()) && shouldBlockHostFailure(ImmutableSet.of(failedHostUuid), hostState)) {
log.warn("Unable to fail host: " + failedHostUuid +
" because one of the minions (" + hostState.getHostUuid() +
") on " + hostState.getHost() +
" is currently down. Retry when all minions are available");
return false;
}
}
return true;
}
/**
* Fail a host. For any tasks with replicas on that host, move these replicas elsewhere. For any tasks with live copies on the host,
* promote a replica, then make a new replica somewhere else.
*/
private void markHostDead(String failedHostUuid) {
if (failedHostUuid == null || !checkHostStatesForFailure(failedHostUuid)) {
return;
}
spawn.markHostStateDead(failedHostUuid);
hostFailState.removeHost(failedHostUuid);
failHostCount.inc();
}
/**
* Before failing host(s), check if a different host needs to be up to perform the failure operation.
*
* @param failedHostUUIDs The hosts being failed
* @param hostState The host to check
* @return True if the host is down
*/
private boolean shouldBlockHostFailure(Set<String> failedHostUUIDs, HostState hostState) {
if (hostState == null || hostState.isDead() || hostState.isUp()) {
return false;
}
for (JobKey jobKey : hostState.allJobKeys()) // never null due to implementation
{
JobTask task = spawn.getTask(jobKey);
if (task != null && (failedHostUUIDs.contains(task.getHostUUID()) || task.hasReplicaOnHosts(failedHostUUIDs))) {
// There is a task on a to-be-failed host that has a copy on a host that is down. We cannot fail for now.
return true;
}
}
return false;
}
/**
* After receiving a host failure request, queue an event to fail that host after a quiet period
*/
private void queueFailNextHost() {
if (newAdditions.compareAndSet(false, true)) {
executorService.schedule(new FailHostTask(false), hostFailQuietPeriod, TimeUnit.MILLISECONDS);
}
}
/**
* Find the next host on the fail queue, considering filesystem-dead hosts first. Perform the correct actions and remove from the queue if appropriate.
*/
private void failNextHost() {
Pair<String, FailState> hostToFail = hostFailState.nextHostToFail();
if (hostToFail != null) {
String failedHostUuid = hostToFail.getLeft();
FailState failState = hostToFail.getRight();
if (failState == FailState.FAILING_FS_DEAD) {
// File system is dead. Relocate all tasks ASAP.
markHostDead(failedHostUuid);
spawn.getSpawnBalancer().fixTasksForFailedHost(spawn.hostManager.listHostStatus(null), failedHostUuid);
} else {
HostState host = spawn.hostManager.getHostState(failedHostUuid);
if (host == null) {
// Host is gone or has no more tasks. Simply mark it as failed.
markHostDead(failedHostUuid);
return;
}
boolean diskFull = (failState == FailState.DISK_FULL);
if (!diskFull && spawn.getSystemManager().isQuiesced()) {
// If filesystem is okay, don't do any moves while spawn is quiesced.
return;
}
int taskMovingMax = diskFull ? maxMovingTasksDiskFull : maxMovingTasks;
int tasksRebalancing = countRebalancingTasks();
int tasksToMove = taskMovingMax - tasksRebalancing;
if (tasksToMove <= 0) {
// Spawn is already moving enough tasks; hold off until later
return;
}
List<JobTaskMoveAssignment> assignments = spawn.getSpawnBalancer().pushTasksOffDiskForFilesystemOkayFailure(host, tasksToMove);
// no re-assignments available for this host, move it to the end of the fs-ok queue
if(assignments.isEmpty() && tasksRebalancing == 0 && failState == FailState.FAILING_FS_OKAY) {
hostFailState.removeHost(failedHostUuid);
hostFailState.putHost(failedHostUuid, FailState.FAILING_FS_OKAY);
}
// Use available task slots to push tasks off the host in question. Not all of these assignments will necessarily be moved.
spawn.executeReallocationAssignments(assignments, !diskFull && obeyTaskSlots.get());
if (failState == FailState.FAILING_FS_OKAY && assignments.isEmpty() && host.countTotalLive() == 0) {
// Found no tasks on the failed host, so fail it for real.
markHostDead(failedHostUuid);
spawn.getSpawnBalancer().fixTasksForFailedHost(
spawn.hostManager.listHostStatus(host.getMinionTypes()), failedHostUuid);
}
}
}
}
public void setObeyTaskSlots(boolean obey) {
obeyTaskSlots.set(obey);
}
/**
* Retrieve information about the implications of failing a host, to inform/warn a user in the UI
*
* @param hostsToFail The hosts that will be failed
* @return A JSONObject with various data about the implications of the failure
*/
public JSONObject getInfoForHostFailure(String hostsToFail, boolean deadFilesystem) throws JSONException {
if (hostsToFail == null) {
return new JSONObject();
}
HashSet<String> ids = new HashSet<>(Arrays.asList(hostsToFail.split(",")));
long totalClusterAvail = 0, totalClusterUsed = 0, hostAvail = 0;
List<String> hostsDown = new ArrayList<>();
for (HostState host : spawn.hostManager.listHostStatus(null)) {
// Sum up disk availability across the entire cluster and across the specified hosts
if (host.getMax() != null && host.getUsed() != null) {
if (getFailureState(host.getHostUuid()) == FailState.ALIVE) {
totalClusterAvail += host.getMax().getDisk();
totalClusterUsed += host.getUsed().getDisk();
}
if (ids.contains(host.getHostUuid())) {
hostAvail += host.getMax().getDisk();
}
}
if (!ids.contains(host.getHostUuid()) && shouldBlockHostFailure(ids, host)) {
hostsDown.add(host.getHostUuid() + " on " + host.getHost());
}
}
// Guard against division by zero in the case of unexpected values
totalClusterAvail = Math.max(1, totalClusterAvail);
hostAvail = Math.min(totalClusterAvail - 1, hostAvail);
return constructInfoMessage(hostsToFail, deadFilesystem, (double) (totalClusterUsed) / totalClusterAvail, (double) (totalClusterUsed) / (totalClusterAvail - hostAvail), hostsDown);
}
/**
* Create the info message about host message using some raw values
*
* @param prefailCapacity The capacity the cluster had before the failure
* @param postfailCapacity The capacity the cluster would have after the failure
* @param hostsDown Any hosts that are down that might temporarily prevent failure
* @return A JSONObject encapsulating the above information.
* @throws JSONException
*/
private JSONObject constructInfoMessage(String hostsToFail, boolean deadFilesystem, double prefailCapacity,
double postfailCapacity, List<String> hostsDown) throws JSONException {
JSONObject obj = new JSONObject();
obj.put(infoHostsKey, hostsToFail);
obj.put(infoDeadFsKey, deadFilesystem);
obj.put(infoPrefailCapacityKey, prefailCapacity);
obj.put(infoPostfailCapacityKey, postfailCapacity);
if (Double.isNaN(postfailCapacity)) {
obj.put(infoFatalWarningKey, "Cannot fail all hosts from a cluster");
} else if (postfailCapacity >= 1) {
obj.put(infoFatalWarningKey, "Insufficient cluster disk capacity");
}
if (!hostsDown.isEmpty()) {
obj.put(infoWarningKey, "Some hosts are down. Host failure could be delayed until they return: " + hostsDown);
}
return obj;
}
/**
* A simple wrapper around failNextHost that is run by the failExecutor.
*/
private class FailHostTask implements Runnable {
private final boolean skipIfNewAdditions;
public FailHostTask(boolean skipIfNewAdditions) {
this.skipIfNewAdditions = skipIfNewAdditions;
}
@Override
public void run() {
updateFullMinions();
if (skipIfNewAdditions && newAdditions.get()) {
return;
}
try {
failNextHost();
} catch (Exception e) {
log.warn("Exception while failing host: {}", e.getMessage(), e);
} finally {
newAdditions.set(false);
}
}
}
public void updateFullMinions() {
for (HostState hostState : spawn.hostManager.listHostStatus(null)) {
if (hostState == null || hostState.isDead() || !hostState.isUp()) {
continue;
}
String hostId = hostState.getHostUuid();
if (spawn.getSpawnBalancer().isDiskFull(hostState)) {
markHostsToFail(hostId, HostFailWorker.FailState.DISK_FULL);
} else if (getFailureState(hostId) == HostFailWorker.FailState.DISK_FULL) {
// Host was previously full, but isn't anymore. Take it off the disk_full list.
hostFailState.removeHost(hostId);
}
}
}
private int countRebalancingTasks() {
int count = 0;
for (Job job : spawn.listJobs()) {
if (job.getState() == JobState.REBALANCE) {
for (JobTask task : job.getCopyOfTasks()) {
if (task.getState() == JobTaskState.REBALANCE) {
count++;
}
}
}
}
return count;
}
/**
* This enum tracks HostFailWorker's ideas of Host State. Options are:
* - ALIVE: host is normal
* - FAILING_FS_DEAD: User has requested that the host be failed immediately. There is a quiet period to allow the
* queue logic to exit gracefully and to allow user to cancel if there was a mistake.
* - FAILING_FS_OKAY: User has requested that the host be failed eventually, after safely migrating each task off.
* - DISK_FULL: HostFailWorker detects hosts that are nearly full on disk, and moves tasks off automatically. Once
* they return to safer levels, they will go back to ALIVE status.
*/
public enum FailState {ALIVE, FAILING_FS_DEAD, FAILING_FS_OKAY, DISK_FULL}
}