/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.job; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import com.addthis.basis.util.Parameter; import com.addthis.hydra.job.mq.HostState; import com.addthis.hydra.job.mq.JobKey; import com.addthis.hydra.job.spawn.HostManager; import com.addthis.hydra.job.spawn.Spawn; import com.addthis.maljson.JSONException; import com.addthis.maljson.JSONObject; import com.google.common.collect.ImmutableSet; import com.yammer.metrics.Metrics; import com.yammer.metrics.core.Counter; import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HostFailWorker { private static final Logger log = LoggerFactory.getLogger(HostFailWorker.class); private final AtomicBoolean newAdditions = new AtomicBoolean(false); // True if a host has been recently added to the queue private final AtomicBoolean obeyTaskSlots = new AtomicBoolean(true); // Whether spawn should honor the max task slots when moving tasks to fail hosts private final HostFailState hostFailState; private final Spawn spawn; private final HostManager hostManager; // Perform host-failure related operations at a given interval private static final long hostFailDelayMillis = Parameter.longValue("host.fail.delay", 15_000); // Quiet period between when host is failed in UI and when Spawn begins failure-related operations private static final long hostFailQuietPeriod = Parameter.longValue("host.fail.quiet.period", 20_000); // Don't rebalance additional tasks if spawn is already rebalancing at least this many. private static final int maxMovingTasks = Parameter.intValue("host.fail.maxMovingTasks", 6); // Use a smaller max when a disk is being failed, to avoid a 'thundering herds' scenario private static final int maxMovingTasksDiskFull = Parameter.intValue("host.fail.maxMovingTasksDiskFull", 2); private static final Counter failHostCount = Metrics.newCounter(Spawn.class, "failHostCount"); // Various keys used to make JSON objects to send to the UI private static final String infoHostsKey = "uuids"; private static final String infoDeadFsKey = "deadFs"; private static final String infoWarningKey = "warning"; private static final String infoPrefailCapacityKey = "prefail"; private static final String infoPostfailCapacityKey = "postfail"; private static final String infoFatalWarningKey = "fatal"; private final ScheduledExecutorService executorService; public HostFailWorker(Spawn spawn, HostManager hostManager, ScheduledExecutorService executorService) { this.spawn = spawn; this.hostManager = hostManager; this.executorService = executorService; hostFailState = new HostFailState(spawn); hostFailState.loadState(); } /** Initializes scheduled execution of fail host task **/ public void initFailHostTaskSchedule() { if (executorService != null) { if (hostFailState.nextHostToFail() != null) { queueFailNextHost(); } executorService.scheduleWithFixedDelay(new FailHostTask(true), hostFailDelayMillis, hostFailDelayMillis, TimeUnit.MILLISECONDS); } } /** * Mark a series of hosts for failure * * @param hostIds A comma-separated list of host uuids * @param state The state of the hosts being failed */ public void markHostsToFail(String hostIds, FailState state) { if (hostIds != null) { for (String host : hostIds.split(",")) { hostFailState.putHost(host, state); spawn.sendHostUpdateEvent(spawn.hostManager.getHostState(host)); } queueFailNextHost(); } } /** * @return The set of all minion ids across all failure queues. */ public Set<String> queuedHosts() { return hostFailState.queuedHosts(); } /** * Retrieve an enum describing whether/how a host has been failed (for programmatic purposes) * * @param hostId A host uuid to check * @return A FailState object describing whether the host has been failed */ public FailState getFailureState(String hostId) { return hostFailState.getState(hostId); } public boolean shouldKickTasks(String hostId) { FailState failState = getFailureState(hostId); // A Failing_Fs_Okay host is nominally fine for the time being. It should be allowed to run tasks. return failState == FailState.ALIVE || failState == FailState.FAILING_FS_OKAY; } /** * Retrieve a human-readable string describing whether/how a host has been failed * * @param hostId A host uuid to check * @param up Whether the host is up * @return A String describing the host's failure state (mainly for the UI) */ public String getFailureStateString(String hostId, boolean up) { FailState failState = getFailureState(hostId); switch (failState) { case ALIVE: return up ? "up" : "down"; case FAILING_FS_DEAD: return "queued to fail (fs dead)"; case FAILING_FS_OKAY: return "queued to fail (fs okay)"; case DISK_FULL: return "disk near full; moving tasks off"; default: return "UNKNOWN"; } } /** * Cancel the failure of one or more hosts * * @param hostIds A comma-separated list of host uuids */ public void removeHostsForFailure(String hostIds) { if (hostIds != null) { for (String host : hostIds.split(",")) { hostFailState.removeHost(host); spawn.sendHostUpdateEvent(spawn.hostManager.getHostState(host)); } } } /** * Decide whether a given host can be failed based on whether other minions in the cluster are up * * @param failedHostUuid The host to be failed * @return True only if there are no down hosts that would need to be up in order to correctly fail the host */ protected boolean checkHostStatesForFailure(String failedHostUuid) { Collection<HostState> hostStates = spawn.hostManager.listHostStatus(null); for (HostState hostState : hostStates) { if (!failedHostUuid.equals(hostState.getHostUuid()) && shouldBlockHostFailure(ImmutableSet.of(failedHostUuid), hostState)) { log.warn("Unable to fail host: " + failedHostUuid + " because one of the minions (" + hostState.getHostUuid() + ") on " + hostState.getHost() + " is currently down. Retry when all minions are available"); return false; } } return true; } /** * Fail a host. For any tasks with replicas on that host, move these replicas elsewhere. For any tasks with live copies on the host, * promote a replica, then make a new replica somewhere else. */ private void markHostDead(String failedHostUuid) { if (failedHostUuid == null || !checkHostStatesForFailure(failedHostUuid)) { return; } spawn.markHostStateDead(failedHostUuid); hostFailState.removeHost(failedHostUuid); failHostCount.inc(); } /** * Before failing host(s), check if a different host needs to be up to perform the failure operation. * * @param failedHostUUIDs The hosts being failed * @param hostState The host to check * @return True if the host is down */ private boolean shouldBlockHostFailure(Set<String> failedHostUUIDs, HostState hostState) { if (hostState == null || hostState.isDead() || hostState.isUp()) { return false; } for (JobKey jobKey : hostState.allJobKeys()) // never null due to implementation { JobTask task = spawn.getTask(jobKey); if (task != null && (failedHostUUIDs.contains(task.getHostUUID()) || task.hasReplicaOnHosts(failedHostUUIDs))) { // There is a task on a to-be-failed host that has a copy on a host that is down. We cannot fail for now. return true; } } return false; } /** * After receiving a host failure request, queue an event to fail that host after a quiet period */ private void queueFailNextHost() { if (newAdditions.compareAndSet(false, true)) { executorService.schedule(new FailHostTask(false), hostFailQuietPeriod, TimeUnit.MILLISECONDS); } } /** * Find the next host on the fail queue, considering filesystem-dead hosts first. Perform the correct actions and remove from the queue if appropriate. */ private void failNextHost() { Pair<String, FailState> hostToFail = hostFailState.nextHostToFail(); if (hostToFail != null) { String failedHostUuid = hostToFail.getLeft(); FailState failState = hostToFail.getRight(); if (failState == FailState.FAILING_FS_DEAD) { // File system is dead. Relocate all tasks ASAP. markHostDead(failedHostUuid); spawn.getSpawnBalancer().fixTasksForFailedHost(spawn.hostManager.listHostStatus(null), failedHostUuid); } else { HostState host = spawn.hostManager.getHostState(failedHostUuid); if (host == null) { // Host is gone or has no more tasks. Simply mark it as failed. markHostDead(failedHostUuid); return; } boolean diskFull = (failState == FailState.DISK_FULL); if (!diskFull && spawn.getSystemManager().isQuiesced()) { // If filesystem is okay, don't do any moves while spawn is quiesced. return; } int taskMovingMax = diskFull ? maxMovingTasksDiskFull : maxMovingTasks; int tasksRebalancing = countRebalancingTasks(); int tasksToMove = taskMovingMax - tasksRebalancing; if (tasksToMove <= 0) { // Spawn is already moving enough tasks; hold off until later return; } List<JobTaskMoveAssignment> assignments = spawn.getSpawnBalancer().pushTasksOffDiskForFilesystemOkayFailure(host, tasksToMove); // no re-assignments available for this host, move it to the end of the fs-ok queue if(assignments.isEmpty() && tasksRebalancing == 0 && failState == FailState.FAILING_FS_OKAY) { hostFailState.removeHost(failedHostUuid); hostFailState.putHost(failedHostUuid, FailState.FAILING_FS_OKAY); } // Use available task slots to push tasks off the host in question. Not all of these assignments will necessarily be moved. spawn.executeReallocationAssignments(assignments, !diskFull && obeyTaskSlots.get()); if (failState == FailState.FAILING_FS_OKAY && assignments.isEmpty() && host.countTotalLive() == 0) { // Found no tasks on the failed host, so fail it for real. markHostDead(failedHostUuid); spawn.getSpawnBalancer().fixTasksForFailedHost( spawn.hostManager.listHostStatus(host.getMinionTypes()), failedHostUuid); } } } } public void setObeyTaskSlots(boolean obey) { obeyTaskSlots.set(obey); } /** * Retrieve information about the implications of failing a host, to inform/warn a user in the UI * * @param hostsToFail The hosts that will be failed * @return A JSONObject with various data about the implications of the failure */ public JSONObject getInfoForHostFailure(String hostsToFail, boolean deadFilesystem) throws JSONException { if (hostsToFail == null) { return new JSONObject(); } HashSet<String> ids = new HashSet<>(Arrays.asList(hostsToFail.split(","))); long totalClusterAvail = 0, totalClusterUsed = 0, hostAvail = 0; List<String> hostsDown = new ArrayList<>(); for (HostState host : spawn.hostManager.listHostStatus(null)) { // Sum up disk availability across the entire cluster and across the specified hosts if (host.getMax() != null && host.getUsed() != null) { if (getFailureState(host.getHostUuid()) == FailState.ALIVE) { totalClusterAvail += host.getMax().getDisk(); totalClusterUsed += host.getUsed().getDisk(); } if (ids.contains(host.getHostUuid())) { hostAvail += host.getMax().getDisk(); } } if (!ids.contains(host.getHostUuid()) && shouldBlockHostFailure(ids, host)) { hostsDown.add(host.getHostUuid() + " on " + host.getHost()); } } // Guard against division by zero in the case of unexpected values totalClusterAvail = Math.max(1, totalClusterAvail); hostAvail = Math.min(totalClusterAvail - 1, hostAvail); return constructInfoMessage(hostsToFail, deadFilesystem, (double) (totalClusterUsed) / totalClusterAvail, (double) (totalClusterUsed) / (totalClusterAvail - hostAvail), hostsDown); } /** * Create the info message about host message using some raw values * * @param prefailCapacity The capacity the cluster had before the failure * @param postfailCapacity The capacity the cluster would have after the failure * @param hostsDown Any hosts that are down that might temporarily prevent failure * @return A JSONObject encapsulating the above information. * @throws JSONException */ private JSONObject constructInfoMessage(String hostsToFail, boolean deadFilesystem, double prefailCapacity, double postfailCapacity, List<String> hostsDown) throws JSONException { JSONObject obj = new JSONObject(); obj.put(infoHostsKey, hostsToFail); obj.put(infoDeadFsKey, deadFilesystem); obj.put(infoPrefailCapacityKey, prefailCapacity); obj.put(infoPostfailCapacityKey, postfailCapacity); if (Double.isNaN(postfailCapacity)) { obj.put(infoFatalWarningKey, "Cannot fail all hosts from a cluster"); } else if (postfailCapacity >= 1) { obj.put(infoFatalWarningKey, "Insufficient cluster disk capacity"); } if (!hostsDown.isEmpty()) { obj.put(infoWarningKey, "Some hosts are down. Host failure could be delayed until they return: " + hostsDown); } return obj; } /** * A simple wrapper around failNextHost that is run by the failExecutor. */ private class FailHostTask implements Runnable { private final boolean skipIfNewAdditions; public FailHostTask(boolean skipIfNewAdditions) { this.skipIfNewAdditions = skipIfNewAdditions; } @Override public void run() { updateFullMinions(); if (skipIfNewAdditions && newAdditions.get()) { return; } try { failNextHost(); } catch (Exception e) { log.warn("Exception while failing host: {}", e.getMessage(), e); } finally { newAdditions.set(false); } } } public void updateFullMinions() { for (HostState hostState : spawn.hostManager.listHostStatus(null)) { if (hostState == null || hostState.isDead() || !hostState.isUp()) { continue; } String hostId = hostState.getHostUuid(); if (spawn.getSpawnBalancer().isDiskFull(hostState)) { markHostsToFail(hostId, HostFailWorker.FailState.DISK_FULL); } else if (getFailureState(hostId) == HostFailWorker.FailState.DISK_FULL) { // Host was previously full, but isn't anymore. Take it off the disk_full list. hostFailState.removeHost(hostId); } } } private int countRebalancingTasks() { int count = 0; for (Job job : spawn.listJobs()) { if (job.getState() == JobState.REBALANCE) { for (JobTask task : job.getCopyOfTasks()) { if (task.getState() == JobTaskState.REBALANCE) { count++; } } } } return count; } /** * This enum tracks HostFailWorker's ideas of Host State. Options are: * - ALIVE: host is normal * - FAILING_FS_DEAD: User has requested that the host be failed immediately. There is a quiet period to allow the * queue logic to exit gracefully and to allow user to cancel if there was a mistake. * - FAILING_FS_OKAY: User has requested that the host be failed eventually, after safely migrating each task off. * - DISK_FULL: HostFailWorker detects hosts that are nearly full on disk, and moves tasks off automatically. Once * they return to safer levels, they will go back to ALIVE status. */ public enum FailState {ALIVE, FAILING_FS_DEAD, FAILING_FS_OKAY, DISK_FULL} }