package org.ovirt.engine.core.bll.pm; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.stream.Collectors; import javax.annotation.PostConstruct; import javax.inject.Inject; import javax.inject.Singleton; import org.ovirt.engine.core.bll.Backend; import org.ovirt.engine.core.bll.job.ExecutionHandler; import org.ovirt.engine.core.bll.pm.PowerManagementHelper.AgentsIterator; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.BackendService; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; import org.ovirt.engine.core.common.action.VdcActionType; import org.ovirt.engine.core.common.action.VdcReturnValueBase; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.businessentities.pm.FenceAgent; import org.ovirt.engine.core.common.businessentities.pm.FenceOperationResult.Status; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.compat.Guid; import org.ovirt.engine.core.dal.dbbroker.DbFacade; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AlertDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogable; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableBase; import org.ovirt.engine.core.dao.VdsDao; import org.ovirt.engine.core.di.Injector; import org.ovirt.engine.core.utils.ThreadUtils; import org.ovirt.engine.core.utils.threadpool.ThreadPoolUtil; import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation; import org.ovirt.engine.core.utils.timer.SchedulerUtilQuartzImpl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Responsible for checking PM enabled hosts by sending a status command to each host configured PM agent cards and * raise alerts for failed operations. */ @Singleton public class PmHealthCheckManager implements BackendService { private static final Logger log = LoggerFactory.getLogger(PmHealthCheckManager.class); private Lock lock = new ReentrantLock(); @Inject private AuditLogDirector auditLogDirector; @Inject private VdsDao vdsDao; @Inject private AlertDirector alertDirector; /** * Initializes the PM Health Check Manager */ @PostConstruct private void initialize() { if(Config.<Boolean>getValue(ConfigValues.PMHealthCheckEnabled)) { log.info("Start initializing {}", getClass().getSimpleName()); Integer pmHealthCheckInterval = Config.<Integer> getValue(ConfigValues.PMHealthCheckIntervalInSec); Injector.get(SchedulerUtilQuartzImpl.class).scheduleAFixedDelayJob(this, "pmHealthCheck", new Class[] {}, new Object[] {}, pmHealthCheckInterval, pmHealthCheckInterval, TimeUnit.SECONDS); } // recover from engine failure recover(vdsDao.getAll()); log.info("Finished initializing {}", getClass().getSimpleName()); } @OnTimerMethodAnnotation("pmHealthCheck") public void pmHealthCheck() { // skip PM health check if previous operation is not completed yet if (lock.tryLock()) { try { log.info("Power Management Health Check started."); List<VDS> hosts = DbFacade.getInstance().getVdsDao().getAll(); for (VDS host : hosts) { if (host.isPmEnabled()) { pmHealthCheck(host); } } log.info("Power Management Health Check completed."); } finally { lock.unlock(); } } } /** * Check PM health of a host. Add/Remove alerts as necessary, and log the results. */ public void pmHealthCheck(VDS host) { // check health PmHealth pmHealth = checkPMHealth(host); // handle alerts - adding or canceling as necessary handleAlerts(pmHealth); log.debug(pmHealth.toString()); } /** * Check PM health of a host. Add/Remove alerts as necessary, and log the results. */ public void pmHealthCheck(Guid hostId) { VDS host = DbFacade.getInstance().getVdsDao().get(hostId); pmHealthCheck(host); } /** * Collect health-status info for all agents. */ private PmHealth checkPMHealth(VDS host) { PmHealth pmHealth = new PmHealth(host); AgentsIterator iterator = PowerManagementHelper.getAgentsIterator(host.getFenceAgents()); // In each step of the loop deal with the agents with the next 'order' (one or more). Write info into PmHealth. while (iterator.hasNext()) { collectHealthStatus(pmHealth, iterator.next()); } return pmHealth; } private void handleAlerts(PmHealth healthStatus) { VDS host = healthStatus.getHost(); // TODO: uncomment pending implementation of removing alerts by agent-id. // for (Entry<FenceAgent, Boolean> entry : healthStatus.getHealthMap().entrySet()) { // handleAgentAlerts(entry, hostId); // } handleStartAlerts(healthStatus, host); handleStopAlerts(healthStatus, host); } private void handleStartAlerts(PmHealth healthStatus, VDS host) { if (healthStatus.isStartShouldWork()) { removeAlert(host.getId(), AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_START_MIGHT_FAIL); } else { addAlert(host, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_START_MIGHT_FAIL); } } private void handleStopAlerts(PmHealth healthStatus, VDS host) { if (healthStatus.isStopShouldWork()) { removeAlert(host.getId(), AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_STOP_MIGHT_FAIL); } else { addAlert(host, AuditLogType.VDS_ALERT_PM_HEALTH_CHECK_STOP_MIGHT_FAIL); } } private void removeAlert(Guid hostId, AuditLogType auditMessage) { alertDirector.removeVdsAlert(hostId, auditMessage); } private void addAlert(VDS host, AuditLogType auditMessage) { AuditLogable alert = new AuditLogableBase(); alert.setVdsId(host.getId()); alert.setVdsName(host.getName()); auditLogDirector.log(alert, auditMessage); } /** * A step in the health-status check. Checks health of the provided agents. */ private void collectHealthStatus(PmHealth healthStatus, List<FenceAgent> agents) { boolean atLeastOneHealthy = false; // initialize to false, and if one healthy agent found, change to true. boolean allHealthy = true; // initialize to true, and if one unhealthy agent found, change to false. for (FenceAgent agent : agents) { if (isHealthy(agent, healthStatus.getHost())) { healthStatus.getHealthMap().put(agent, true); atLeastOneHealthy = true; } else { healthStatus.getHealthMap().put(agent, false); allHealthy = false; } } if (atLeastOneHealthy) { healthStatus.setStartShouldWork(true); } if (allHealthy) { healthStatus.setStopShouldWork(true); } } /** * Checks if the agent is healthy. A healthy agent is one that returns an answer when queries for status, and it * doesn't matter whether that answer is "on" or "off". */ private boolean isHealthy(FenceAgent agent, VDS host) { return new HostFenceActionExecutor(host).getFenceAgentStatus(agent).getStatus() == Status.SUCCESS; } private void waitUntilFencingAllowed() { // wait the quiet time from engine start in which we skip fencing operations ThreadUtils.sleep( TimeUnit.SECONDS.toMillis( Config.<Integer>getValue(ConfigValues.DisableFenceAtStartupInSec))); } /** * Recovers hosts with status Reboot or Kdumping from engine crash * * @param hosts * all existing hosts */ public void recover(List<VDS> hosts) { startHostsWithPMInReboot(hosts); recoverKdumpingHosts(hosts); } private void startHostsWithPMInReboot(List<VDS> hosts) { final List<VDS> hostsWithPMInReboot = hosts.stream() .filter(host -> host.isPmEnabled()) .filter(host -> host.getStatus() == VDSStatus.Reboot) .collect(Collectors.toList()); if (hostsWithPMInReboot.size() > 0) { ThreadPoolUtil.execute(() -> { waitUntilFencingAllowed(); startHosts(hostsWithPMInReboot); }); } } /** * This method starts hosts remained in off status because of the following flow * non-responding -> stop -> wait -> off -> engine restart * Such hosts will stay DOWN while its status will show Reboot * We should try to catch such hosts and attempt to restart it. */ public void startHosts(List<VDS> hostWithPMInStatusReboot) { for (VDS host : hostWithPMInStatusReboot) { RestartVdsCommand<FenceVdsActionParameters> restartVdsCommand = new RestartVdsCommand<>(new FenceVdsActionParameters(host.getId()), null); if (new HostFenceActionExecutor(host).isHostPoweredOff()) { VdcReturnValueBase retValue = Backend.getInstance().runInternalAction(VdcActionType.RestartVds, restartVdsCommand.getParameters()); if (retValue!= null && retValue.getSucceeded()) { log.info("Host '{}' was started successfully by PM Health Check Manager", host.getName()); } else { log.info("PM Health Check Manager failed to start Host '{}'", host.getName()); } } } } private void recoverKdumpingHosts(List<VDS> hosts) { final List<VDS> kdumpingHosts = hosts.stream() .filter(host -> host.getStatus() == VDSStatus.Kdumping) .collect(Collectors.toList()); if (!kdumpingHosts.isEmpty()) { ThreadPoolUtil.execute(() -> { waitUntilFencingAllowed(); executeNotRespondingTreatment(kdumpingHosts); }); } } private void executeNotRespondingTreatment(List<VDS> hosts) { for (VDS host : hosts) { ThreadPoolUtil.execute(() -> Backend.getInstance().runInternalAction( VdcActionType.VdsNotRespondingTreatment, new FenceVdsActionParameters(host.getId()), ExecutionHandler.createInternalJobContext() )); } } private static class PmHealth { public PmHealth(VDS host) { super(); this.host = host; } private Map<FenceAgent, Boolean> healthMap = new HashMap<>(); private boolean startShouldWork = false; private boolean stopShouldWork = false; private VDS host; public VDS getHost() { return host; } public Map<FenceAgent, Boolean> getHealthMap() { return healthMap; } public boolean isStartShouldWork() { return startShouldWork; } public void setStartShouldWork(boolean startShouldWork) { this.startShouldWork = startShouldWork; } public boolean isStopShouldWork() { return stopShouldWork; } public void setStopShouldWork(boolean stopShouldWork) { this.stopShouldWork = stopShouldWork; } @Override public String toString() { StringBuilder sb = new StringBuilder().append("Power-Management Health Status for host ") .append(host.getId()) .append(": "); sb.append("Using fencing to Start is "); if (startShouldWork) { sb.append("expected to work (since one or more of the agents are working properly). "); } else { sb.append("at high risk of failing (since none of the agents are working properly). "); } sb.append("Using fencing to Stop is "); if (stopShouldWork) { sb.append("expected to work (since all agents are working properly). "); } else { sb.append(" at high risk of failing (since one or more of the agents are not working properly). "); } sb.append("Agent statuses: "); for (Entry<FenceAgent, Boolean> entry : healthMap.entrySet()) { sb.append(entry.getKey().getId()) .append(": ") .append(entry.getValue() ? "Up. " : "Down. "); } return sb.toString(); } } }