package org.ovirt.engine.core.bll; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Objects; import java.util.concurrent.CopyOnWriteArraySet; import java.util.concurrent.TimeUnit; import javax.annotation.PostConstruct; import javax.inject.Inject; import org.ovirt.engine.core.bll.interfaces.BackendInternal; import org.ovirt.engine.core.bll.job.ExecutionHandler; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.BackendService; import org.ovirt.engine.core.common.action.RunVmParams; import org.ovirt.engine.core.common.action.VdcActionType; import org.ovirt.engine.core.common.businessentities.Snapshot; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.errors.EngineMessage; import org.ovirt.engine.core.compat.DateTime; import org.ovirt.engine.core.compat.Guid; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogable; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableImpl; import org.ovirt.engine.core.dao.SnapshotDao; import org.ovirt.engine.core.dao.VmDao; import org.ovirt.engine.core.dao.VmDynamicDao; import org.ovirt.engine.core.utils.lock.EngineLock; import org.ovirt.engine.core.utils.lock.LockManager; import org.ovirt.engine.core.utils.timer.OnTimerMethodAnnotation; import org.ovirt.engine.core.utils.timer.SchedulerUtilQuartzImpl; import org.ovirt.engine.core.vdsbroker.ResourceManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public abstract class AutoStartVmsRunner implements BackendService { /** How long to wait before rerun HA VM that failed to start (not because of lock acquisition) */ private static final int RETRY_TO_RUN_AUTO_START_VM_INTERVAL = Config.<Integer> getValue(ConfigValues.RetryToRunAutoStartVmIntervalInSeconds); /** How long to wait before next check whether the NextRun configuration is applied */ private static final int DELAY_TO_RUN_AUTO_START_VM_INTERVAL = Config.<Integer> getValue(ConfigValues.DelayToRunAutoStartVmIntervalInSeconds); protected final Logger log = LoggerFactory.getLogger(getClass()); @Inject private AuditLogDirector auditLogDirector; @Inject private SchedulerUtilQuartzImpl schedulerUtil; @Inject private LockManager lockManager; @Inject private BackendInternal backend; @Inject private VmDynamicDao vmDynamicDao; @Inject private VmDao vmDao; @Inject private ResourceManager resourceManager; @Inject private SnapshotDao snapshotDao; /** Records of VMs that need to be started */ private CopyOnWriteArraySet<AutoStartVmToRestart> autoStartVmsToRestart; @PostConstruct private void init() { autoStartVmsToRestart = new CopyOnWriteArraySet<>(getInitialVmsToStart()); int autoStartVmsRunnerIntervalInSeconds = Config.<Integer>getValue(ConfigValues.AutoStartVmsRunnerIntervalInSeconds); schedulerUtil.scheduleAFixedDelayJob( this, "startFailedAutoStartVms", new Class[] {}, new Object[] {}, autoStartVmsRunnerIntervalInSeconds, autoStartVmsRunnerIntervalInSeconds, TimeUnit.SECONDS); } protected abstract Collection<AutoStartVmToRestart> getInitialVmsToStart(); /** * Add the given VM IDs to the set of VMs which will be started in the next iteration. * * @param vmIds * List of VM IDs to start in the next iteration of the job */ public void addVmsToRun(List<Guid> vmIds) { ArrayList<AutoStartVmToRestart> vmsToAdd = new ArrayList<>(vmIds.size()); for (Guid vmId: vmIds) { vmsToAdd.add(new AutoStartVmToRestart(vmId)); } autoStartVmsToRestart.addAll(vmsToAdd); } @OnTimerMethodAnnotation("startFailedAutoStartVms") public void startFailedAutoStartVms() { LinkedList<AutoStartVmToRestart> vmsToRemove = new LinkedList<>(); final DateTime iterationStartTime = DateTime.getNow(); final Date nextTimeOfRetryToRun = iterationStartTime.addSeconds(RETRY_TO_RUN_AUTO_START_VM_INTERVAL); final Date delayedTimeOfRetryToRun = iterationStartTime.addSeconds(DELAY_TO_RUN_AUTO_START_VM_INTERVAL); for (AutoStartVmToRestart autoStartVmToRestart : autoStartVmsToRestart) { // if it is not the time to try to run the VM yet, skip it for now // (we'll try again in the next iteration) if (!autoStartVmToRestart.isTimeToRun(iterationStartTime)) { continue; } Guid vmId = autoStartVmToRestart.getVmId(); if (isNextRunConfiguration(vmId)) { // if the NextRun config exists then give the ProcessDownVmCommand time to apply it log.debug("NextRun config found for '{}' vm, the RunVm will be delayed", vmId); if (autoStartVmToRestart.delayNextTimeToRun(delayedTimeOfRetryToRun)) { // Skip attempt to run the VM for now. // The priority is to run the VM even if the NextRun fails to be applied continue; } // Waiting for NextRun config is over, let's run the VM even with the non-applied Next-Run log.warn("Failed to wait for the NextRun config to be applied on vm '{}', trying to run the VM anyway", vmId); } EngineLock runVmLock = createEngineLockForRunVm(vmId); // try to acquire the required lock for running the VM, if the lock cannot be // acquired, skip for now and we'll try again in the next iteration if (!acquireLock(runVmLock)) { log.debug("Could not acquire lock for auto starting VM '{}'", vmId); continue; } if (!isVmNeedsToBeAutoStarted(vmId)) { // if the VM doesn't need to be auto started anymore, release the lock and // remove the VM from the collection of VMs that should be auto started releaseLock(runVmLock); vmsToRemove.add(autoStartVmToRestart); continue; } if (runVm(vmId, runVmLock)) { // the VM reached WaitForLunch, so from now on this job is not responsible // to auto start it, future failures will be detected by the monitoring vmsToRemove.add(autoStartVmToRestart); } else { logFailedAttemptToRestartVm(vmId); if (!autoStartVmToRestart.scheduleNextTimeToRun(nextTimeOfRetryToRun)) { // if we could not schedule the next time to run the VM, it means // that we reached the maximum number of tried so don't try anymore vmsToRemove.add(autoStartVmToRestart); logFailureToRestartVm(vmId); } } } autoStartVmsToRestart.removeAll(vmsToRemove); } /** * @return True if the VM has a next-run configuration to be applied */ private boolean isNextRunConfiguration(Guid vmId) { return snapshotDao.exists(vmId, Snapshot.SnapshotType.NEXT_RUN); } private boolean acquireLock(EngineLock lock) { return lockManager.acquireLock(lock).getFirst(); } private void releaseLock(EngineLock lock) { lockManager.releaseLock(lock); } protected abstract boolean isVmNeedsToBeAutoStarted(Guid vmId); private void logFailedAttemptToRestartVm(Guid vmId) { logVmEvent(vmId, getRestartFailedAuditLogType()); } protected abstract AuditLogType getRestartFailedAuditLogType(); private void logFailureToRestartVm(Guid vmId) { logVmEvent(vmId, getExceededMaxNumOfRestartsAuditLogType()); } protected abstract AuditLogType getExceededMaxNumOfRestartsAuditLogType(); private void logVmEvent(Guid vmId, AuditLogType restartFailedAuditLogType) { AuditLogable event = createVmEvent(vmId); auditLogDirector.log(event, restartFailedAuditLogType); } private AuditLogable createVmEvent(Guid vmId) { AuditLogable event = new AuditLogableImpl(); event.setVmId(vmId); event.setVmName(resourceManager.getVmManager(vmId).getName()); return event; } private EngineLock createEngineLockForRunVm(Guid vmId) { return new EngineLock( RunVmCommandBase.getExclusiveLocksForRunVm(vmId, getLockMessage()), RunVmCommandBase.getSharedLocksForRunVm()); } private String getLockMessage() { return EngineMessage.ACTION_TYPE_FAILED_OBJECT_LOCKED.name(); } protected VmDynamicDao getVmDynamicDao() { return vmDynamicDao; } protected VmDao getVmDao() { return vmDao; } private boolean runVm(Guid vmId, EngineLock lock) { return backend.runInternalAction( VdcActionType.RunVm, new RunVmParams(vmId), ExecutionHandler.createInternalJobContext(lock)).getSucceeded(); } protected static class AutoStartVmToRestart { /** The earliest date in Java */ private static final Date MIN_DATE = DateTime.getMinValue(); /** How many times to try to restart highly available VM that went down */ private static final int MAXIMUM_NUM_OF_TRIES_TO_AUTO_START_VM = Config.<Integer> getValue(ConfigValues.MaxNumOfTriesToRunFailedAutoStartVm); private static final int MAXIMUM_NUM_OF_SKIPS_BEFORE_AUTO_START_VM = Config.<Integer> getValue(ConfigValues.MaxNumOfSkipsBeforeAutoStartVm); /** The next time we should try to run the VM */ private Date timeToRunTheVm; /** Number of tries that were made so far to run the VM */ private int numOfRuns; /** Number of skips that were made so far before attempt to run the VM */ private int numOfSkips; /** The ID of the VM */ private Guid vmId; AutoStartVmToRestart(Guid vmId) { this.vmId = vmId; timeToRunTheVm = MIN_DATE; } /** * Set the next time we should try to rerun the VM. * If we reached the maximum number of tries, the method returns false. */ boolean scheduleNextTimeToRun(Date timeToRunTheVm) { this.timeToRunTheVm = timeToRunTheVm; return ++numOfRuns < MAXIMUM_NUM_OF_TRIES_TO_AUTO_START_VM; } /** * Skip this attempt to run the VM. * Return false if count of skips reached thresh-hold. * Do not increase the attempt-counter 'numOfRuns'. */ boolean delayNextTimeToRun(Date timeToRunTheVm) { this.timeToRunTheVm = timeToRunTheVm; numOfSkips++; numOfSkips %= MAXIMUM_NUM_OF_SKIPS_BEFORE_AUTO_START_VM; return numOfSkips != 0; } boolean isTimeToRun(Date time) { return timeToRunTheVm == MIN_DATE || time.compareTo(timeToRunTheVm) >= 0; } Guid getVmId() { return vmId; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (!(obj instanceof AutoStartVmToRestart)) { return false; } AutoStartVmToRestart other = (AutoStartVmToRestart) obj; return Objects.equals(vmId, other.vmId); } @Override public int hashCode() { return Objects.hashCode(vmId); } } }