package org.ovirt.engine.core.bll.pm;
import java.util.concurrent.TimeUnit;
import org.ovirt.engine.core.common.AuditLogType;
import org.ovirt.engine.core.common.businessentities.FencingPolicy;
import org.ovirt.engine.core.common.businessentities.VDS;
import org.ovirt.engine.core.common.businessentities.pm.FenceActionType;
import org.ovirt.engine.core.common.businessentities.pm.FenceAgent;
import org.ovirt.engine.core.common.businessentities.pm.FenceOperationResult;
import org.ovirt.engine.core.common.businessentities.pm.FenceOperationResult.Status;
import org.ovirt.engine.core.common.businessentities.pm.PowerStatus;
import org.ovirt.engine.core.common.config.Config;
import org.ovirt.engine.core.common.config.ConfigValues;
import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogDirector;
import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogable;
import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableImpl;
import org.ovirt.engine.core.di.Injector;
import org.ovirt.engine.core.utils.ThreadUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* It manages:
* <ul>
* <li>Execution of "complex" fence actions (start, stop) where we wait until host reaches requested status
* using status action</li>
* <li>Execution retries for failed "complex" fence action (start, stop)</li>
* <li>Execution retries for failed "wait until host status is reached" actions</li>
* <li>Usage of {@code FenceAgentExecutor} to execute "simple" fence actions</li>
* </ul>
*/
public class SingleAgentFenceActionExecutor implements FenceActionExecutor{
private static final Logger log = LoggerFactory.getLogger(SingleAgentFenceActionExecutor.class);
private AuditLogDirector auditLogDirector;
/**
* Number of ms to wait after host was fenced to fetch host power status
*/
private static final int SLEEP_BEFORE_FIRST_ATTEMPT = 5000;
/**
* Number of allowed {@code PowerStatus.UNKNOWN} status results to determine if fence host operation was
* successful
*/
private static final int UNKNOWN_RESULT_LIMIT = 3;
private final VDS fencedHost;
private final FenceAgent fenceAgent;
private final FencingPolicy fencingPolicy;
private int allowedFenceActionRetries;
private PowerStatus requestedPowerStatus;
private int allowedWaitForStatusRetries;
private long delayBetweenRetries;
public SingleAgentFenceActionExecutor(VDS fencedHost, FenceAgent fenceAgent, FencingPolicy fencingPolicy) {
this.fencedHost = fencedHost;
this.fenceAgent = fenceAgent;
this.fencingPolicy = fencingPolicy;
}
@Override
public FenceOperationResult fence(FenceActionType fenceAction) {
setupParams(fenceAction);
if (fenceAction == FenceActionType.STATUS) {
return getStatus();
} else {
return changeStatus(fenceAction);
}
}
/**
* Setup parameters for specified fence action
*/
protected void setupParams(FenceActionType fenceAction) {
switch (fenceAction) {
case START:
requestedPowerStatus = PowerStatus.ON;
allowedFenceActionRetries = 1;
allowedWaitForStatusRetries = Config.<Integer>getValue(ConfigValues.FenceStartStatusRetries);
delayBetweenRetries = TimeUnit.SECONDS.toMillis(
Config.<Integer>getValue(ConfigValues.FenceStartStatusDelayBetweenRetriesInSec));
break;
case STOP:
requestedPowerStatus = PowerStatus.OFF;
allowedFenceActionRetries = 0;
allowedWaitForStatusRetries = Config.<Integer>getValue(ConfigValues.FenceStopStatusRetries);
delayBetweenRetries = TimeUnit.SECONDS.toMillis(
Config.<Integer>getValue(ConfigValues.FenceStopStatusDelayBetweenRetriesInSec));
break;
case STATUS:
break;
}
}
/**
* Returns new instance of {@link FenceAgentExecutor}
*/
protected FenceAgentExecutor createAgentExecutor() {
return Injector.injectMembers(new FenceAgentExecutor(fencedHost, fencingPolicy));
}
/**
* Fetches power status of the host using specified agent
*/
protected FenceOperationResult getStatus() {
return createAgentExecutor().fence(FenceActionType.STATUS, fenceAgent);
}
/**
* Executes start or stop fence operation using specified agent
*/
protected FenceOperationResult changeStatus(FenceActionType fenceAction) {
FenceAgentExecutor agentExecutor = createAgentExecutor();
FenceOperationResult statusResult = null;
// start at -1 because 1st fence attempt is regular and not a retry
int fenceRetries = -1;
do {
FenceOperationResult result = agentExecutor.fence(fenceAction, fenceAgent);
if (result.getStatus() == Status.SKIPPED_ALREADY_IN_STATUS) {
// action skipped already in status, so report it as success with correct power status
return new FenceOperationResult(
Status.SUCCESS,
fenceAction == FenceActionType.START ? PowerStatus.ON : PowerStatus.OFF);
} else if (result.getStatus() == Status.SKIPPED_DUE_TO_POLICY) {
// skipped due to policy is handled in caller
return result;
}
if (result.getStatus() == Status.SUCCESS) {
// fence operation was successful, verify if host power status changed
statusResult = waitForStatus(fenceAction);
if (isRequestedStatusAchieved(statusResult)) {
// requested host power status reached, end with success
return statusResult;
}
}
fenceRetries++;
} while (fenceRetries < allowedFenceActionRetries);
return new FenceOperationResult(
Status.ERROR,
// fail safe, at least one fence attempt should always be executed, so statusResult shouldn't be null
statusResult == null
? PowerStatus.UNKNOWN
: statusResult.getPowerStatus(),
"Allowed retries to verify host power status exceeded");
}
/**
* Executes status operation until requested host power status is reached or allowed number of retries exceeded
* to determine of start/stop fence operation was successful
*/
protected FenceOperationResult waitForStatus(FenceActionType fenceAction) {
FenceOperationResult statusResult = null;
// start at -1, because the 1st iteration is regular and not a retry
int statusRetries = -1;
int unknownStatusReceived = 0;
log.info(
"Waiting for host '{}' to reach status '{}'",
fencedHost.getHostName(),
requestedPowerStatus);
// Waiting before first attempt to check the host status.
// This is done because if we will attempt to get host status immediately
// in most cases it will not turn from on/off to off/on and we will need
// to wait a full cycle for it.
ThreadUtils.sleep(getSleepBeforeFirstAttempt());
while (statusRetries < allowedWaitForStatusRetries) {
log.info("Attempt {} to get host '{}' status", statusRetries + 2, fencedHost.getHostName());
statusResult = getStatus();
if (statusResult.getStatus() == Status.SUCCESS) {
if (statusResult.getPowerStatus() == PowerStatus.UNKNOWN) {
if (unknownStatusReceived < getUnknownResultLimit()
&& statusRetries < allowedWaitForStatusRetries) {
// unknown power status received, wait a while and retry
ThreadUtils.sleep(delayBetweenRetries);
statusRetries++;
unknownStatusReceived++;
} else {
// No need to retry, agent definitions are corrupted
log.error(
"Host '{}' PM Agent definitions are corrupted, aborting fence operation.",
fencedHost.getHostName());
return new FenceOperationResult(
Status.ERROR,
PowerStatus.UNKNOWN,
statusResult.getMessage());
}
} else if (statusResult.getPowerStatus() == requestedPowerStatus) {
log.info("Host '{}' status is '{}'", fencedHost.getHostName(), requestedPowerStatus);
return new FenceOperationResult(
Status.SUCCESS,
requestedPowerStatus);
} else {
// host is still not in requested power status
statusRetries++;
if (statusRetries < allowedWaitForStatusRetries) {
ThreadUtils.sleep(delayBetweenRetries);
}
}
} else {
log.error("Failed to get host '{}' status.", fencedHost.getHostName());
return statusResult;
}
}
auditVerifyStatusRetryLimitExceeded(fenceAction);
return new FenceOperationResult(
Status.ERROR,
statusResult == null ? PowerStatus.UNKNOWN : statusResult.getPowerStatus(),
statusResult == null ? "" : statusResult.getMessage());
}
protected boolean isRequestedStatusAchieved(FenceOperationResult result) {
return result.getStatus() == Status.SUCCESS
&& result.getPowerStatus() == requestedPowerStatus;
}
protected int getSleepBeforeFirstAttempt() {
return SLEEP_BEFORE_FIRST_ATTEMPT;
}
protected int getUnknownResultLimit() {
return UNKNOWN_RESULT_LIMIT;
}
protected void auditVerifyStatusRetryLimitExceeded(FenceActionType fenceAction) {
AuditLogable auditLogable = new AuditLogableImpl();
auditLogable.addCustomValue("Host", fencedHost.getName());
auditLogable.addCustomValue("Status", fenceAction.name().toLowerCase());
auditLogable.setVdsId(fencedHost.getId());
auditLogable.setVdsName(fencedHost.getName());
getAuditLogDirector().log(auditLogable, AuditLogType.VDS_ALERT_FENCE_STATUS_VERIFICATION_FAILED);
log.error(
"Failed to verify host '{}' status after {} action: have retried {} times with delay of {} seconds"
+ " between each retry.",
fencedHost.getHostName(),
fenceAction.name(),
allowedWaitForStatusRetries,
delayBetweenRetries);
}
// TODO Investigate if injection is possible
protected AuditLogDirector getAuditLogDirector() {
if (auditLogDirector == null) {
auditLogDirector = new AuditLogDirector();
}
return auditLogDirector;
}
}