package org.ovirt.engine.core.bll;
import static org.ovirt.engine.core.common.job.Step.MAX_PROGRESS;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import org.ovirt.engine.core.bll.storage.EntityPollingCommand;
import org.ovirt.engine.core.bll.tasks.CommandCoordinatorUtil;
import org.ovirt.engine.core.common.action.HostJobCommandParameters;
import org.ovirt.engine.core.common.action.VdcActionParametersBase;
import org.ovirt.engine.core.common.businessentities.CommandEntity;
import org.ovirt.engine.core.common.businessentities.HostJobInfo;
import org.ovirt.engine.core.common.businessentities.HostJobInfo.HostJobStatus;
import org.ovirt.engine.core.common.businessentities.HostJobInfo.HostJobType;
import org.ovirt.engine.core.common.businessentities.VDS;
import org.ovirt.engine.core.common.businessentities.VDSStatus;
import org.ovirt.engine.core.common.vdscommands.GetHostJobsVDSCommandParameters;
import org.ovirt.engine.core.common.vdscommands.VDSCommandType;
import org.ovirt.engine.core.common.vdscommands.VDSReturnValue;
import org.ovirt.engine.core.compat.CommandStatus;
import org.ovirt.engine.core.compat.Guid;
import org.ovirt.engine.core.compat.backendcompat.CommandExecutionStatus;
import org.ovirt.engine.core.dal.dbbroker.DbFacade;
import org.ovirt.engine.core.dao.StepDao;
import org.ovirt.engine.core.dao.VdsDao;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public abstract class HostJobCallback extends ChildCommandsCallbackBase {
protected final Logger log = LoggerFactory.getLogger(getClass());
@Override
protected void childCommandsExecutionEnded(CommandBase<?> command,
boolean anyFailed,
List<Guid> childCmdIds,
CommandExecutionStatus status,
int completedChildren) {
Guid cmdId = command.getCommandId();
CommandEntity commandEntity = CommandCoordinatorUtil.getCommandEntity(cmdId);
VdcActionParametersBase cmdParams = commandEntity.getCommandParameters();
Guid job = ((HostJobCommandParameters) cmdParams).getHostJobId();
HostJobStatus jobStatus = null;
Guid vdsId = cmdParams.getVdsRunningOn();
VDS vds = getVdsDao().get(vdsId);
if (vds != null) {
boolean jobsReportedByHost = false;
if (vds.getStatus() == VDSStatus.Up) {
HostJobInfo jobInfo;
try {
jobInfo = pollStorageJob(job, vdsId);
} catch (Exception e) {
// We shouldn't get an error when polling the host job (as it access the local storage only).
// If we got an error, it will usually be a network error - so the host will either move
// to Non Responsive or the polling will succeed on the next attempt.
log.warn("Command {} id: '{}': Failed to poll the job '{}' on host '{}' (id: '{}'), will retry soon",
commandEntity.getCommandType(), cmdId, job, vds.getName(), vdsId);
return;
}
if (jobInfo != null) {
handlePolledJobInfo(getCommand(cmdId), jobInfo);
jobStatus = jobInfo.getStatus();
updateStepProgress(commandEntity.getCommandContext().getStepId(), jobInfo.getProgress());
}
jobsReportedByHost = true;
} else {
log.warn("Command {} id: '{}': can't poll the job '{}' as host '{}' (id: '{}') isn't in status UP",
commandEntity.getCommandType(), cmdId, job, vds.getName(), vdsId);
}
// If we couldn't determine job status by polling the host, we can try to determine it using different methods.
if (jobStatus == null) {
jobStatus = handleUndeterminedJobStatus(getCommand(cmdId), jobsReportedByHost);
}
if (jobStatus == null) {
log.info("Command {} id: '{}': couldn't get the status of job '{}' on host '{}' (id: '{}'), assuming it's " +
"still running",
commandEntity.getCommandType(), cmdId, job, vds.getName(), vdsId);
return;
}
if (jobStatus.isAlive()) {
log.info("Command {} id: '{}': waiting for job '{}' on host '{}' (id: '{}') to complete",
commandEntity.getCommandType(), cmdId, job, vds.getName(), vdsId);
return;
}
log.info("Command {} id: '{}': job '{}' execution was completed with VDSM job status '{}'",
commandEntity.getCommandType(), cmdId, job, jobStatus);
if (command.shouldUpdateStepProgress() && jobStatus == HostJobStatus.done) {
updateStepProgress(commandEntity.getCommandContext().getStepId(), MAX_PROGRESS);
}
} else {
jobStatus = HostJobStatus.failed;
log.info("Command {} id: '{}': job '{}' wasn't executed on any host, considering the job status as failed",
commandEntity.getCommandType(), cmdId, job);
}
command.getParameters().setTaskGroupSuccess(status == CommandExecutionStatus.EXECUTED
&& jobStatus == HostJobStatus.done);
command.setCommandStatus(command.getParameters().getTaskGroupSuccess() ? CommandStatus.SUCCEEDED
: CommandStatus.FAILED);
log.info("Command {} id: '{}': execution was completed, the command status is '{}'",
command.getActionType(), command.getCommandId(), command.getCommandStatus());
}
protected CommandBase<? extends HostJobCommandParameters> getCommand(Guid cmdId) {
return CommandCoordinatorUtil.retrieveCommand(cmdId);
}
private boolean isEntityPollingSupported(CommandBase<?> cmd) {
return cmd instanceof EntityPollingCommand;
}
private HostJobInfo pollStorageJob(Guid jobId, Guid vdsId) {
if (jobId == null) {
return null;
}
GetHostJobsVDSCommandParameters p = new GetHostJobsVDSCommandParameters(vdsId, Collections
.singletonList(jobId), getHostJobType());
VDSReturnValue returnValue = Backend.getInstance().getResourceManager().runVdsCommand(VDSCommandType
.GetHostJobs, p);
return ((Map<Guid, HostJobInfo>) returnValue.getReturnValue()).get(jobId);
}
protected abstract HostJobType getHostJobType();
private HostJobStatus pollEntity(CommandBase<?> cmd) {
try {
return ((EntityPollingCommand) cmd).poll();
} catch (Exception e) {
log.error("Command {} id: '{}': failed to poll the command entity",
cmd.getActionType(),
cmd.getCommandId());
}
return null;
}
private void handlePolledJobInfo(CommandBase<? extends HostJobCommandParameters> cmd, HostJobInfo jobInfo) {
if (jobInfo.getStatus() != HostJobStatus.failed) {
return;
}
// If a job failed on VDSM side, we may want to perform operations according to the job error or to consider the
// job as successful on some cases.
// each StorageJobCommand can override a method that'll handle the error and may return a different job status
// if needed.
if (jobInfo.getError() != null) {
jobInfo.setStatus(((HostJobCommand) cmd).handleJobError(jobInfo.getError().getCode()));
if (jobInfo.getStatus() == HostJobStatus.done) {
// if the error inspection led us to decide to the job actual status is done, we can set the progress
// to null so the progress will be considered as 100 for the command step (if present), it's better than
// setting to 100 as we don't know if progress is actually reported for the operation.
jobInfo.setProgress(null);
}
}
}
private HostJobStatus handleUndeterminedJobStatus(CommandBase<? extends HostJobCommandParameters> cmd,
boolean jobsReportedByHost) {
// If the command supports entity polling, we can use it in order to determine the status.
if (isEntityPollingSupported(cmd)) {
log.info("Command {} id: '{}': attempting to determine the job status by polling the entity.",
cmd.getActionType(),
cmd.getCommandId());
HostJobStatus jobStatus = pollEntity(cmd);
if (jobStatus != null) {
return jobStatus;
}
// If the job status couldn't been detected using entity polling and the command supports job fencing, we
// can attempt to fence the job - which means that the host will fail to execute it if it attempts to.
// Note that we may attempt to perform the fencing even if the job failed in case we couldn't determine
// the job status, that'll confirm the job failure.
//
// Fencing the operation will usually be performed by executing an asynchronous fencing command on the
// entity the job is supposed to be performed on.
// If a fencing command was executed, the callback will wait for it to end and then will try to poll the
// entity again (it'll be detected as a running child command). On synchronous fencing/no fencing we
// will attempt to poll the entity again.
((EntityPollingCommand) cmd).attemptToFenceJob();
return null;
}
if (((HostJobCommand) cmd).failJobWithUndeterminedStatus()) {
log.error("Command {} id: '{}': failed to determine the actual job status, considering as failed as per" +
" the command implementation",
cmd.getActionType(),
cmd.getCommandId());
return HostJobStatus.failed;
}
// if the job was cleared from the host job report we fail the operation so the command will end
// (as the command doesn't support entity polling - so we don't have any way to poll it).
if (jobsReportedByHost) {
log.error("Command {} id: '{}': entity polling isn't supported and the job isn't reported by the host," +
"assuming it failed so that the command execution will end.",
cmd.getActionType(),
cmd.getCommandId());
return HostJobStatus.failed;
}
// if we couldn't determine the job status, we'll retry to poll it.
log.error("Command {} id: '{}': failed to determine the actual job status, will retry to poll the job soon",
cmd.getActionType(),
cmd.getCommandId());
return null;
}
private void updateStepProgress(Guid stepId, Integer progress) {
if (stepId != null) {
getStepDao().updateStepProgress(stepId, progress);
}
}
@Override
public boolean pollOnExecutionFailed() {
return true;
}
@Override
public void onSucceeded(Guid cmdId, List<Guid> childCmdIds) {
endAction(getCommand(cmdId));
}
@Override
public void onFailed(Guid cmdId, List<Guid> childCmdIds) {
CommandBase<?> commandBase = getCommand(cmdId);
// This should be removed as soon as infra bug will be fixed and failed execution will reach endWithFailure
commandBase.getParameters().setTaskGroupSuccess(false);
endAction(commandBase);
}
private void endAction(CommandBase<?> commandBase) {
commandBase.getReturnValue().setSucceeded(false);
commandBase.endAction();
}
private VdsDao getVdsDao() {
return DbFacade.getInstance().getVdsDao();
}
private StepDao getStepDao() {
return DbFacade.getInstance().getStepDao();
}
}