package org.ovirt.engine.core.bll.pm; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import javax.inject.Inject; import org.ovirt.engine.core.bll.LockMessagesMatchUtil; import org.ovirt.engine.core.bll.NonTransactiveCommandAttribute; import org.ovirt.engine.core.bll.VdsCommand; import org.ovirt.engine.core.bll.context.CommandContext; import org.ovirt.engine.core.bll.hostedengine.PreviousHostedEngineHost; import org.ovirt.engine.core.bll.job.ExecutionContext; import org.ovirt.engine.core.bll.validator.FenceValidator; import org.ovirt.engine.core.bll.validator.HostValidator; import org.ovirt.engine.core.common.AuditLogType; import org.ovirt.engine.core.common.VdcObjectType; import org.ovirt.engine.core.common.action.FenceVdsActionParameters; import org.ovirt.engine.core.common.action.LockProperties; import org.ovirt.engine.core.common.action.LockProperties.Scope; import org.ovirt.engine.core.common.action.SetStoragePoolStatusParameters; import org.ovirt.engine.core.common.action.VdcActionType; import org.ovirt.engine.core.common.action.VdcReturnValueBase; import org.ovirt.engine.core.common.businessentities.Cluster; import org.ovirt.engine.core.common.businessentities.FencingPolicy; import org.ovirt.engine.core.common.businessentities.StoragePoolStatus; import org.ovirt.engine.core.common.businessentities.VDS; import org.ovirt.engine.core.common.businessentities.VDSStatus; import org.ovirt.engine.core.common.businessentities.VdsSpmStatus; import org.ovirt.engine.core.common.config.Config; import org.ovirt.engine.core.common.config.ConfigValues; import org.ovirt.engine.core.common.errors.EngineMessage; import org.ovirt.engine.core.common.locks.LockingGroup; import org.ovirt.engine.core.common.utils.Pair; import org.ovirt.engine.core.compat.Guid; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogable; import org.ovirt.engine.core.dal.dbbroker.auditloghandling.AuditLogableImpl; import org.ovirt.engine.core.dao.ClusterDao; import org.ovirt.engine.core.dao.VdsDao; import org.ovirt.engine.core.utils.ThreadUtils; import org.ovirt.engine.core.vdsbroker.ResourceManager; import org.ovirt.engine.core.vdsbroker.monitoring.MonitoringStrategyFactory; /** * @see RestartVdsCommand on why this command is requiring a lock */ @NonTransactiveCommandAttribute public class VdsNotRespondingTreatmentCommand<T extends FenceVdsActionParameters> extends VdsCommand<T> { /** * use this member to determine if fence failed but vms moved to unknown mode (for the audit log type) */ private static final String RESTART = "Restart"; @Inject private ResourceManager resourceManager; @Inject private PreviousHostedEngineHost previousHostedEngineHost; @Inject private MonitoringStrategyFactory monitoringStrategyFactory; @Inject private ClusterDao clusterDao; @Inject private VdsDao vdsDao; public VdsNotRespondingTreatmentCommand(T parameters, CommandContext commandContext) { super(parameters, commandContext); } @Override protected LockProperties applyLockProperties(LockProperties lockProperties) { return lockProperties.withScope(Scope.Execution); } private boolean shouldFencingBeSkipped(VDS vds) { // check if fencing in cluster is enabled Cluster cluster = clusterDao.get(vds.getClusterId()); if (cluster != null && !cluster.getFencingPolicy().isFencingEnabled()) { AuditLogable alb = createAuditLogableForHost(vds); auditLogDirector.log(alb, AuditLogType.VDS_ALERT_FENCE_DISABLED_BY_CLUSTER_POLICY); return true; } // check if connectivity is not broken if (isConnectivityBrokenThresholdReached(getVds())) { return true; } // fencing will be executed return false; } private AuditLogable createAuditLogableForHost(VDS vds) { AuditLogable logable = new AuditLogableImpl(); logable.setVdsId(vds.getId()); logable.setVdsName(vds.getName()); logable.setClusterId(vds.getClusterId()); logable.setClusterName(vds.getClusterName()); logable.setRepeatable(true); return logable; } @Override protected boolean validate() { HostValidator validator = HostValidator.createInstance(getVds()); return validate(validator.hostExists()); } /** * Only fence the host if the VDS is down, otherwise it might have gone back up until this command was executed. If * the VDS is not fenced then don't send an audit log event. */ @Override protected void executeCommand() { if (!previousHostedEngineHost.isPreviousHostId(getVds().getId()) && !new FenceValidator().isStartupTimeoutPassed()) { log.error("Failed to run Fence script on vds '{}'.", getVdsName()); alertIfPowerManagementOperationSkipped(); // If fencing can't be done and the host is the SPM, set storage-pool to non-operational if (getVds().getSpmStatus() != VdsSpmStatus.None) { setStoragePoolNonOperational(); } return; } setVds(null); if (getVds() == null) { setCommandShouldBeLogged(false); log.info("Host '{}' ({}) not fenced since it doesn't exist anymore.", getVdsName(), getVdsId()); getReturnValue().setSucceeded(false); return; } if (shouldFencingBeSkipped(getVds())) { setSucceeded(false); setCommandShouldBeLogged(false); return; } boolean shouldBeFenced = getVds().shouldVdsBeFenced(); VdcReturnValueBase restartVdsResult = null; if (shouldBeFenced) { getParameters().setParentCommand(VdcActionType.VdsNotRespondingTreatment); VdcReturnValueBase retVal; retVal = runInternalAction(VdcActionType.SshSoftFencing, getParameters(), cloneContext().withoutExecutionContext()); if (retVal.getSucceeded()) { // SSH Soft Fencing was successful and host is Up, stop non responding treatment getReturnValue().setSucceeded(true); return; } // proceed with non responding treatment only if PM action are allowed and PM enabled for host if (!monitoringStrategyFactory.getMonitoringStrategyForVds(getVds()).isPowerManagementSupported() || !getVds().isPmEnabled()) { setSucceeded(false); setCommandShouldBeLogged(false); return; } retVal = runInternalAction(VdcActionType.VdsKdumpDetection, getParameters(), cloneContext().withoutExecutionContext()); if (retVal.getSucceeded()) { // kdump on host detected and finished successfully, stop hard fencing execution getReturnValue().setSucceeded(true); return; } // load cluster fencing policy FencingPolicy fencingPolicy = clusterDao.get(getVds().getClusterId()).getFencingPolicy(); getParameters().setFencingPolicy(fencingPolicy); waitUntilSkipFencingIfSDActiveAllowed(fencingPolicy.isSkipFencingIfSDActive()); restartVdsResult = runInternalAction(VdcActionType.RestartVds, getParameters(), cloneContext().withoutExecutionContext()); } else { setCommandShouldBeLogged(false); log.info("Host '{}' ({}) not fenced since it's status is ok, or it doesn't exist anymore.", getVdsName(), getVdsId()); } if (restartVdsResult != null && restartVdsResult.<RestartVdsResult>getActionReturnValue().isSkippedDueToFencingPolicy()) { // fencing was skipped, fire an alert and suppress standard command logging AuditLogable alb = createAuditLogableForHost(getVds()); auditLogDirector.log(alb, AuditLogType.VDS_ALERT_NOT_RESTARTED_DUE_TO_POLICY); setSucceeded(false); setCommandShouldBeLogged(false); } else { getReturnValue().setSucceeded(shouldBeFenced); } } private void setStoragePoolNonOperational() { log.info("Fence failed on vds '{}' which is spm of pool '{}' - moving pool to non operational", getVds().getName(), getVds().getStoragePoolId()); CommandContext commandContext = getContext().clone(); // CommandContext clone is 'shallow' and does not clone the internal ExecutionContext. // So ExecutionContext is cloned here manually to prevent a bug (BZ1145099). commandContext.withExecutionContext(new ExecutionContext(commandContext.getExecutionContext())); runInternalAction( VdcActionType.SetStoragePoolStatus, new SetStoragePoolStatusParameters(getVds().getStoragePoolId(), StoragePoolStatus.NotOperational, AuditLogType.SYSTEM_CHANGE_STORAGE_POOL_STATUS_NO_HOST_FOR_SPM), commandContext); } @Override public AuditLogType getAuditLogTypeValue() { return getSucceeded() ? AuditLogType.VDS_RECOVER : AuditLogType.VDS_RECOVER_FAILED; } @Override public Map<String, String> getJobMessageProperties() { if (jobProperties == null) { jobProperties = new HashMap<>(); jobProperties.put(VdcObjectType.VDS.name().toLowerCase(), (getVdsName() == null) ? "" : getVdsName()); } return jobProperties; } private boolean isConnectivityBrokenThresholdReached(VDS vds) { Cluster cluster = clusterDao.get(vds.getClusterId()); int percents = 0; boolean result = false; if (cluster.getFencingPolicy().isSkipFencingIfConnectivityBroken()) { List<VDS> hosts = vdsDao.getAllForCluster(cluster.getId()); double hostsNumber = hosts.size(); double hostsWithBrokenConnectivityNumber = hosts.stream().filter(h -> h.getStatus() == VDSStatus.Connecting || h.getStatus() == VDSStatus.NonResponsive).count(); percents = (int) ((hostsWithBrokenConnectivityNumber/hostsNumber)*100); result = percents >= cluster.getFencingPolicy().getHostsWithBrokenConnectivityThreshold(); } if (result) { logAlert(vds, percents); } return result; } private void logAlert(VDS host, int percents) { AuditLogable auditLogable = createAuditLogableForHost(host); auditLogable.addCustomValue("Percents", String.valueOf(percents)); auditLogDirector.log(auditLogable, AuditLogType.VDS_ALERT_FENCE_OPERATION_SKIPPED_BROKEN_CONNECTIVITY); } public ResourceManager getResourceManager() { return resourceManager; } private void waitUntilSkipFencingIfSDActiveAllowed(boolean skipFencingIfSDActive) { if (skipFencingIfSDActive) { // host storage lease should be renewed each ConfigValues.HostStorageLeaseAliveInterval // so we need to be sure not to execute fencing before host is non responsive for longer time long interval = TimeUnit.SECONDS.toMillis( Config.<Integer>getValue(ConfigValues.HostStorageLeaseAliveCheckingInterval)); long lastUpdate = getResourceManager().getVdsManager(getVdsId()).getLastUpdate(); long difference = System.currentTimeMillis() - lastUpdate; if (difference < interval) { long sleepMs = interval - difference; log.info("Sleeping {} ms before proceeding with fence execution", sleepMs); ThreadUtils.sleep(sleepMs); } } } @Override protected Map<String, Pair<String, String>> getExclusiveLocks() { return createFenceExclusiveLocksMap(getVdsId()); } public static Map<String, Pair<String, String>> createFenceExclusiveLocksMap(Guid vdsId) { return Collections.singletonMap(vdsId.toString(), LockMessagesMatchUtil.makeLockingPair( LockingGroup.VDS_FENCE, EngineMessage.POWER_MANAGEMENT_ACTION_ON_ENTITY_ALREADY_IN_PROGRESS)); } }