package org.rhq.enterprise.server.storage; import static java.util.Arrays.asList; import java.util.ArrayList; import java.util.List; import javax.ejb.Asynchronous; import javax.ejb.EJB; import javax.ejb.Stateless; import javax.ejb.TransactionAttribute; import javax.ejb.TransactionAttributeType; import javax.persistence.EntityManager; import javax.persistence.PersistenceContext; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.joda.time.Days; import org.rhq.cassandra.schema.Table; import org.rhq.core.domain.auth.Subject; import org.rhq.core.domain.authz.Permission; import org.rhq.core.domain.cloud.StorageClusterSettings; import org.rhq.core.domain.cloud.StorageNode; import org.rhq.core.domain.common.JobTrigger; import org.rhq.core.domain.configuration.Configuration; import org.rhq.core.domain.configuration.Property; import org.rhq.core.domain.configuration.PropertyList; import org.rhq.core.domain.configuration.PropertySimple; import org.rhq.core.domain.operation.OperationDefinition; import org.rhq.core.domain.operation.OperationHistory; import org.rhq.core.domain.operation.ResourceOperationHistory; import org.rhq.core.domain.operation.bean.ResourceOperationSchedule; import org.rhq.core.domain.resource.Resource; import org.rhq.core.domain.resource.ResourceType; import org.rhq.core.util.exception.ThrowableUtil; import org.rhq.enterprise.server.RHQConstants; import org.rhq.enterprise.server.auth.SubjectException; import org.rhq.enterprise.server.auth.SubjectManagerLocal; import org.rhq.enterprise.server.authz.RequiredPermission; import org.rhq.enterprise.server.cloud.StorageNodeManagerLocal; import org.rhq.enterprise.server.operation.OperationManagerLocal; import org.rhq.enterprise.server.resource.ResourceManagerLocal; import org.rhq.enterprise.server.scheduler.jobs.ReplicationFactorCheckJob; import org.rhq.server.metrics.StorageSession; /** * @author John Sanda */ @Stateless public class StorageNodeOperationsHandlerBean implements StorageNodeOperationsHandlerLocal { private final Log log = LogFactory.getLog(StorageNodeOperationsHandlerBean.class); private static final String STORAGE_NODE_TYPE_NAME = "RHQ Storage Node"; private static final String STORAGE_NODE_PLUGIN_NAME = "RHQStorage"; private final static String RUN_REPAIR_PROPERTY = "runRepair"; private final static String UPDATE_SEEDS_LIST = "updateSeedsList"; private final static String SEEDS_LIST = "seedsList"; private static final int LONG_RUNNING_OPERATION_TIMEOUT = Days.SEVEN.toStandardSeconds().getSeconds(); @PersistenceContext(unitName = RHQConstants.PERSISTENCE_UNIT_NAME) private EntityManager entityManager; @EJB private SubjectManagerLocal subjectManager; @EJB private StorageNodeManagerLocal storageNodeManager; @EJB private OperationManagerLocal operationManager; @EJB private StorageClusterSettingsManagerLocal storageClusterSettingsManager; @EJB private StorageClientManager storageClientManager; @EJB private StorageNodeOperationsHandlerLocal storageNodeOperationsHandler; @EJB private ResourceManagerLocal resourceManager; @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void announceStorageNode(Subject subject, StorageNode storageNode) { if (log.isInfoEnabled()) { log.info("Announcing " + storageNode + " to storage node cluster."); } try { storageNodeOperationsHandler.setMode(storageNode, StorageNode.OperationMode.ANNOUNCE); storageNodeOperationsHandler.setMaintenancePending(); List<StorageNode> clusterNodes = storageNodeOperationsHandler.setMaintenancePending(); announceStorageNode(subject, storageNode, clusterNodes.get(0), createPropertyListOfAddresses("addresses", asList(storageNode))); } catch (IndexOutOfBoundsException e) { String msg = "Aborting storage node deployment due to unexpected error while announcing storage node at " + storageNode.getAddress(); log.error(msg, e); log.error("If this error occurred with a storage node that was deployed prior to installing the server, " + "then this may indicate that the rhq.storage.nodes property in rhq-server.properties was not set " + "correctly. All nodes deployed prior to server installation should be listed in the " + "rhq.storage.nodes property. Please review the deployment documentation for additional details."); storageNodeOperationsHandler.logError(storageNode.getAddress(), msg, e); } catch (Exception e) { String msg = "Aborting storage node deployment due to unexpected error while announcing storage node at " + storageNode.getAddress(); log.error(msg, e); storageNodeOperationsHandler.logError(storageNode.getAddress(), msg, e); } } private void announceStorageNode(Subject subject, StorageNode newStorageNode, StorageNode clusterNode, PropertyList addresses) { if (log.isInfoEnabled()) { log.info("Announcing " + newStorageNode + " to cluster node " + clusterNode); } Configuration parameters = new Configuration(); parameters.put(addresses); scheduleOperation(subject, clusterNode, parameters, "announce"); } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void unannounceStorageNode(Subject subject, StorageNode storageNode) { log.info("Unannouncing " + storageNode); List<StorageNode> clusterNodes = storageNodeOperationsHandler.setMaintenancePending(); unannounceStorageNode(subject, clusterNodes.get(0), createPropertyListOfAddresses("addresses", asList(storageNode))); } @Override public List<StorageNode> setMaintenancePending() { List<StorageNode> clusterNodes = getStorageNodesByMode(StorageNode.OperationMode.NORMAL); for (StorageNode clusterNode : clusterNodes) { clusterNode.setMaintenancePending(true); } return clusterNodes; } @Override public List<StorageNode> getStorageNodesByMode(StorageNode.OperationMode mode) { List<StorageNode> clusterNodes = entityManager .createNamedQuery(StorageNode.QUERY_FIND_ALL_BY_MODE, StorageNode.class) .setParameter("operationMode", mode).getResultList(); return clusterNodes; } private void unannounceStorageNode(Subject subject, StorageNode clusterNode, PropertyList addresses) { Configuration parameters = new Configuration(); parameters.put(addresses); scheduleOperation(subject, clusterNode, parameters, "unannounce"); } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void uninstall(Subject subject, StorageNode storageNode) { log.info("Uninstalling " + storageNode); if (storageNode.getResource() == null) { storageNodeOperationsHandler.finishUninstall(subject, storageNode); } else { scheduleOperation(subject, storageNode, new Configuration(), "uninstall"); } } @Override public void finishUninstall(Subject subject, StorageNode storageNode) { storageNode = entityManager.find(StorageNode.class, storageNode.getId()); if (storageNode.getResource() != null) { log.info("Removing storage node resource " + storageNode.getResource() + " from inventory"); Resource resource = storageNode.getResource(); storageNodeOperationsHandler.detachFromResource(storageNode); resourceManager.uninventoryResource(subject, resource.getId()); } log.info("Removing storage node entity " + storageNode + " from database"); entityManager.remove(storageNode); log.info(storageNode + " has been undeployed"); } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void detachFromResource(StorageNode storageNode) { storageNode.setResource(null); storageNode.setFailedOperation(null); } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void decommissionStorageNode(Subject subject, StorageNode storageNode) { log.info("Preparing to decommission " + storageNode); storageNode = storageNodeOperationsHandler.setMode(storageNode, StorageNode.OperationMode.DECOMMISSION); storageNode = storageNodeOperationsHandler.setMaintenancePendingDecommissionStorageNode(storageNode); scheduleOperation(subject, storageNode, new Configuration(), "decommission"); } @Override public StorageNode setMaintenancePendingDecommissionStorageNode(StorageNode storageNode) { List<StorageNode> storageNodes = getStorageNodesByMode(StorageNode.OperationMode.NORMAL); boolean runRepair = updateSchemaIfNecessary(storageNodes.size() + 1, storageNodes.size()); // This is a bit of a hack since the maintenancePending flag is really intended to // queue up storage nodes during cluster maintenance operations. storageNode = entityManager.find(StorageNode.class, storageNode.getId()); storageNode.setMaintenancePending(runRepair); storageNode = entityManager.merge(storageNode); return storageNode; } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void logError(String address, String error, Exception e) { try { StorageNode newStorageNode = findStorageNodeByAddress(address); newStorageNode.setErrorMessage(error + " Check the server log for details. Root cause: " + ThrowableUtil.getRootCause(e).getMessage()); } catch (Exception e1) { log.error("Failed to log error against storage node", e); } } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void performAddNodeMaintenance(Subject subject, StorageNode storageNode) { try { storageNodeOperationsHandler.performAddMaintenance(subject, storageNode); } catch (Exception e) { String msg = "Aborting storage node deployment due to unexpected error while performing add node " + "maintenance."; log.error(msg, e); storageNodeOperationsHandler.logError(storageNode.getAddress(), msg, e); } } @Override public void performAddMaintenance(Subject subject, StorageNode storageNode) { List<StorageNode> clusterNodes = setMaintenancePendingPerformAddMaintenance(storageNode); boolean runRepair = updateSchemaIfNecessary(clusterNodes.size() - 1, clusterNodes.size()); performAddNodeMaintenance(subject, storageNode, runRepair, createPropertyListOfAddresses(SEEDS_LIST, clusterNodes), storageNode.getAddress()); } @Override public List<StorageNode> setMaintenancePendingPerformAddMaintenance(StorageNode storageNode) { List<StorageNode> storageNodes = getStorageNodesByMode(StorageNode.OperationMode.NORMAL); for (StorageNode node : storageNodes) { node.setMaintenancePending(true); } storageNode.setMaintenancePending(true); storageNode = entityManager.merge(storageNode); storageNodes.add(storageNode); return storageNodes; } private void performAddNodeMaintenance(Subject subject, StorageNode storageNode, boolean runRepair, PropertyList seedsList, String newNodeAddress) { if (log.isInfoEnabled()) { log.info("Running addNodeMaintenance for storage node " + storageNode); } Configuration params = new Configuration(); params.put(seedsList); params.put(new PropertySimple(RUN_REPAIR_PROPERTY, runRepair)); params.put(new PropertySimple(UPDATE_SEEDS_LIST, Boolean.TRUE)); params.put(new PropertySimple("newNodeAddress", newNodeAddress)); scheduleOperation(subject, storageNode, params, "addNodeMaintenance", LONG_RUNNING_OPERATION_TIMEOUT); StorageClusterSettings settings = storageClusterSettingsManager.getClusterSettings(subject); storageNodeManager.scheduleSnapshotManagementOperationsForStorageNode(subject, storageNode, settings); } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void performRemoveNodeMaintenance(Subject subject, StorageNode storageNode) { try { storageNodeOperationsHandler.performRemoveMaintenance(subject, storageNode); } catch (Exception e) { String msg = "Aborting undeployment due to unexpected error while performing remove node maintenance."; log.error(msg, e); storageNodeOperationsHandler.logError(storageNode.getAddress(), msg, e); } } @Override public void performRemoveMaintenance(Subject subject, StorageNode storageNode) { List<StorageNode> clusterNodes = setMaintenancePending(); boolean runRepair = storageNode.isMaintenancePending(); performRemoveNodeMaintenance(subject, clusterNodes.get(0), runRepair, createPropertyListOfAddresses(SEEDS_LIST, clusterNodes), storageNode.getAddress()); } private void performRemoveNodeMaintenance(Subject subject, StorageNode storageNode, boolean runRepair, PropertyList seedsList, String removedNodeAddress) { if (log.isInfoEnabled()) { log.info("Running remove node maintenance for storage node " + storageNode); } Configuration params = new Configuration(); params.put(seedsList); params.put(new PropertySimple(RUN_REPAIR_PROPERTY, runRepair)); params.put(new PropertySimple(UPDATE_SEEDS_LIST, true)); params.put(new PropertySimple("removedNodeAddress", removedNodeAddress)); scheduleOperation(subject, storageNode, params, "removeNodeMaintenance", LONG_RUNNING_OPERATION_TIMEOUT); } @Override @Asynchronous public void handleOperationUpdateIfNecessary(OperationHistory operationHistory) { if (!(operationHistory instanceof ResourceOperationHistory)) { return; } ResourceOperationHistory resourceOperationHistory = (ResourceOperationHistory) operationHistory; if (!isStorageNodeOperation(resourceOperationHistory)) { return; } if (resourceOperationHistory.getOperationDefinition().getName().equals("announce")) { try { storageNodeOperationsHandler.handleAnnounce(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting storage node deployment due to unexpected error while announcing cluster nodes."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("prepareForBootstrap")) { try { storageNodeOperationsHandler.handlePrepareForBootstrap(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting storage node deployment due to unexpected error while bootstrapping new node."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("addNodeMaintenance")) { try { storageNodeOperationsHandler.handleAddNodeMaintenance(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting storage node deployment due to unexpected error while performing add node " + "maintenance."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("decommission")) { try { storageNodeOperationsHandler.handleDecommission(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting undeployment due to unexpected error while decommissioning storage node."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("removeNodeMaintenance")) { try { storageNodeOperationsHandler.handleRemoveNodeMaintenance(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting undeployment due to unexpected error while performing remove node maintenance."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("unannounce")) { try { storageNodeOperationsHandler.handleUnannounce(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting undeployment due to unexpected error while performing unannouncement."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("uninstall")) { try { storageNodeOperationsHandler.handleUninstall(resourceOperationHistory); } catch (Exception e) { String msg = "Aborting undeployment due to unexpected error while uninstalling."; logError(resourceOperationHistory, msg, e); } } else if (operationHistory.getOperationDefinition().getName().equals("repair")) { try { storageNodeOperationsHandler.handleRepair(resourceOperationHistory); } catch (Exception e) { String msg = "Abort scheduled repair maintenance due to unexpected error."; log.error(msg, e); } } } private void logError(ResourceOperationHistory operationHistory, String msg, Exception e) { log.error(msg, e); StorageNode storageNode = findStorageNode(operationHistory.getResource()); storageNodeOperationsHandler.logError(storageNode.getAddress(), msg, e); } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleAnnounce(ResourceOperationHistory resourceOperationHistory) { StorageNode storageNode = findStorageNode(resourceOperationHistory.getResource()); Configuration parameters = resourceOperationHistory.getParameters(); PropertyList addresses = parameters.getList("addresses"); StorageNode newStorageNode; switch (resourceOperationHistory.getStatus()) { case INPROGRESS: // nothing to do here return; case CANCELED: newStorageNode = findStorageNodeByAddress(getAddress(addresses)); deploymentOperationCanceled(storageNode, resourceOperationHistory, newStorageNode); case FAILURE: newStorageNode = findStorageNodeByAddress(getAddress(addresses)); deploymentOperationFailed(storageNode, resourceOperationHistory, newStorageNode); return; default: // SUCCESS storageNode.setMaintenancePending(false); StorageNode nextNode = takeFromMaintenanceQueue(); Subject subject = getSubject(resourceOperationHistory); newStorageNode = findStorageNodeByAddress(getAddress(addresses)); if (nextNode == null) { log.info("Successfully announced new storage node to storage cluster"); newStorageNode = storageNodeOperationsHandler.setMode(newStorageNode, StorageNode.OperationMode.BOOTSTRAP); storageNodeOperationsHandler.bootstrapStorageNode(subject, newStorageNode); } else { announceStorageNode(subject, newStorageNode, nextNode, addresses.deepCopy(false)); } } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleUnannounce(ResourceOperationHistory operationHistory) { StorageNode storageNode = findStorageNode(operationHistory.getResource()); Configuration params = operationHistory.getParameters(); PropertyList addresses = params.getList("addresses"); StorageNode removedStorageNode; switch (operationHistory.getStatus()) { case INPROGRESS: // nothing to do here break; case CANCELED: removedStorageNode = findStorageNodeByAddress(getAddress(addresses)); undeploymentOperationCanceled(storageNode, operationHistory, removedStorageNode); break; case FAILURE: removedStorageNode = findStorageNodeByAddress(getAddress(addresses)); deploymentOperationFailed(storageNode, operationHistory, removedStorageNode); break; default: // SUCCESS storageNode.setMaintenancePending(false); StorageNode nextNode = takeFromMaintenanceQueue(); Subject subject = getSubject(operationHistory); removedStorageNode = findStorageNodeByAddress(getAddress(addresses)); if (nextNode == null) { log.info("Successfully unannounced " + removedStorageNode + " to storage cluster"); removedStorageNode = storageNodeOperationsHandler.setMode(removedStorageNode, StorageNode.OperationMode.UNINSTALL); uninstall(getSubject(operationHistory), removedStorageNode); } else { unannounceStorageNode(subject, nextNode, addresses.deepCopy(false)); } } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handlePrepareForBootstrap(ResourceOperationHistory operationHistory) { StorageNode newStorageNode = findStorageNode(operationHistory.getResource()); switch (operationHistory.getStatus()) { case INPROGRESS: // nothing to do here return; case CANCELED: deploymentOperationCanceled(newStorageNode, operationHistory); return; case FAILURE: deploymentOperationFailed(newStorageNode, operationHistory); return; default: // SUCCESS log.info("The prepare for bootstrap operation completed successfully for " + newStorageNode); newStorageNode = storageNodeOperationsHandler.setMode(newStorageNode, StorageNode.OperationMode.ADD_MAINTENANCE); Subject subject = getSubject(operationHistory); performAddMaintenance(subject, newStorageNode); } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleAddNodeMaintenance(ResourceOperationHistory resourceOperationHistory) { StorageNode storageNode = findStorageNode(resourceOperationHistory.getResource()); Configuration parameters = resourceOperationHistory.getParameters(); String newNodeAddress = parameters.getSimpleValue("newNodeAddress"); StorageNode newStorageNode; switch (resourceOperationHistory.getStatus()) { case INPROGRESS: // nothing to do here return; case CANCELED: newStorageNode = findStorageNodeByAddress(newNodeAddress); deploymentOperationCanceled(storageNode, resourceOperationHistory, newStorageNode); return; case FAILURE: newStorageNode = findStorageNodeByAddress(newNodeAddress); deploymentOperationFailed(storageNode, resourceOperationHistory, newStorageNode); return; default: // SUCCESS log.info("Finished running add node maintenance for " + storageNode); storageNode.setMaintenancePending(false); StorageNode nextNode = takeFromMaintenanceQueue(); newStorageNode = findStorageNodeByAddress(newNodeAddress); if (nextNode == null) { log.info("Finished running add node maintenance on all cluster nodes"); storageNodeOperationsHandler.setMode(newStorageNode, StorageNode.OperationMode.NORMAL); } else { boolean runRepair = parameters.getSimple(RUN_REPAIR_PROPERTY).getBooleanValue(); PropertyList seedsList = parameters.getList(SEEDS_LIST).deepCopy(false); Subject subject = getSubject(resourceOperationHistory); performAddNodeMaintenance(subject, nextNode, runRepair, seedsList, newNodeAddress); } } } @Override @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void bootstrapStorageNode(Subject subject, StorageNode storageNode) { List<StorageNode> clusterNodes = storageNodeOperationsHandler .getStorageNodesByMode(StorageNode.OperationMode.NORMAL); clusterNodes.add(storageNode); prepareNodeForBootstrap(subject, storageNode, createPropertyListOfAddresses("addresses", clusterNodes)); } private void prepareNodeForBootstrap(Subject subject, StorageNode storageNode, PropertyList addresses) { if (log.isInfoEnabled()) { log.info("Preparing to bootstrap " + storageNode + " into cluster..."); } StorageClusterSettings clusterSettings = storageClusterSettingsManager.getClusterSettings(subject); Configuration parameters = new Configuration(); parameters.put(new PropertySimple("cqlPort", clusterSettings.getCqlPort())); parameters.put(new PropertySimple("gossipPort", clusterSettings.getGossipPort())); parameters.put(addresses); scheduleOperation(subject, storageNode, parameters, "prepareForBootstrap"); } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleRemoveNodeMaintenance(ResourceOperationHistory operationHistory) { StorageNode storageNode = findStorageNode(operationHistory.getResource()); Configuration parameters = operationHistory.getParameters(); String removedNodeAddress = parameters.getSimpleValue("removedNodeAddress"); StorageNode removedStorageNode; switch (operationHistory.getStatus()) { case INPROGRESS: // nothing to do here break; case CANCELED: removedStorageNode = findStorageNodeByAddress(removedNodeAddress); undeploymentOperationCanceled(storageNode, operationHistory, removedStorageNode); break; case FAILURE: removedStorageNode = findStorageNodeByAddress(removedNodeAddress); undeploymentOperationFailed(storageNode, operationHistory, removedStorageNode); break; default: // SUCCESS log.info("Finished remove node maintenance for " + storageNode); storageNode.setMaintenancePending(false); StorageNode nextNode = takeFromMaintenanceQueue(); removedStorageNode = findStorageNodeByAddress(removedNodeAddress); if (nextNode == null) { log.info("Finished running remove node maintenance on all cluster nodes"); removedStorageNode = storageNodeOperationsHandler.setMode(removedStorageNode, StorageNode.OperationMode.UNANNOUNCE); unannounceStorageNode(getSubject(operationHistory), removedStorageNode); } else { boolean runRepair = parameters.getSimple(RUN_REPAIR_PROPERTY).getBooleanValue(); PropertyList seedsList = parameters.getList(SEEDS_LIST).deepCopy(false); Subject subject = getSubject(operationHistory); performRemoveNodeMaintenance(subject, nextNode, runRepair, seedsList, removedNodeAddress); } } } @Override public void runRepair(Subject subject) { List<StorageNode> clusterNodes = storageNodeManager.getClusterNodes(); if (clusterNodes.size() == 1) { log.info("Skipping scheduled repair since this is a single-node cluster"); return; } log.info("Starting anti-entropy repair on storage cluster: " + clusterNodes); for (StorageNode node : clusterNodes) { node.setErrorMessage(null); node.setFailedOperation(null); node.setMaintenancePending(true); } StorageNode storageNode = storageNodeOperationsHandler.setMode(clusterNodes.get(0), StorageNode.OperationMode.MAINTENANCE); scheduleOperation(subject, storageNode, new Configuration(), "repair", LONG_RUNNING_OPERATION_TIMEOUT); } private void runRepair(Subject subject, StorageNode storageNode) { scheduleOperation(subject, storageNode, new Configuration(), "repair", LONG_RUNNING_OPERATION_TIMEOUT); } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleRepair(ResourceOperationHistory operationHistory) { StorageNode storageNode = findStorageNode(operationHistory.getResource()); switch (operationHistory.getStatus()) { case INPROGRESS: // nothing to do here return; case CANCELED: repairCanceled(storageNode, operationHistory); break; case FAILURE: repairFailed(storageNode, operationHistory); break; default: // SUCCESS log.info("Finished running repair on " + storageNode); break; } storageNode.setMaintenancePending(false); storageNodeOperationsHandler.setMode(storageNode, StorageNode.OperationMode.NORMAL); StorageNode nextNode = takeFromMaintenanceQueue(); if (nextNode == null) { log.info("Finished running repair on storage cluster"); } else { nextNode = storageNodeOperationsHandler.setMode(nextNode, StorageNode.OperationMode.MAINTENANCE); runRepair(getSubject(operationHistory), nextNode); } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleDecommission(ResourceOperationHistory operationHistory) { StorageNode storageNode = findStorageNode(operationHistory.getResource()); switch (operationHistory.getStatus()) { case INPROGRESS: // nothing do to here break; case CANCELED: undeploymentOperationCanceled(storageNode, operationHistory); break; case FAILURE: undeploymentOperationFailed(storageNode, operationHistory); break; default: // SUCCESS log.info("Successfully decommissioned " + storageNode); storageNode = storageNodeOperationsHandler.setMode(storageNode, StorageNode.OperationMode.REMOVE_MAINTENANCE); Subject subject = getSubject(operationHistory); performRemoveMaintenance(subject, storageNode); } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public void handleUninstall(ResourceOperationHistory operationHistory) { StorageNode storageNode = findStorageNode(operationHistory.getResource()); switch (operationHistory.getStatus()) { case INPROGRESS: // nothing to do here break; case CANCELED: undeploymentOperationCanceled(storageNode, operationHistory); break; case FAILURE: undeploymentOperationFailed(storageNode, operationHistory); break; default: // SUCCESS log.info("Successfully uninstalled " + storageNode + " from disk"); finishUninstall(getSubject(operationHistory), storageNode); } } @Override @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) public StorageNode setMode(StorageNode storageNode, StorageNode.OperationMode newMode) { storageNode.setOperationMode(newMode); return entityManager.merge(storageNode); } private Subject getSubject(ResourceOperationHistory resourceOperationHistory) { try { return subjectManager.loginUnauthenticated(resourceOperationHistory.getSubjectName()); } catch (Exception e) { throw new SubjectException("Not able to authenticate subject " + resourceOperationHistory.getSubjectName(), e); } } private void deploymentOperationCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode newStorageNode) { operationCanceled(storageNode, operationHistory, newStorageNode, "Deployment"); } private void undeploymentOperationCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode removedStorageNode) { operationCanceled(storageNode, operationHistory, removedStorageNode, "Undeployment"); } private void operationCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode movingNode, String opType) { log.error(opType + " has been aborted due to canceled operation [" + operationHistory.getOperationDefinition().getDisplayName() + " on " + storageNode.getResource() + ": " + operationHistory.getErrorMessage()); movingNode.setErrorMessage(opType + " has been aborted due to canceled resource operation on " + storageNode.getAddress()); storageNode.setErrorMessage(opType + " of " + movingNode.getAddress() + " has been aborted due " + "to cancellation of resource operation [" + operationHistory.getOperationDefinition().getDisplayName() + "]."); storageNode.setFailedOperation(operationHistory); } private void deploymentOperationCanceled(StorageNode newStorageNode, ResourceOperationHistory operationHistory) { operationCanceled(newStorageNode, operationHistory, "Deployment"); } private void undeploymentOperationCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory) { operationCanceled(storageNode, operationHistory, "Undeployment"); } private void repairCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory) { operationCanceled(storageNode, operationHistory, "Scheduled repair"); } private void operationCanceled(StorageNode storageNode, ResourceOperationHistory operationHistory, String opType) { log.error(opType + " has been aborted due to canceled operation [" + operationHistory.getOperationDefinition().getDisplayName() + " on " + storageNode.getResource() + ": " + operationHistory.getErrorMessage()); storageNode.setErrorMessage(opType + " has been aborted due to canceled resource operation [" + operationHistory.getOperationDefinition().getDisplayName() + "]."); storageNode.setFailedOperation(operationHistory); } private void deploymentOperationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode newStorageNode) { operationFailed(storageNode, operationHistory, newStorageNode, "Deployment"); } private void undeploymentOperationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode removedNode) { operationFailed(storageNode, operationHistory, removedNode, "Undeployment"); } private void deploymentOperationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory) { operationFailed(storageNode, operationHistory, "Deployment"); } private void undeploymentOperationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory) { operationFailed(storageNode, operationHistory, "Undeployment"); } private void repairFailed(StorageNode storageNode, ResourceOperationHistory operationHistory) { operationFailed(storageNode, operationHistory, "Scheduled repair"); } private void operationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory, String opType) { log.error(opType + " has been aborted due to failed operation [" + operationHistory.getOperationDefinition().getDisplayName() + "] on " + storageNode.getResource() + ": " + operationHistory.getErrorMessage()); storageNode.setErrorMessage(opType + " has been aborted due to failed resource operation [" + operationHistory.getOperationDefinition().getDisplayName() + "]."); storageNode.setFailedOperation(operationHistory); } private void operationFailed(StorageNode storageNode, ResourceOperationHistory operationHistory, StorageNode movingNode, String opType) { log.error(opType + " has been aborted due to failed operation [" + operationHistory.getOperationDefinition().getDisplayName() + "] on " + storageNode.getResource() + ": " + operationHistory.getErrorMessage()); movingNode.setErrorMessage(opType + " has been aborted due to failed resource operation on " + storageNode.getAddress()); storageNode.setErrorMessage(opType + " of " + movingNode.getAddress() + " has been aborted due " + "to failed resource operation [" + operationHistory.getOperationDefinition().getDisplayName() + "]."); storageNode.setFailedOperation(operationHistory); } private StorageNode findStorageNode(Resource resource) { for (StorageNode storageNode : storageNodeManager.getStorageNodes()) { if (storageNode.getResource().getId() == resource.getId()) { return storageNode; } } return null; } private StorageNode takeFromMaintenanceQueue() { List<StorageNode> storageNodes = entityManager .createQuery( "SELECT s FROM StorageNode s WHERE " + "s.operationMode = :operationMode AND s.maintenancePending = :maintenancePending", StorageNode.class).setParameter("operationMode", StorageNode.OperationMode.NORMAL) .setParameter("maintenancePending", true).getResultList(); if (storageNodes.isEmpty()) { return null; } return storageNodes.get(0); } private StorageNode findStorageNodeByAddress(String address) { return entityManager.createNamedQuery(StorageNode.QUERY_FIND_BY_ADDRESS, StorageNode.class) .setParameter("address", address).getSingleResult(); } private boolean isStorageNodeOperation(ResourceOperationHistory operationHistory) { if (operationHistory == null) { return false; } ResourceType resourceType = operationHistory.getOperationDefinition().getResourceType(); return resourceType.getName().equals(STORAGE_NODE_TYPE_NAME) && resourceType.getPlugin().equals(STORAGE_NODE_PLUGIN_NAME); } private boolean updateSchemaIfNecessary(int previousClusterSize, int newClusterSize) { boolean isRepairNeeded; int replicationFactor = 1; if (previousClusterSize == 0) { throw new IllegalStateException("previousClusterSize cannot be 0"); } if (newClusterSize == 0) { throw new IllegalStateException("newClusterSize cannot be 0"); } if (Math.abs(newClusterSize - previousClusterSize) != 1) { throw new IllegalStateException("The absolute difference between previousClusterSize[" + previousClusterSize + "] and newClusterSize[" + newClusterSize + "] must be 1"); } if (newClusterSize == 1) { isRepairNeeded = false; replicationFactor = 1; } else if (newClusterSize >= 5) { isRepairNeeded = false; } else if (previousClusterSize > 4) { isRepairNeeded = false; } else if (previousClusterSize == 4 && newClusterSize == 3) { isRepairNeeded = true; replicationFactor = 2; } else if (previousClusterSize == 3 && newClusterSize == 2) { isRepairNeeded = false; } else if (previousClusterSize == 1 && newClusterSize == 2) { isRepairNeeded = true; replicationFactor = 2; } else if (previousClusterSize == 2 && newClusterSize == 3) { isRepairNeeded = false; } else if (previousClusterSize == 3 && newClusterSize == 4) { isRepairNeeded = true; replicationFactor = 3; } else { throw new IllegalStateException("previousClusterSize[" + previousClusterSize + "] and newClusterSize[" + newClusterSize + "] is not supported"); } if (isRepairNeeded) { updateReplicationFactor(replicationFactor, newClusterSize); if (previousClusterSize == 1) { updateGCGraceSeconds(691200); // 8 days } } else if (newClusterSize == 1) { updateReplicationFactor(1, 1); updateGCGraceSeconds(0); } return isRepairNeeded; } private void updateReplicationFactor(int replicationFactor, int clusterSize) { StorageSession session = storageClientManager.getSession(); ReplicationFactorCheckJob.updateReplicationFactor(session, "rhq", replicationFactor); ReplicationFactorCheckJob.updateReplicationFactor(session, "system_auth", clusterSize); } private void updateGCGraceSeconds(int seconds) { StorageSession session = storageClientManager.getSession(); for (Table table : Table.values()) { session.execute("ALTER TABLE " + table.getTableName() + " WITH gc_grace_seconds = " + seconds); } session.execute("ALTER TABLE rhq.schema_version WITH gc_grace_seconds = " + seconds); } /** * Optimally this should be called outside of a transaction, because when scheduling an operation we * currently call back into {@link StorageNodeOperationsHandlerBean#handleOperationUpdateIfNecessary(OperationHistory)}. * This runs the risk of locking if called inside an existing transaction. */ private void scheduleOperation(Subject subject, StorageNode storageNode, Configuration parameters, String operation) { scheduleOperation(subject, storageNode, parameters, operation, 300); } /** * Optimally this should be called outside of a transaction, because when scheduling an operation we * currently call back into {@link StorageNodeOperationsHandlerBean#handleOperationUpdateIfNecessary(OperationHistory)}. * This runs the risk of locking if called inside an existing transaction. */ private void scheduleOperation(Subject subject, StorageNode storageNode, Configuration parameters, String operation, int timeout) { ResourceOperationSchedule schedule = new ResourceOperationSchedule(); schedule.setResource(storageNode.getResource()); schedule.setJobTrigger(JobTrigger.createNowTrigger()); schedule.setSubject(subject); schedule.setOperationName(operation); parameters.setSimpleValue(OperationDefinition.TIMEOUT_PARAM_NAME, Integer.toString(timeout)); schedule.setParameters(parameters); operationManager.scheduleResourceOperation(subject, schedule); } private String getAddress(PropertyList addressList) { List<String> list = new ArrayList<String>(addressList.getList().size()); for (Property property : addressList.getList()) { PropertySimple simple = (PropertySimple) property; list.add(simple.getStringValue()); } return list.get(0); } private PropertyList createPropertyListOfAddresses(String propertyName, List<StorageNode> nodes) { PropertyList list = new PropertyList(propertyName); for (StorageNode storageNode : nodes) { list.add(new PropertySimple("address", storageNode.getAddress())); } return list; } @Override @RequiredPermission(Permission.MANAGE_SETTINGS) public StorageNode removeMaintenanceMode(Subject subject, StorageNode storageNode) { storageNode.setMaintenancePending(false); return setMode(storageNode, StorageNode.OperationMode.NORMAL); } }