/* * Copyright (c) 2015 EMC Corporation * All Rights Reserved */ package com.emc.sa.engine; import java.net.URI; import java.util.Date; import java.util.List; import java.util.UUID; import javax.annotation.PostConstruct; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import com.emc.storageos.db.client.model.uimodels.ExecutionLog.LogLevel; import com.emc.storageos.db.client.model.uimodels.ExecutionPhase; import com.emc.storageos.db.client.model.uimodels.ExecutionState; import com.emc.storageos.db.client.model.uimodels.ExecutionStatus; import com.emc.storageos.db.client.model.uimodels.ExecutionTaskLog; import com.emc.storageos.db.client.model.uimodels.Order; import com.emc.storageos.db.client.model.uimodels.OrderStatus; import com.emc.sa.model.dao.ModelClient; import com.emc.sa.zookeeper.SingletonService; import com.emc.storageos.coordinator.client.service.DistributedDataManager; /** * This class handles both sending the heartbeat for the current engine instance, and monitoring for engine failures. * Only a single node will ever be operating as the monitor for engine failures. * * Data is stored as /config/sa/engine/{ENGINE_ID}/heartbeat * /config/sa/engine/{ENGINE_ID}/orders/{ORDER_ID}.... (replicated for each order) * * @author jonnymiller */ @Component public class ExecutionEngineMonitor extends SingletonService { private static final long HEART_BEAT = 60000; private static final long MAX_AGE = 5 * HEART_BEAT; public static final String BASE_PATH = "/config/sa/engine"; @Autowired private ModelClient modelClient; private String uniqueId = UUID.randomUUID().toString(); private DistributedDataManager dataManager; @Autowired private OrderCleanupHandler drOrderCleanupHandler; private volatile Thread keepAliveThread; @PostConstruct public void init() { dataManager = getCoordinatorClient().getWorkflowDataManager(); drOrderCleanupHandler.run(); // Start a keep-alive thread keepAliveThread = new Thread(new Runnable() { public void run() { keepAlive(); } }, "engine-monitor"); keepAliveThread.setDaemon(true); keepAliveThread.start(); log.info("Created SA Engine Monitor with ID " + uniqueId); } /** * Adds the order to the engine state. * * @param order * the order. */ public void addOrder(Order order) { try { dataManager.putData(getOrderPath(uniqueId, order), order.getId()); if (log.isDebugEnabled()) { log.debug("Tracking order: " + order.getId()); } } catch (Exception e) { log.error("Error adding order " + order.getId() + " to EngineState", e); } } /** * Removes the order from the engine state. * * @param order * the order. */ public void removeOrder(Order order) { try { dataManager.removeNode(getOrderPath(uniqueId, order)); } catch (Exception e) { log.error("Error removing order " + order.getId() + " from EngineState", e); } } /** * Performs monitoring of all engines. Only a single node will ever be performing this function at a time. */ @Override protected void runService() { try { while (true) { Thread.sleep(HEART_BEAT); checkEngineStates(); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } /** * Checks the state of all registered engines, killing any which are no longer alive. */ private void checkEngineStates() { try { for (String engine : dataManager.getChildren(BASE_PATH)) { Long lastHeartBeat = getHeartBeat(engine); if (lastHeartBeat != null) { long age = System.currentTimeMillis() - lastHeartBeat; if (age >= MAX_AGE) { removeEngineState(engine); } } } } catch (Exception e) { log.error("Error checking Engine States", e); } } /** * Stops the engine monitor. */ @Override protected void stopService() { Thread t = keepAliveThread; keepAliveThread = null; if (t != null) { t.interrupt(); } } /** * Kills any orders associated with the engine and removes the engine from zookeeper. */ private void removeEngineState(String engineId) { if (log.isInfoEnabled()) { log.info("Removing engine: " + engineId); } try { if (dataManager.checkExists(getOrdersPath(engineId)) != null) { for (String orderId : dataManager.getChildren(getOrdersPath(engineId))) { String message = "Order processing terminated during execution, order was not completed. " + "Check with your administrator. Reboot may have occurred."; killOrder(URI.create(orderId), message); dataManager.removeNode(getOrderPath(engineId, orderId)); } dataManager.removeNode(getEnginePath(engineId)); } } catch (Exception e) { log.error("Error whilst removing " + engineId + " engine state", e); } } /** * Kills an order that was running within a dead engine. * * @param orderId * the order ID. * @param detailedMessage * message to be added to order log */ public void killOrder(URI orderId, String detailedMessage) { try { Order order = modelClient.orders().findById(orderId); if (order != null) { if (log.isInfoEnabled()) { log.info("Killing order: " + orderId); } // Mark the order as failed order.setOrderStatus(OrderStatus.ERROR.name()); modelClient.save(order); if (order.getExecutionStateId() != null) { ExecutionState execState = modelClient.executionStates().findById(order.getExecutionStateId()); // Mark the execution state as failed execState.setExecutionStatus(ExecutionStatus.FAILED.name()); modelClient.save(execState); // Find any task logs that are 'in progress' (no elapsed time) and set the elapsed List<ExecutionTaskLog> logs = modelClient.executionTaskLogs().findByIds(execState.getTaskLogIds()); for (ExecutionTaskLog log : logs) { if (log.getElapsed() == null) { // Mark any that were in progress as warnings log.setLevel(LogLevel.WARN.name()); modelClient.save(log); } } // Add a new log message indicating it failed due to engine termination addTerminationTaskLog(execState, detailedMessage); } } } catch (RuntimeException e) { log.error("Failed to terminate order: " + orderId, e); } } /** * Adds an execution task log indicating that the engine terminated during execution. * * @param state the execution state. */ private void addTerminationTaskLog(ExecutionState state, String detailedMessage) { ExecutionTaskLog log = new ExecutionTaskLog(); log.setDate(new Date()); log.setLevel(LogLevel.ERROR.toString()); log.setMessage("Order Terminated"); log.setDetail(detailedMessage); log.setPhase(ExecutionPhase.EXECUTE.name()); modelClient.save(log); state.addExecutionTaskLog(log); modelClient.save(state); } /** * Sends a heartbeat every minute, keeping the engine alive. */ private void keepAlive() { Thread current = Thread.currentThread(); try { heartBeat(); while (current == keepAliveThread) { Thread.sleep(HEART_BEAT); heartBeat(); } } catch (InterruptedException e) { log.warn("Heartbeat interrupted", e); } } /** * Sends a heartbeat for this engine. */ private void heartBeat() { try { Long newHeartBeat = System.currentTimeMillis(); dataManager.putData(getHeartBeatPath(uniqueId), newHeartBeat); } catch (Exception e) { log.error("Error updating Engine " + uniqueId + " HeartBeat", e); } } private Long getHeartBeat(String engineId) { try { return (Long) dataManager.getData(getHeartBeatPath(engineId), false); } catch (Exception e) { log.error("Error getting Engine " + engineId + " Heartbeat", e); return null; } } private String getEnginePath(String engineId) { return String.format("%s/%s", BASE_PATH, engineId); } private String getHeartBeatPath(String engineId) { return String.format("%s/%s", getEnginePath(engineId), "heartbeat"); } private String getOrdersPath(String engineId) { return String.format("%s/%s", getEnginePath(engineId), "orders"); } private String getOrderPath(String engineId, Order order) { return getOrderPath(engineId, order.getId().toString()); } private String getOrderPath(String engineId, String orderId) { return String.format("%s/%s", getOrdersPath(engineId), orderId); } }