package cz.cuni.mff.d3s.been.manager; import static cz.cuni.mff.d3s.been.core.task.TaskState.*; import static cz.cuni.mff.d3s.been.manager.TaskManagerConfiguration.*; import java.util.HashSet; import java.util.Set; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.hazelcast.core.IMap; import cz.cuni.mff.d3s.been.cluster.ServiceException; import cz.cuni.mff.d3s.been.cluster.context.ClusterContext; import cz.cuni.mff.d3s.been.core.ri.RuntimeInfo; import cz.cuni.mff.d3s.been.core.task.TaskEntry; import cz.cuni.mff.d3s.been.core.task.TaskState; import cz.cuni.mff.d3s.been.manager.msg.Messages; import cz.cuni.mff.d3s.been.manager.msg.TaskMessage; import cz.cuni.mff.d3s.been.mq.IMessageSender; import cz.cuni.mff.d3s.been.util.PropertyReader; /** * * Scans local keys of tasks map and looks for irregularities. * * Hazelcast (as of version 2.5) is not capable to inform client application * when a key migrates. So the idea is to periodically scan local keys and see * if a key needs our attention. This covers leaving (or crashing) as well as * joining of data nodes. * * @author Martin Sixta */ final class LocalKeyScanner extends TaskManagerService { /** logging */ private static final Logger log = LoggerFactory.getLogger(LocalKeyScanner.class); /** Connection to the cluster */ private final ClusterContext clusterCtx; /** Task Action Queue */ private IMessageSender<TaskMessage> sender; /** The scanner runnable */ private final LocalKeyScannerRunnable runnable; /** This node's ID */ private final String nodeId; private final PropertyReader propertyReader; /** * Creates the LocalKeyScanner {@link TaskManagerService}. * * @param clusterCtx * connection to the cluster */ public LocalKeyScanner(ClusterContext clusterCtx) { this.clusterCtx = clusterCtx; this.nodeId = clusterCtx.getCluster().getLocalMember().getUuid(); this.runnable = new LocalKeyScannerRunnable(); this.propertyReader = PropertyReader.on(clusterCtx.getProperties()); } /** Runnable to schedule with the executor */ private class LocalKeyScannerRunnable implements Runnable { @Override public void run() { // pokemon block, aka. catch-them-all (Executors tend to silently ignore Exceptions) try { doRun(); } catch (Exception e) { log.error("Unknown error in TaskManager (LocalKeyScanner)", e); } } } /** * The actual "run()" method * * @throws Exception * when it rains */ private void doRun() throws Exception { IMap<String, TaskEntry> map = clusterCtx.getTasks().getTasksMap(); Set<String> runtimeIds = new HashSet<>(); for (RuntimeInfo info : clusterCtx.getRuntimes().getRuntimes()) { runtimeIds.add(info.getId()); } for (String taskId : map.localKeySet()) { TaskEntry entry = map.get(taskId); if (entry == null) { continue; } try { checkEntry(runtimeIds, entry); } catch (Exception e) { log.error("Error when checking TaskEntry " + taskId, e); } } } /** * Checks one {@link TaskEntry} for irregularities. * * @param entry * The entry to check * @throws Exception * when it rains */ private void checkEntry(final Set<String> runtimesIds, final TaskEntry entry) throws Exception { log.debug("TaskEntry ID: {}, status: {}", entry.getId(), entry.getState().toString()); final TaskState state = entry.getState(); boolean isWaiting = (state == WAITING); boolean isAccepted = (state == ACCEPTED); boolean isDone = (state == ABORTED || state == FINISHED); boolean isScheduled = (state == SCHEDULED); boolean isRunning = (state == RUNNING); boolean isRuntimeOffline = !runtimesIds.contains(entry.getRuntimeId()); boolean isFromPersistence = entry.isLoadedFromPersistence(); // Cluster restart if (!isDone && isRuntimeOffline && isFromPersistence) { String logMsg = String.format("Will abort task '%s' because of cluster restart", entry.getId()); log.debug(logMsg); sender.send(Messages.createAbortTaskMessage(entry, logMsg)); return; } // Failed Host Runtime of a scheduled task if ((isScheduled || isAccepted) && isRuntimeOffline) { String logMsg = String.format("Will reschedule '%s' because of Host Runtime failure", entry.getId()); log.debug(logMsg); sender.send(Messages.createRescheduleTaskMessage(entry)); return; } // Failed Host Runtime of a running task if (isRunning && isRuntimeOffline) { String logMsg = String.format("Will abort '%s' because of Host Runtime failure", entry.getId()); log.debug(logMsg); sender.send(Messages.createAbortTaskMessage(entry, logMsg)); return; } if (isWaiting) { log.debug("Will try to schedule WAITING task {}", entry.getState()); TaskMessage msg = Messages.createCheckSchedulabilityMessage(entry); sender.send(msg); return; } } @Override public void start() throws ServiceException { sender = createSender(); int delay = propertyReader.getInteger(SCANNER_INITIAL_DELAY, DEFAULT_SCANNER_INITIAL_DELAY); int period = propertyReader.getInteger(SCANNER_PERIOD, DEFAULT_SCANNER_PERIOD); clusterCtx.schedule(runnable, delay, period, TimeUnit.SECONDS); } @Override public void stop() { sender.close(); } }