/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.airavata.gfac.monitor.cloud; import org.apache.airavata.cloud.aurora.client.AuroraThriftClient; import org.apache.airavata.cloud.aurora.client.bean.JobDetailsResponseBean; import org.apache.airavata.cloud.aurora.client.bean.JobKeyBean; import org.apache.airavata.cloud.aurora.client.sdk.ScheduledTask; import org.apache.airavata.common.utils.AiravataUtils; import org.apache.airavata.gfac.core.GFacException; import org.apache.airavata.gfac.core.GFacThreadPoolExecutor; import org.apache.airavata.gfac.core.GFacUtils; import org.apache.airavata.gfac.core.context.ProcessContext; import org.apache.airavata.gfac.core.context.TaskContext; import org.apache.airavata.gfac.core.monitor.JobMonitor; import org.apache.airavata.gfac.impl.AuroraUtils; import org.apache.airavata.gfac.impl.GFacWorker; import org.apache.airavata.model.job.JobModel; import org.apache.airavata.model.status.JobState; import org.apache.airavata.model.status.JobStatus; import org.apache.airavata.model.status.ProcessState; import org.apache.airavata.model.status.ProcessStatus; import org.apache.airavata.model.status.TaskState; import org.apache.airavata.model.status.TaskStatus; import org.apache.airavata.registry.cpi.CompositeIdentifier; import org.apache.airavata.registry.cpi.ExperimentCatalog; import org.apache.airavata.registry.cpi.ExperimentCatalogModelType; import org.apache.airavata.registry.cpi.RegistryException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentHashMap; import java.util.stream.Collectors; import java.util.stream.Stream; public class AuroraJobMonitor implements JobMonitor, Runnable { private static final Logger log = LoggerFactory.getLogger(AuroraJobMonitor.class); private static AuroraJobMonitor auroraJobMonitor; private Timer timer; private Map<String,TaskContext> jobMonitoringMap; private AuroraJobMonitor(){ jobMonitoringMap = new ConcurrentHashMap<>(); timer = new Timer("Aurora status poll timer", true); } public static AuroraJobMonitor getInstance(){ if (auroraJobMonitor == null) { synchronized (AuroraJobMonitor.class){ if (auroraJobMonitor == null) { auroraJobMonitor = new AuroraJobMonitor(); } } } return auroraJobMonitor; } @Override public void run() { AuroraTimer task = null; try { task = new AuroraTimer(); timer.schedule(task, 5000, 5000); } catch (Exception e) { log.error("Error couldn't run Aurora status poll timer task"); } } @Override public void monitor(String jobId, TaskContext taskContext) { jobMonitoringMap.put(jobId, taskContext); log.info("Added JobId : {} to Aurora Job Monitoring map", jobId); taskContext.getParentProcessContext().setPauseTaskExecution(true); } @Override public void stopMonitor(String jobId, boolean runOutFlow) { jobMonitoringMap.remove(jobId); } @Override public boolean isMonitoring(String jobId) { return jobMonitoringMap.get(jobId) != null; } @Override public void canceledJob(String jobId) { throw new IllegalStateException("Method not yet implemented"); } class AuroraTimer extends TimerTask { AuroraThriftClient client; public AuroraTimer() throws Exception { client = AuroraThriftClient.getAuroraThriftClient(); } @Override public void run() { while(true){ JobKeyBean jobKeyBean = new JobKeyBean(AuroraUtils.ENVIRONMENT, AuroraUtils.ROLE, "dummy"); Iterator<Map.Entry<String, TaskContext>> iterator = jobMonitoringMap.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry<String, TaskContext> currentEntry = iterator.next(); try { jobKeyBean.setName(currentEntry.getKey()); JobDetailsResponseBean jobDetailsResponseBean = client.getJobDetails(jobKeyBean); List<ScheduledTask> tasks = jobDetailsResponseBean.getTasks(); switch (tasks.get(0).getStatus()) { case FINISHED: iterator.remove(); processJob(currentEntry.getKey(), currentEntry.getValue(), JobState.COMPLETE); break; case FAILED: iterator.remove(); processJob(currentEntry.getKey(), currentEntry.getValue(), JobState.FAILED); break; case RUNNING: updateStatus(currentEntry.getKey(), currentEntry.getValue(), JobState.ACTIVE); break; default: log.info("Job {} is in {} state", currentEntry.getKey(), tasks.get(0).getStatus().name()); break; } } catch (Exception e) { log.error("Error while getting response for job : {}", currentEntry.getKey()); } } try { Thread.sleep(5000); } catch (InterruptedException e) { log.warn("Aurora Monitoring task interrupted"); } } } private void updateStatus(String jobKey, TaskContext taskContext, JobState jobState) { ProcessContext pc = taskContext.getParentProcessContext(); JobModel jobModel = pc.getJobModel(); if (jobModel.getJobStatuses().get(0).getJobState() != jobState) { JobStatus jobStatus = new JobStatus(jobState); jobStatus.setReason("Aurora return " + jobState.name()); jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); jobModel.setJobStatuses(Arrays.asList(jobStatus)); try { GFacUtils.saveJobStatus(pc, jobModel); } catch (GFacException e) { log.error("Error while saving job status {}, job : {}, task :{}, process:{} exp:{}", jobState.name(), jobKey, taskContext.getTaskId(), pc.getProcessId(), pc.getExperimentId()); } } } private void processJob(String jobKey, TaskContext taskContext, JobState jobState) { JobStatus jobStatus = new JobStatus(); jobStatus.setJobState(jobState); if (jobState == JobState.COMPLETE) { jobStatus.setReason("Aurora Job completed"); } else if (jobState == JobState.FAILED) { jobStatus.setReason("Aurora Job Failed"); } ProcessContext pc = taskContext.getParentProcessContext(); JobModel jobModel = pc.getJobModel(); jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); jobModel.setJobStatuses(Arrays.asList(jobStatus)); try { GFacUtils.saveJobStatus(pc, jobModel); } catch (GFacException e) { log.error("Error while saving job status for job : {} ", jobKey); } TaskStatus taskStatus = new TaskStatus(TaskState.COMPLETED); taskStatus.setReason("Job monitoring completed with final state: " + TaskState.COMPLETED.name()); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); taskContext.setTaskStatus(taskStatus); try { GFacUtils.saveAndPublishTaskStatus(taskContext); } catch (GFacException e) { log.error("Error while saving task status for exp : {} , process : {} , task : {} , job : {}", taskContext.getExperimentId(), taskContext.getProcessId(), taskContext.getTaskId(), jobKey); } if (pc.isCancel()) { ProcessStatus processStatus = new ProcessStatus(ProcessState.CANCELLING); processStatus.setReason("Process has been cancelled"); pc.setProcessStatus(processStatus); try { GFacUtils.saveAndPublishProcessStatus(pc); } catch (GFacException e) { log.error("Error while cancelling process, exp : {}, process : {}", pc.getExperimentId(), pc.getProcessId()); } } try { GFacThreadPoolExecutor.getCachedThreadPool().execute(new GFacWorker(pc)); } catch (GFacException e) { log.error("Error while running output tasks for exp : {} , process : {}", taskContext.getExperimentId(), pc.getProcessId()); ProcessStatus processStatus = new ProcessStatus(ProcessState.FAILED); processStatus.setReason("Failed to run output tasks"); processStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); pc.setProcessStatus(processStatus); try { GFacUtils.saveAndPublishProcessStatus(pc); } catch (GFacException ex) { log.error("Error while updating process status to FAILED, exp : {}, process : {}", pc.getExperimentId(), pc.getProcessId()); } } } } }