/** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.airavata.gfac.impl.task; import org.apache.airavata.common.exception.ApplicationSettingsException; import org.apache.airavata.common.utils.AiravataUtils; import org.apache.airavata.gfac.core.*; import org.apache.airavata.gfac.core.cluster.JobSubmissionOutput; import org.apache.airavata.gfac.core.cluster.RawCommandInfo; import org.apache.airavata.gfac.core.cluster.RemoteCluster; import org.apache.airavata.gfac.core.context.ProcessContext; import org.apache.airavata.gfac.core.context.TaskContext; import org.apache.airavata.gfac.core.task.JobSubmissionTask; import org.apache.airavata.gfac.core.task.TaskException; import org.apache.airavata.gfac.impl.Factory; import org.apache.airavata.model.appcatalog.computeresource.ComputeResourceDescription; import org.apache.airavata.model.appcatalog.computeresource.ResourceJobManager; import org.apache.airavata.model.commons.ErrorModel; import org.apache.airavata.model.experiment.ExperimentModel; import org.apache.airavata.model.job.JobModel; import org.apache.airavata.model.status.*; import org.apache.airavata.model.task.TaskTypes; import org.apache.airavata.registry.cpi.AppCatalogException; import org.apache.airavata.registry.cpi.ExperimentCatalogModelType; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; public class DefaultJobSubmissionTask implements JobSubmissionTask { private static final Logger log = LoggerFactory.getLogger(DefaultJobSubmissionTask.class); public static final String DEFAULT_JOB_ID = "DEFAULT_JOB_ID"; private static int waitForProcessIdmillis = 5000; private static int pauseTimeInSec = waitForProcessIdmillis / 1000; @Override public void init(Map<String, String> propertyMap) throws TaskException { } @Override public TaskStatus execute(TaskContext taskContext){ TaskStatus taskStatus = new TaskStatus(TaskState.COMPLETED); // set to completed. try { ProcessContext processContext = taskContext.getParentProcessContext(); JobModel jobModel = processContext.getJobModel(); jobModel.setTaskId(taskContext.getTaskId()); RemoteCluster remoteCluster = processContext.getJobSubmissionRemoteCluster(); GroovyMap groovyMap = GFacUtils.createGroovyMap(processContext, taskContext); groovyMap.getStringValue(Script.JOB_NAME). ifPresent(jobName -> jobModel.setJobName(jobName)); ResourceJobManager resourceJobManager = GFacUtils.getResourceJobManager(processContext); JobManagerConfiguration jConfig = null; if (resourceJobManager != null) { jConfig = Factory.getJobManagerConfiguration(resourceJobManager); } JobStatus jobStatus = new JobStatus(); File jobFile = GFacUtils.createJobFile(groovyMap, taskContext, jConfig); if (jobFile != null && jobFile.exists()) { jobModel.setJobDescription(FileUtils.readFileToString(jobFile)); JobSubmissionOutput jobSubmissionOutput = remoteCluster.submitBatchJob(jobFile.getPath(), processContext.getWorkingDir()); int exitCode = jobSubmissionOutput.getExitCode(); jobModel.setExitCode(exitCode); jobModel.setStdErr(jobSubmissionOutput.getStdErr()); jobModel.setStdOut(jobSubmissionOutput.getStdOut()); String jobId = jobSubmissionOutput.getJobId(); String experimentId = taskContext.getExperimentId(); if (exitCode != 0 || jobSubmissionOutput.isJobSubmissionFailed()) { jobModel.setJobId(DEFAULT_JOB_ID); if (jobSubmissionOutput.isJobSubmissionFailed()) { List<JobStatus> statusList = new ArrayList<>(); statusList.add(new JobStatus(JobState.FAILED)); statusList.get(0).setReason(jobSubmissionOutput.getFailureReason()); jobModel.setJobStatuses(statusList); GFacUtils.saveJobModel(processContext, jobModel); log.error("expId: {}, processid: {}, taskId: {} :- Job submission failed for job name {}", experimentId, taskContext.getProcessId(), taskContext.getTaskId(), jobModel.getJobName()); ErrorModel errorModel = new ErrorModel(); errorModel.setUserFriendlyMessage(jobSubmissionOutput.getFailureReason()); errorModel.setActualErrorMessage(jobSubmissionOutput.getFailureReason()); GFacUtils.saveExperimentError(processContext, errorModel); GFacUtils.saveProcessError(processContext, errorModel); GFacUtils.saveTaskError(taskContext, errorModel); taskStatus.setState(TaskState.FAILED); taskStatus.setReason("Job submission command didn't return a jobId"); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); taskContext.setTaskStatus(taskStatus); } else { String msg; GFacUtils.saveJobModel(processContext, jobModel); ErrorModel errorModel = new ErrorModel(); if (exitCode != Integer.MIN_VALUE) { msg = "expId:" + processContext.getProcessModel().getExperimentId() + ", processId:" + processContext.getProcessId() + ", taskId: " + taskContext.getTaskId() + " return non zero exit code:" + exitCode + " for JobName:" + jobModel.getJobName() + ", with failure reason : " + jobSubmissionOutput.getFailureReason() + " Hence changing job state to Failed." ; errorModel.setActualErrorMessage(jobSubmissionOutput.getFailureReason()); } else { msg = "expId:" + processContext.getProcessModel().getExperimentId() + ", processId:" + processContext.getProcessId() + ", taskId: " + taskContext.getTaskId() + " doesn't return valid job submission exit code for JobName:" + jobModel.getJobName() + ", with failure reason : stdout ->" + jobSubmissionOutput.getStdOut() + " stderr -> " + jobSubmissionOutput.getStdErr() + " Hence changing job state to Failed." ; errorModel.setActualErrorMessage(msg); } log.error(msg); errorModel.setUserFriendlyMessage(msg); GFacUtils.saveExperimentError(processContext, errorModel); GFacUtils.saveProcessError(processContext, errorModel); GFacUtils.saveTaskError(taskContext, errorModel); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); taskContext.setTaskStatus(taskStatus); } try { GFacUtils.saveAndPublishTaskStatus(taskContext); } catch (GFacException e) { log.error("Error while saving task status", e); } return taskStatus; } else if (jobId != null && !jobId.isEmpty()) { jobModel.setJobId(jobId); GFacUtils.saveJobModel(processContext, jobModel); jobStatus.setJobState(JobState.SUBMITTED); ComputeResourceDescription computeResourceDescription = taskContext.getParentProcessContext() .getComputeResourceDescription(); jobStatus.setReason("Successfully Submitted to " + computeResourceDescription.getHostName()); jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); jobModel.setJobStatuses(Arrays.asList(jobStatus)); GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel); if (verifyJobSubmissionByJobId(remoteCluster, jobId)) { jobStatus.setJobState(JobState.QUEUED); jobStatus.setReason("Verification step succeeded"); jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); jobModel.setJobStatuses(Arrays.asList(jobStatus)); GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel); } // doing gateway reporting if (computeResourceDescription.isGatewayUsageReporting()){ String loadCommand = computeResourceDescription.getGatewayUsageModuleLoadCommand(); String usageExecutable = computeResourceDescription.getGatewayUsageExecutable(); ExperimentModel experiment = (ExperimentModel)taskContext.getParentProcessContext() .getExperimentCatalog().get(ExperimentCatalogModelType.EXPERIMENT, experimentId); String username = experiment.getUserName() + "@" + taskContext.getParentProcessContext().getUsageReportingGatewayId(); RawCommandInfo rawCommandInfo = new RawCommandInfo(loadCommand + " && " + usageExecutable + " -gateway_user " + username + " -submit_time \"`date '+%F %T %:z'`\" -jobid " + jobId ); remoteCluster.execute(rawCommandInfo); } taskStatus = new TaskStatus(TaskState.COMPLETED); taskStatus.setReason("Submitted job to compute resource"); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); } else { int verificationTryCount = 0; while (verificationTryCount++ < 3) { String verifyJobId = verifyJobSubmission(remoteCluster, jobModel); if (verifyJobId != null && !verifyJobId.isEmpty()) { // JobStatus either changed from SUBMITTED to QUEUED or directly to QUEUED jobId = verifyJobId; jobModel.setJobId(jobId); GFacUtils.saveJobModel(processContext, jobModel); jobStatus.setJobState(JobState.QUEUED); jobStatus.setReason("Verification step succeeded"); jobStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); jobModel.setJobStatuses(Arrays.asList(jobStatus)); GFacUtils.saveJobStatus(taskContext.getParentProcessContext(), jobModel); taskStatus.setState(TaskState.COMPLETED); taskStatus.setReason("Submitted job to compute resource"); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); break; } log.info("Verify step return invalid jobId, retry verification step in {} secs", verificationTryCount * 10); Thread.sleep(verificationTryCount * 10000); } } if (jobId == null || jobId.isEmpty()) { jobModel.setJobId(DEFAULT_JOB_ID); GFacUtils.saveJobModel(processContext, jobModel); String msg = "expId:" + processContext.getProcessModel().getExperimentId() + " Couldn't find " + "remote jobId for JobName:" + jobModel.getJobName() + ", both submit and verify steps " + "doesn't return a valid JobId. " + "Hence changing experiment state to Failed"; log.error(msg); ErrorModel errorModel = new ErrorModel(); errorModel.setUserFriendlyMessage(msg); errorModel.setActualErrorMessage(msg); GFacUtils.saveExperimentError(processContext, errorModel); GFacUtils.saveProcessError(processContext, errorModel); GFacUtils.saveTaskError(taskContext, errorModel); taskStatus.setState(TaskState.FAILED); taskStatus.setReason("Couldn't find job id in both submitted and verified steps"); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); }else { GFacUtils.saveJobModel(processContext, jobModel); } } else { taskStatus.setState(TaskState.FAILED); if (jobFile == null) { taskStatus.setReason("JobFile is null"); } else { taskStatus.setReason("Job file doesn't exist"); } } } catch (AppCatalogException e) { String msg = "Error while instantiating app catalog"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } catch (ApplicationSettingsException e) { String msg = "Error occurred while creating job descriptor"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } catch (GFacException e) { String msg = "Error occurred while submitting the job"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } catch (IOException e) { String msg = "Error while reading the content of the job file"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } catch (InterruptedException e) { String msg = "Error occurred while verifying the job submission"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } catch (Throwable e) { String msg = "JobSubmission failed"; log.error(msg, e); taskStatus.setState(TaskState.FAILED); taskStatus.setReason(msg); taskStatus.setTimeOfStateChange(AiravataUtils.getCurrentTimestamp().getTime()); ErrorModel errorModel = new ErrorModel(); errorModel.setActualErrorMessage(e.getMessage()); errorModel.setUserFriendlyMessage(msg); taskContext.getTaskModel().setTaskErrors(Arrays.asList(errorModel)); } taskContext.setTaskStatus(taskStatus); try { GFacUtils.saveAndPublishTaskStatus(taskContext); } catch (GFacException e) { log.error("Error while saving task status", e); } return taskStatus; } private boolean verifyJobSubmissionByJobId(RemoteCluster remoteCluster, String jobID) throws GFacException { JobStatus status = remoteCluster.getJobStatus(jobID); return status != null && status.getJobState() != JobState.UNKNOWN; } private String verifyJobSubmission(RemoteCluster remoteCluster, JobModel jobDetails) { String jobName = jobDetails.getJobName(); String jobId = null; try { jobId = remoteCluster.getJobIdByJobName(jobName, remoteCluster.getServerInfo().getUserName()); } catch (GFacException e) { log.error("Error while verifying JobId from JobName"); } return jobId; } @Override public TaskStatus recover(TaskContext taskContext) { ProcessContext processContext = taskContext.getParentProcessContext(); JobModel jobModel = processContext.getJobModel(); // original job failed before submitting if (jobModel == null || jobModel.getJobId() == null ){ return execute(taskContext); }else { // job is already submitted and monitor should handle the recovery return new TaskStatus(TaskState.COMPLETED); } } @Override public TaskTypes getType() { return TaskTypes.JOB_SUBMISSION; } @Override public JobStatus cancel(TaskContext taskcontext) throws TaskException { ProcessContext processContext = taskcontext.getParentProcessContext(); RemoteCluster remoteCluster = processContext.getJobSubmissionRemoteCluster(); JobModel jobModel = processContext.getJobModel(); int retryCount = 0; if (jobModel != null) { if (processContext.getProcessState() == ProcessState.EXECUTING) { while (jobModel.getJobId() == null) { log.info("Cancellation pause {} secs until process get jobId", pauseTimeInSec); try { Thread.sleep(waitForProcessIdmillis); } catch (InterruptedException e) { // ignore } } } try { JobStatus oldJobStatus = remoteCluster.getJobStatus(jobModel.getJobId()); while (oldJobStatus == null && retryCount <= 5) { retryCount++; Thread.sleep(retryCount * 1000); oldJobStatus = remoteCluster.getJobStatus(jobModel.getJobId()); } if (oldJobStatus != null) { oldJobStatus = remoteCluster.cancelJob(jobModel.getJobId()); return oldJobStatus; } else { throw new TaskException("Cancel operation failed, Job status couldn't find in resource, JobId " + jobModel.getJobId()); } } catch ( GFacException | InterruptedException e) { throw new TaskException("Error while cancelling job " + jobModel.getJobId(), e); } } else { throw new TaskException("Couldn't complete cancel operation, JobModel is null in ProcessContext."); } } }