/** * Copyright 2007-2008 University Of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.isi.pegasus.planner.code.gridstart; import edu.isi.pegasus.common.logging.LogManager; import edu.isi.pegasus.planner.catalog.site.classes.SiteCatalogEntry; import edu.isi.pegasus.planner.catalog.site.classes.SiteStore; import edu.isi.pegasus.planner.classes.ADag; import edu.isi.pegasus.planner.classes.AggregatedJob; import edu.isi.pegasus.planner.classes.Job; import edu.isi.pegasus.planner.classes.PegasusBag; import edu.isi.pegasus.planner.classes.PegasusFile; import edu.isi.pegasus.planner.classes.PlannerOptions; import edu.isi.pegasus.planner.classes.TransferJob; import edu.isi.pegasus.planner.code.GridStart; import edu.isi.pegasus.planner.code.generator.condor.CondorQuoteParser; import edu.isi.pegasus.planner.code.generator.condor.CondorQuoteParserException; import edu.isi.pegasus.planner.common.PegasusConfiguration; import edu.isi.pegasus.planner.common.PegasusProperties; import edu.isi.pegasus.planner.namespace.Condor; import edu.isi.pegasus.planner.namespace.Globus; import edu.isi.pegasus.planner.namespace.Pegasus; import edu.isi.pegasus.planner.transfer.SLS; import edu.isi.pegasus.common.util.Separator; import edu.isi.pegasus.planner.catalog.transformation.classes.TCType; import edu.isi.pegasus.planner.catalog.TransformationCatalog; import edu.isi.pegasus.planner.catalog.transformation.TransformationCatalogEntry; import edu.isi.pegasus.planner.cluster.JobAggregator; import edu.isi.pegasus.planner.namespace.ENV; import edu.isi.pegasus.planner.partitioner.graph.GraphNode; import edu.isi.pegasus.common.util.Boolean; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.StringTokenizer; /** * This enables a constituentJob to be run on the grid, by launching it through kickstart. * The kickstart executable is a light-weight program which connects the * stdin, stdout and stderr filehandles for Pegasus jobs on the remote * site. * <p> * Sitting in between the remote scheduler and the executable, it is * possible for kickstart to gather additional information about the * executable run-time behavior, including the exit status of jobs. * <p> * Kickstart is an executable distributed with Pegasus that can generally be found * at $PEGASUS_HOME/bin/kickstart * * @author Karan Vahi vahi@isi.edu * @version $Revision$ */ public class Kickstart implements GridStart { /** * The transformation namespace for the kickstart */ public static final String TRANSFORMATION_NAMESPACE = "pegasus"; /** * The logical name of kickstart */ public static final String TRANSFORMATION_NAME = "kickstart"; /** * The version number for kickstart. */ public static final String TRANSFORMATION_VERSION = null; /** * The basename of the kickstart executable. */ public static final String EXECUTABLE_BASENAME = "pegasus-kickstart"; /** * The complete TC name for kickstart. */ public static final String COMPLETE_TRANSFORMATION_NAME = Separator.combine( TRANSFORMATION_NAMESPACE, TRANSFORMATION_NAME, TRANSFORMATION_VERSION ); /** * The suffix for the kickstart input file, that is generated to use * invoke at the remote end. */ public static final String KICKSTART_INPUT_SUFFIX = "arg"; /** * The basename of the class that is implmenting this. Could have * been determined by reflection. */ public static final String CLASSNAME = "Kickstart"; /** * The SHORTNAME for this implementation. */ public static final String SHORT_NAME = "kickstart"; /** * The environment variable used to the set Kickstart SETUP JOB. */ public static final String KICKSTART_SETUP = "GRIDSTART_SETUP"; /** * The environment variable used to the set Kickstart PREJOB. */ public static final String KICKSTART_PREJOB = "GRIDSTART_PREJOB"; /** * The environment variable used to the set Kickstart POSTJOB. */ public static final String KICKSTART_POSTJOB = "GRIDSTART_POSTJOB"; /** * The environment variable used to the set Kickstart CLEANUP JOB. */ public static final String KICKSTART_CLEANUP = "GRIDSTART_CLEANUP"; /** * The LogManager object which is used to log all the messages. */ private LogManager mLogger; /** * The object holding all the properties pertaining to Pegasus. */ private PegasusProperties mProps; /** * The options passed to the planner. */ private PlannerOptions mPOptions; /** * The handle to the workflow that is being enabled. */ private ADag mConcDAG; /** * Handle to the site catalog store. */ private SiteStore mSiteStore; //private PoolInfoProvider mSiteHandle; /** * Handle to Transformation Catalog. */ private TransformationCatalog mTCHandle; /** * The submit exectionSiteDirectory where the submit files are being generated for * the workflow. */ private String mSubmitDir; /** * A boolean indicating whether to use invoke always or not. */ private boolean mInvokeAlways; /** * A boolean indicating whether to stat files or not. */ private boolean mDoStat; /** * A boolean indicating whether to generate lof files or not. */ private boolean mGenerateLOF; /** * The invoke limit trigger. */ private long mInvokeLength; /** * A flag to indicate if outputs are being registered or not */ private boolean mRegisterOutputs; /** * handle to PegasusConfiguration */ private PegasusConfiguration mPegasusConfiguration; /** * The handle to the SLS implementor */ private SLS mSLS; /** * An instance variable to track if enabling is happening as part of a clustered constituentJob. * See Bug 21 comments on Pegasus Bugzilla */ private boolean mEnablingPartOfAggregatedJob; /** * A boolean indicating whether kickstart is deployed dynamically or not. */ private boolean mDynamicDeployment; /** * The label that is passed to kickstart. */ private String mKickstartLabel; /** * Whether kickstart should set the X Bit on the staged executables. */ private boolean mSetXBit; /** * Handle to NoGridStart implementation. */ private GridStart mNoGridStartImpl; /** * Boolean indicating whether to use full path or not */ private boolean mUseFullPathToGridStart; /** * Boolean indicating whether to disable invoke functionality. */ private boolean mDisableInvokeFunctionality; private boolean mDisableKickstartStatCompletely; /** * whether integrity checking is turned on or not */ private boolean mIntegrityCheckingOn ; /** * Initializes the GridStart implementation. * * @param bag the bag of objects that is used for initialization. * @param dag the concrete dag so far. */ public void initialize( PegasusBag bag, ADag dag ){ mProps = bag.getPegasusProperties(); mPOptions = bag.getPlannerOptions(); mLogger = bag.getLogger(); mSubmitDir = mPOptions.getSubmitDirectory(); mKickstartLabel = ( dag == null ) ? null : ( mPOptions.getBasenamePrefix() == null )? dag.getLabel(): mPOptions.getBasenamePrefix() ; mInvokeAlways = mProps.useInvokeInGridStart(); mInvokeLength = mProps.getGridStartInvokeLength(); mGenerateLOF = mProps.generateLOFFiles(); mIntegrityCheckingOn = mProps.doIntegrityChecking(); mConcDAG = dag; mSiteStore = bag.getHandleToSiteStore(); mTCHandle = bag.getHandleToTransformationCatalog(); mDynamicDeployment = mProps.transferWorkerPackage(); //mWorkerNodeExecution = mProps.executeOnWorkerNode(); mPegasusConfiguration = new PegasusConfiguration( bag.getLogger() ); mEnablingPartOfAggregatedJob = false; mSetXBit = mProps.setXBitWithKickstart(); mNoGridStartImpl = new NoGridStart(); mNoGridStartImpl.initialize( bag, dag ); mUseFullPathToGridStart = true; mDisableInvokeFunctionality = mProps.disableInvokeInGridStart(); //PM-1060 we set stat based on whether a user //has specified a value or not String value = mProps.doStatWithKickstart(); mRegisterOutputs = mProps.createRegistrationJobs(); mDisableKickstartStatCompletely = false; if( value == null ){ //stat is disabled unless there is registration job mDoStat = false; } else{ //user specified a stat value . mDoStat = Boolean.parse( value, false ); mDisableKickstartStatCompletely = !mDoStat; } mLogger.log( "Kickstart Stating Disabled Completely - " + mDisableKickstartStatCompletely, LogManager.CONFIG_MESSAGE_LEVEL ); } /** * Setter method to control whether a full path to Gridstart should be * returned while wrapping a job or not. * * @param fullPath if set to true, indicates that full path would be used. */ public void useFullPathToGridStarts( boolean fullPath ){ this.mUseFullPathToGridStart = fullPath; } /** * Enables a constituentJob to run on the grid. This also determines how the * stdin,stderr and stdout of the constituentJob are to be propogated. * To grid enable a constituentJob, the constituentJob may need to be wrapped into another * constituentJob, that actually launches the constituentJob. It usually results in the constituentJob * description passed being modified modified. * * @param constituentJob the <code>Job</code> object containing the constituentJob description * of the constituentJob that has to be enabled on the grid. * @param isGlobusJob is <code>true</code>, if the constituentJob generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the constituentJob runs on the submit * host in any way. * * @return boolean true if enabling was successful,else false. */ public boolean enable( AggregatedJob job,boolean isGlobusJob){ //PM-817 when the recursion first starts parameter first is true return this.enable(job, isGlobusJob, true); } /** * Enables a constituentJob to run on the grid. This also determines how the * stdin,stderr and stdout of the constituentJob are to be propogated. * To grid enable a constituentJob, the constituentJob may need to be wrapped into another * constituentJob, that actually launches the constituentJob. It usually results in the constituentJob * description passed being modified modified. * * @param job * @param isGlobusJob is <code>true</code>, if the constituentJob generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the constituentJob runs on the submit * host in any way. * @param first * * @return boolean true if enabling was successful,else false. */ public boolean enable( AggregatedJob job, boolean isGlobusJob, boolean first){ //boolean first = true; //get hold of the JobAggregator determined for this clustered job //during clustering JobAggregator aggregator = job.getJobAggregator(); if( aggregator == null ){ throw new RuntimeException( "Clustered job not associated with a job aggregator " + job.getID() ); } boolean partOfClusteredJob = true; //we want to evaluate the exectionSiteDirectory only once //for the clustered job for (Iterator it = aggregator.topologicalOrderingRequired() ? job.topologicalSortIterator()://PM-817 we care about order, else -H option maynot be omitted always for first job job.nodeIterator(); it.hasNext(); ) { //PM-817Job constituentJob = (Job)it.next(); GraphNode node = ( GraphNode )it.next(); Job constituentJob = (Job) node.getContent(); if( constituentJob instanceof AggregatedJob ){ //PM-817 we need to make sure that the constituten //clustered job also gets enabled correctly AggregatedJob constituentClusteredJob = (AggregatedJob)constituentJob; if( !aggregator.getClass().equals( constituentClusteredJob.getJobAggregator().getClass() ) ){ //sanity check first to ensure the aggreagtors are not mixed StringBuilder error = new StringBuilder(); error.append( "Recursive Clustering does not support different job aggregators. Job Aggregator for clustered job " ). append( job.getID() ).append( " ").append( aggregator.getClass() ). append( " does not match with constitutent job " ).append( constituentJob.getID() ). append( " " ).append( constituentClusteredJob.getJobAggregator().getClass() ); throw new RuntimeException( error.toString() ); } this.enable( constituentClusteredJob, isGlobusJob, first ); } //earlier was set in SeqExec JobAggregator in the enable function constituentJob.vdsNS.construct( Pegasus.GRIDSTART_KEY, this.getVDSKeyValue() ); if (first) { first = false; } else { //we need to pass -H to kickstart //to suppress the header creation // PM-823 Add -H to pegasus.gridstart.arguments if it is already set Pegasus jobconf = constituentJob.vdsNS; if (jobconf.containsKey(Pegasus.GRIDSTART_ARGUMENTS_KEY)) { String args = jobconf.getStringValue(Pegasus.GRIDSTART_ARGUMENTS_KEY); jobconf.construct(Pegasus.GRIDSTART_ARGUMENTS_KEY, args + " -H"); } else { jobconf.construct(Pegasus.GRIDSTART_ARGUMENTS_KEY, "-H"); } } //no worker node case //always pass isGlobus true as always //interested only in executable strargs //due to the fact that seqexec does not allow for setting environment //per constitutent constituentJob, we cannot set the postscript removal option this.enable( constituentJob, isGlobusJob, mDoStat, false, partOfClusteredJob ); //PM-1021 add any of the lof files (and any other) that maybe transferred via condor io job.condorVariables.addIPFileForTransfer( constituentJob.condorVariables.getIPFilesForTransfer() ); //System.out.println( constituentJob.condorVariables ); } //all the constitutent jobs are enabled. //get the job aggregator to render the job //to it's executable form aggregator.makeAbstractAggregatedJobConcrete( job ); //the aggregated job itself needs to be enabled via NoGridStart mNoGridStartImpl.enable( (Job)job, isGlobusJob); return true; } /** * Enables a constituentJob to run on the grid by launching it through kickstart. * Does the stdio, and stderr handling of the constituentJob to be run on the grid. * It modifies the constituentJob description, and also constructs all the valid * option to be passed to kickstart for launching the executable. * * @param constituentJob the <code>Job</code> object containing the constituentJob description * of the constituentJob that has to be enabled on the grid. * @param isGlobusJob is <code>true</code>, if the constituentJob generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the constituentJob runs on the submit * host in any way. * * @return boolean true if enabling was successful,else false in case when * the path to kickstart could not be determined on the site where * the constituentJob is scheduled. */ public boolean enable( Job job, boolean isGlobusJob ){ return this.enable( job, isGlobusJob, mDoStat , true, false ); } /** * Enables a constituentJob to run on the grid by launching it through kickstart. * Does the stdio, and stderr handling of the constituentJob to be run on the grid. * It modifies the constituentJob description, and also constructs all the valid * option to be passed to kickstart for launching the executable. * * @param constituentJob the <code>Job</code> object containing the constituentJob description * of the constituentJob that has to be enabled on the grid. * @param isGlobusJob is <code>true</code>, if the constituentJob generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the constituentJob runs on the submit * host in any way. * @param stat boolean indicating whether to generate the lof files * for kickstart stat option or not. * @param addPostScript boolean indicating whether to add a postscript or not. * @param partOfClusteredJob boolean indicating whether the job being enabled * is part of a clustered job or not. * * @return boolean true if enabling was successful,else false in case when * the path to kickstart could not be determined on the site where * the constituentJob is scheduled. */ protected boolean enable( Job job, boolean isGlobusJob, boolean stat, boolean addPostScript , boolean partOfClusteredJob) { //take care of relative submit exectionSiteDirectory if specified. String submitDir = mSubmitDir + mSeparator; // String submitDir = getSubmitDirectory( mSubmitDir , constituentJob) + mSeparator; //To get the gridstart/kickstart path on the remote //pool, querying with entry for vanilla universe. //In the new format the gridstart is associated with the //pool not pool, condor universe SiteCatalogEntry site = mSiteStore.lookup( job.getSiteHandle() ); //the executable path and arguments are put //in the Condor namespace and not printed to the //file so that they can be overriden if desired //later through profiles and key transfer_executable String gridStartPath = handleTransferOfExecutable( job, getKickstartPath( site ) ); //sanity check if (gridStartPath == null){ return false; } StringBuffer gridStartArgs = new StringBuffer(); // the executable is gridstart, the application becomes its argument gridStartArgs.append(' '); gridStartArgs.append("-n "); gridStartArgs.append(job.getCompleteTCName()); gridStartArgs.append(' '); //for derivation we now pass the logical id in the DAX //for the job JIRA PM-329 gridStartArgs.append("-N ").append( job.getDAXID() ).append( " " ); // handle stdin if (job.stdIn.length() > 0) { //for using the transfer script and other vds executables the //input file is transferred from the submit host by Condor to //stdin. We fool the kickstart to pick up the input file from //standard stdin by giving the input file name as - if (job.logicalName.equals( edu.isi.pegasus.planner.transfer.implementation.Transfer.TRANSFORMATION_NAME) || job.logicalName.equals(edu.isi.pegasus.planner.refiner.cleanup.Cleanup.TRANSFORMATION_NAME ) || job.logicalName.equals( edu.isi.pegasus.planner.refiner.createdir.DefaultImplementation.TRANSFORMATION_NAME ) || job.logicalName.equals(edu.isi.pegasus.planner.cluster.aggregator.SeqExec. COLLAPSE_LOGICAL_NAME) || job.logicalName.equals(edu.isi.pegasus.planner.cluster.aggregator.MPIExec. COLLAPSE_LOGICAL_NAME) ) { //condor needs to pick up the constituentJob stdin and //transfer it to the remote end construct( job, "input" , job.getFileFullPath( submitDir, ".in") ); gridStartArgs.append("-i ").append("-").append(' '); } else { //kickstart provides the app's *tracked* stdin gridStartArgs.append("-i ").append(job.stdIn).append(' '); } } // handle stdout if (job.stdOut.length() > 0) { // gridstart saves the app's *tracked* stdout gridStartArgs.append("-o ").append(job.stdOut).append(' '); } // the Condor output variable and kickstart -o option // must not point to the same file for any local constituentJob. if (job.stdOut.equals(job.jobName + ".out") && !isGlobusJob) { mLogger.log("Detected WAW conflict for stdout",LogManager.WARNING_MESSAGE_LEVEL); } // the output of gridstart is propagated back to the submit host construct(job,"output", job.getFileFullPath( submitDir, ".out") ); if (isGlobusJob) { construct(job,"transfer_output","true"); } // handle stderr if (job.stdErr.length() > 0) { // gridstart saves the app's *tracked* stderr gridStartArgs.append("-e ").append(job.stdErr).append(' '); } // the Condor error variable and kickstart -e option // must not point to the same file for any local constituentJob. if (job.stdErr.equals(job.jobName + ".err") && !isGlobusJob) { mLogger.log("Detected WAW conflict for stderr",LogManager.WARNING_MESSAGE_LEVEL); } // the error from gridstart is propagated back to the submit host construct(job,"error",job.getFileFullPath( submitDir, ".err")); if (isGlobusJob) { construct(job,"transfer_error","true"); } //we need to pass the resource handle //to kickstart as argument gridStartArgs.append("-R ").append(job.executionPool).append(' '); //Added for JIRA PM-543 String directory = this.getDirectory( job ); boolean setScratchEnvVariable = false; //handle the -W option that asks kickstart to create and change //exectionSiteDirectory before launching an executable. if(job.vdsNS.getBooleanValue(Pegasus.CREATE_AND_CHANGE_DIR_KEY ) ){ //pass the exectionSiteDirectory as an argument to kickstart gridStartArgs.append(" -W ").append(directory).append(' '); setScratchEnvVariable = true; } else if(job.vdsNS.getBooleanValue(Pegasus.CHANGE_DIR_KEY) ){ //handle the -w option that asks kickstart to change //exectionSiteDirectory before launching an executable. gridStartArgs.append(" -w ").append( directory ).append(' '); setScratchEnvVariable = true; } else{ //set the directory key with the job //for kickstart -w and -W it is not set if( requiresToSetDirectory( job ) ){ job.setDirectory( directory ); } } //PM-961 set the Pegasus scratch dir only for -w and -W cases //for rest we associate them in the styles if( setScratchEnvVariable ){ job.envVariables.construct( ENV.PEGASUS_SCRATCH_DIR_KEY, directory ); } if( job.vdsNS.getBooleanValue(Pegasus.TRANSFER_PROXY_KEY) ){ job.setDirectory( null ); } //check if the constituentJob type indicates staging of executable //The -X functionality is handled by the setup jobs that //are added as childern to the stage in jobs, unless they are //disabled and users set a property to set the xbit //Karan November 22, 2005 if( mSetXBit && job.userExecutablesStagedForJob() ){ //add the -X flag to denote turning on gridStartArgs.append( " -X " ); } String statArgs = generateStatArgumentOptions( job, stat, mRegisterOutputs, addPostScript, mIntegrityCheckingOn ); if( !statArgs.isEmpty() ){ gridStartArgs.append( statArgs ); } else if( mGenerateLOF ){ //dostat is false. so no generation of stat option //but generate lof files nevertheless //inefficient check here again. just a prototype //we need to generate -S option only for non transfer jobs //generate the list of filenames file for the input and output files. if (! (job instanceof TransferJob)) { generateListofFilenamesFile( job.getInputFiles(), job, ".in.lof"); } //for cleanup jobs no generation of stats for output files if (job.getJobType() != Job.CLEANUP_JOB) { generateListofFilenamesFile(job.getOutputFiles(), job, ".out.lof"); } }///end of mGenerateLOF //append any arguments that need to be passed //kickstart directly, set elsewhere if(job.vdsNS.containsKey(Pegasus.GRIDSTART_ARGUMENTS_KEY)){ gridStartArgs.append(job.vdsNS.get(Pegasus.GRIDSTART_ARGUMENTS_KEY)) .append(' '); } if(mProps.generateKickstartExtraOptions() && mConcDAG != null){ gridStartArgs.append("-L ").append( mKickstartLabel ).append(" "); gridStartArgs.append("-T ").append(mConcDAG.getMTime()).append(" "); } gridStartArgs.append( getKickstartTimeoutOptions( job ) ); /* mLogger.log( "User executables staged for job " + job.getID() + " " + job.userExecutablesStagedForJob() , LogManager.DEBUG_MESSAGE_LEVEL ); */ //figure out job executable String jobExecutable = ( !this.mUseFullPathToGridStart && job.userExecutablesStagedForJob() )? //the basename of the executable used for pegasus lite //and staging of executables "." + File.separator + job.getStagedExecutableBaseName( ): //use whatever is set in the executable field job.executable; long argumentLength = gridStartArgs.length() + jobExecutable.length() + 1 + job.strargs.length(); //invoke is disabled if part of clustered job or because of a global disable //JIRA PM-526 boolean disableInvoke = mDisableInvokeFunctionality || partOfClusteredJob || job.getJobType() != Job.COMPUTE_JOB; //PM-851 if( !disableInvoke && (mInvokeAlways || argumentLength > mInvokeLength) ){ if(!useInvoke(job, jobExecutable, gridStartArgs)){ mLogger.log("Unable to use invoke for job ", LogManager.ERROR_MESSAGE_LEVEL); return false; } } else{ gridStartArgs.append( jobExecutable ); gridStartArgs.append(' ').append(job.strargs); } //the executable path and arguments are put //in the Condor namespace and not printed to the //file so that they can be overriden if desired //later through profiles and key transfer_executable // the arguments are no longer set as condor profiles // they are now set to the corresponding profiles in // the Condor Code Generator only. /* construct(job, "executable", gridStartPath ); construct(job, "arguments", gridStartArgs.toString()); */ job.setArguments( gridStartArgs.toString() ); job.setRemoteExecutable( gridStartPath ); //all finished successfully return true; } /** * It changes the paths to the executable depending on whether we want to * transfer the executable or not. * * If the transfer_executable is set to true, then the executable needs to be * shipped from the submit host meaning the local pool. This function changes * the path of the executable to the one on the local pool, so that it can * be shipped. * * If the worker package is being deployed dynamically, then the path is set * to the exectionSiteDirectory where the worker package is deployed. * * Else, we pick up the path from the site catalog that is passed as input * * @param constituentJob the <code>Job</code> containing the constituentJob description. * @param path the path to kickstart on the remote compute site, as determined * from the site catalog. * * @return the path that needs to be set as the executable */ protected String handleTransferOfExecutable( Job job, String path ) { Condor cvar = job.condorVariables; if ( cvar.getBooleanValue("transfer_executable")) { SiteCatalogEntry site = mSiteStore.lookup( "local" ); TransformationCatalogEntry entry = this.getTransformationCatalogEntry( site.getSiteHandle() ); String gridStartPath = ( entry == null )? //rely on the path determined from sc getKickstartPath( site ): //the tc entry has highest priority entry.getPhysicalTransformation(); if (gridStartPath == null) { mLogger.log( "Gridstart needs to be shipped from the submit host to pool" + job.getSiteHandle() + ".\nNo entry for it in pool local", LogManager.ERROR_MESSAGE_LEVEL); throw new RuntimeException( "GridStart needs to be shipped from submit host to site " + job.getSiteHandle() + " for job " + job.getName()); } return gridStartPath; } else if( mDynamicDeployment && job.runInWorkDirectory() && ! mPegasusConfiguration.jobSetupForWorkerNodeExecution(job ) ){ //worker package deployment for sharedfs //pick up the path from the transformation catalog of //dynamic deployment //in case of pegasus lite mode, we dont look up here. TransformationCatalogEntry entry = this.getTransformationCatalogEntry( job.getSiteHandle() ); if( entry == null ){ //NOW THROWN AN EXCEPTION //should throw a TC specific exception StringBuffer error = new StringBuffer(); error.append("Could not find entry in tc for lfn "). append( COMPLETE_TRANSFORMATION_NAME ). append(" at site ").append( job.getSiteHandle() ); if ( job.getSiteHandle().equalsIgnoreCase( "local" ) ){ //for local site in case of worker package staging also //we can pick up the path on submit host, if not staged //PM-497 SiteCatalogEntry site = mSiteStore.lookup( "local" ); String p = this.getKickstartPath( site ); if( p != null ){ return p; } } mLogger.log( error.toString(), LogManager.ERROR_MESSAGE_LEVEL); throw new RuntimeException( error.toString() ); } return entry.getPhysicalTransformation(); } else{ //the vanilla case where kickstart is pre installed. TransformationCatalogEntry entry = this.getTransformationCatalogEntry( job.getSiteHandle() ); String ksProfilePath = (String)job.vdsNS.get( Pegasus.GRIDSTART_PATH_KEY ); String ksPath = ( entry == null )? //rely on the path determined from profiles ksProfilePath: //the tc entry has highest priority entry.getPhysicalTransformation(); //we use full paths for pegasus auxillary jobs //even when pegasus lite is used i.e mUseFullPathToGridStart is set to true boolean useFullPath = mUseFullPathToGridStart || job.getJobType() != Job.COMPUTE_JOB ; if( useFullPath ){ ksPath = ( ksPath == null )? //rely on the path from the site catalog path: ksPath; } else{ //pegasus lite case. we dont want to rely on site catalog //constructed path /* commented out for PM-1097 ksPath = ( ksPath == null )? this.EXECUTABLE_BASENAME ://use the basename ksPath; */ if ( ksPath == null ){ ksPath = this.EXECUTABLE_BASENAME ;//use the basename } else{ //PM-1097 check again to see if user had different gs profile set if ( ksProfilePath != null ){ //we prefer the kickstart path as determined from profile //only for PegasusLite case. ksPath = ksProfilePath; } } } //sanity check if( ksPath == null ){ StringBuilder error = new StringBuilder(); error.append( "Unable to determine path to kickstart for site " ).append( job.getSiteHandle()). append( " for job " ).append( job.getID() ); if( path == null ){ //we know there was no path determined from site catalog. error.append( " . " ).append( "Make sure PEGASUS_HOME is set as an env profile in the site catalog for site " ). append( job.getSiteHandle() ); } throw new RuntimeException( error.toString() ); } return ksPath; } } /** * Returns the transformation catalog entry for kickstart on a site * * @param site the site on which the entry is required * * @return the entry if found else null */ public TransformationCatalogEntry getTransformationCatalogEntry( String site ){ List entries = null; try { entries = mTCHandle.lookup( Kickstart.TRANSFORMATION_NAMESPACE, Kickstart.TRANSFORMATION_NAME, Kickstart.TRANSFORMATION_VERSION, site, TCType.INSTALLED ); } catch (Exception e) { //non sensical catching mLogger.log("Unable to retrieve entries from TC " + e.getMessage(), LogManager.DEBUG_MESSAGE_LEVEL); } return ( entries == null ) ? null : (TransformationCatalogEntry) entries.get(0); } /** * Returns the default path to kickstart as constructed from the * environment variable associated with a site in the site catalog * * @param site the SiteCatalogEntry object for the site. * * @return value if set else null */ public String getKickstartPath( SiteCatalogEntry site ) { //try to construct the default path on basis of //PEGASUS_HOME environment variable. String home = site.getPegasusHome(); if( home == null ){ return null; } StringBuffer ks = new StringBuffer(); ks.append( home ).append( File.separator ). append( "bin").append( File.separator ). append( Kickstart.EXECUTABLE_BASENAME ); return ks.toString(); } /** * Returns the exectionSiteDirectory in which the constituentJob executes on the worker node. * * * @param constituentJob * * @return the full path to the exectionSiteDirectory where the constituentJob executes */ public String getWorkerNodeDirectory( Job job ){ //check for Pegasus Profile if( job.vdsNS.containsKey( Pegasus.WORKER_NODE_DIRECTORY_KEY ) ){ return job.vdsNS.getStringValue( Pegasus.WORKER_NODE_DIRECTORY_KEY ); } if( mSLS.doesCondorModifications() ){ //indicates the worker node exectionSiteDirectory is the exectionSiteDirectory //in which condor launches the job // JIRA PM-380 return "."; } StringBuffer workerNodeDir = new StringBuffer(); String destDir = mSiteStore.getEnvironmentVariable( job.getSiteHandle() , "wntmp" ); destDir = ( destDir == null ) ? "/tmp" : destDir; String relativeDir = mPOptions.getRelativeDirectory(); workerNodeDir.append( destDir ).append( File.separator ). append( relativeDir.replaceAll( "/" , "-" ) ). //append( File.separator ).append( constituentJob.getCompleteTCName().replaceAll( ":[:]*", "-") ); append( "-" ).append( job.getID() ); return workerNodeDir.toString(); } /** * Indicates whether the enabling mechanism can set the X bit * on the executable on the remote grid site, in addition to launching * it on the remote grid site. * * @return true indicating Kickstart can set the X bit or not. */ public boolean canSetXBit(){ return true; } /** * Returns the value of the vds profile with key as Pegasus.GRIDSTART_KEY, * that would result in the loading of this particular implementation. * It is usually the name of the implementing class without the * package name. * * @return the value of the profile key. * @see org.griphyn.cPlanner.namespace.Pegasus#GRIDSTART_KEY */ public String getVDSKeyValue(){ return Kickstart.CLASSNAME; } /** * Returns a short textual description in the form of the name of the class. * * @return short textual description. */ public String shortDescribe(){ return Kickstart.SHORT_NAME; } /** * Returns the SHORT_NAME for the POSTScript implementation that is used * to be as default with this GridStart implementation. * * @return the identifier for the PegasusExitCode POSTScript implementation. * * */ public String defaultPOSTScript(){ return PegasusExitCode.SHORT_NAME; } /** * Returns a boolean indicating whether we need to set the directory for * the job or not. * * @param job the job for which to set directory. * * @return */ protected boolean requiresToSetDirectory( Job job ) { //the cleanup jobs should never have directory set as full path //is specified return ( job.getJobType() != Job.CLEANUP_JOB && job.getJobType() != Job.REPLICA_REG_JOB ); } /** * Returns the directory in which the job should run. * * @param job the job in which the directory has to run. * * @return */ protected String getDirectory( Job job ){ String execSiteWorkDir = mSiteStore.getInternalWorkDirectory(job); String workdir = (String) job.globusRSL.removeKey("directory"); // returns old value workdir = (workdir == null)?execSiteWorkDir:workdir; return workdir; } /** * Triggers the creation of the kickstart input file, that contains the * the remote executable and the arguments with which it has to be invoked. * The kickstart input file is created in the submit directory. * * @param constituentJob the <code>Job</code> object containing the constituentJob description. * @param executable the path to the executable used. * @param args the arguments buffer for gridstart invocation so far. * * @return boolean indicating whether kickstart input file was generated or not. * false in case of any error. */ private boolean useInvoke(Job job, String executable, StringBuffer args){ boolean result = true; String inputBaseName = job.jobName + "." + Kickstart.KICKSTART_INPUT_SUFFIX; //writing the stdin file File argFile = new File(mSubmitDir, inputBaseName); try { FileWriter input; input = new FileWriter( argFile ); //the first thing that goes in is the executable name input.write( executable ); input.write("\n"); //write out all the arguments //one on each line StringTokenizer st = new StringTokenizer(job.strargs); while(st.hasMoreTokens()){ input.write(st.nextToken()); input.write("\n"); } //close the stream input.close(); } catch (Exception e) { mLogger.log("Unable to write the kickstart input file for job " + job.getCompleteTCName() + " " + e.getMessage(), LogManager.ERROR_MESSAGE_LEVEL); return false; } //check if a directory is associated with the job String directory = job.getDirectory(); if( directory != null ){ //for PM-526 //we want to trigger the -w option if a directory is associated with //the jobs args.append( " -w " ).append( directory ); job.setDirectory( null ); } job.condorVariables.addIPFileForTransfer( argFile.getAbsolutePath() ); //add the -I argument to kickstart args.append(" -I ").append(inputBaseName).append(" "); return result; } /** * Constructs a kickstart setup constituentJob * * @param constituentJob the constituentJob to be run. * @param workerNodeTmp the worker node tmp to run the constituentJob in. * * @return String */ protected String constructSetupJob( Job job, String workerNodeTmp ){ StringBuffer setup = new StringBuffer(); setup.append( "/bin/mkdir -p " ).append( workerNodeTmp ); return setup.toString(); } /** * Constructs a kickstart setup constituentJob * * @param constituentJob the constituentJob to be run. * @param workerNodeTmp the worker node tmp to run the constituentJob in. * * @return String */ protected String constructCleanupJob( Job job, String workerNodeTmp ){ StringBuffer setup = new StringBuffer(); setup.append( "/bin/rm -rf " ).append( workerNodeTmp ); return setup.toString(); } /** * Constructs the prejob that fetches sls file, and then invokes transfer * again. * * @param constituentJob the constituentJob for which the prejob is being created * @param headNodeURLPrefix String * @param headNodeDirectory String * @param workerNodeDirectory String * @param slsFile String * * @return String containing the prescript invocation */ protected String constructPREJob( Job job, String headNodeURLPrefix, String headNodeDirectory, String workerNodeDirectory, String slsFile ){ File headNodeSLS = new File( headNodeDirectory, slsFile ); return mSLS.invocationString( job, headNodeSLS ); //first we need to get the sls file to worker node /* preJob.append( "/bin/echo -e \" " ). append( headNodeURLPrefix ).append( File.separator ). append( headNodeDirectory ).append( File.separator ). append( slsFile ).append( " \\n " ). append( "file://" ).append( workerNodeDirectory ).append( File.separator ). append( slsFile ).append( "\"" ). append( " | " ).append( transfer ).append( " base mnt " ); preJob.append( " && " ); //now we need to get transfer to execute this sls file preJob.append( transfer ).append( " base mnt < " ).append( slsFile ); */ } /** * Returns the stat argument options to be appended for kick * * @param job * @param stat * @param registerOutputs * @param addPostScript * @param integrityChecksOn * @return */ protected String generateStatArgumentOptions(Job job, boolean stat, boolean registerOutputs, boolean addPostScript, boolean integrityChecksOn ) { //sanity check if ( !( stat || registerOutputs || integrityChecksOn) || this.mDisableKickstartStatCompletely){ return ""; } StringBuilder args = new StringBuilder(); //PM-992 we stat outputs either if stat property is set //or registration is enabled. inputs are only stated if //stat property is turned on. if ( stat || registerOutputs || integrityChecksOn){ //add the stat options to kickstart only for certain jobs for time being //and if the input variable is true if (job.getJobType() == Job.COMPUTE_JOB || // job.getJobType() == Job.STAGED_COMPUTE_JOB || job.getJobType() == Job.CLEANUP_JOB || job.getJobType() == Job.STAGE_IN_JOB || job.getJobType() == Job.INTER_POOL_JOB) { String lof; List files = new ArrayList(2); //inefficient check here again. just a prototype //we need to generate -S option only for non transfer jobs //generate the list of filenames file for the input and output files. if (! (job instanceof TransferJob) && stat ) { lof = generateListofFilenamesFile(job.getInputFiles(), job, ".in.lof"); if (lof != null) { File file = new File(lof); job.condorVariables.addIPFileForTransfer(lof); //arguments just need basename . no path component args.append(" -S @").append(file.getName()). append(" "); files.add(file.getName()); } } if( stat ) { //for cleanup jobs no generation of stats for output files if (job.getJobType() != Job.CLEANUP_JOB) { lof = generateListofFilenamesFile( job.getOutputFiles(), job, ".out.lof"); if (lof != null) { File file = new File(lof); job.condorVariables.addIPFileForTransfer(lof); //arguments just need basename . no path component args.append(" -s @").append(file.getName()).append(" "); files.add(file.getName()); } } } else if( registerOutputs || integrityChecksOn ){ //PM-992 we generate lfn=pfn -s options on command line //for files that need to be registered if( job.getJobType() == Job.COMPUTE_JOB ){ for( PegasusFile file : job.getOutputFiles() ){ //going forward we should remove this check completely. if( file.getRegisterFlag() || integrityChecksOn){ args.append( " -s " ).append( file.getLFN() ). append( "=" ).append( file.getLFN() ); } } } } //add kickstart postscript that removes these files if( addPostScript ) { addCleanupPostScript(job, files); } } } args.append( " " ); return args.toString(); } /** * Writes out the list of filenames file for the job. * * @param files the list of <code>PegasusFile</code> objects contains the files * whose stat information is required. * @param job the job * @param suffix the suffix to be applied to files * * @return the full path to lof file created, else null if no file is written out. */ protected String generateListofFilenamesFile( Set files, Job job, String suffix ){ //sanity check if ( files == null || files.isEmpty() ){ return null; } String result = null; //writing the stdin file try { File f = new File( job.getFileFullPath(mSubmitDir, suffix) ); FileWriter input; input = new FileWriter( f ); PegasusFile pf; for( Iterator it = files.iterator(); it.hasNext(); ){ pf = ( PegasusFile ) it.next(); String lfn= pf.getLFN(); StringBuilder sb = new StringBuilder(); //to make sure that kickstart generates lfn attribute in statcall //element sb.append( lfn ).append( "=" ). append( lfn ).append( "\n" ); input.write( sb.toString() ); } //close the stream input.close(); result = f.getAbsolutePath(); } catch ( IOException e) { mLogger.log("Unable to write the lof file for job " + job.getID() + " with suffix " + suffix , e , LogManager.ERROR_MESSAGE_LEVEL); } return result; } /** * Constructs a condor variable in the condor profile namespace * associated with the constituentJob. Overrides any preexisting key values. * * @param constituentJob contains the constituentJob description. * @param key the key of the profile. * @param value the associated value. */ private void construct(Job job, String key, String value){ job.condorVariables.construct(key,value); } /** * Condor Quotes a string * * @param string the string to be quoted. * * @return quoted string. */ private String quote( String string ){ String result; try{ mLogger.log("Unquoted Prejob is " + string, LogManager.DEBUG_MESSAGE_LEVEL); result = CondorQuoteParser.quote( string, false ); mLogger.log("Quoted Prejob is " + result, LogManager.DEBUG_MESSAGE_LEVEL ); } catch (CondorQuoteParserException e) { throw new RuntimeException("CondorQuoting Problem " + e.getMessage()); } return result; } /** * Adds a /bin/rm post constituentJob to kickstart that removes the files passed. * The post jobs is added as an environment variable. * * @param constituentJob the constituentJob in which the post constituentJob needs to be added. * @param files the files to be deleted. */ private void addCleanupPostScript( Job job, List files ){ //sanity check if ( files == null || !mDoStat || files.isEmpty() ) { return; } //do not add if constituentJob already has a postscript specified if( job.envVariables.containsKey( this.KICKSTART_CLEANUP ) ){ mLogger.log( "Not adding lof cleanup as another kickstart cleanup already exists", LogManager.DEBUG_MESSAGE_LEVEL ); return; } StringBuffer ps = new StringBuffer(); //maybe later we might want to pick it up from the TC ps.append( "/bin/rm -rf").append( " " ); for( Iterator it = files.iterator(); it.hasNext(); ){ ps.append( it.next() ).append( " " ); } job.envVariables.construct( this.KICKSTART_CLEANUP, ps.toString() ); return; } /** * Generates the argument fragment related to kickstart -k and -K options * * @param job the job. * * @return */ private String getKickstartTimeoutOptions(Job job) { StringBuilder sb = new StringBuilder(); //get the checkout time in seconds long checkpointTime = this.getJobCheckpointTimeInSeconds(job); if( checkpointTime == Long.MAX_VALUE ){ //no value specified return sb.toString(); } //expected time is the time after which kickstart sends //the TERM signal to job sb.append( "-k " ).append( checkpointTime ).append( " " ); long max = Long.MAX_VALUE; long multiplier = 60; if( job.vdsNS.containsKey( Pegasus.MAX_WALLTIME) ){ max = job.vdsNS.getIntValue( Pegasus.MAX_WALLTIME, Integer.MAX_VALUE ); } else if ( job.globusRSL.containsKey(Globus.MAX_WALLTIME_KEY) ){ max = job.globusRSL.getIntValue(Globus.MAX_WALLTIME_KEY, Integer.MAX_VALUE ); } else if ( job.vdsNS.containsKey( Pegasus.RUNTIME_KEY) ){ //PM-962 last fallback to pegasus profile runtime which is in seconds max = job.vdsNS.getIntValue( Pegasus.RUNTIME_KEY, Integer.MAX_VALUE ); multiplier = 1; } if( max == Long.MAX_VALUE ){ //means user never specified a maxwalltime //or a malformed value //we don't determnine the -K parameter return sb.toString(); } //maxwalltime is specified in minutes, while pegasus runtime //is in seconds. convert to seconds for kickstart max = max * multiplier; //we set the -K parameter to half the difference between //maxwalltime - checkpointTime long diff = max - checkpointTime; long minDiff = 10; if( diff < minDiff ){ //throw error throw new RuntimeException( "Insufficient difference between maxwalltime " + max + " and checkpoint time " + checkpointTime + " Should be at least " + minDiff + " seconds "); } //we divide the difference equaully. //give equal time to generate the checkpoint file and //the time to transfer the file //kill time is the time after which kickstart sends //the KILL signal to job sb.append( "-K " ).append( diff/2 ).append( " " ); return sb.toString(); } /** * Returns the job's checkpoint time in seconds. * * @param j * * @return job checkpointime in seconds, else Long.MAX_VALUE if not * specified * @throws RuntimeException for malformed values */ private long getJobCheckpointTimeInSeconds( Job job ){ long time = Long.MAX_VALUE; //check for checkpoint.time that is specified in minutes. String key = Pegasus.CHECKPOINT_TIME_KEY; if( job.vdsNS.containsKey( key ) ){ //means there is expectation of timeout functionality time = job.vdsNS.getLongValue( key, Long.MAX_VALUE ); if( time == Long.MAX_VALUE ){ //malformed value throw new RuntimeException( "Malformed Pegasus Profile " + key + " value " + job.vdsNS.getStringValue( key ) + " for job " + job.getID()); } //pegasus checkpoint.time key is in minutes. convert to seconds. time = time * 60; return time; } //check for deprecated value key = Pegasus.DEPRECATED_CHECKPOINT_TIME_KEY; if( job.vdsNS.containsKey( key ) ){ //means there is expectation of timeout functionality time = job.vdsNS.getLongValue( key, Long.MAX_VALUE ); if( time == Long.MAX_VALUE ){ //malformed value throw new RuntimeException( "Malformed Pegasus Profile " + key + " value " + job.vdsNS.getStringValue( key ) + " for job " + job.getID()); } //log deprecated value mLogger.log( "Deprecated Pegasus profile key " + key + " found for job " + job.getID(), LogManager.DEBUG_MESSAGE_LEVEL ); } return time; } }