/** * Copyright 2007-2008 University Of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.isi.pegasus.planner.code.gridstart; import edu.isi.pegasus.common.credential.CredentialHandler; import edu.isi.pegasus.common.credential.CredentialHandlerFactory; import edu.isi.pegasus.common.logging.LogManager; import edu.isi.pegasus.common.util.Escape; import edu.isi.pegasus.common.util.Version; import edu.isi.pegasus.planner.catalog.TransformationCatalog; import edu.isi.pegasus.planner.catalog.classes.Profiles; import edu.isi.pegasus.planner.catalog.site.classes.SiteCatalogEntry; import edu.isi.pegasus.planner.catalog.site.classes.SiteStore; import edu.isi.pegasus.planner.catalog.transformation.TransformationCatalogEntry; import edu.isi.pegasus.planner.catalog.transformation.classes.TCType; import edu.isi.pegasus.planner.classes.ADag; import edu.isi.pegasus.planner.classes.AggregatedJob; import edu.isi.pegasus.planner.classes.Job; import edu.isi.pegasus.planner.classes.PegasusBag; import edu.isi.pegasus.planner.classes.PlannerOptions; import edu.isi.pegasus.planner.code.GridStart; import edu.isi.pegasus.planner.code.generator.condor.ClassADSGenerator; import edu.isi.pegasus.planner.common.PegasusConfiguration; import edu.isi.pegasus.planner.common.PegasusProperties; import edu.isi.pegasus.planner.namespace.ENV; import edu.isi.pegasus.planner.namespace.Pegasus; import java.io.File; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; /** * This class wraps all the jobs, both remote or local to be executed using * the Distribute wrapper for HubZero. * * This wrapper only works for the shared filesystem approach and only jobs * scheduled for non local sites are wrapped by distribute. To use this, * users needs to catalog the executable hubzero::distribute in their * transformation catalog. * * * @author Karan Vahi * @version $Revision$ */ public class Distribute implements GridStart { private PegasusBag mBag; private ADag mDAG; /** * The basename of the class that is implmenting this. Could have * been determined by reflection. */ public static final String CLASSNAME = "Distribute"; /** * The SHORTNAME for this implementation. */ public static final String SHORT_NAME = "distribute"; /** * The transformation namespace for the distribute */ public static final String TRANSFORMATION_NAMESPACE = "hubzero"; /** * The logical name of distribute */ public static final String TRANSFORMATION_NAME = "distribute"; /** * The version number for distribute. */ public static final String TRANSFORMATION_VERSION = null; /** * The basename of the distribute executable. */ public static final String EXECUTABLE_BASENAME = "distribute"; /** * Stores the major version of the planner. */ private String mMajorVersionLevel; /** * Stores the major version of the planner. */ private String mMinorVersionLevel; /** * Stores the major version of the planner. */ private String mPatchVersionLevel; /** * The LogManager object which is used to log all the messages. */ protected LogManager mLogger; /** * The object holding all the properties pertaining to Pegasus. */ protected PegasusProperties mProps; /** * The submit directory where the submit files are being generated for * the workflow. */ protected String mSubmitDir; /** * A boolean indicating whether to generate lof files or not. */ protected boolean mGenerateLOF; /** * A boolean indicating whether to have worker node execution or not. */ protected boolean mWorkerNodeExecution; /** * The options passed to the planner. */ protected PlannerOptions mPOptions; /** * Handle to the site catalog store. */ protected SiteStore mSiteStore; /** * An instance variable to track if enabling is happening as part of a clustered job. */ protected boolean mEnablingPartOfAggregatedJob; /** * Handle to kickstart GridStart implementation. */ private Kickstart mKickstartGridStartImpl; /** * Handle to Transformation Catalog. */ private TransformationCatalog mTCHandle; /** * Boolean indicating whether worker package transfer is enabled or not */ protected boolean mTransferWorkerPackage; /** A map indexed by execution site and the corresponding worker package *location in the submit directory */ Map<String,String> mWorkerPackageMap ; /** * The Credential handler for SSH transfer */ private CredentialHandler mSSHCredHandler; private ENV mLocalENV; private PegasusConfiguration mPegasusConfiguration; /** * Initializes the GridStart implementation. * * @param bag the bag of objects that is used for initialization. * @param dag the concrete dag so far. */ public void initialize( PegasusBag bag, ADag dag ){ mBag = bag; mDAG = dag; mLogger = bag.getLogger(); mSiteStore = bag.getHandleToSiteStore(); mPOptions = bag.getPlannerOptions(); mSubmitDir = mPOptions.getSubmitDirectory(); mProps = bag.getPegasusProperties(); mGenerateLOF = mProps.generateLOFFiles(); mTCHandle = bag.getHandleToTransformationCatalog(); mTransferWorkerPackage = mProps.transferWorkerPackage(); if( mTransferWorkerPackage ){ mWorkerPackageMap = bag.getWorkerPackageMap(); if( mWorkerPackageMap == null ){ mWorkerPackageMap = new HashMap<String,String>(); } } else{ mWorkerPackageMap = new HashMap<String,String>(); } Version version = Version.instance(); mMajorVersionLevel = version.getMajor(); mMinorVersionLevel = version.getMinor(); mPatchVersionLevel = version.getPatch(); mPegasusConfiguration = new PegasusConfiguration( bag.getLogger() ); mEnablingPartOfAggregatedJob = false; mKickstartGridStartImpl = new Kickstart(); mKickstartGridStartImpl.initialize( bag, dag ); //initialize the SSH credential handler CredentialHandlerFactory factory = new CredentialHandlerFactory(); factory.initialize( mBag ); mSSHCredHandler = factory.loadInstance(CredentialHandler.TYPE.ssh ); //get the local env mSiteStore = mBag.getHandleToSiteStore(); SiteCatalogEntry localSite = mSiteStore.lookup( "local" ); if( localSite == null ){ throw new RuntimeException( "Unable to locate site catalog entry for site local "); } else { mLocalENV = (ENV) localSite.getProfiles().get( Profiles.NAMESPACES.env); } } /** * Enables a job to run on the grid. This also determines how the * stdin,stderr and stdout of the job are to be propogated. * To grid enable a job, the job may need to be wrapped into another * job, that actually launches the job. It usually results in the job * description passed being modified modified. * * @param job the <code>Job</code> object containing the job description * of the job that has to be enabled on the grid. * @param isGlobusJob is <code>true</code>, if the job generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the job runs on the submit * host in any way. * * @return boolean true if enabling was successful,else false. */ public boolean enable( AggregatedJob job,boolean isGlobusJob){ //in pegasus lite mode we dont want kickstart to change or create //worker node directories for( Iterator it = job.constituentJobsIterator(); it.hasNext() ; ){ Job j = (Job) it.next(); j.vdsNS.construct( Pegasus.CHANGE_DIR_KEY , "true" ); j.vdsNS.construct( Pegasus.CREATE_AND_CHANGE_DIR_KEY, "false" ); } //for time being we treat clustered jobs same as normal jobs //in pegasus-lite //return this.enable( (Job)job, isGlobusJob ); //consider case for non worker node execution first if( !mWorkerNodeExecution ){ //shared filesystem case. //System.out.println( "Job " + job.getID() + " scheduled at site " + job.getSiteHandle() ); if( job.getSiteHandle().equals( "local") ){ //all jobs scheduled to local site just get //vanilla treatment from the kickstart enabling. return mKickstartGridStartImpl.enable( job, isGlobusJob ); } else{ //the clustered jobs are never lauched via kickstart //as their constitutents are enabled mKickstartGridStartImpl.enable( job, isGlobusJob ); //now we enable the jobs with the distribute wrapper wrapJobWithDistribute( job, isGlobusJob ); } } else{ throw new RuntimeException( "Distribute Job Wrapper only works for sharedfs deployments"); } return true; } /** * Enables a job to run on the grid by launching it directly. It ends * up running the executable directly without going through any intermediate * launcher executable. It connects the stdio, and stderr to underlying * condor mechanisms so that they are transported back to the submit host. * * @param job the <code>Job</code> object containing the job description * of the job that has to be enabled on the grid. * @param isGlobusJob is <code>true</code>, if the job generated a * line <code>universe = globus</code>, and thus runs remotely. * Set to <code>false</code>, if the job runs on the submit * host in any way. * * @return boolean true if enabling was successful,else false in case when * the path to kickstart could not be determined on the site where * the job is scheduled. */ public boolean enable(Job job, boolean isGlobusJob) { //take care of relative submit directory if specified String submitDir = mSubmitDir + mSeparator; if( mPegasusConfiguration.jobSetupForWorkerNodeExecution( job ) ){ //shared filesystem case. StringBuilder error = new StringBuilder(); error.append( "Job " ).append( job.getID() ). append( " cannot be wrapped with Distribute. It works only in sharedfs case. Invalid data.configuration associated " ). append( job.vdsNS.get( Pegasus.DATA_CONFIGURATION_KEY ) ); throw new RuntimeException( error.toString() ); } //shared filesystem case. if( job.getSiteHandle().equals( "local") ){ //all jobs scheduled to local site just get //vanilla treatment from the kickstart enabling. return mKickstartGridStartImpl.enable( job, isGlobusJob ); } else{ //jobs scheduled to non local site are wrapped //with distribute after wrapping them with kickstart //we always want the kickstart -w option job.vdsNS.construct( Pegasus.CHANGE_DIR_KEY , "true" ); job.vdsNS.construct( Pegasus.CREATE_AND_CHANGE_DIR_KEY, "false" ); mKickstartGridStartImpl.enable( job, isGlobusJob ); //now we enable the jobs with the distribute wrapper wrapJobWithDistribute( job, isGlobusJob ); } return true; } /** * Wraps a job with the distribute wrapper. * The job existing executable and arguments are retrived to construct * an invocation string that is passed as an argument to the distribute * job launcher. Also, the job is modified to run on local site. * * @param job the job to be wrapped with distribute * @param globusJob boolean */ protected void wrapJobWithDistribute(Job job, boolean globusJob) { StringBuilder arguments = new StringBuilder(); //construct the path to distribute executable //on local site. TransformationCatalogEntry entry = this.getTransformationCatalogEntry( "local" ); String distributePath = ( entry == null )? //rely on the path determined from profiles (String)job.vdsNS.get( Pegasus.GRIDSTART_PATH_KEY ): //else the tc entry has highest priority entry.getPhysicalTransformation(); if( distributePath == null ){ throw new RuntimeException( "Unable to determine path to the distribute wrapper on local site"); } //job arguments are combination of arguments to distribute //and the kickstart invocation arguments.append( getDistributeArguments( job ) ); arguments.append( job.getRemoteExecutable() ).append( " " ). append( job.getArguments() ); job.setArguments(arguments.toString() ); job.setRemoteExecutable( distributePath ); //a lot of distribute arguments are picked up via the environment ENV distributeENV = this.getEnvironmentForDistribute(job); //we want want the generated classad to still point to the remote site job.condorVariables.construct( ClassADSGenerator.PLUS_RESOURCE_AD_KEY , job.getSiteHandle() ); //update the job to run on local site //and the style to condor job.setSiteHandle( "local" ); job.vdsNS.construct(Pegasus.STYLE_KEY, Pegasus.CONDOR_STYLE ); //since the job is running locally it's environment //has to be from the local entry of the site catalog ENV env = new ENV(); env.merge( mLocalENV ); env.merge( distributeENV ); job.envVariables = env; return; } /** * Constructs the argument string for the distribute job * * @param job * * @return the argument string */ protected String getDistributeArguments(Job job) { StringBuilder args = new StringBuilder(); /* */ return args.toString(); } /** * Returns the environment variables that are required for distribute * to generate the appropriate PBS submit file for the job * * @param job * * @return the job environment variables */ protected ENV getEnvironmentForDistribute( Job job ){ ENV env = new ENV(); //jobs environment variables for a ; separated list under //the key DISTRIBUTE_REMOTE_ENVIRONMENT String key = "DISTRIBUTE_REMOTE_ENVIRONMENT"; StringBuilder remoteEnv = new StringBuilder(); Escape es = new Escape(); for( Iterator it = job.envVariables.getProfileKeyIterator(); it.hasNext(); ){ String envVariable = (String) it.next(); String value = (String) job.envVariables.get(envVariable); remoteEnv.append( envVariable ).append( "=" ). append( es.escape( value ) ); if( it.hasNext() ){ remoteEnv.append( "," ); } } env.construct( key, remoteEnv.toString()); //DISTRIBUTE_SITE_DESIGNATOR is the site handle env.construct( "DISTRIBUTE_SITE_DESIGNATOR", job.getSiteHandle() ); //SSH PRIVATE KEY String sshKeyPath = mSSHCredHandler.getPath( job.getSiteHandle() ); if( sshKeyPath == null ){ throw new RuntimeException( "Distribute Wrapper needs path to the private SSH Key. Please set the pegasus profile " + mSSHCredHandler.getProfileKey() + " for site " + job.getSiteHandle() ); } env.construct( "DISTRIBUTE_SSH_IDENTITY_PATH", sshKeyPath ); //construct a name for DISTRIBUTE to tell PBS to where to place the //kickstart stdout and stderr . some cheating here.. //job.getDirectory() returns null since we have -w enabled for kickstart String directory = mSiteStore.getInternalWorkDirectory(job); env.construct( "DISTRIBUTE_JOB_PATH" , directory ); env.construct( "DISTRIBUTE_JOB_STDOUT", job.getID() + ".stdout" ); env.construct( "DISTRIBUTE_JOB_STDERR", job.getID() + ".stderr" ); /* the globus key hostCount is NODES */ if( job.globusRSL.containsKey( "hostcount" ) ){ env.construct( "DISTRIBUTE_NODES", (String)job.globusRSL.get( "hostcount" ) ); } /* the globus key xcount is PROCS or PPN */ if( job.globusRSL.containsKey( "xcount" ) ){ env.construct( "DISTRIBUTE_PPN", (String)job.globusRSL.get( "xcount" ) ); } /* the globus key maxwalltime is WALLTIME */ if( job.globusRSL.containsKey( "maxwalltime" ) ){ env.construct( "DISTRIBUTE_WALLTIME", (String)job.globusRSL.get( "maxwalltime" ) ); } /* the globus key maxmemory is PER_PROCESS_MEMORY */ if( job.globusRSL.containsKey( "maxmemory" ) ){ env.construct( "DISTRIBUTE_PER_PROCESS_MEMORY", (String)job.globusRSL.get( "maxmemory" ) ); } return env; } /** * Returns the transformation catalog entry for kickstart on a site * * @param site the site on which the entry is required * * @return the entry if found else null */ protected TransformationCatalogEntry getTransformationCatalogEntry( String site ){ List entries = null; try { entries = mTCHandle.lookup( Distribute.TRANSFORMATION_NAMESPACE, Distribute.TRANSFORMATION_NAME, Distribute.TRANSFORMATION_VERSION, site, TCType.INSTALLED ); } catch (Exception e) { //non sensical catching mLogger.log("Unable to retrieve entries from TC " + e.getMessage(), LogManager.DEBUG_MESSAGE_LEVEL); } return ( entries == null ) ? null : (TransformationCatalogEntry) entries.get(0); } /** * Indicates whether the enabling mechanism can set the X bit * on the executable on the remote grid site, in addition to launching * it on the remote grid stie * * @return false, as no wrapper executable is being used. */ public boolean canSetXBit(){ return false; } /** * Returns the value of the vds profile with key as Pegasus.GRIDSTART_KEY, * that would result in the loading of this particular implementation. * It is usually the name of the implementing class without the * package name. * * @return the value of the profile key. * @see org.griphyn.cPlanner.namespace.Pegasus#GRIDSTART_KEY */ public String getVDSKeyValue(){ return Distribute.CLASSNAME; } /** * Returns a short textual description in the form of the name of the class. * * @return short textual description. */ public String shortDescribe(){ return Distribute.SHORT_NAME; } /** * Returns the SHORT_NAME for the POSTScript implementation that is used * to be as default with this GridStart implementation. * * @return the identifier for the default POSTScript implementation for * kickstart gridstart module. * * @see Kickstart#defaultPOSTScript() */ public String defaultPOSTScript(){ return this.mKickstartGridStartImpl.defaultPOSTScript(); } public void useFullPathToGridStarts(boolean fullPath) { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } public String getWorkerNodeDirectory(Job job) { throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } }