/** * Copyright 2007-2008 University Of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.isi.pegasus.planner.cluster.aggregator; import edu.isi.pegasus.planner.code.GridStartFactory; import edu.isi.pegasus.planner.code.gridstart.PegasusExitCode; import edu.isi.pegasus.planner.classes.ADag; import edu.isi.pegasus.planner.classes.AggregatedJob; import edu.isi.pegasus.planner.classes.Job; import edu.isi.pegasus.common.logging.LogManager; import edu.isi.pegasus.planner.namespace.Pegasus; import edu.isi.pegasus.planner.namespace.Dagman; import edu.isi.pegasus.planner.classes.PegasusBag; /** * This class aggregates the smaller jobs in a manner such that * they are launched at remote end, sequentially on a single node using * seqexec. The executable seqexec is a Pegasus tool distributed in the Pegasus worker * package, and can be usually found at $PEGASUS_HOME/bin/seqexec. * * @author Karan Vahi vahi@isi.edu * @version $Revision$ */ public class SeqExec extends Abstract { /** * The logical name of the transformation that is able to run multiple * jobs sequentially. */ public static final String COLLAPSE_LOGICAL_NAME = "seqexec"; /** * The basename of the pegasus cluster executable. */ public static final String EXECUTABLE_BASENAME = "pegasus-cluster"; /** * The suffix to be applied to seqexec progress report file. */ public static final String SEQEXEC_PROGRESS_REPORT_SUFFIX = ".prg"; /** * Flag indicating whether a global log file or per job file. */ private boolean mGlobalLog; /** * Flag indicating whether to fail on first hard error or not. */ private boolean mFailOnFirstError; /** * Flag to indicate whether to log progress or not. */ private boolean mLogProgress; /** * The default constructor. */ public SeqExec(){ super(); } /** *Initializes the JobAggregator impelementation * * @param dag the workflow that is being clustered. * @param bag the bag of objects that is useful for initialization. * */ public void initialize( ADag dag , PegasusBag bag ){ super.initialize( dag, bag ); mGlobalLog = mProps.logJobAggregatorProgressToGlobal(); mLogProgress = mProps.logJobAggregatorProgress(); //set abort of first job failure this.setAbortOnFirstJobFailure( mProps.abortOnFirstJobFailure() ); } /** * Enables the abstract clustered job for execution and converts it to it's * executable form. Also associates the post script that should be invoked * for the AggregatedJob * * @param job the abstract clustered job */ public void makeAbstractAggregatedJobConcrete( AggregatedJob job ){ super.makeAbstractAggregatedJobConcrete(job); Job firstJob = (Job)job.getConstituentJob( 0 ); StringBuffer message = new StringBuffer(); message.append( " POSTScript for merged job " ). append( job.getName() ).append( " " ); //should we tinker with the postscript for this job if( job.dagmanVariables.containsKey( Dagman.POST_SCRIPT_KEY ) ){ //no merged job has been set to have a specific post script //no tinkering } else{ //we need to tinker //gridstart is always populated String gridstart = (String) firstJob.vdsNS.get(Pegasus.GRIDSTART_KEY); if (gridstart.equalsIgnoreCase( GridStartFactory. GRIDSTART_SHORT_NAMES[ GridStartFactory.KICKSTART_INDEX]) ) { //ensure $PEGASUS_HOME/bin/exitpost is invoked //as the baby jobs are being invoked by kickstart job.dagmanVariables.construct( Dagman.POST_SCRIPT_KEY, PegasusExitCode.SHORT_NAME ); } } message.append( job.dagmanVariables.get( Dagman.POST_SCRIPT_KEY ) ); mLogger.log( message.toString(), LogManager.DEBUG_MESSAGE_LEVEL ); return ; } /** * Returns the logical name of the transformation that is used to * collapse the jobs. * * @return the the logical name of the collapser executable. * @see #COLLAPSE_LOGICAL_NAME */ public String getClusterExecutableLFN(){ return COLLAPSE_LOGICAL_NAME; } /** * Returns the executable basename of the clustering executable used. * * @return the executable basename. * @see #EXECUTABLE_BASENAME */ public String getClusterExecutableBasename(){ return SeqExec.EXECUTABLE_BASENAME; } /** * Determines whether there is NOT an entry in the transformation catalog * for the job aggregator executable on a particular site. * * @param site the site at which existence check is required. * * @return boolean true if an entry does not exists, false otherwise. */ public boolean entryNotInTC(String site) { return this.entryNotInTC( SeqExec.TRANSFORMATION_NAMESPACE, SeqExec.COLLAPSE_LOGICAL_NAME, SeqExec.TRANSFORMATION_VERSION, this.getClusterExecutableBasename(), site); } /** * Returns the arguments with which the <code>AggregatedJob</code> * needs to be invoked with. * * @param job the <code>AggregatedJob</code> for which the arguments have * to be constructed. * * @return argument string */ public String aggregatedJobArguments( AggregatedJob job ){ StringBuffer arguments = new StringBuffer(); //do we need to fail hard on first error if( this.abortOnFristJobFailure()){ arguments.append( " -f " ); } //track the progress of the seqexec job //if specified in properties if( mLogProgress ){ arguments.append( " -R ").append( logFile(job) ); } return arguments.toString(); } /** * Setter method to indicate , failure on first consitutent job should * result in the abort of the whole aggregated job. Ignores any value * passed, as MPIExec does not handle it for time being. * * @param fail indicates whether to abort or not . */ public void setAbortOnFirstJobFailure( boolean fail){ mFailOnFirstError = fail; } /** * Returns a boolean indicating whether to fail the aggregated job on * detecting the first failure during execution of constituent jobs. * * @return boolean indicating whether to fail or not. */ public boolean abortOnFristJobFailure(){ return mFailOnFirstError; } /** * A boolean indicating whether ordering is important while traversing * through the aggregated job. * * @return true */ public boolean topologicalOrderingRequired(){ //ordering is important, as jobs need to be written out in //topological order for label based clustering return true; } /** * Returns the name of the log file to used on the remote site, for the * seqexec job. Depending upon the property settings, either assigns a * common * * * @param job the <code>AggregatedJob</code> * * @return the path to the log file. */ protected String logFile( AggregatedJob job ){ StringBuffer sb = new StringBuffer( 32 ); if ( mGlobalLog ){ //the basename of the log file is derived from the dag name sb.append( this.mClusteredADag.getLabel() ); } else{ //per seqexec job name sb.append( job.getName() ); } sb.append( this.SEQEXEC_PROGRESS_REPORT_SUFFIX); return sb.toString(); } }