/**
* Copyright 2007-2008 University Of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.isi.pegasus.planner.cluster.aggregator;
import edu.isi.pegasus.planner.catalog.site.classes.SiteStore;
import edu.isi.pegasus.planner.code.GridStartFactory;
import edu.isi.pegasus.common.logging.LogManager;
import edu.isi.pegasus.planner.common.PegasusProperties;
import edu.isi.pegasus.planner.classes.ADag;
import edu.isi.pegasus.planner.classes.AggregatedJob;
import edu.isi.pegasus.planner.classes.Job;
import edu.isi.pegasus.planner.classes.PegasusBag;
import edu.isi.pegasus.planner.cluster.JobAggregator;
import edu.isi.pegasus.planner.catalog.TransformationCatalog;
import edu.isi.pegasus.planner.catalog.transformation.TransformationCatalogEntry;
import edu.isi.pegasus.planner.catalog.transformation.classes.TCType;
import java.io.File;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.Iterator;
import edu.isi.pegasus.common.util.Separator;
import edu.isi.pegasus.planner.partitioner.graph.GraphNode;
import java.util.Arrays;
import java.util.HashSet;
/**
* An abstract implementation of the JobAggregator interface, which the other
* implementations can choose to extend.
*
* @author Karan Vahi vahi@isi.edu
* @version $Revision$
*
*/
public abstract class Abstract implements JobAggregator {
/**
* The prefix that is assigned to the jobname of the collapsed jobs to
* get the jobname for the fat job.
*/
public static final String CLUSTERED_JOB_PREFIX = "merge_";
/**
* The transformation namespace for the cluster jobs.
*/
public static final String TRANSFORMATION_NAMESPACE = "pegasus";
/**
* The version number for the derivations for cluster jobs
*/
public static final String TRANSFORMATION_VERSION = null;
/**
* The derivation namespace for the cluster jobs.
*/
public static final String DERIVATION_NAMESPACE = "pegasus";
/**
* The version number for the derivations for cluster jobs.
*/
public static final String DERIVATION_VERSION = "1.0";
/**
* The marker to designate a line in the input file reserved for
* monitord purposes.
*/
public static final String MONITORD_COMMENT_MARKER = "#@";
/**
* The directory, where the stdin file of the fat jobs are created.
* It should be the submit file directory that the user mentions at
* runtime.
*/
protected String mDirectory;
/**
* The object holding all the properties pertaining to Pegasus.
*/
protected PegasusProperties mProps;
/**
* The handle to the LogManager that logs all the messages.
*/
protected LogManager mLogger;
/**
* The handle to the transformation catalog.
*/
protected TransformationCatalog mTCHandle;
/**
* Handle to the site catalog store
*/
protected SiteStore mSiteStore;
//protected PoolInfoProvider mSiteHandle;
/**
* The handle to the ADag object that contains the workflow being
* clustered.
*/
protected ADag mClusteredADag;
/**
* The handle to the GridStart Factory.
*/
protected GridStartFactory mGridStartFactory;
/**
* Bag of initialization objects.
*/
protected PegasusBag mBag;
/**
* A convenience method to return the complete transformation name being
* used to construct jobs in this class.
*
* @param name the name of the transformation
*
* @return the complete transformation name
*/
public static String getCompleteTranformationName( String name ){
return Separator.combine( TRANSFORMATION_NAMESPACE,
name,
TRANSFORMATION_VERSION );
}
/**
* The default constructor.
*/
public Abstract(){
}
/**
*Initializes the JobAggregator impelementation
*
* @param dag the workflow that is being clustered.
* @param bag the bag of objects that is useful for initialization.
*
*/
public void initialize( ADag dag , PegasusBag bag ){
mBag = bag;
mClusteredADag = dag;
mLogger = bag.getLogger();
mProps = bag.getPegasusProperties();
mTCHandle = bag.getHandleToTransformationCatalog();
mSiteStore = bag.getHandleToSiteStore();
setDirectory( bag.getPlannerOptions().getSubmitDirectory() );
mGridStartFactory = new GridStartFactory();
mGridStartFactory.initialize( mBag, dag, null );
}
/**
* Returns the arguments with which the <code>AggregatedJob</code>
* needs to be invoked with.
*
* @param job the <code>AggregatedJob</code> for which the arguments have
* to be constructed.
*
* @return argument string
*/
public abstract String aggregatedJobArguments( AggregatedJob job );
/**
* Constructs a new aggregated job that contains all the jobs passed to it.
* The new aggregated job, appears as a single job in the workflow and
* replaces the jobs it contains in the workflow.
*
* @param jobs the list of <code>Job</code> objects that need to be
* collapsed. All the jobs being collapsed should be scheduled
* at the same pool, to maintain correct semantics.
* @param name the logical name of the jobs in the list passed to this
* function.
* @param id the id that is given to the new job.
*
* @return the <code>Job</code> object corresponding to the aggregated
* job containing the jobs passed as List in the input,
* null if the list of jobs is empty
*/
public AggregatedJob constructAbstractAggregatedJob(List jobs,String name,String id){
return constructAbstractAggregatedJob(jobs,name,id,getClusterExecutableLFN());
}
/**
* Constructs an abstract aggregated job that has a handle to the appropriate
* JobAggregator that will be used to aggregate the jobs.
*
* @param jobs the list of <code>SubInfo</code> objects that need to be
* collapsed. All the jobs being collapsed should be scheduled
* at the same pool, to maintain correct semantics.
* @param name the logical name of the jobs in the list passed to this
* function.
* @param id the id that is given to the new job.
* @param mergeLFN the logical name for the aggregated job that has to be
* constructed.
*
* @return the <code>SubInfo</code> object corresponding to the aggregated
* job containing the jobs passed as List in the input,
* null if the list of jobs is empty
*/
public AggregatedJob constructAbstractAggregatedJob( List jobs,
String name,
String id,
String mergeLFN ){
//sanity check
if(jobs == null || jobs.isEmpty()){
mLogger.log("List of jobs for clustering is empty",
LogManager.ERROR_MESSAGE_LEVEL);
return null;
}
//sanity check missing to ensure jobs are of same type
//Right now done in NodeCollapser. But we do not need this for
//Vertical Clumping. Karan July 28, 2005
//To get the gridstart/kickstart path on the remote
//pool, querying with entry for vanilla universe.
//In the new format the gridstart is associated with the
//pool not pool, condor universe
Job firstJob = (Job)jobs.get(0);
AggregatedJob mergedJob = new AggregatedJob( /*(Job)jobs.get(0),*/
jobs.size() );
mergedJob.setJobAggregator( this );
mergedJob.setJobType( Job.COMPUTE_JOB );
Job job = null;
StringBuffer sb = new StringBuffer();
sb.append( Abstract.CLUSTERED_JOB_PREFIX );
if( name != null && name.length() > 0 ){
sb.append( name ).append( "_" );
}
sb.append( id );
String mergedJobName = sb.toString();
mLogger.log("Constructing Abstract clustered job " + mergedJobName,
LogManager.DEBUG_MESSAGE_LEVEL);
//enable the jobs that need to be merged
//before writing out the stdin file
// String gridStartPath = site.getKickstartPath();
// GridStart gridStart = mGridStartFactory.loadGridStart( firstJob, gridStartPath );
// mergedJob = gridStart.enable( mergedJob, jobs );
//inconsistency between job name and logical name for now
mergedJob.setName( mergedJobName );
//fix for JIRA bug 83
//the site handle needs to be set for the aggregated job
//before it is enabled.
mergedJob.setSiteHandle( firstJob.getSiteHandle() );
mergedJob.setStagingSiteHandle( firstJob.getStagingSiteHandle() );
Set ipFiles = new HashSet();
Set opFiles = new HashSet();
boolean userExecutablesStaged = false;
for( Iterator it = jobs.iterator(); it.hasNext(); ) {
job = (Job) it.next();
ipFiles.addAll( job.getInputFiles() );
opFiles.addAll( job.getOutputFiles() );
mergedJob.add(job);
//update user executable staging.
userExecutablesStaged = userExecutablesStaged || job.userExecutablesStagedForJob();
//we need to merge the profiles from the constituent
//jobs now, rather in function makeAbstractAggreagatedJobConcrete
//JIRA PM-368
//merge profiles for all jobs
mergedJob.mergeProfiles( job );
}
mergedJob.setExecutableStagingForJob(userExecutablesStaged);
//overriding the input files, output files, id
mergedJob.setInputFiles( ipFiles );
mergedJob.setOutputFiles( opFiles );
mergedJob.setTransformation( Abstract.TRANSFORMATION_NAMESPACE,
mergeLFN,
Abstract.TRANSFORMATION_VERSION );
mergedJob.setDerivation( Abstract.DERIVATION_NAMESPACE,
mergeLFN,
Abstract.DERIVATION_VERSION);
mergedJob.setLogicalID( id );
//the compute job of the VDS supernode is this job itself
mergedJob.setVDSSuperNode( mergedJobName );
//explicitly set stdout to null overriding any stdout
//that might have been inherited in the clone operation.
//FIX for bug 142 http://bugzilla.globus.org/vds/show_bug.cgi?id=142
mergedJob.setStdOut( "" );
mergedJob.setStdErr( "" );
return mergedJob;
}
/**
* Enables the abstract clustered job for execution and converts it to it's
* executable form
*
* @param job the abstract clustered job
*/
public void makeAbstractAggregatedJobConcrete( AggregatedJob job ){
//containers for the input and output
//files of fat job. Set insures no duplication
//The multiple transfer ensures no duplicate transfer of
//input files. So doing the set thing is redundant.
//Hashset not used correctly
// mergedJob = enable( mergedJob, jobs );
File stdIn = writeOutInputFileForJobAggregator( job );
/* JIRA PM-277
job.setUniverse( firstJob.getUniverse() );
job.setJobManager( firstJob.getJobManager() );
*/
//the executable that fat job refers to is collapser
TransformationCatalogEntry entry = this.getTCEntry( job );
job.setRemoteExecutable( entry.getPhysicalTransformation() );
//stdin file is the file containing the arguments
//for the jobs being collapsed
job.setStdIn( stdIn.getName() );
//explicitly set stdout to null overriding any stdout
//that might have been inherited in the clone operation.
//FIX for bug 142 http://bugzilla.globus.org/vds/show_bug.cgi?id=142
job.setStdOut( "" );
job.setStdErr( "" );
//get hold of one of the jobs and suck init's globus namespace
//info into the the map.
/* Not needed, as the clone method would have taken care of it.
Karan Sept 09, 2004
entry = getTCEntry(job);
mergedJob.globusRSL.checkKeyInNS(entry.getProfiles(Profile.GLOBUS));
*/
//also put in jobType as mpi
//mergedJob.globusRSL.checkKeyinNS("jobtype","mpi");
//the profile information from the pool catalog does not need to be
//assimilated into the job. As the collapsed job is run on the
//same pool as the job is run
// mergedJob.updateProfiles(mPoolHandle.getPoolProfile(mergedJob.executionPool));
//add any notifications specified in the transformation
//catalog for the job. JIRA PM-391
job.addNotifications( entry );
//the profile information from the transformation
//catalog needs to be assimilated into the job
//overriding the one from pool catalog.
job.updateProfiles( entry );
//the profile information from the properties file
//is assimilated overidding the one from transformation
//catalog.
job.updateProfiles( mProps );
//set the arguments for the clustered job
//they are set in the end to ensure that profiles can
//be used to specify the argumetns
job.setArguments( this.aggregatedJobArguments( job ) );
return ;
}
/**
* Generates the comment string for the job . It generates a comment of the
* format # task_id transformation derivation.
*
* @param job the job for which
* @param taskid the task id to put in.
*
* @return the comment invocation
*/
protected String getCommentString( Job job, int taskid ){
return this.getCommentString(taskid, job.getCompleteTCName(), job.getDAXID());
}
/**
* Generates the comment string for the job . It generates a comment of the
* format # task_id transformation derivation.
*
* @param taskid the task id to put in.
* @param transformationName
* @param daxID the id of the job from the DAX
*
* @return the comment invocation
*/
protected String getCommentString( int taskid, String transformationName, String daxID ){
StringBuffer sb = new StringBuffer();
sb.append( MONITORD_COMMENT_MARKER ).append( " " ).
append( taskid ).append( " " ).
append( transformationName ).append( " " ).
append( daxID ).append( " " );
return sb.toString();
}
/**
* Helper method to get an entry from the transformation catalog for an
* installed executable. It does the traversal from the list of entries
* to return a single TransformationCatalogEntry object, and dies with
* an appropriate error message if the object is not found.
* The pool and the name are retrieved from job object.
*
* @param job the job whose corresponding TransformationCatalogEntry you want.
*
* @return the TransformationCatalogEntry corresponding to the entry in the
* TC.
*/
protected TransformationCatalogEntry getTCEntry(Job job){
List tcentries = null;
TransformationCatalogEntry entry = null;
try {
tcentries = mTCHandle.lookup(job.namespace,
job.logicalName,
job.version,
job.executionPool,
TCType.INSTALLED);
} catch (Exception e) {
mLogger.log(
"Unable to retrieve entry from TC for transformation " +
job.getCompleteTCName() + " " +
e.getMessage(), LogManager.DEBUG_MESSAGE_LEVEL );
}
entry = ( tcentries == null ) ?
this.defaultTCEntry( this.getClusterExecutableLFN(),
this.getClusterExecutableBasename(),
job.getSiteHandle() ): //try using a default one
(TransformationCatalogEntry) tcentries.get(0);
if( entry == null ){
//NOW THROWN AN EXCEPTION
//should throw a TC specific exception
StringBuffer error = new StringBuffer();
error.append("Could not find entry in tc for lfn ").
append( job.getCompleteTCName() ).
append(" at site ").append( job.getSiteHandle() );
mLogger.log( error.toString(), LogManager.ERROR_MESSAGE_LEVEL);
throw new RuntimeException( error.toString() );
}
return entry;
}
/**
* Returns a default TC entry to be used in case entry is not found in the
* transformation catalog.
*
* @param name the logical name for the clustering transformation.
* @param executableBasename the basename for the executable in the bin directory
* of a Pegasus installation
* @param site the site for which the default entry is required.
*
*
* @return the default entry.
*/
private TransformationCatalogEntry defaultTCEntry( String name,
String executableBasename,
String site ){
TransformationCatalogEntry defaultTCEntry = null;
//check if PEGASUS_HOME is set
String home = mSiteStore.getPegasusHome( site );
//if PEGASUS_HOME is not set, use VDS_HOME
home = ( home == null )? mSiteStore.getVDSHome( site ): home;
mLogger.log( "Creating a default TC entry for " +
Abstract.getCompleteTranformationName( name ) +
" at site " + site,
LogManager.DEBUG_MESSAGE_LEVEL );
//if home is still null
if ( home == null ){
//cannot create default TC
mLogger.log( "Unable to create a default entry for " +
Abstract.getCompleteTranformationName( name ),
LogManager.DEBUG_MESSAGE_LEVEL );
//set the flag back to true
return defaultTCEntry;
}
//remove trailing / if specified
home = ( home.charAt( home.length() - 1 ) == File.separatorChar )?
home.substring( 0, home.length() - 1 ):
home;
//construct the path to it
StringBuffer path = new StringBuffer();
path.append( home ).append( File.separator ).
append( "bin" ).append( File.separator ).
append( executableBasename );
defaultTCEntry = new TransformationCatalogEntry( Abstract.TRANSFORMATION_NAMESPACE,
name,
Abstract.TRANSFORMATION_VERSION );
defaultTCEntry.setPhysicalTransformation( path.toString() );
defaultTCEntry.setResourceId( site );
defaultTCEntry.setType( TCType.INSTALLED );
defaultTCEntry.setSysInfo( this.mSiteStore.getSysInfo( site ) );
//register back into the transformation catalog
//so that we do not need to worry about creating it again
try{
mTCHandle.insert( defaultTCEntry , false );
}
catch( Exception e ){
//just log as debug. as this is more of a performance improvement
//than anything else
mLogger.log( "Unable to register in the TC the default entry " +
defaultTCEntry.getLogicalTransformation() +
" for site " + site, e,
LogManager.DEBUG_MESSAGE_LEVEL );
}
return defaultTCEntry;
}
/**
* Determines whether there is NOT an entry in the transformation catalog
* for a particular transformation on a particular site.
*
* @param namespace the logical namespace of the transformation.
* @param name the logical name of the transformation.
* @param version the version of the transformation.
* @param executableBasename basename of the executable that does the clustering.
* @param site the site at which existence check is required.
*
* @return boolean true if an entry does not exists, false otherwise.
*/
protected boolean entryNotInTC(String namespace,
String name,
String version,
String executableBasename,
String site){
//check on for pfn for existence. gmehta says lesser queries
//underneath
List l = null;
try{
l = mTCHandle.lookupNoProfiles(namespace, name, version, site,
TCType.INSTALLED);
}
catch (Exception e) {
mLogger.log(
"Unable to retrieve entry from TC " + e.getMessage(),
LogManager.ERROR_MESSAGE_LEVEL);
}
//a double negative
return ( l == null || l.isEmpty() ) ?
(( this.defaultTCEntry( name, executableBasename, site ) ) == null ) ://construct a default tc entry
false ;
}
/**
* Sets the directory where the stdin files are to be generated.
*
* @param directory the path to the directory to which it needs to be set.
*/
protected void setDirectory(String directory){
mDirectory = (directory == null)?
//user did not specify a submit file dir
//use the default i.e current directory
".":
//user specified directory picked up
directory;
}
/**
* Writes out the input file for the aggregated job
*
* @param job the aggregated job
*
* @return path to the input file
*/
protected File writeOutInputFileForJobAggregator(AggregatedJob job ) {
return this.writeOutInputFileForJobAggregator(job, 1);
}
/**
* Writes out the input file for the aggregated job
*
* @param job the aggregated job
*
* @return path to the input file
*/
protected File writeOutInputFileForJobAggregator(AggregatedJob job, Integer taskid) {
File stdin = null;
try {
BufferedWriter writer;
String name = job.getID() + ".in";
//PM-833 the .in file should be in the same directory where all job submit files go
File directory = new File( this.mDirectory, job.getRelativeSubmitDirectory() );
stdin = new File( directory, name );
writer = new BufferedWriter(new FileWriter( stdin ) );
//traverse throught the jobs to determine input/output files
//and merge the profiles for the jobs
//int taskid = 1;
for( Iterator it = this.topologicalOrderingRequired() ?
job.topologicalSortIterator()://we care about order
job.nodeIterator();//dont care about order
it.hasNext(); ) {
GraphNode node = ( GraphNode )it.next();
Job constitutentJob = (Job) node.getContent();
//handle stdin
if( constitutentJob instanceof AggregatedJob ){
//PM-817 recursive clustering case, we need to
//write out merge_XXXX.in file for constitutent job
//that is a clustered job itself
File file = this.writeOutInputFileForJobAggregator( (AggregatedJob)constitutentJob, taskid );
//slurp in contents of it's stdin
//taking care of the taskid increments across recursion
BufferedReader reader = new BufferedReader(
new FileReader( file )
);
String line;
while( (line = reader.readLine()) != null ){
//ignore comment out lines
if( line.startsWith( MONITORD_COMMENT_MARKER) ){
String[] split = line.split( "\\s+" );
//System.out.println(Arrays.toString(split));
//taskid = Integer.parseInt( split[1] );
writer.write( getCommentString( taskid, split[2], split[3] ) + "\n" );
continue;
}
writer.write( line );
writer.write( "\n" );
taskid++;
}
reader.close();
//delete the previous stdin file
file.delete();
}
else{
//write out the argument string to the
//stdin file for the fat job
//genereate the comment string that has the
//taskid transformation derivation
writer.write( getCommentString( constitutentJob, taskid ) + "\n" );
// the arguments are no longer set as condor profiles
// they are now set to the corresponding profiles in
// the Condor Code Generator only.
writer.write( constitutentJob.getRemoteExecutable() + " " +
constitutentJob.getArguments() + "\n");
taskid++;
}
}
//closing the handle to the writer
writer.close();
}
catch(IOException e){
mLogger.log("While writing the stdIn file " + e.getMessage(),
LogManager.ERROR_MESSAGE_LEVEL);
throw new RuntimeException( "While writing the stdIn file " + stdin, e );
}
return stdin;
}
}