/**
* Copyright 2007-2008 University Of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.isi.pegasus.planner.selector.site.heft;
import edu.isi.pegasus.planner.classes.ADag;
import edu.isi.pegasus.planner.classes.Job;
import edu.isi.pegasus.planner.classes.PegasusBag;
import edu.isi.pegasus.planner.common.PegasusProperties;
import edu.isi.pegasus.common.logging.LogManager;
import edu.isi.pegasus.planner.partitioner.graph.Graph;
import edu.isi.pegasus.planner.partitioner.graph.GraphNode;
import edu.isi.pegasus.planner.partitioner.graph.Bag;
import edu.isi.pegasus.planner.catalog.TransformationCatalog;
import edu.isi.pegasus.planner.catalog.transformation.TransformationCatalogEntry;
import edu.isi.pegasus.planner.catalog.transformation.Mapper;
import edu.isi.pegasus.planner.namespace.Pegasus;
import edu.isi.pegasus.planner.catalog.site.classes.SiteCatalogEntry;
import edu.isi.pegasus.planner.catalog.site.classes.SiteStore;
import edu.isi.pegasus.planner.catalog.site.classes.GridGateway;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Iterator;
import java.util.Comparator;
import java.util.Collections;
import edu.isi.pegasus.planner.classes.Profile;
/**
* The HEFT based site selector. The runtime for the job in seconds is picked
* from the pegasus profile key runtime in the transformation catalog for a
* transformation.
*
* The data communication costs between jobs if scheduled on different sites
* is assumed to be fixed. Later on if required, the ability to specify this
* value will be exposed via properties.
*
* The number of processors in a site is picked by the attribute idle-nodes
* associated with the vanilla jobmanager for a site in the site catalog.
*
* There are two important differences with the algorithm cited in the
* HEFT paper.
* <pre>
* - Our implementation uses downward ranks instead of the upward ranks as
* mentioned in the paper. The formulas have been updated accordingly.
*
* - During the processor selection phase, we do the simple selection and
* not follow the insertion based approach.
* </pre>
*
* @author Karan Vahi
* @version $Revision$
*
* @see #AVERAGE_BANDWIDTH
* @see #RUNTIME_PROFILE_KEY
* @see #DEFAULT_NUMBER_OF_FREE_NODES
* @see #AVERAGE_DATA_SIZE_BETWEEN_JOBS
* @see org.griphyn.cPlanner.classes.JobManager#IDLE_NODES
*
*/
public class Algorithm {
/**
* The pegasus profile key that gives us the expected runtime.
*/
public static final String RUNTIME_PROFILE_KEY = Pegasus.RUNTIME_KEY;
/**
* The property that designates which Process catalog impl to pick up.
*/
public static final String PROCESS_CATALOG_IMPL_PROPERTY = "pegasus.catalog.transformation.windward";
/**
* The average bandwidth between the sites. In mega bytes/per second.
*/
public static final float AVERAGE_BANDWIDTH = 5;
/**
* The average data that is transferred in between 2 jobs in the workflow.
* In megabytes.
*/
public static final float AVERAGE_DATA_SIZE_BETWEEN_JOBS = 2;
/**
* The default number of nodes that are associated with a site if not found
* in the site catalog.
*/
public static final int DEFAULT_NUMBER_OF_FREE_NODES = 10;
/**
* The maximum finish time possible for a job.
*/
public static final long MAXIMUM_FINISH_TIME = Long.MAX_VALUE;
/**
* The average communication cost between nodes.
*/
private float mAverageCommunicationCost;
/**
* The workflow in the graph format, that needs to be scheduled.
*/
private Graph mWorkflow;
/**
* Handle to the site catalog.
*/
// private PoolInfoProvider mSiteHandle;
private SiteStore mSiteStore;
/**
* The list of sites where the workflow can run.
*/
private List mSites;
/**
* Map containing the number of free nodes for each site. The key is the site
* name, and value is a <code>Site</code> object.
*/
private Map mSiteMap;
/**
* Handle to the TCMapper.
*/
protected Mapper mTCMapper;
/**
* The handle to the LogManager
*/
private LogManager mLogger;
/**
* The handle to the properties.
*/
private PegasusProperties mProps;
//TANGRAM related variables
/**
* The request id associated with the DAX.
*/
private String mRequestID;
/**
* The label of the workflow.
*/
private String mLabel;
/**
* The handle to the transformation catalog.
*/
private TransformationCatalog mTCHandle;
/**
* The default constructor.
*
* @param bag the bag of Pegasus related objects.
*/
public Algorithm( PegasusBag bag ) {
mProps = ( PegasusProperties ) bag.get( PegasusBag.PEGASUS_PROPERTIES );
mRequestID = mProps.getWingsRequestID();
mTCHandle = ( TransformationCatalog )bag.get( PegasusBag.TRANSFORMATION_CATALOG );
mTCMapper = ( Mapper )bag.get( PegasusBag.TRANSFORMATION_MAPPER );
mLogger = ( LogManager )bag.get( PegasusBag.PEGASUS_LOGMANAGER );
// mSiteHandle = ( PoolInfoProvider )bag.get( PegasusBag.SITE_CATALOG );
mSiteStore = bag.getHandleToSiteStore();
mAverageCommunicationCost = (this.AVERAGE_BANDWIDTH / this.AVERAGE_DATA_SIZE_BETWEEN_JOBS);
}
/**
* Schedules the workflow using the heft.
*
* @param dag the <code>ADag</code> object containing the abstract workflow
* that needs to be mapped.
* @param sites the list of candidate sites where the workflow can potentially
* execute.
*/
public void schedule( ADag dag , List sites ){
//metadata about the DAG needs to go to Graph object
//mLabel = dag.getLabel();
//PM-747 no need for conversion as ADag now implements Graph interface
schedule( dag , sites, dag.getLabel() );
}
/**
* Schedules the workflow according to the HEFT algorithm.
*
* @param workflow the workflow that has to be scheduled.
* @param sites the list of candidate sites where the workflow can potentially
* execute.
* @param label the label of the workflow
*
*/
public void schedule( ADag workflow , List sites, String label ){
mLabel = label;
mWorkflow = workflow;
populateSiteMap( sites );
//compute weighted execution times for each job
for( Iterator it = workflow.nodeIterator(); it.hasNext(); ){
GraphNode node = ( GraphNode )it.next();
Job job = (Job)node.getContent();
//add the heft bag to a node
Float averageComputeTime = new Float( calculateAverageComputeTime( job ) );
HeftBag b = new HeftBag();
b.add( HeftBag.AVG_COMPUTE_TIME, averageComputeTime );
node.setBag( b );
mLogger.log( "Average Compute Time " + node.getID() + " is " + averageComputeTime,
LogManager.DEBUG_MESSAGE_LEVEL );
}
//add a dummy root
Bag bag;
GraphNode dummyRoot = new GraphNode( "dummy", "dummy" );
workflow.addRoot( dummyRoot );
bag = new HeftBag();
//downward rank for the root is set to 0
bag.add( HeftBag.DOWNWARD_RANK, new Float( 0 ) );
dummyRoot.setBag( bag );
//do a breadth first traversal and compute the downward ranks
Iterator it = workflow.iterator();
dummyRoot = ( GraphNode )it.next(); //we have the dummy root
Float drank;
//the dummy root has a downward rank of 0
dummyRoot.getBag().add( HeftBag.DOWNWARD_RANK, new Float( 0 ) );
//stores the nodes in sorted ascending order
List sortedNodes = new LinkedList();
while ( it.hasNext() ){
GraphNode node = ( GraphNode ) it.next();
drank = new Float( computeDownwardRank( node ) );
bag = node.getBag();
bag.add( HeftBag.DOWNWARD_RANK , drank );
sortedNodes.add( node );
mLogger.log( "Downward rank for node " + node.getID() + " is " + drank,
LogManager.DEBUG_MESSAGE_LEVEL );
}
//sort the node
Collections.sort( sortedNodes, new HeftGraphNodeComparator() );
//the start time and end time for the dummy root is 0
dummyRoot.getBag().add( HeftBag.ACTUAL_START_TIME, new Long( 0 ) );
dummyRoot.getBag().add( HeftBag.ACTUAL_FINISH_TIME, new Long( 0 ) );
//schedule out the sorted order of the nodes
for( it = sortedNodes.iterator(); it.hasNext(); ){
GraphNode current = (GraphNode) it.next();
bag = current.getBag();
mLogger.log("Scheduling node " + current.getID(),
LogManager.DEBUG_MESSAGE_LEVEL);
//figure out the sites where a job can run
Job job = (Job) current.getContent();
List runnableSites = mTCMapper.getSiteList( job.getTXNamespace(),
job.getTXName(),
job.getTXVersion(),
mSites);
//for each runnable site get the estimated finish time
//and schedule job on site that minimizes the finish time
String site;
long est_result[ ];
long result[] = new long[ 2 ];
result [ 1 ] = this.MAXIMUM_FINISH_TIME;
for( Iterator rit = runnableSites.iterator(); rit.hasNext(); ){
site = (String) rit.next();
est_result = calculateEstimatedStartAndFinishTime( current, site );
//if existing EFT is greater than the returned EFT
//set existing EFT to the returned EFT
if( result[ 1 ] > est_result[ 1 ] ){
result[ 0 ] = est_result[ 0 ];
result[ 1 ] = est_result[ 1 ];
//tentatively schedule the job for that site
bag.add( HeftBag.SCHEDULED_SITE , site );
}
}
//update the site selected with the job
bag.add( HeftBag.ACTUAL_START_TIME, new Long( result[ 0 ] ));
bag.add( HeftBag.ACTUAL_FINISH_TIME, new Long( result[ 1 ] ) );
site = (String)bag.get( HeftBag.SCHEDULED_SITE );
scheduleJob( site,
result[ 0 ],
result[ 1 ] );
//log the information
StringBuffer sb = new StringBuffer();
sb.append( "Scheduled job " ).append( current.getID() ).
append( " to site " ).append( site ).
append( " with from ").append( result[ 0 ] ).
append( " till " ).append( result[ 1 ] );
mLogger.log( sb.toString(), LogManager.DEBUG_MESSAGE_LEVEL );
}//end of going through all the sorted nodes
//remove the dummy root
mWorkflow.remove( dummyRoot.getID() );
}
/**
* Returns the makespan of the scheduled workflow. It is maximum of the
* actual finish times for the leaves of the scheduled workflow.
*
* @return long the makespan of the workflow.
*/
public long getMakespan( ){
long result = -1;
//compute the maximum of the actual end times of leaves
for( Iterator it = mWorkflow.getLeaves().iterator(); it.hasNext() ; ){
GraphNode node = ( GraphNode )it.next();
Long endTime = ( Long ) node.getBag().get( HeftBag.ACTUAL_FINISH_TIME );
//sanity check
if( endTime == null ){
throw new RuntimeException( "Looks like the leave node is unscheduled " + node.getID());
}
if( endTime > result ){
result = endTime;
}
}
return result;
}
/**
* Estimates the start and finish time of a job on a site.
*
* @param node the node that is being scheduled
* @param site the site for which the finish time is reqd.
*
* @return long[0] the estimated start time.
* long[1] the estimated finish time.
*/
protected long[] calculateEstimatedStartAndFinishTime( GraphNode node, String site ){
Job job = ( Job )node.getContent();
long[] result = new long[2];
//calculate the ready time for the job
//that is time by which all the data needed
//by the job has reached the site.
long readyTime = 0;
for( Iterator it = node.getParents().iterator(); it.hasNext(); ){
GraphNode parent = ( GraphNode )it.next();
long current = 0;
//add the parent finish time to current
current += (Long)parent.getBag().get( HeftBag.ACTUAL_FINISH_TIME );
//if the parent was scheduled on another site
//add the average data transfer time.
if( !parent.getBag().get( HeftBag.SCHEDULED_SITE ).equals( site ) ){
current += this.mAverageCommunicationCost;
}
if ( current > readyTime ){
//ready time is maximum of all currents
readyTime = current;
}
}
//the estimated start time is the maximum
//of the ready time and available time of the site
//using non insertion based policy for time being
result[ 0 ] = getAvailableTime( site , readyTime );
// do not need it, as available time is always >= ready time
// if ( result[ 0 ] < readyTime ){
// result[ 0 ] = readyTime;
// }
//the estimated finish time is est + compute time on site
List entries = mTCMapper.getTCList( job.getTXNamespace(),
job.getTXName(),
job.getTXVersion(),
site );
//pick the first one for time being
TransformationCatalogEntry entry = ( TransformationCatalogEntry ) entries.get( 0 );
result[ 1 ] = result[ 0 ] + getExpectedRuntime( job, entry );
//est now stores the estimated finish time
return result;
}
/**
* Computes the downward rank of a node.
*
* The downward rank of node i is
* _ ___
* max { rank( n ) + w + c }
* j E pred( i ) d j j ji
*
*
*
* @param node the <code>GraphNode</code> whose rank needs to be computed.
*
* @return computed rank.
*/
protected float computeDownwardRank( GraphNode node ){
float result = 0;
//value needs to be computed for each parent separately
//float value = 0;
for( Iterator it = node.getParents().iterator(); it.hasNext(); ){
GraphNode p = (GraphNode)it.next();
Bag pbag = p.getBag();
float value = 0;
value += ( getFloatValue ( pbag.get( HeftBag.DOWNWARD_RANK ) )+
getFloatValue ( pbag.get( HeftBag.AVG_COMPUTE_TIME ) ) +
mAverageCommunicationCost
);
if( value > result ){
result = value;
}
}
return result;
}
/**
* Returns the average compute time in seconds for a job.
*
* @param job the job whose average compute time is to be computed.
*
* @return the weighted compute time in seconds.
*/
protected float calculateAverageComputeTime( Job job ){
//get all the TC entries for the sites where a job can run
List runnableSites = mTCMapper.getSiteList( job.getTXNamespace(),
job.getTXName(),
job.getTXVersion(),
mSites );
//sanity check
if( runnableSites == null || runnableSites.isEmpty() ){
throw new RuntimeException( "No runnable site for job " + job.getName() );
}
mLogger.log( "Runnables sites for job " + job.getName() + " " + runnableSites , LogManager.DEBUG_MESSAGE_LEVEL );
//for each runnable site get the expected runtime
String site;
int total_nodes = 0;
int total = 0;
for( Iterator it = runnableSites.iterator(); it.hasNext(); ){
site = ( String ) it.next();
int nodes = getFreeNodesForSite( site );
List entries = mTCMapper.getTCList( job.getTXNamespace(),
job.getTXName(),
job.getTXVersion(),
site );
//pick the first one for time being
TransformationCatalogEntry entry = ( TransformationCatalogEntry ) entries.get( 0 );
int jobRuntime = getExpectedRuntime( job, entry );
total_nodes += nodes;
total += jobRuntime * nodes;
}
return total/total_nodes;
}
/**
* Return expected runtime.
*
* @param job the job in the workflow.
* @param entry the <code>TransformationCatalogEntry</code> object.
*
* @return the runtime in seconds.
*/
protected int getExpectedRuntime( Job job, TransformationCatalogEntry entry ){
int result = -1;
//try and fetch the expected runtime from the Windward AC
double pcresult = getExpectedRuntimeFromAC( job , entry );
if( pcresult == 0.0 ){
mLogger.log( "PC returned a value of 0 for job" + job.getID(),
LogManager.WARNING_MESSAGE_LEVEL );
result = 1;
}
else if( pcresult > 0.0 && pcresult < 1.0 ){
mLogger.log( "PC returned a value between 0 and 1" + pcresult + " for job " + job.getID(),
LogManager.WARNING_MESSAGE_LEVEL );
result = 1;
}
else{
result = (int)pcresult;
}
// if(result == 0){
// mLogger.log("PC returned 0 as runtime. Returning 1", LogManager.ERROR_MESSAGE_LEVEL);
// return result=1;
// }
if( result >= 1 ){
return result;
}
//else try and get the runtime from the profiles
List profiles = entry.getProfiles( Profile.VDS );
mLogger.log( "Fetching runtime information from profiles for job " + job.getName(),
LogManager.DEBUG_MESSAGE_LEVEL );
mLogger.log( "Profiles are " + profiles, LogManager.DEBUG_MESSAGE_LEVEL);
if( profiles != null ){
for (Iterator it = profiles.iterator(); it.hasNext(); ) {
Profile p = (Profile) it.next();
if (p.getProfileKey().equals(this.RUNTIME_PROFILE_KEY)) {
result = Integer.parseInt(p.getProfileValue());
break;
}
}
}
//if no information . try from profiles in dax
if( result < 1 ){
mLogger.log( "Fetching runtime information from profiles for job " + job.getName(),
LogManager.DEBUG_MESSAGE_LEVEL );
for (Iterator it = job.vdsNS.getProfileKeyIterator(); it.hasNext(); ) {
String key = (String) it.next();
if ( key.equals(this.RUNTIME_PROFILE_KEY)) {
result = Integer.parseInt( job.vdsNS.getStringValue( key ) );
break;
}
}
}
//sanity check for time being
if( result < 1 ){
throw new RuntimeException( "Invalid or no runtime specified for job " + job.getID() );
}
return result;
}
/**
* Return expected runtime from the AC only if the process catalog is
* initialized. Since Pegasus 3.0 release it always returns -1.
*
* @param job the job in the workflow.
* @param entry the TC entry
*
* @return the runtime in seconds.
*/
protected double getExpectedRuntimeFromAC( Job job, TransformationCatalogEntry entry ){
double result = -1;
return result;
}
/**
* Populates the number of free nodes for each site, by querying the
* Site Catalog.
*
* @param sites list of sites.
*/
@SuppressWarnings({"unchecked", "unchecked"})
protected void populateSiteMap( List sites ){
mSiteMap = new HashMap();
//for testing purposes
mSites = sites;
String value = null;
int nodes = 0;
for( Iterator it = mSites.iterator(); it.hasNext(); ){
String site = (String)it.next();
SiteCatalogEntry eSite = mSiteStore.lookup( site );
if( eSite == null ){
throw new RuntimeException( "Unable to find site in site store entry for site " + site );
}
GridGateway jobManager = eSite.selectGridGateway( GridGateway.JOB_TYPE.compute );
if( jobManager == null ){
mLogger.log( "Site not associated with a gridgateway. Using default number of freenodes " + site,
LogManager.DEBUG_MESSAGE_LEVEL );
nodes = Algorithm.DEFAULT_NUMBER_OF_FREE_NODES;
}
else{
try {
nodes = jobManager.getIdleNodes();
if( nodes == -1 ){
mLogger.log( "Picking up total nodes for site " + site,
LogManager.DEBUG_MESSAGE_LEVEL );
nodes = jobManager.getTotalNodes();
if( nodes == -1 ){
mLogger.log( "Picking up default free nodes for site " + site,
LogManager.DEBUG_MESSAGE_LEVEL );
nodes = Algorithm.DEFAULT_NUMBER_OF_FREE_NODES;
}
}
}
catch( Exception e ){
nodes = Algorithm.DEFAULT_NUMBER_OF_FREE_NODES;
}
}
mLogger.log( "Available nodes set for site " + site + " " + nodes,
LogManager.DEBUG_MESSAGE_LEVEL );
mSiteMap.put( site, new Site( site, nodes ) );
}
}
/**
* Returns the freenodes for a site.
*
* @param site the site identifier.
*
* @return number of nodes
*/
protected int getFreeNodesForSite( String site ){
if( mSiteMap.containsKey( site ) ){
return ( ( Site )mSiteMap.get( site )).getAvailableProcessors();
}
else{
throw new RuntimeException( "The number of free nodes not available for site " + site );
}
}
/**
* Schedules a job to a site.
*
* @param site the site at which to schedule
* @param start the start time for job
* @param end the end time of job
*/
protected void scheduleJob( String site, long start , long end ){
Site s = ( Site )mSiteMap.get( site );
s.scheduleJob( start, end );
}
/**
* Returns the available time for a site.
*
* @param site the site at which you want to schedule the job.
* @param readyTime the time at which all the data reqd by the job will arrive at site.
*
* @return the available time of the site.
*/
protected long getAvailableTime( String site , long readyTime ){
if( mSiteMap.containsKey( site ) ){
return ( ( Site )mSiteMap.get( site )).getAvailableTime( readyTime );
}
else{
throw new RuntimeException( "Site information unavailable for site " + site );
}
}
/**
* This method returns a String describing the site selection technique
* that is being implemented by the implementing class.
*
* @return String
*/
public String description() {
return "Heft based Site Selector";
}
/**
* The call out to the site selector to determine on what pool the job
* should be scheduled.
*
* @param job Job the <code>Job</code> object corresponding to
* the job whose execution pool we want to determine.
* @param pools the list of <code>String</code> objects representing the
* execution pools that can be used.
* @return if the pool is found to which the job can be mapped, a string
* of the form <code>executionpool:jobmanager</code> where the
* jobmanager can be null. If the pool is not found, then set
* poolhandle to NONE. null - if some error occured .
*/
public String mapJob2ExecPool(Job job, List pools) {
return "";
}
/**
* A convenience method to get the intValue for the object passed.
*
* @param key the key to be converted
*
* @return the floatt value if object an integer, else -1
*/
private float getFloatValue( Object key ){
float k = -1;
//try{
k = ( (Float) key).floatValue();
//}
//catch( Exception e ){}
return k;
}
}
/**
* Comparator for GraphNode objects that allow us to sort on basis of
* the downward rank computed.
*/
class HeftGraphNodeComparator implements Comparator{
/**
* Implementation of the {@link java.lang.Comparable} interface.
* Compares this object with the specified object for order. Returns a
* negative integer, zero, or a positive integer as this object is
* less than, equal to, or greater than the specified object. The
* definitions are compared by their type, and by their short ids.
*
* @param o1 is the object to be compared
* @param o2 is the object to be compared with o1.
*
* @return a negative number, zero, or a positive number, if the
* object compared against is less than, equals or greater than
* this object.
* @exception ClassCastException if the specified object's type
* prevents it from being compared to this Object.
*/
public int compare( Object o1, Object o2 )
{
if ( o1 instanceof GraphNode && o2 instanceof GraphNode ) {
GraphNode g1 = ( GraphNode ) o1;
GraphNode g2 = ( GraphNode ) o2;
float drank1 = (( Float )g1.getBag().get( HeftBag.DOWNWARD_RANK ));//.floatValue();
float drank2 = (( Float )g2.getBag().get( HeftBag.DOWNWARD_RANK ));//.floatValue();
return (int)(drank1 - drank2);
} else {
throw new ClassCastException( "object is not a GraphNode" );
}
}
}