/** * Copyright 2007-2008 University Of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.isi.pegasus.planner.selector.site; import edu.isi.pegasus.common.logging.LogManagerFactory; import edu.isi.pegasus.planner.catalog.site.classes.FileServer; import edu.isi.pegasus.planner.catalog.site.classes.SiteCatalogEntry; import edu.isi.pegasus.planner.classes.PegasusFile; import edu.isi.pegasus.planner.classes.Job; import edu.isi.pegasus.common.logging.LogManager; import edu.isi.pegasus.planner.catalog.site.classes.Directory; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import edu.isi.pegasus.planner.classes.ADag; import edu.isi.pegasus.planner.classes.PegasusBag; /** * This is the class that implements a call-out to a site selector which * is an application or executable script. In order to use the site * selector implemented by this class, the property * <code>pegasus.selector.site</code> must be set to value * <code>NonJavaCallout</code>.<p> * * This site selector implements a <code>popen()</code> like call to an * external application that conforms to the API laid out here. The name * of the application to run is specified by the property * <code>pegasus.selector.site.path</code>. Its value points to a locally * available application.<p> * * If the external executable requires certain environment variables to * be set for execution, these can be specified in the property files, * using the prefix <code>pegasus.selector.site.env</code>. The name of the * environment variable is obtained by stripping the prefix. For * example, to set the variable PATH to a certain value, use the * following entry in your user property file:<p> * * <pre> * pegasus.selector.site.env.PATH = /usr/bin:/bin:/usr/X11R6/bin * </pre> * * The site selector populates the environment of the external * application with the following default properties, which can * be overwritten by user-specified properties:<p> * * <table border="1"> * <tr align="left"><th>key</th><th>value</th></tr> * <tr align="left"><th>PEGASUS_HOME</th> * <td>As set by the system</td></tr> * <tr align="left"><th>CLASSPATH</th> * <td>From <tt>java.class.path</tt></td></tr> * <tr align="left"><th>JAVA_HOME</th> * <td>From <tt>java.home</tt></td></tr> * <tr align="left"><th>USER</th> * <td>From <tt>user.name</tt>, if present</td></tr> * <tr align="left"><th>LOGNAME</th> * <td>From <tt>user.name</tt>, if present</td></tr> * <tr align="left"><th>HOME</th> * <td>From <tt>user.home</tt>, if present</td></tr> * <tr align="left"><th>TMP</th> * <td>From <tt>java.io.tmpdir</tt>, if present</td></tr> * <tr align="left"><th>TZ</th> * <td>From <tt>user.timezone</tt>, if present</td></tr> * </table><p> * * The user can specify the environment variables, by specifying the * properties with the prefix pegasus.selector.site.env. prefix. for e.g user * can override the default user.name property by setting the property * pegasus.selector.site.env.user.home .<p> * * The external application is invoked with one commandline argument. * This argument is the name of a temporary file. The temporary file is * created for each invocation anew by the site selecting caller. Being * temporary, the file is deleted after the site selector returns with * success. The deletion of the file is governed by the property * pegasus.selector.site.keep.tmp. It can have a tristate value with the valid * values being * <pre> * ALWAYS * NEVER * ONERROR * </pre> * <p> * * The external application is expected to write one line to stdout. * The line starts with the string <code>SOLUTION:</code>, followed * by the chosen site handle. Optionally, separated by a colon, the * name of a jobmanager for the site can be provided by the site * selector. Two examples for successful site selections are:<p> * * <pre> * SOLUTION:mysite:my.job.mgr/jobmanager-batch * SOLUTION:siteY * </pre> * * Note, these are two examples. The site selector only returns one line * with the appropriate solution. If no site is found to be eligble, the * poolhandle should be set to NONE by the site selector. <p> * * The temporary file is the corner stone of the communication between * the site selecting caller and the external site selector. It is a * collection of key-value pairs. Each pair is separated by an equals * (=) sign, and stands on a line of its own. There are no multi-line * values permitted.<p> * * The following pairs are generated for the siteselector temporary file:<p> * * <table border="1"> * <tr align="left"><th>#</th><th>key</th><th>value</th></tr> * <tr align="left"><th>1</th><th>version</th> * <td>The version of the site selector API, currently 2.0</td></tr> * <tr align="left"><th>1</th><th>transformation</th> * <td>The fully-qualified definition identifier for the TR, ns::id:vs.</td></tr> * <tr align="left"><th>1</th><th>derivation</th> * <td>The fully-qualified definition identifier for the DV, ns::id:vs.</td></tr> * <tr align="left"><th>1</th><th>job.level</th> * <td>The job's depth in the DFS tree of the workflow DAG</td></tr> * <tr align="left"><th>1</th><th>job.id</th> * <td>The job's ID, as used in the DAX file.</td></tr> * <tr align="left"><th>N</th><th>resource.id</th> * <td>A pool handle, followed by a whitespace, followed by a gridftp server. * Typically, each gridftp server is enumerated once, so you may have multiple * occurances of the same site.</td></tr> * <tr align="left"><th>M</th><th>input.lfn</th> * <td>An input LFN, optionally followed by a whitespace and filesize.</td></tr> * <tr align="left"><th>1</th><th>wf.name</th> * <td>The label of the DAX, as found in the DAX's root element.</td></tr> * <tr align="left"><th>1</th><th>wf.index</th> * <td>The DAX index, which is incremented for each partition.</td></tr> * <tr align="left"><th>1</th><th>wf.time</th> * <td>The <i>mtime</i> of the workflow.</td></tr> * <tr align="left"><th>1</th><th>wf.manager</th> * <td>The name of the workflow manager to be used, e.g. <tt>dagman</tt>.</td></tr> * <tr align="left"><th>1</th><th>vo.name</th> * <td>unused at present, name of the virtual organization who runs this WF.</td></tr> * <tr align="left"><th>1</th><th>vo.group</th> * <td>unused at present, usage not clear .</td></tr> * </table><p> * * In order to detect malfunctioning site selectors, a timeout is * attached with each site selector, see property * <code>pegasus.selector.site.timeout</code>. By default, a site selector * is given up upon after 60 s.<p> * * @author Karan Vahi * @author Jens Vöckler * * @version $Revision$ * * @see java.lang.Runtime * @see java.lang.Process */ public class NonJavaCallout extends AbstractPerJob { /** * The prefix to be used while creating a temporary file to pass to * the external siteselector. */ public static final String PREFIX_TEMPORARY_FILE = "pegasus"; /** * The suffix to be used while creating a temporary file to pass to * the external siteselector. */ public static final String SUFFIX_TEMPORARY_FILE = null; /** * The prefix of the property names that specify the environment * variables that need to be set before calling out to the site * selector. */ public static final String PREFIX_PROPERTIES = "pegasus.selector.site.env."; /** * The prefix that the site selector writes out on its stdout to * designate that it is sending a solution. */ public static final String SOLUTION_PREFIX = "SOLUTION:"; /** * The version number associated with this API of non java callout * site selection. */ public static final String VERSION = "2.0"; //tristate variables for keeping the temporary files generated /** * The state denoting never to keep the temporary files. */ public static final int KEEP_NEVER = 0; /** * The state denoting to keep the temporary files only in case of error. */ public static final int KEEP_ONERROR = 1; /** * The state denoting always to keep the temporary files. */ public static final int KEEP_ALWAYS = 2; /** * The description of the site selector. */ private static final String mDescription = "External call-out to a site-selector application"; /** * The map that contains the environment variables including the * default ones that are set while calling out to the site selector * unless they are overridden by the values set in the properties * file. */ private Map mEnvVar; /** * The timeout value in seconds after which to timeout, in the case * where the external site selector does nothing (nothing on stdout * nor stderr). */ private int mTimeout; /** * The tristate value for whether keeping the temporary files generated or * not. */ private int mKeepTMP; /** * The path to the site selector. */ private String mSiteSelectorPath; /** * The abstract DAG. */ private ADag mAbstractDag; /** * The default constructor. */ public NonJavaCallout(){ super(); // set the default timeout to 60 seconds mTimeout = 60; //default would be onerror mKeepTMP = KEEP_ONERROR; } /** * Initializes the site selector. * * @param bag the bag of objects that is useful for initialization. * */ public void initialize( PegasusBag bag ){ super.initialize( bag ); mTimeout = mProps.getSiteSelectorTimeout(); mSiteSelectorPath = mProps.getSiteSelectorPath(); // load the environment variables from the properties file // and the default values. this.loadEnvironmentVariables(); //get the value from the properties file. mKeepTMP = getKeepTMPValue(mProps.getSiteSelectorKeep()); } /** * Maps the jobs in the workflow to the various grid sites. * The jobs are mapped by setting the site handle for the jobs. * * @param workflow the workflow. * * @param sites the list of <code>String</code> objects representing the * execution sites that can be used. */ public void mapWorkflow( ADag workflow, List sites ){ mAbstractDag = workflow; //PM-747 no need for conversion as ADag now implements Graph interface super.mapWorkflow( workflow , sites ); } /** * Returns a brief description of the site selection technique * implemented by this class. * * @return a self-description of this site selector. */ public String description(){ return mDescription; } /** * Calls out to the external site selector. The method converts a * <code>Job</code> object into an API-compliant temporary file. * The file's name is provided as single commandline argument to the * site selector executable when it is invoked. The executable, * representing the external site selector, provides its answer * on <i>stdout</i>. The answer is captures, and returned. * * @param job is a representation of the DAX compute job whose site of * execution need to be determined. * * @param sites the list of <code>String</code> objects representing the * execution sites that can be used. * * * * FIXME: Some site selector return an empty string on failures. Also: * NONE could be a valid site name. * * @see org.griphyn.cPlanner.classes.Job */ public void mapJob( Job job, List sites ){ Runtime rt = Runtime.getRuntime(); // prepare the temporary file that needs to be sent to the // Site Selector via command line. File ipFile = prepareInputFile( job, sites ); // sanity check if(ipFile == null){ job.setSiteHandle( null ); return; } // prepare the environment to call out the site selector String command = this.mSiteSelectorPath; if ( command == null ) { // delete the temporary file generated ipFile.delete(); throw new RuntimeException( "Site Selector: Please set the path to the external site " + "selector in the properties! " ); } try { command += " " + ipFile.getAbsolutePath(); // get hold of all the environment variables that are to be set String[] envArr = this.getEnvArrFromMap(); mLogger.log( "Calling out to site selector " + command, LogManager.DEBUG_MESSAGE_LEVEL); Process p = rt.exec( command , envArr ); // set up to read subprogram output InputStream is = p.getInputStream(); InputStreamReader isr = new InputStreamReader(is); BufferedReader br = new BufferedReader(isr); // set up to read subprogram error InputStream er = p.getErrorStream(); InputStreamReader err = new InputStreamReader(er); BufferedReader ebr = new BufferedReader(err); // pipe the process stdout and stderr to standard stdout/stderr // FIXME: Really? I thought we want to capture stdout? String s = null; String se = null; // set the variable to check if the timeout needs to be set or not boolean notTimeout = ( mTimeout <= 0 ); boolean stdout = false; boolean stderr = false; int time = 0; while( ( (stdout =br.ready()) || (stderr = ebr.ready()) ) || notTimeout || time < mTimeout){ if ( ! ( stdout || stderr ) ) { // nothing on either streams // sleep for some time try { time +=5; mLogger.log("main thread going to sleep " + time, LogManager.DEBUG_MESSAGE_LEVEL); Thread.sleep(5000); mLogger.log("main thread woken up", LogManager.DEBUG_MESSAGE_LEVEL); } catch ( InterruptedException e ) { // do nothing // we potentially loose time here. } } else { // we hearing something from selector // reset the time counter time = 0; if ( stdout ) { s = br.readLine(); mLogger.log("[Site Selector stdout] " + s, LogManager.DEBUG_MESSAGE_LEVEL); // parse the string to get the output if ( parseStdOut( job, s ) ){ break; } } if ( stderr ) { se = ebr.readLine(); mLogger.log("[Site Selector stderr] " + se, LogManager.ERROR_MESSAGE_LEVEL); } } } // while // close the streams br.close(); ebr.close(); if ( time >= mTimeout ) { mLogger.log( "External Site Selector timeout after " + mTimeout + " seconds", LogManager.ERROR_MESSAGE_LEVEL); p.destroy(); // no use closing the streams as it would be probably hung job.setSiteHandle( null ); return; } // the site selector seems to have worked without any errors // delete the temporary file that was generated only if the // process exited with a status of 0 // FIXME: Who is going to clean up after us? int status = p.waitFor(); if ( status != 0){ // let the user know site selector exited with non zero mLogger.log("Site Selector exited with non zero exit " + "status " + status, LogManager.DEBUG_MESSAGE_LEVEL); } //delete the temporary file on basis of keep value if((status == 0 && mKeepTMP < KEEP_ALWAYS) || (status != 0 && mKeepTMP == KEEP_NEVER )){ //deleting the file if ( ! ipFile.delete() ) mLogger.log("Unable to delete temporary file " + ipFile.getAbsolutePath(),LogManager.WARNING_MESSAGE_LEVEL); } } catch ( IOException e ) { mLogger.log("[Site selector] " + e.getMessage(), LogManager.ERROR_MESSAGE_LEVEL); } catch ( InterruptedException e ) { mLogger.log("Waiting for site selector to exit: " + e.getMessage(), LogManager.ERROR_MESSAGE_LEVEL); } return; } /** * Writes job knowledge into the temporary file passed to the external * site selector. The job knowledge derives from the contents of the * DAX job's <code>Job</code> record, and the a list of site * candidates. The format of the file is laid out in the class's * introductory documentation. * * @param job is a representation of the DAX compute job whose site of * execution need to be determined. * * @param pools is a list of site candidates. The items of the list are * <code>String</code> objects. * * @return the temporary input file was successfully prepared. A value * of <code>null</code> implies that an error occured while writing * the file. * * @see #getTempFilename() */ private File prepareInputFile( Job job, List pools ) { File f = new File( this.getTempFilename() ); PrintWriter pw; try { pw = new PrintWriter(new FileWriter(f)); // write out the version of the api pw.println("version=" + this.VERSION); // fw.write("\nvds_job_name=" + job.jobName); pw.println("transformation=" + job.getCompleteTCName()); pw.println("derivation=" + job.getCompleteDVName()); // write out the job id and level as gotten from dax pw.println("job.level=" + job.level); pw.println("job.id=" + job.logicalId); //at present Pegasus always asks to schedule compute jobs //User should be able to specify through vdl or the pool config file. //Karan Feb 10 3:00 PM PDT //pw.println("vds_scheduler_preference=regular"); // write down the list of exec Pools and their corresponding grid // ftp servers if ( pools.isEmpty() ) { // just write out saying illustrating no exec pool or grid ftp // server passed to site selector. Upto the selector to do what // it wants. // FIXME: We need to define this part of the interface. If there // are not site candidates, should it ever reach this part of // the code? If now, insert assertion and abort here. If yes, we // need to define this case! But just silently write the below // will not site will with our set of site selectors. pw.println("resource.id=NONE NONE"); } else { String st, pool; for ( Iterator i = pools.iterator(); i.hasNext(); ) { pool = (String) i.next(); st = "resource.id=" + pool + " "; SiteCatalogEntry site = mSiteStore.lookup( pool ); /* for( Iterator it = site.getHeadNodeFS().getScratch().getSharedDirectory().getFileServersIterator(); it.hasNext();){ pw.println(st + ( (FileServer) it.next()).getURLPrefix() ); }*/ Directory d = site.getDirectory( Directory.TYPE.shared_scratch ); if( d != null ){ for( FileServer.OPERATION op : FileServer.OPERATION.values() ){ for( Iterator it = d.getFileServersIterator(op); it.hasNext();){ pw.println(st + ( (FileServer) it.next()).getURLPrefix() ); } } } } // for } // write the input files for ( Iterator i=job.inputFiles.iterator(); i.hasNext(); ) pw.println("input.lfn=" + ((PegasusFile)i.next()).getLFN()); // write workflow related metadata if ( this.mAbstractDag != null ) { pw.println("wf.name=" + mAbstractDag.getLabel() ); pw.println("wf.index=" + mAbstractDag.getIndex() ); // pw.println("workflow.time=" + mAbstractDag.dagInfo.time??); // FIXME: Try File.lastModified() on the DAX file // should actually be picked up from the properties file pw.println("wf.manager=" + "dagman"); } // uninitialized values pw.println("vo.name=" + "NONE"); pw.println("vo.group=" + "NONE"); // done pw.flush(); pw.close(); } catch ( IOException e ) { mLogger.log("While writing to the temporary file :" + e.getMessage(), LogManager.ERROR_MESSAGE_LEVEL); return null; } catch ( Exception ex ) { //an unknown exception mLogger.log("Unknown error while writing to the temp file :" + ex.getMessage(), LogManager.ERROR_MESSAGE_LEVEL); return null; } return f; } /** * Extracts the chosen site from the site selector's answer. Parses * the <i>stdout</i> sent by the selector, to see, if the execution * pool and the jobmanager were sent or not. * * @param job the job that has to be mapped. * @param s is the stdout received from the site selector. * * @return boolean indicating if the stdout was succesfully parsed and * job populated. * * */ private boolean parseStdOut( Job job, String s ){ String val = null; s = s.trim(); boolean result = false; if(s.startsWith(SOLUTION_PREFIX)){ s = s.substring(SOLUTION_PREFIX.length()); StringTokenizer st = new StringTokenizer(s,":"); while(st.hasMoreTokens()){ result = true; job.setSiteHandle( (String)st.nextToken() ); job.setJobManager( st.hasMoreTokens() ? st.nextToken(): null ); } } // HMMM: String.indexOf() functions can be used in Jens HO. return result; } /** * Creates a temporary file and obtains its name. This method returns * the absolute path to a temporary file in the system's TEMP * directory. The file is guarenteed to be unique for the current * invocation of the virtual machine. * * FIXME: However, since we return a filename and not an opened file, race * conditions are still possible. * * @return the absolute path of a newly created temporary file. */ private String getTempFilename(){ File f = null; try { f = File.createTempFile(PREFIX_TEMPORARY_FILE,SUFFIX_TEMPORARY_FILE); return f.getAbsolutePath(); } catch ( IOException e ) { throw new RuntimeException( "Unable to get handle to a temporary file :" + e.getMessage()); } } /** * Initializes the internal hash that collects environment variables. * These variables are set up to run the external helper application. * Environment variables come from two source. * * <ol> * <li>Default environment variables, fixed, hard-coded. * <li>User environment variables, from properties. * </ol> */ private void loadEnvironmentVariables(){ // load the default environment variables String value = null; mEnvVar = new HashMap(); mEnvVar.put("CLASSPATH",mProps.getProperty("java.class.path")); mEnvVar.put("JAVA_HOME",mProps.getProperty("java.home")); // set $LOGNAME and $USER if corresponding property set in JVM if ( (value = mProps.getProperty("user.name")) != null ) { mEnvVar.put("USER",value); mEnvVar.put("LOGNAME",value); } // set the $HOME if user.home is set if ( (value = mProps.getProperty("user.home")) != null ) mEnvVar.put("HOME",value); // set the $TMP if java.io.tmpdir is set if ( (value = mProps.getProperty("java.io.tmpdir")) != null ) mEnvVar.put("TMP",value); // set $TZ if user.timezone is set if ( (value = mProps.getProperty("user.timezone")) != null ) mEnvVar.put("TZ",value); // get hold of the environment variables that user might have set // and put them in the map overriding the variables already set. mEnvVar.putAll( mProps.matchingSubset(PREFIX_PROPERTIES,false) ); } /** * Generates an array of environment variables. The variables are kept * in an internal map. Converts the environment variables in the map * to the array format. * * @return array of enviroment variables set, or <code>null</code> if * the map is empty. * @see #loadEnvironmentVariables() */ private String[] getEnvArrFromMap(){ String result[] = null; // short-cut if ( mEnvVar == null || mEnvVar.isEmpty() ) return result; else result = new String[mEnvVar.size()]; Iterator it = mEnvVar.entrySet().iterator(); int i = 0; while(it.hasNext()){ Map.Entry entry = (Map.Entry)it.next(); result[i] = entry.getKey() + "=" + entry.getValue(); i++; } return result; } /** * Returns the int value corresponding to the string value passed. * * @param value the string value for keeping the temporary files. * * @return the corresponding int value. * @see #KEEP_ALWAYS * @see #KEEP_NEVER * @see #KEEP_ONERROR */ private int getKeepTMPValue(String value){ //default value is keep on error int val = KEEP_ONERROR; //sanity check of the string value if(value == null || value.length() == 0){ //return the default value return val; } value = value.trim(); if(value.equalsIgnoreCase("always")) val = KEEP_ALWAYS; if(value.equalsIgnoreCase("never")) val = KEEP_NEVER; return val; } /** * The main program that allows you to test. * FIXME: Test programs should have prefix Test.....java * * @param args the arguments * */ public static void main( String[] args ){ LogManagerFactory.loadSingletonInstance().setLevel(LogManager.DEBUG_MESSAGE_LEVEL); NonJavaCallout nj = new NonJavaCallout( ); Job s = new Job(); s.logicalName = "test"; s.namespace = "pegasus"; s.version = "1.01"; s.jobName = "test_ID00001"; List pools = new java.util.ArrayList(); pools.add("isi-condor");pools.add("isi-lsf"); nj.mapJob( s,pools ); System.out.println("Exec Pool return by site selector is " + s.getSiteHandle() ); } }