/* Copyright 2014 MITRE Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.mitre.provenance.workflowengine; import java.util.ArrayList; import java.util.Date; import java.util.Enumeration; import java.util.Hashtable; import java.util.Vector; import java.util.logging.Logger; import org.mitre.provenance.plusobject.PLUSObject; import org.mitre.provenance.tools.PLUSUtils; import org.mitre.provenance.workflowengine.activity.Activity; /** * This object encapsulates a runnable workflow. If you're trying to load some workflow from the * database that has already been executed, this isn't what you want. Check PLUSWorkflow instead. * <p>Workflows contain a bunch of activities, each of which has inputs and outputs. * <p>This is <b>not</b> a fully-featured workflow engine. There are numerous limitations to the * proper functioning of the engine. As of 3/2008, the PLUS MSR plans to later add hooks into * more fully-featured workflow engines to make up for the shortcomings and limitations of this code. * @author moxious */ public class Workflow { protected static Logger log = Logger.getLogger(Workflow.class.getName()); /** * Controls whether or not the workflow engine will change confidence values of the data. * The workflow engine will *always* change the confidence level of data items whose confidence is not * valid (i.e. less than 0 or more than 1). This variable here though controls what the workflow engine * will do when the confidence is already valid. If this is set to true, the workflow engine will always * change the uncertainty scores according to the specified policy. If it is set to false, the engine * will only change the uncertainty scores if they are invalid. */ public static final boolean ALWAYS_COMPUTE_CONFIDENCE = false; /** * Uncertainty policy: when uncertainty isn't present for an output, assume 1.0 * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final int UNCERTAINTY_POLICY_ALWAYS_1 = 1; /** * Uncertainty policy: when uncertainty isn't present for an output, make the uncertainty * equal to the product of the inputs. * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final int UNCERTAINTY_POLICY_MULTIPLY = 2; /** * Uncertainty policy: when uncertainty isn't present for an output, make the uncertainty * equal to the minimum input uncertainty. * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final int UNCERTAINTY_POLICY_MIN_INPUTS = 3; /** * Uncertainty policy: when uncertainty isn't present for an output, make the uncertainty * equal to the maximum input uncertainty. * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final int UNCERTAINTY_POLICY_MAX_INPUTS = 4; /** * Uncertainty policy: when uncertainty isn't present for an output, make the uncertainty * equal to the maximum input uncertainty * the degrade factor. * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final int UNCERTAINTY_POLICY_GRADUAL_DEGRADE = 5; /** * When using UNCERTAINTY_POLICY_DEGRADE, this controls how fast uncertainty degrades. * @see Workflow#validateUncertaintyMeasures(Hashtable, Hashtable, int) */ public static final double DEGRADE_FACTOR = 0.9; /** The name of the workflow as created. */ String workflowName; /** The starting point provided by the user */ Activity startingPoint; /** The tracer object that will be used to log the lineage of the workflow. */ LineageTracer tracer; /** * Used only as an internal exception. * Nothing to see here, move along. */ public class BackTrackingException extends Exception { static final long serialVersionUID = 123124123; public BackTrackingException() { super(); } public BackTrackingException(String msg) { super(msg); } } // End BackTrackingException /** * This is the ugly part. We use this to store references to intermediate data products in the workflow. * If we were more sophisticated, this would be stored in a relational database, but we aren't so it isn't. * This stores two types of things; * <ul><li>Strings mapped to PLUSObjects representing particular data values that have * already been computed by activities in the workflow, * <li>and Activity objects mapped to their output sets to determine which activities we've already executed. * </ul> * Not currently used much, but necessary for backtracking and other functionality that will probably be required. */ Hashtable <Object, Object> cache; /** * Default constructor, creates an empty workflow called "Unnamed Workflow" */ public Workflow() { this(new LineageTracer(), "Unnamed Workflow"); } // End default constructor /** * Create a new workflow with a particular lineage tracer * @param tracer the object to use to trace the lineage execution of the workflow. */ public Workflow(LineageTracer tracer) { this(tracer, "Unnamed Workflow"); } // End Workflow public Workflow(LineageTracer tracer, String name) { this.workflowName = name; cache = new Hashtable <Object,Object>(); this.tracer = tracer; } // End Workflow /** * Sometimes workflows contain data objects that were not the output of any process. The workflow begins execution * with these data objects "precomputed". In order to use one of those, call this function. If you don't add such * a precomputed object to the workflow, then the workflow engine will fail to use it as an input to some other * activity and your workflow won't run. * @param name the name of the precomputed input. This should match the input name of whatever activity is expecting * to consume the object. * @param object the object itself. */ public void addPrecomputedObject(String name, PLUSObject object) { cache.put(name, object); } /** * Get the name of the workflow. * @return a string representing the workflow's name. */ public String getName() { return workflowName; } /** * Set the workflow's name * @param name the new name to use. */ public void setName(String name) { workflowName = name; } /** * Have we already determined what this variable's value is? * @param varName the name of the variable * @return true if we already have a cached value, false otherwise. */ public boolean alreadyComputed(String varName) { return cache.contains(varName); } // End alreadyComputed /** * Execute the workflow. Assume the use of an "Always 1" uncertainty policy, meaning that any * data item that does not have a valid uncertainty associated with it will be marked with 1.0. * This is equivalent to calling execute(start, UNCERTAINTY_POLICY_GRADUAL_DEGRADE); * @param start the start node to use. * @return the Hashtable containing the outputs of the last activity executed. * @throws Exception */ public Hashtable <String,PLUSObject> execute(Activity start) throws Exception { return execute(start, UNCERTAINTY_POLICY_GRADUAL_DEGRADE); } // End execute /** * Get the minimum uncertainty from a series of data items. * @param varSet a hashtable containing string variable names mapped to PLUSObjects * @return the minimum uncertainty from all data items in the list. If the actual min is invalid, * this will return 0. */ public static float minUncertainty(Hashtable <String,PLUSObject> varSet) { Enumeration<String> e = varSet.keys(); float min = (float)1.0; while(e.hasMoreElements()) { String key = (String)e.nextElement(); PLUSObject val = varSet.get(key); float u = val.getUncertainty(); if(u < min) min = u; } // End while if(min < 0) { log.warning("WARNING: Variable set has minimum uncertainty < 0!"); min = 0; } // End if return min; } // End minUncertainty /** * Get the maximum uncertainty from a series of data items. * @param varSet a hashtable containing string variable names mapped to PLUSObjects * @return the maximum uncertainty from all data items in the list. If the actual max is invalid, * this will return 1. */ public static float maxUncertainty(Hashtable <String,PLUSObject> varSet) { Enumeration<String>e = varSet.keys(); float max = (float)-1; while(e.hasMoreElements()) { String key = (String)e.nextElement(); PLUSObject val = varSet.get(key); float u = val.getUncertainty(); // log.warning("maxUncertainty: evaluating " + u); if(u > max) max = u; } // End while if(max > 1) { // log.warning("WARNING: Variable set has maximum uncertainty > 0!"); max = 1; } else if(max < 0) { // log.warning("WARNING: No valid maximum uncertainty found."); max = 1; } return max; } // End maxUncertainty /** * Get the product of uncertainty values from a series of data items. * @param varSet a hashtable containing string variable names mapped to PLUSObjects * @return the product of all uncertainty from all data items in the list. If the actual product * is invalid, this will return 0. */ public static float productUncertainty(Hashtable <String,PLUSObject> varSet) { Enumeration<String>e = varSet.keys(); float prod = (float)1; while(e.hasMoreElements()) { String key = (String)e.nextElement(); PLUSObject val = varSet.get(key); float u = val.getUncertainty(); prod = prod * u; } // End while if(prod > 1 || prod < 0) { log.warning("WARNING: Variable set has product uncertainty " + prod + " out of range!"); prod = 0; } // End if return prod; } // End maxUncertainty /** * Convenience method used to check if a particular queue contains a particular activity. * This just does a stupid linear scan which can very likely be improved upon. * @param array the queue * @param ac the activity * @return true if array contains ac, false otherwise. */ private boolean queueContains(ArrayList<Object[]>array, Activity ac) { for(int x=0; x<array.size(); x++) { Object [] el = (Object [])array.get(x); if(el[0] == ac) return true; } // End for return false; } // End queueContains /** * Validate that the uncertainy measures for a particular set of variables is accurate, according to the * chosen policy of the execution engine. If the uncertainty measures are not correct, the uncertainty * will be modified according to the specified policy. * @param inputs the input variables to a particular activity. * @param output the output variables to a particular activity * @return modified hashtable corresponding to the outputs. */ public Hashtable <String,PLUSObject> validateUncertaintyMeasures( Hashtable <String,PLUSObject> inputs, Hashtable <String,PLUSObject> output, int POLICY) { // Ensure that all variables are tagged with uncertainty, to account for uncertainty-naive // processes. Enumeration<String>okz = output.keys(); float min = Workflow.minUncertainty(inputs); float max = Workflow.maxUncertainty(inputs); float prod = Workflow.productUncertainty(inputs); while(okz.hasMoreElements()) { String kz = (String)okz.nextElement(); PLUSObject v = output.get(kz); float u = v.getUncertainty(); if(ALWAYS_COMPUTE_CONFIDENCE || u < 0 || u > 1) { log.fine("Validate uncertainty: input " + u + " for " + v); if(POLICY == UNCERTAINTY_POLICY_ALWAYS_1) { v.getMetadata().put("computedUncertainty", "implus:POLICY_ALWAYS_1"); v.setUncertainty((float)1.0); log.fine("New uncertainty: 1.0"); } else if(POLICY == UNCERTAINTY_POLICY_MIN_INPUTS) { v.getMetadata().put("computedUncertainty", "implus:POLICY_MIN_INPUTS"); v.setUncertainty(min); log.fine("New uncertainty: (minimum) " + min); } else if(POLICY == UNCERTAINTY_POLICY_MAX_INPUTS) { v.getMetadata().put("computedUncertainty", "implus:POLICY_MAX_INPUTS"); v.setUncertainty(max); log.fine("New uncertainty: (maximum) " + max); } else if(POLICY == UNCERTAINTY_POLICY_MULTIPLY) { v.getMetadata().put("computedUncertainty", "implus:POLICY_MULTIPLY"); v.setUncertainty(prod); log.fine("New uncertainty: (product) " + prod); } else if(POLICY == UNCERTAINTY_POLICY_GRADUAL_DEGRADE) { v.getMetadata().put("computedUncertainty", "implus:POLICY_GRADUAL_DEGRADE"); v.setUncertainty((float)(max * (float)DEGRADE_FACTOR)); log.fine("Max uncertainty: " + max); log.fine("New uncertainty: (degraded) " + (max * (float)DEGRADE_FACTOR)); } else { v.getMetadata().put("computedUncertainty", "implus:NONEXISTANT_POLICY_DEFAULT_1"); log.severe("ILLEGAL POLICY: " + POLICY + " Assuming always 1."); v.setUncertainty((float)1.0); } // End else } // End if } // End while return output; } // End validateUncertaintyMeasures /** * Execute the workflow. * @param start the start node to use. * @param POLICY the uncertainty policy to use. * @return the Hashtable containing the outputs of the last activity executed. * @throws Exception */ protected Hashtable <String,PLUSObject> execute(Activity start, int POLICY) throws Exception { Hashtable <String,PLUSObject> output = new Hashtable <String,PLUSObject> () ; ArrayList <Object []> open = new ArrayList <Object []>(); ArrayList <Activity> closed = new ArrayList <Activity>(); Hashtable <Transition, Boolean> seenTransitions = new Hashtable <Transition, Boolean> (); int bt = 0; // Fire an event that says we're starting execution. tracer.startWorkflow(this, start); open.add(new Object [] {start, null} ); Activity current = null; // Iterate through the list of open activities we still need to execute. while(open.size() != 0) { // Each item in the "open" list is an array: [some activity, some transition] Object [] stuff = open.remove(0); current = (Activity)stuff[0]; Transition t = (Transition)stuff[1]; // The open list contains the next activity to execute, and the particular transition that it was // associated with. Before executing the activity, notify the listener that we are transitioning. if(t != null) { Activity to = t.getTo(); if(to.getMetadata().get("invokeid") == null) to.getMetadata().put("invokeid", PLUSUtils.generateID()); seenTransitions.put(t, new Boolean(true)); tracer.transition(t); } else if(current != Activity.START_STATE){ log.warning("WARNING: Transition to activity " + current + " was null."); log.warning("Not firing transition event to listener."); } // End else // As a special case, if the current activity is the end state, // then we're done...notify the listener. if(current == Activity.END_STATE) { tracer.endWorkflow(this, current); return output; } // End if // Declare the structure that will hold inputs to the activity. Hashtable <String,PLUSObject> inputs = null; try { inputs = computeNeededInputs(current); } catch(BackTrackingException e) { // Sometimes, all necessary inputs have not yet been computed because of branching // in the workflow. In this situation, we just need to add this back to the "open" list, // and hope that one of the intervening activities in the list will end up calculating // the necessary input. That way, when the activity comes around the next time, its inputs // will be there. log.fine("Cannot execute '" + current.getName() + "': " + e); open.add(stuff); // Add it back to the open set. bt++; // In certain bad conditions, (such as loops or reflexive transitions) we can // get stuck backtracking on a node, only to find that it's the only node left in // our list to process. When that happens, we gotta just quit. This is pretty much // always caused by a malformed workflow, or an activity that requires a nonexistant // input. if(bt > 10) throw new Exception("STOP! I'm stuck in a loop!"); // Go to the next item in the list. Nothing more to do here if we're backtracking. continue; } // End catch // Special bizarre case: say an activity has 5 inputs, and has been waiting for execution. // At this point in the code, its golden moment in the sun has come. Except so far we've only // followed one transition to the node, when it has 5 introductory transitions. Now we have // to go back and fire "transition" events for all of the others that blocked until this point // because the activity's preconditions hadn't been met at the time. Vector <Transition> introductions = current.getIntroductions(); if(introductions.size() > 1) { for(int x=0; x<introductions.size(); x++) { Transition catchUp = introductions.elementAt(x); if(seenTransitions.containsKey(catchUp)) continue; else { log.fine("Catching up on transitions..."); tracer.transition(catchUp); seenTransitions.put(catchUp, new Boolean(true)); } // End else } // End for } // End if // Register the activity with the PLUS system. // Each activity has a generic "type signature" stating which inputs and outputs it produces/needs. // When an activity is specifically executed, then it becomes an "invocation". //String id = plus.registerActivity(current); String id = PLUSUtils.generateID(); current.getMetadata().put("id", id); log.fine("WORKFLOW: Starting to execute " + current); // At the time an activity is actually executed, it gets a distinct invocation ID to // differentiate it from any other instance of that activity. // Doing this allows the lineage loggers also to track activities. if(current.getMetadata().get("invokeid") == null) { String invokeID = PLUSUtils.generateID(); current.getMetadata().put("invokeid", invokeID); current.getMetadata().put("startExecution", (new Date()).toString()); } // End if current.getMetadata().put("startExecution", (new Date()).toString()); tracer.startActivity(current, inputs); // Fire the start event to the listener output = current.execute(inputs); // Actually execute the activity, capture outputs /* Do cleanup and log uncertainty values/ending time to the object **before** firing the * event indicating that the activity is finished. This way, the logger gets the benefit * of this extra work. */ // Ensure that all variables are tagged with uncertainty, to account for uncertainty-naive // processes. So if some process was too naive to associate proper uncertainty, this will do // it for the process. output = validateUncertaintyMeasures(inputs, output, POLICY); // Log pointers back to the original data in the item, so that // we can fetch it and reconstitute it later. // logPLUSObjectPointersAndMetadata(inputs); // logPLUSObjectPointersAndMetadata(output); /****** At this point, all outputs are guaranteed to have valid uncertainty. ***/ // Tag the activity with an ending date/time. This allows calculations on duration of exec. current.getMetadata().put("endExecution", (new Date()).toString()); // Log the metadata object for the current invocation. // plus.logMetadata((String)current.getMetadata().get("invokeid"), current.getMetadata()); // Fire the finish event to the listener. tracer.finishActivity(current, output); // Do some caching. We hold on to our output products so we can look them up later to see // if they are required inputs for something else. cacheActivity(current, output); cacheVariables(output); // Now all that's left is to look at the list of transitions that go out of this activity, // and add all of the ones to the "open" queue that aren't already there. Vector<Transition>transitions = current.getTransitions(); for(int x=0; x<transitions.size(); x++) { Transition trans = (Transition)transitions.get(x); Activity to = trans.getTo(); if(!queueContains(open, to)) { log.fine("WORKFLOW: Adding " + to + " to worklist."); open.add(new Object [] { to, trans} ); } // End if } // End for // Add the current item to the closed list. closed.add(current); } // End while // Fire event saying that we're done with the workflow. // For most workflows that have an "end" event, this will never be executed, because // the end state is caught above. tracer.endWorkflow(this, current); return output; } // End execute /** * Calculate which inputs should be used for this particular process. * If an input isn't available, we have to defer execution until its precondition is met. * Inputs generally come from two different places; either the last value passed as part of * the transition, or something from the cache, which was computed earlier. * @param a the Activity you are about to execute. * @return a Hashtable containing variable names mapped onto PLUSObject's that are usable as inputs * to this particular activity. * @throws BackTrackingException if some needed input is not available or has not yet been created. * If this happens, then Activity a <b>should not be executed</b>. */ protected Hashtable <String,PLUSObject> computeNeededInputs(Activity a) throws BackTrackingException { Enumeration<String>neededInputs = a.getInputs().keys(); Vector <Transition> intros = a.getIntroductions(); Hashtable <String,PLUSObject> inputs = new Hashtable <String,PLUSObject> (); while(neededInputs.hasMoreElements()) { String neededVar = (String)neededInputs.nextElement(); boolean found = false; for(int x=0; x<intros.size(); x++) { Transition t = intros.elementAt(x); // Check and see if there is an introduction whose input variable is what we need. if(t != null && neededVar.equals(t.getInputVariableName())) { // The variable that's needed was the one just passed from the previous output. log.fine("Using " + t.getFrom() + ":" + t.getOutputVariableName() + " as input for " + t.getTo() + ":" + t.getInputVariableName()); if(cache.containsKey(t.getOutputVariableName())) { inputs.put(neededVar, (PLUSObject)cache.get(t.getOutputVariableName())); found = true; } } else { if(cache.containsKey(neededVar)) { log.fine("Using cached data item " + neededVar + " as input to " + a); found = true; inputs.put(neededVar, (PLUSObject)cache.get(neededVar)); } // End else } // End else } // End for if(!found) throw new BackTrackingException("Cannot find precomputed variable " + neededVar + " for " + a); } // End while return inputs; } // End computeNeededInputs /** * Store a set of particular variables in the cache for later use. * @param vars a hash mapping variable names onto PLUSObjects that are their values. */ public void cacheVariables(Hashtable <String,PLUSObject> vars) { Enumeration<String>e = vars.keys(); while(e.hasMoreElements()) { String key = (String)e.nextElement(); log.fine("Caching output variable " + key + "/" + vars.get(key)); cache.put(key, (PLUSObject)vars.get(key)); } // End while } // End cacheVariables /** * Keep track of an activity that has already been seen or used. * @param a * @param vars */ public void cacheActivity(Activity a, Hashtable <String,PLUSObject> vars) { cache.put(a, vars); } // End cacheActivity } // End Workflow