package water; import jsr166y.CountedCompleter; import java.util.Arrays; import water.H2O.H2OCountedCompleter; import water.api.schemas3.KeyV3; import water.util.ArrayUtils; import water.util.Log; /** Jobs are used to do minimal tracking of long-lifetime user actions, * including progress-bar updates and the ability to review in progress or * completed Jobs, and cancel currently running Jobs. * <p> * Jobs are {@link Keyed}, because they need to Key to control e.g. atomic updates. * <p> * Jobs are generic on Keyed, because their primary result is a Keyed result - * which is Not a Job. Obvious examples are Frames (from running Parse or * CreateFrame jobs), or Models (from running ModelBuilder jobs). * <p> * Long running tasks will has-a Job, not is-a Job. */ public final class Job<T extends Keyed> extends Keyed<Job> { /** Result Key */ public final Key<T> _result; public final int _typeid; /** User description */ public final String _description; // whether the _result key is ready for view private boolean _ready_for_view = true; private String [] _warns; public void warn(String warn) { Log.warn(warn); setWarnings(ArrayUtils.append(warns(),warn)); } public void setWarnings(final String [] warns){ new JAtomic() { @Override boolean abort(Job job) { return job._stop_requested; } @Override void update(Job job) { job._warns = warns; } }.apply(this); } /** Create a Job * @param key Key of the final result * @param clz_of_T String class of the Keyed result * @param desc String description */ public Job(Key<T> key, String clz_of_T, String desc) { super(defaultJobKey()); // Passing in a brand new Job key assert key==null || clz_of_T!=null; _result = key; // Result (destination?) key _typeid = clz_of_T==null ? 0 : TypeMap.onIce(clz_of_T); _description = desc; } // Job Keys are pinned to this node (i.e., the node that invoked the // computation), because it should be almost always updated locally private static Key<Job> defaultJobKey() { return Key.make((byte) 0, Key.JOB, false, H2O.SELF); } /** Job start_time and end_time using Sys.CTM */ private long _start_time; // Job started, or 0 if not running private long _end_time; // Job end time, or 0 if not ended // Simple internal state accessors private boolean created() { return _start_time == 0; } private boolean running() { return _start_time != 0 && _end_time == 0; } private boolean stopped() { return _end_time != 0; } // Simple state accessors; public ones do a DKV update check public long start_time() { update_from_remote(); assert !created(); return _start_time; } public long end_time() { update_from_remote(); assert stopped(); return _end_time; } public boolean isRunning() { update_from_remote(); return running(); } public boolean isStopped() { update_from_remote(); return stopped(); } // Slightly more involved state accessors public boolean isStopping(){ return isRunning() && _stop_requested; } public boolean isDone() { return isStopped() && _ex == null; } public boolean isCrashing(){ return isRunning() && _ex != null; } public boolean isCrashed (){ return isStopped() && _ex != null; } /** Current runtime; zero if not started. */ public long msec() { update_from_remote(); if( created() ) return 0; // Created, not running if( running() ) return System.currentTimeMillis() - _start_time; return _end_time - _start_time; // Stopped } public boolean readyForView() { return _ready_for_view; } public void setReadyForView(boolean ready) { _ready_for_view = ready; } /** Jobs may be requested to Stop. Each individual job will respond to this * on a best-effort basis, and make some time to stop. Stop really means * "the Job stops", but is not an indication of any kind of error or fail. * Perhaps the user simply got bored. Because it takes time to stop, a Job * may be both in state isRunning and stop_requested, and may later switch * to isStopped and stop_requested. Also, an exception may be posted. */ private volatile boolean _stop_requested; // monotonic change from false to true public boolean stop_requested() { update_from_remote(); return _stop_requested; } public void stop() { if( !_stop_requested ) // fast path cutout new JAtomic() { @Override boolean abort(Job job) { return job._stop_requested; } @Override void update(Job job) { job._stop_requested = true; } }.apply(this); } /** Any exception thrown by this Job, or null if none. Note that while * setting an exception generally triggers stopping a Job, stopping * takes time, so the Job might still be running with an exception * posted. */ private byte [] _ex; public Throwable ex() { if(_ex == null) return null; return (Throwable)AutoBuffer.javaSerializeReadPojo(_ex); } /** Total expected work. */ public long _work; // Total work to-do public long _max_runtime_msecs; private long _worked; // Work accomplished; between 0 and _work private String _msg; // Progress string /** Returns a float from 0 to 1 representing progress. Polled periodically. * Can default to returning e.g. 0 always. */ public float progress() { update_from_remote(); float regularProgress = _work==0 ? 0f : Math.min(1,(float)_worked/_work); if (_max_runtime_msecs>0) return Math.min(1,Math.max(regularProgress, (float)msec()/_max_runtime_msecs)); return regularProgress; } /** Returns last progress message. */ public String progress_msg() { update_from_remote(); return _msg; } /** Report new work done for this job */ public final void update( final long newworked, final String msg) { if( newworked > 0 || (msg != null && !msg.equals(_msg)) ) { new JAtomic() { @Override boolean abort(Job job) { return newworked==0 && ((msg==null && _msg==null) || (msg != null && msg.equals(job._msg))); } @Override void update(Job old) { old._worked += newworked; old._msg = msg; } }.apply(this); } } public final void update(final long newworked) { update(newworked,(String)null); } public static void update(final long newworked, Key<Job> jobkey) { update(newworked, null, jobkey); } public static void update(final long newworked, String msg, Key<Job> jobkey) { jobkey.get().update(newworked, msg); } // -------------- /** A system key for global list of Job keys. */ public static final Key<Job> LIST = Key.make(" JobList", (byte) 0, Key.BUILT_IN_KEY, false); public String[] warns() { update_from_remote(); return _warns; } private static class JobList extends Keyed { Key<Job>[] _jobs; JobList() { super(LIST); _jobs = new Key[0]; } private JobList(Key<Job>[]jobs) { super(LIST); _jobs = jobs; } } /** The list of all Jobs, past and present. * @return The list of all Jobs, past and present */ public static Job[] jobs() { final Value val = DKV.get(LIST); if( val==null ) return new Job[0]; JobList jl = val.get(); Job[] jobs = new Job[jl._jobs.length]; int j=0; for( int i=0; i<jl._jobs.length; i++ ) { final Value job = DKV.get(jl._jobs[i]); if( job != null ) jobs[j++] = job.get(); } if( j==jobs.length ) return jobs; // All jobs still exist jobs = Arrays.copyOf(jobs,j); // Shrink out removed Key keys[] = new Key[j]; for( int i=0; i<j; i++ ) keys[i] = jobs[i]._key; // One-shot throw-away attempt at remove dead jobs from the jobs list DKV.DputIfMatch(LIST,new Value(LIST,new JobList(keys)),val,new Futures()); return jobs; } public Job<T> start(final H2OCountedCompleter fjtask, long work, double max_runtime_secs) { _max_runtime_msecs = (long)(max_runtime_secs*1e3); return start(fjtask, work); } /** Start this task based on given top-level fork-join task representing job computation. * @param fjtask top-level job computation task. * @param work Amount of work to-do, for updating progress bar * @return this job in {@code isRunning()} state * * @see H2OCountedCompleter */ public Job<T> start(final H2OCountedCompleter fjtask, long work) { // Job does not exist in any DKV, and so does not have any global // visibility (yet). assert !new AssertNoKey(_key).doAllNodes()._found; assert created() && !running() && !stopped(); assert fjtask != null : "Starting a job with null working task is not permitted!"; assert fjtask.getCompleter() == null : "Cannot have a completer; this must be a top-level task"; // F/J rules: upon receiving an exception (the task's compute/compute2 // throws an exception caugt by F/J), the task is marked as "completing // exceptionally" - it is marked "completed" before the onExComplete logic // runs. It is then notified, and wait'ers wake up - before the // onExComplete runs; onExComplete runs on in another thread, so wait'ers // are racing with the onExComplete. // We want wait'ers to *wait* until the task's onExComplete runs, AND Job's // onExComplete runs (marking the Job as stopped, with an error). So we // add a few wrappers: // Make a wrapper class that only *starts* when the task completes - // especially it only starts even when task completes exceptionally... thus // the task onExceptionalCompletion code runs completely before Barrer1 // starts - providing a simple barrier. The Barrier1 onExComplete runs in // parallel with wait'ers on Barrier1. When Barrier1 onExComplete itself // completes, Barrier2 is notified. // Barrier2 is an empty class, and vacuously runs in parallel with wait'ers // of Barrier2 - all callers of Job.get(). _barrier = new Barrier2(); fjtask.setCompleter(new Barrier1(_barrier)); // These next steps must happen in-order: // 4 - cannot submitTask without being on job-list, lest all cores get // slammed but no user-visible record of why, so 4 after 3 // 3 - cannot be on job-list without job in DKV, lest user (briefly) see it // on list but cannot click the link & find job, so 3 after 2 // 2 - cannot be findable in DKV without job also being in running state // lest the finder be confused about the job state, so 2 after 1 // 1 - set state to running // 1 - Change state from created to running _start_time = System.currentTimeMillis(); assert !created() && running() && !stopped(); _work = work; // 2 - Save the full state of the job, first time ever making it public DKV.put(this); // Announce in DKV // 3 - Update job list final Key jobkey = _key; new TAtomic<JobList>() { @Override public JobList atomic(JobList old) { if( old == null ) old = new JobList(); Key[] jobs = old._jobs; old._jobs = Arrays.copyOf(jobs, jobs.length + 1); old._jobs[jobs.length] = jobkey; return old; } }.invoke(LIST); // 4 - Fire off the FJTASK H2O.submitTask(fjtask); return this; } transient private Barrier2 _barrier; // Top-level task to block on // Handy for assertion private static class AssertNoKey extends MRTask<AssertNoKey> { private final Key<Job> _key; boolean _found; AssertNoKey( Key<Job> key ) { _key = key; } @Override public void setupLocal() { _found = H2O.containsKey(_key); } @Override public void reduce( AssertNoKey ank ) { _found |= ank._found; } } public static class JobCancelledException extends RuntimeException {} // A simple barrier. Threads blocking on the job will block on this // "barrier" task, which will block until the fjtask runs the onCompletion or // onExceptionCompletion code. private class Barrier1 extends CountedCompleter { Barrier1(CountedCompleter cc) { super(cc,0); } @Override public void compute() { } @Override public void onCompletion(CountedCompleter caller) { new Barrier1OnCom().apply(Job.this); _barrier = null; // Free for GC } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { if(Job.isCancelledException(ex)) { new Barrier1OnCom().apply(Job.this); _barrier = null; } else { try { Log.err(ex); } catch (Throwable t) {/* do nothing */} new Barrier1OnExCom(ex).apply(Job.this); } _barrier = null; // Free for GC return true; } } static public boolean isCancelledException(Throwable ex) { return ex instanceof JobCancelledException || ex.getCause() != null && ex.getCause() instanceof JobCancelledException; } private static class Barrier1OnCom extends JAtomic { @Override boolean abort(Job job) { return false; } @Override public void update(Job old) { assert old._end_time==0 : "onComp should be called once at most, and never if onExComp is called"; old._end_time = System.currentTimeMillis(); if( old._worked < old._work ) old._worked = old._work; old._msg = old._stop_requested ? "Cancelled." : "Done."; } } private static class Barrier1OnExCom extends JAtomic { final byte[] _dex; Barrier1OnExCom(Throwable ex) { _dex = AutoBuffer.javaSerializeWritePojo(ex); } @Override boolean abort(Job job) { return job._ex != null && job._end_time!=0; } // Already stopped & exception'd @Override void update(Job job) { if( job._ex == null ) job._ex = _dex; // Keep first exception ever job._stop_requested = true; // Since exception set, also set stop if( job._end_time == 0 ) // Keep first end-time job._end_time = System.currentTimeMillis(); job._msg = "Failed."; } } private class Barrier2 extends CountedCompleter { @Override public void compute() { } } /** Blocks until the Job completes */ public T get() { Barrier2 bar = _barrier; if( bar != null ) // Barrier may be null if task already completed bar.join(); // Block on the *barrier* task, which blocks until the fjtask on*Completion code runs completely assert isStopped(); if (_ex!=null) throw new RuntimeException((Throwable)AutoBuffer.javaSerializeReadPojo(_ex)); // Maybe null return, if the started fjtask does not actually produce a result at this Key return _result==null ? null : _result.get(); } // -------------- // Atomic State Updaters. Atomically change state on the home node. They // also update the *this* object from the freshest remote state, meaning the // *this* object changes after these calls. // NO OTHER CHANGES HAPPEN TO JOB FIELDS. private abstract static class JAtomic extends TAtomic<Job> { void apply(Job job) { invoke(job._key); job.update_from_remote(); } abstract boolean abort(Job job); abstract void update(Job job); @Override public Job atomic(Job job) { assert job != null : "Race on creation"; if( abort(job) ) return null; update(job); return job; } } // Update the *this* object from a remote object. private void update_from_remote( ) { Job remote = DKV.getGet(_key); // Watch for changes in the DKV if( this==remote ) return; // Trivial! if( null==remote ) return; // Stay with local version boolean differ = false; if( _stop_requested != remote._stop_requested ) differ = true; if(_start_time!= remote._start_time) differ = true; if(_end_time != remote._end_time ) differ = true; if(_ex != remote._ex ) differ = true; if(_work != remote._work ) differ = true; if(_worked != remote._worked ) differ = true; if(_msg != remote._msg ) differ = true; if(_max_runtime_msecs != remote._max_runtime_msecs) differ = true; if( differ ) synchronized(this) { _stop_requested = remote._stop_requested; _start_time= remote._start_time; _end_time = remote._end_time ; _ex = remote._ex ; _work = remote._work ; _worked = remote._worked ; _msg = remote._msg ; _max_runtime_msecs = remote._max_runtime_msecs; } } @Override public Class<KeyV3.JobKeyV3> makeSchema() { return KeyV3.JobKeyV3.class; } }