package water; import jsr166y.CountedCompleter; import jsr166y.ForkJoinPool; import water.H2O.H2OCountedCompleter; import water.fvec.*; import water.fvec.Vec.VectorGroup; /** * Map/Reduce style distributed computation. * <br> * MRTask2 provides several <code>map</code> and <code>reduce</code> methods that can be * overriden to specify a computation. Several instances of this class will be * created to distribute the computation over F/J threads and machines. Non-transient * fields are copied and serialized to instances created for map invocations. Reduce * methods can store their results in fields. Results are serialized and reduced all the * way back to the invoking node. When the last reduce method has been called, fields * of the initial MRTask2 instance contains the computation results. * <br> * Apart from small reduced POJO returned to the calling node, MRtask2 can * produce output vector(s) as a result. These will have chunks co-located * with the input dataset, however, their number of lines will generally * differ, (so they won't be strictly compatible with the original). To produce * output vectors, call doAll.dfork version with required number of outputs and * override appropriate <code>map</code> call taking required number of * NewChunks. MRTask2 will automatically close the new Appendable vecs and * produce an output frame with newly created Vecs. */ public abstract class MRTask2<T extends MRTask2<T>> extends DTask implements Cloneable, ForkJoinPool.ManagedBlocker { public MRTask2() { } public MRTask2(H2OCountedCompleter completer){super(completer); } /** The Vectors to work on. */ public Frame _fr; // appendables are treated separately (roll-ups computed in map/reduce style, can not be passed via K/V store). protected AppendableVec [] _appendables; private int _vid; private int _noutputs; // If TRUE, run entirely local - which will pull all the data locally. private boolean _run_local; private byte _priority = H2O.MIN_PRIORITY; @Override public byte priority() { return _priority; } private void raisePriority() { // Always 1 higher priority than calling thread... because the caller will // block & burn a thread waiting for this MRTask2 to complete. Thread cThr = Thread.currentThread(); _priority = (byte)((cThr instanceof H2O.FJWThr) ? ((H2O.FJWThr)cThr)._priority+1 : super.priority()); } public Frame outputFrame(String [] names, String [][] domains){ return outputFrame(null,names,domains); } public Frame outputFrame(Key key, String [] names, String [][] domains){ Futures fs = new Futures(); Frame res = outputFrame(key, names, domains, fs); fs.blockForPending(); return res; } public Frame outputFrame(Key key, String [] names, String [][] domains, Futures fs){ if(_noutputs == 0)return null; Vec [] vecs = new Vec[_noutputs]; for(int i = 0; i < _noutputs; ++i) { if( _appendables==null ) // Zero rows? vecs[i] = _fr.anyVec().makeZero(); else { _appendables[i]._domain = domains==null ? null : domains[i]; vecs[i] = _appendables[i].close(fs); } } return new Frame(key,names,vecs); } /** Override with your map implementation. This overload is given a single * <strong>local</strong> input Chunk. It is meant for map/reduce jobs that use a * single column in a input Frame. All map variants are called, but only one is * expected to be overridden. */ public void map( Chunk c ) { } public void map( Chunk c, NewChunk nc ) { } /** Override with your map implementation. This overload is given two * <strong>local</strong> Chunks. All map variants are called, but only one * is expected to be overridden. */ public void map( Chunk c0, Chunk c1 ) { } public void map( Chunk c0, Chunk c1, NewChunk nc) { } public void map( Chunk c0, Chunk c1, NewChunk nc1, NewChunk nc2 ) { } /** Override with your map implementation. This overload is given three * <strong>local</strong> input Chunks. All map variants are called, but only one * is expected to be overridden. */ public void map( Chunk c0, Chunk c1, Chunk c2 ) { } public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc ) { } public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc1, NewChunk nc2 ) { } /** Override with your map implementation. This overload is given an array * of <strong>local</strong> input Chunks, for Frames with arbitrary column * numbers. All map variants are called, but only one is expected to be * overridden. */ public void map( Chunk cs[] ) { } public void map( Chunk cs[], NewChunk nc ) { } public void map( Chunk cs[], NewChunk nc1, NewChunk nc2 ) { } public void map( Chunk cs[], NewChunk [] ncs ) { } /** Override to combine results from 'mrt' into 'this' MRTask2. Both 'this' * and 'mrt' are guaranteed to either have map() run on them, or be the * results of a prior reduce(). Reduce is optional if, e.g., the result is * some output vector. */ public void reduce( T mrt ) { } /** Override to do any remote initialization on the 1st remote instance of * this object, for initializing node-local shared data structures. */ protected void setupLocal() {} // load the vecs in non-racy way (we will definitely need them and in case we don;t have cached version there will be unnecessary racy update from multiple maps at the same time)! /** Override to do any remote cleaning on the last remote instance of * this object, for disposing of node-local shared data structures. */ protected void closeLocal() { } /** Internal field to track a range of remote nodes/JVMs to work on */ protected short _nxx, _nhi; // Range of Nodes to work on - remotely private int addShift( int x ) { x += _nxx; int sz = H2O.CLOUD.size(); return x < sz ? x : x-sz; } private int subShift( int x ) { x -= _nxx; int sz = H2O.CLOUD.size(); return x < 0 ? x+sz : x; } /** Internal field to track the left and right remote nodes/JVMs to work on */ transient protected RPC<T> _nleft, _nrite; /** Internal field to track if this is a top-level local call */ transient protected boolean _topLocal; // Top-level local call, returning results over the wire /** Internal field to track a range of local Chunks to work on */ transient protected int _lo, _hi; // Range of Chunks to work on - locally /** Internal field to track the left and right sub-range of chunks to work on */ transient protected T _left, _rite; // In-progress execution tree transient private T _res; // Result /** We can add more things to block on - in case we want a bunch of lazy * tasks produced by children to all end before this top-level task ends. * Semantically, these will all complete before we return from the top-level * task. Pragmatically, we block on a finer grained basis. */ transient protected Futures _fs; // More things to block on // Profiling support. Time for each subpart of a single M/R task, plus any // nested MRTasks. All numbers are CTM stamps or millisecond times. private static class MRProfile extends Iced { String _clz; public MRProfile(MRTask2 mrt) { _clz = mrt.getClass().toString(); _localdone = System.currentTimeMillis(); } // See where these are set to understand their meaning. If we split the // job, then _lstart & _rstart are the start of left & right jobs. If we // do NOT split, then _rstart is 0 and _lstart is for the user map job(s). long _localstart, _rpcLstart, _rpcRstart, _rpcRdone, _localdone; // Local setup, RPC network i/o times long _mapstart, _userstart, _closestart, _mapdone; // MAP phase long _onCstart, _reducedone, _remoteBlkDone, _localBlkDone, _onCdone; // REDUCE phase // If we split the job left/right, then we get a total recording of the // last job, and the exec time & completion time of 1st job done. long _time1st, _done1st; int _size_rez0, _size_rez1; // i/o size in bytes during reduce MRProfile _last; long sumTime() { return _onCdone - (_localstart==0 ? _mapstart : _localstart); } void gather( MRProfile p, int size_rez ) { p._clz=null; if( _last == null ) _last=p; else { MRProfile first = _last._onCdone <= p._onCdone ? _last : p; MRProfile last = _last._onCdone > p._onCdone ? _last : p; _last = last; if( first._onCdone > _done1st ) { _time1st = first.sumTime(); _done1st = first._onCdone; } } if( size_rez !=0 ) // Record i/o result size if( _size_rez0 == 0 ) { _size_rez0=size_rez; } else { /*assert _size_rez1==0;*/ _size_rez1=size_rez; } assert _last._onCdone >= _done1st; } @Override public String toString() { return toString(new StringBuilder(),0).toString(); } private StringBuilder toString(StringBuilder sb, int d) { if( d==0 ) sb.append(_clz).append("\n"); for( int i=0; i<d; i++ ) sb.append(" "); if( _localstart != 0 ) sb.append("Node local ").append(_localdone - _localstart).append("ms, "); if( _userstart == 0 ) { // Forked job? sb.append("Slow wait ").append(_mapstart-_localdone).append("ms + work ").append(_last.sumTime()).append("ms, "); sb.append("Fast work ").append(_time1st).append("ms + wait ").append(_onCstart-_done1st).append("ms\n"); _last.toString(sb,d+1); // Nested slow-path print for( int i=0; i<d; i++ ) sb.append(" "); sb.append("join-i/o ").append(_onCstart-_last._onCdone).append("ms, "); } else { // Leaf map call? sb.append("Map ").append(_mapdone - _mapstart).append("ms (prep ").append(_userstart - _mapstart); sb.append("ms, user ").append(_closestart-_userstart); sb.append("ms, closeChk ").append(_mapdone-_closestart).append("ms), "); } sb.append("Red ").append(_onCdone - _onCstart).append("ms (locRed "); sb.append(_reducedone-_onCstart).append("ms"); if( _remoteBlkDone!=0 ) { sb.append(", remBlk ").append(_remoteBlkDone-_reducedone).append("ms, locBlk "); sb.append(_localBlkDone-_remoteBlkDone).append("ms, close "); sb.append(_onCdone-_localBlkDone).append("ms, size "); sb.append(PrettyPrint.bytes(_size_rez0)).append("+").append(PrettyPrint.bytes(_size_rez1)); } sb.append(")\n"); return sb; } } MRProfile _profile; public String profString() { return _profile.toString(); } // Support for fluid-programming with strong types private final T self() { return (T)this; } /** Returns a Vec from the Frame. */ public final Vec vecs(int i) { return _fr.vecs()[i]; } /** Invokes the map/reduce computation over the given Vecs. This call is * blocking. */ public final T doAll( Vec... vecs ) { return doAll(0,vecs); } public final T doAll(int outputs, Vec... vecs ) { return doAll(outputs,new Frame(null,vecs), false); } /** Invokes the map/reduce computation over the given Frame. This call is * blocking. */ public final T doAll( Frame fr, boolean run_local) { return doAll(0,fr, run_local); } public final T doAll( Frame fr ) { return doAll(0,fr, false); } public final T doAll( int outputs, Frame fr) {return doAll(outputs,fr,false);} public final T doAll( int outputs, Frame fr, boolean run_local) { dfork(outputs,fr, run_local); return getResult(); } public final T asyncExec(Vec... vecs){asyncExec(0,new Frame(vecs),false); return self();} public final void exec(Vec... vecs){exec(0, new Frame(vecs), false);} public final void asyncExec(Frame fr){asyncExec(0,fr,false);} public final void exec(Frame fr){exec(0, fr, false);} public final void exec( int outputs, Frame fr, boolean run_local){ // Use first readable vector to gate home/not-home fr.checkCompatible(); // Check for compatible vectors if((_noutputs = outputs) > 0) _vid = fr.anyVec().group().reserveKeys(outputs); _fr = fr; // Record vectors to work on _nxx = (short)H2O.SELF.index(); _nhi = (short)H2O.CLOUD.size(); // Do Whole Cloud _run_local = run_local; // Run locally by copying data, or run globally? setupLocal0(); // Local setup compute2(); } /** * Fork the task in strictly non-blocking fashion. * * Same functionality as dfork, but does not raise priority, so user is should * *never* block on it */ public final void asyncExec( int outputs, Frame fr, boolean run_local){ // Use first readable vector to gate home/not-home fr.checkCompatible(); // Check for compatible vectors if((_noutputs = outputs) > 0) _vid = fr.anyVec().group().reserveKeys(outputs); _fr = fr; // Record vectors to work on _nxx = (short)H2O.SELF.index(); _nhi = (short)H2O.CLOUD.size(); // Do Whole Cloud _run_local = run_local; // Run locally by copying data, or run globally? setupLocal0(); // Local setup H2O.submitTask(this); // Begin normal execution on a FJ thread } /** Invokes the map/reduce computation over the given Frame. This call is * asynchronous. It returns 'this', on which getResult() can be invoked * later to wait on the computation. */ public final T dfork( Vec...vecs ) {return dfork(0,vecs);} public T dfork( Frame fr ) {return dfork(0,fr,false);} public final T dfork( int outputs, Vec... vecs) { return dfork(outputs,new Frame(vecs),false); } public final T dfork( int outputs, Frame fr, boolean run_local) { raisePriority(); asyncExec(outputs,fr,run_local); return self(); } /** Block for and get any final results from a dfork'd MRTask2. * Note: the desired name 'get' is final in ForkJoinTask. */ public final T getResult() { try { try { ForkJoinPool.managedBlock(this); } catch (InterruptedException e) { } return self(); }catch(Throwable t){ throw new RuntimeException(t); } } // Return true if blocking is unnecessary, which is true if the Task isDone. public boolean isReleasable() { return isDone(); } // Possibly blocks the current thread. Returns true if isReleasable would // return true. Used by the FJ Pool management to spawn threads to prevent // deadlock is otherwise all threads would block on waits. public boolean block() { while( !isDone() ) join(); return true; } /** Called once on remote at top level, probably with a subset of the cloud. * Called internal by D/F/J. Not expected to be user-called. */ @Override public final void dinvoke(H2ONode sender) { setupLocal0(); // Local setup compute2(); // Do The Main Work // nothing here... must do any post-work-cleanup in onCompletion } // Setup for local work: fire off any global work to cloud neighbors; do all // chunks; call user's init. private final void setupLocal0() { assert _profile==null; _fs = new Futures(); _profile = new MRProfile(this); _profile._localstart = System.currentTimeMillis(); _topLocal = true; // Check for global vs local work int selfidx = H2O.SELF.index(); int nlo = subShift(selfidx); assert nlo < _nhi; final int nmid = (nlo+_nhi)>>>1; // Mid-point if( !_run_local && nlo+1 < _nhi ) { // Have global work? _profile._rpcLstart = System.currentTimeMillis(); _nleft = remote_compute(nlo+1,nmid); _profile._rpcRstart = System.currentTimeMillis(); _nrite = remote_compute( nmid,_nhi); _profile._rpcRdone = System.currentTimeMillis(); } _lo = 0; _hi = _fr.anyVec().nChunks(); // Do All Chunks // If we have any output vectors, make a blockable Futures for them to // block on. // get the Vecs from the K/V store, to avoid racing fetches from the map calls _fr.vecs(); setupLocal(); // Setup any user's shared local structures _profile._localdone = System.currentTimeMillis(); } // Make an RPC call to some node in the middle of the given range. Add a // pending completion to self, so that we complete when the RPC completes. private final RPC<T> remote_compute( int nlo, int nhi ) { // No remote work? if( !(nlo < nhi) ) return null; int node = addShift(nlo); assert node != H2O.SELF.index(); T rpc = clone(); rpc.setCompleter(null); rpc._nhi = (short)nhi; addToPendingCount(1); // Not complete until the RPC returns // Set self up as needing completion by this RPC: when the ACK comes back // we'll get a wakeup. return new RPC(H2O.CLOUD._memary[node], rpc).addCompleter(this).call(); } protected long _t0; /** Called from FJ threads to do local work. The first called Task (which is * also the last one to Complete) also reduces any global work. Called * internal by F/J. Not expected to be user-called. */ @Override public final void compute2() { _t0 = System.nanoTime(); assert _left == null && _rite == null && _res == null; _profile._mapstart = System.currentTimeMillis(); if( _hi-_lo >= 2 ) { // Multi-chunk case: just divide-and-conquer to 1 chunk final int mid = (_lo+_hi)>>>1; // Mid-point _left = clone(); _rite = clone(); _left._profile = new MRProfile(this); _rite._profile = new MRProfile(this); _left._hi = mid; // Reset mid-point _rite._lo = mid; // Also set self mid-point addToPendingCount(1); // One fork awaiting completion _left.fork(); // Runs in another thread/FJ instance _rite.compute2(); // Runs in THIS F/J thread _profile._mapdone = System.currentTimeMillis(); return; // Not complete until the fork completes } // Zero or 1 chunks, and further chunk might not be homed here if( _hi > _lo ) { // Single chunk? Vec v0 = _fr.anyVec(); if( _run_local || v0.chunkKey(_lo).home() ) { // And chunk is homed here? // Make decompression chunk headers for these chunks Vec vecs[] = _fr.vecs(); Chunk bvs[] = new Chunk[vecs.length]; NewChunk [] appendableChunks = null; for( int i=0; i<vecs.length; i++ ) if( vecs[i] != null ) { assert _run_local || vecs[i].chunkKey(_lo).home() : "Chunk="+_lo+" v0="+v0+", k="+v0.chunkKey(_lo)+" v["+i+"]="+vecs[i]+", k="+vecs[i].chunkKey(_lo); try{ bvs[i] = vecs[i].chunkForChunkIdx(_lo); } catch(Throwable t){ System.err.println("missing chunk in MRTask " + getClass().getName()); t.printStackTrace(); throw new RuntimeException(t); } } if(_noutputs > 0){ final VectorGroup vg = vecs[0].group(); _appendables = new AppendableVec[_noutputs]; appendableChunks = new NewChunk[_noutputs]; for(int i = 0; i < _appendables.length; ++i){ _appendables[i] = new AppendableVec(vg.vecKey(_vid+i),new long[4],0); appendableChunks[i] = (NewChunk)_appendables[i].chunkForChunkIdx(_lo); } } // Call all the various map() calls that apply _profile._userstart = System.currentTimeMillis(); if( _fr.vecs().length == 1 ) map(bvs[0]); if( _fr.vecs().length == 2 ) map(bvs[0], bvs[1]); if( _fr.vecs().length == 3 ) map(bvs[0], bvs[1], bvs[2]); if( true ) map(bvs ); if(_noutputs == 1){ // convenience versions for cases with single output. if( _fr.vecs().length == 1 ) map(bvs[0], appendableChunks[0]); if( _fr.vecs().length == 2 ) map(bvs[0], bvs[1],appendableChunks[0]); if( _fr.vecs().length == 3 ) map(bvs[0], bvs[1], bvs[2],appendableChunks[0]); if( true ) map(bvs, appendableChunks[0]); } if(_noutputs == 2){ // convenience versions for cases with 2 outputs (e.g split). if( _fr.vecs().length == 1 ) map(bvs[0], appendableChunks[0],appendableChunks[1]); if( _fr.vecs().length == 2 ) map(bvs[0], bvs[1],appendableChunks[0],appendableChunks[1]); if( _fr.vecs().length == 3 ) map(bvs[0], bvs[1], bvs[2],appendableChunks[0],appendableChunks[1]); if( true ) map(bvs, appendableChunks[0],appendableChunks[1]); } map(bvs,appendableChunks); _res = self(); // Save results since called map() at least once! // Further D/K/V put any new vec results. _profile._closestart = System.currentTimeMillis(); for( Chunk bv : bvs ) bv.close(_lo,_fs); if(_noutputs > 0) for(NewChunk nch:appendableChunks)nch.close(_lo, _fs); } } _profile._mapdone = System.currentTimeMillis(); tryComplete(); // And this task is complete } /** OnCompletion - reduce the left and right into self. Called internal by * F/J. Not expected to be user-called. */ @Override public final void onCompletion( CountedCompleter caller ) { _profile._onCstart = System.currentTimeMillis(); // Reduce results into 'this' so they collapse going up the execution tree. // NULL out child-references so we don't accidentally keep large subtrees // alive since each one may be holding large partial results. reduce2(_left); _left = null; reduce2(_rite); _rite = null; // Only on the top local call, have more completion work _profile._reducedone = System.currentTimeMillis(); if( _topLocal ) postLocal(); _profile._onCdone = System.currentTimeMillis(); } // Call 'reduce' on pairs of mapped MRTask2's. // Collect all pending Futures from both parties as well. private void reduce2( MRTask2<T> mrt ) { if( mrt == null ) return; _profile.gather(mrt._profile,0); if( _res == null ) _res = mrt._res; else if( mrt._res != null ) _res.reduce4(mrt._res); // Futures are shared on local node and transient (so no remote updates) assert _fs == mrt._fs; } protected void postGlobal(){} // Work done after all the main local work is done. // Gather/reduce remote work. // Block for other queued pending tasks. // Copy any final results into 'this', such that a return of 'this' has the results. private final void postLocal() { reduce3(_nleft); // Reduce global results from neighbors. reduce3(_nrite); _profile._remoteBlkDone = System.currentTimeMillis(); _fs.blockForPending(); _profile._localBlkDone = System.currentTimeMillis(); // Finally, must return all results in 'this' because that is the API - // what the user expects int nlo = subShift(H2O.SELF.index()); int nhi = _nhi; // Save before copyOver crushes them if( _res == null ) _nhi=-1; // Flag for no local results *at all* else if( _res != this ) { // There is a local result, and its not self _res._profile = _profile; // Use my profile (not childs) copyOver(_res); // So copy into self } closeLocal(); if( nlo==0 && nhi == H2O.CLOUD.size() ) { // Do any post-writing work (zap rollup fields, etc) _fr.reloadVecs(); for( int i=0; i<_fr.numCols(); i++ ) _fr.vecs()[i].postWrite(); postGlobal(); } } // Block for RPCs to complete, then reduce global results into self results private void reduce3( RPC<T> rpc ) { if( rpc == null ) return; T mrt = rpc.get(); // This is a blocking remote call // Note: because _fs is transient it is not set or cleared by the RPC. // Because the MRT object is a clone of 'self' it's likely to contain a ptr // to the self _fs which will be not-null and still have local pending // blocks. Not much can be asserted there. _profile.gather(mrt._profile, rpc.size_rez()); // Unlike reduce2, results are in mrt directly not mrt._res. if( mrt._nhi != -1L ) { // Any results at all? if( _res == null ) _res = mrt; else _res.reduce4(mrt); } } /** Call user's reduction. Also reduce any new AppendableVecs. Called * internal by F/J. Not expected to be user-called. */ protected void reduce4( T mrt ) { // Reduce any AppendableVecs if( _noutputs > 0 ) for( int i=0; i<_appendables.length; i++ ) _appendables[i].reduce(mrt._appendables[i]); // User's reduction reduce(mrt); } /** Cancel/kill all work as we can, then rethrow... do not invisibly swallow * exceptions (which is the F/J default). Called internal by F/J. Not * expected to be user-called. */ @Override public final boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller ) { //if( _nleft != null ) _nleft.cancel(true); _nleft = null; //if( _nrite != null ) _nrite.cancel(true); _nrite = null; //if( _left != null ) _left.cancel(true); _left = null; //if( _rite != null ) _rite.cancel(true); _rite = null; _nleft = _nrite = null; _left = _rite = null; return super.onExceptionalCompletion(ex, caller); } /** Local Clone - setting final-field completer */ @Override public T clone() { T x = (T)super.clone(); x.setCompleter(this); // Set completer, what used to be a final field x._topLocal = false; // Not a top job x._nleft = x._nrite = null; x. _left = x. _rite = null; x._fs = _fs; x._profile = null; // Clone needs its own profile x.setPendingCount(0); // Volatile write for completer field; reset pending count also return x; } }