package hex.tree; import hex.genmodel.utils.DistributionFamily; import jsr166y.CountedCompleter; import water.*; import water.fvec.*; import water.util.ArrayUtils; import water.util.IcedBitSet; import water.util.VecUtils; import java.util.Arrays; import java.util.concurrent.atomic.AtomicInteger; /** * Created by tomas on 10/28/16. * * Score and Build Histogram. * * This is an updated version ditching histogram sharing (still optional) to improve perfomance on multi-cpu systems (witnessed speedup of up to 4x). * * NOTE: unlike standard MRTask, launch via dfork2 instead of doAll/dfork. Has custom 2-phase local mapreduce task. * * <p>Fuse 2 conceptual passes into one (MRTask): * * <dl> * * <dt>Pass 1:</dt><dd>Score a prior partially-built tree model, and make new Node assignments to * every row. This involves pulling out the current assigned DecidedNode, * "scoring" the row against that Node's decision criteria, and assigning the * row to a new child UndecidedNode (and giving it an improved prediction).</dd> * * <dt>Pass 2:</dt><dd>Build new summary DHistograms on the new child UndecidedNodes * every row got assigned into. Collect counts, mean, variance, min, * max per bin, per column.</dd> * </dl> * * The 2 passes are executed (locally) in sequence. * * <p>The result is a set of DHistogram arrays; one DHistogram array for each * unique 'leaf' in the tree being histogramed in parallel. These have node * ID's (nids) from 'leaf' to 'tree._len'. Each DHistogram array is for all * the columns in that 'leaf'. * * <p>The other result is a prediction "score" for the whole dataset, based on * the previous passes' DHistograms. * * * No CAS update: * * Sharing the histograms proved to be a performance problem on larger multi-cpu machines with many running threads, CAS was the bottleneck. * * To remove the CAS while minimizing the memory overhead (private copies of histograms), phase 2 is paralellized both over columns (primary) and rows (secondary). * Parallelization over different columns precedes paralellization within each column to reduce number of extra histogram copies made. * * Expected number of per-column tasks running in parallel (and hence histogram copies) is given by * * exp(nthreads-pre-column) = max(1,H2O.NUMCPUS - num_cols) * */ public class ScoreBuildHistogram2 extends ScoreBuildHistogram { transient int [] _cids; transient Chunk[][] _chks; transient double [][] _ys; transient double [][] _ws; transient int [][] _nhs; transient int [][] _rss; Frame _fr2; final int _numLeafs; final IcedBitSet _activeCols; public ScoreBuildHistogram2(H2O.H2OCountedCompleter cc, int k, int ncols, int nbins, int nbins_cats, DTree tree, int leaf, DHistogram[][] hcs, DistributionFamily family, int weightIdx, int workIdx, int nidIdxs) { super(cc, k, ncols, nbins, nbins_cats, tree, leaf, hcs, family, weightIdx, workIdx, nidIdxs); _numLeafs = _hcs.length; int hcslen = _hcs.length; IcedBitSet activeCols = new IcedBitSet(ncols); for (int n = 0; n < hcslen; n++) { int [] acs = _tree.undecided(n + _leaf)._scoreCols; if(acs != null) { for (int c : acs) // Columns to score (null, or a list of selected cols) activeCols.set(c); } else { activeCols = null; break; } } _activeCols = activeCols; _hcs = ArrayUtils.transpose(_hcs); } @Override public ScoreBuildHistogram dfork2(byte[] types, Frame fr, boolean run_local) { _fr2 = fr; dfork((Key[])null); return this; } @Override public void map(Chunk [] chks){ // Even though this is an MRTask over a Frame, map(Chunk [] chks) should not be called for this task. // Instead, we do a custom 2-stage local pass (launched from setupLocal) using LocalMR. // // There are 2 reasons for that: // a) We have 2 local passes. 1st pass scores the trees and sorts rows, 2nd pass starts after the 1st pass is done and computes the histogram. // Conceptually two tasks but since we do not need global result we want to do the two passes inside of 1 task - no need to insert extra communication overhead here. // b) To reduce the memory overhead in pass 2(in case we're making private DHistogram copies). // There is a private copy made for each task. MRTask forks one task per one line of chunks and we do not want to make too many copies. // By reusing the same DHisto for multiple chunks we save memory and calls to reduce. // throw H2O.unimpl(); } // Pass 1: Score a prior partially-built tree model, and make new Node // assignments to every row. This involves pulling out the current // assigned DecidedNode, "scoring" the row against that Node's decision // criteria, and assigning the row to a new child UndecidedNode (and // giving it an improved prediction). // Pass 1: Score a prior partially-built tree model, and make new Node // assignments to every row. This involves pulling out the current // assigned DecidedNode, "scoring" the row against that Node's decision // criteria, and assigning the row to a new child UndecidedNode (and // giving it an improved prediction). protected int[] score_decide(Chunk chks[], int nnids[]) { int [] res = nnids.clone(); for( int row=0; row<nnids.length; row++ ) { // Over all rows int nid = nnids[row]; // Get Node to decide from if( isDecidedRow(nid)) { // already done res[row] -= _leaf; continue; } // Score row against current decisions & assign new split boolean oob = isOOBRow(nid); if( oob ) nid = oob2Nid(nid); // sampled away - we track the position in the tree DTree.DecidedNode dn = _tree.decided(nid); if( dn._split == null ) { // Might have a leftover non-split if( DTree.isRootNode(dn) ) { res[row] = nid - _leaf; continue; } nid = dn._pid; // Use the parent split decision then int xnid = oob ? nid2Oob(nid) : nid; nnids[row] = xnid; res[row] = xnid - _leaf; dn = _tree.decided(nid); // Parent steers us } assert !isDecidedRow(nid); nid = dn.getChildNodeID(chks,row); // Move down the tree 1 level if( !isDecidedRow(nid) ) { if( oob ) nid = nid2Oob(nid); // Re-apply OOB encoding nnids[row] = nid; } res[row] = nid-_leaf; } return res; } @Override public void setupLocal() { addToPendingCount(1); // Init all the internal tree fields after shipping over the wire _tree.init_tree(); Vec v = _fr2.anyVec(); assert(v!=null); _cids = VecUtils.getLocalChunkIds(v); _chks = new Chunk[_cids.length][_fr2.numCols()]; _ys = new double[_cids.length][]; _ws = new double[_cids.length][]; _nhs = new int[_cids.length][]; _rss = new int[_cids.length][]; long [] espc = v.espc(); int largestChunkSz = 0; for(int i = 1; i < espc.length; ++i){ int sz = (int)(espc[i] - espc[i-1]); if(sz > largestChunkSz) largestChunkSz = sz; } final int fLargestChunkSz = largestChunkSz; if(_weightIdx == -1){ double [] ws = new double[largestChunkSz]; Arrays.fill(ws,1); Arrays.fill(_ws,ws); } final AtomicInteger cidx = new AtomicInteger(0); // First do the phase 1 on all local data new LocalMR(new MrFun(){ // more or less copied from ScoreBuildHistogram private void map(int id, Chunk [] chks) { final C4VolatileChunk nids = (C4VolatileChunk) chks[_nidIdx]; // Pass 1: Score a prior partially-built tree model, and make new Node // assignments to every row. This involves pulling out the current // assigned DecidedNode, "scoring" the row against that Node's decision // criteria, and assigning the row to a new child UndecidedNode (and // giving it an improved prediction). int [] nnids; if( _leaf > 0) // Prior pass exists? nnids = score_decide(chks,nids.getValues()); else { // Just flag all the NA rows nnids = new int[nids._len]; int [] is = nids.getValues(); for (int row = 0; row < nids._len; row++) { if (isDecidedRow(is[row])) nnids[row] = DECIDED_ROW; } } // Pass 2: accumulate all rows, cols into histograms // Sort the rows by NID, so we visit all the same NIDs in a row // Find the count of unique NIDs in this chunk int nh[] = (_nhs[id] = new int[_numLeafs + 1]); for (int i : nnids) if (i >= 0) nh[i + 1]++; // Rollup the histogram of rows-per-NID in this chunk for (int i = 0; i <_numLeafs; i++) nh[i + 1] += nh[i]; // Splat the rows into NID-groups int rows[] = (_rss[id] = new int[nnids.length]); for (int row = 0; row < nnids.length; row++) if (nnids[row] >= 0) rows[nh[nnids[row]]++] = row; } @Override protected void map(int id) { Vec[] vecs = _fr2.vecs(); for(id = cidx.getAndIncrement(); id < _cids.length; id = cidx.getAndIncrement()) { int cidx = _cids[id]; Chunk [] chks = _chks[id]; for (int i = 0; i < chks.length; ++i) chks[i] = vecs[i].chunkForChunkIdx(cidx); map(id,chks); chks[_nidIdx].close(cidx,_fs); Chunk resChk = chks[_workIdx]; int len = resChk.len(); if(resChk instanceof C8DVolatileChunk){ _ys[id] = ((C8DVolatileChunk)resChk).getValues(); } else _ys[id] = resChk.getDoubles(MemoryManager.malloc8d(len), 0, len); if(_weightIdx != -1){ _ws[id] = chks[_weightIdx].getDoubles(MemoryManager.malloc8d(len), 0, len); } } } },new H2O.H2OCountedCompleter(this){ public void onCompletion(CountedCompleter cc){ final int ncols = _ncols; final int [] active_cols = _activeCols == null?null:new int[Math.max(1,_activeCols.cardinality())]; int nactive_cols = active_cols == null?ncols:active_cols.length; final int numWrks = _hcs.length*nactive_cols < 16*1024?H2O.NUMCPUS:Math.min(H2O.NUMCPUS,Math.max(4*H2O.NUMCPUS/nactive_cols,1)); final int rem = H2O.NUMCPUS-numWrks*ncols; ScoreBuildHistogram2.this.addToPendingCount(1+nactive_cols); if(active_cols != null) { int j = 0; for (int i = 0; i < ncols; ++i) if (_activeCols.contains(i)) active_cols[j++] = i; } // MRTask (over columns) launching MrTasks (over number of workers) for each column. // We want FJ to start processing all the columns before parallelizing within column to reduce memory overhead. // (running single column in n threads means n-copies of the histogram) // This is how it works: // 1) Outer MRTask walks down it's tree, forking tasks with exponentially decreasing number of columns until reaching its left most leaf for columns 0. // At this point, the local fjq for this thread has a task for processing half of columns at the bottom, followed by task for 1/4 of columns and so on. // Other threads start stealing work from the bottom. // 2) forks the leaf task and (because its polling from the top) executes the LocalMr for the column 0. // This way we should have columns as equally distributed as possible without resorting to shared priority queue new LocalMR(new MrFun() { @Override protected void map(int c) { c = active_cols == null?c:active_cols[c]; new LocalMR(new ComputeHistoThread(_hcs.length == 0?new DHistogram[0]:_hcs[c],c,fLargestChunkSz,new AtomicInteger()),numWrks + (c < rem?1:0),ScoreBuildHistogram2.this).fork(); } },nactive_cols,ScoreBuildHistogram2.this).fork(); } }).fork(); } private static void mergeHistos(DHistogram [] hcs, DHistogram [] hcs2){ // Distributed histograms need a little work for( int i=0; i< hcs.length; i++ ) { DHistogram hs1 = hcs[i], hs2 = hcs2[i]; if( hs1 == null ) hcs[i] = hs2; else if( hs2 != null ) hs1.add(hs2); } } private class ComputeHistoThread extends MrFun<ComputeHistoThread> { final int _maxChunkSz; final int _col; final DHistogram [] _lh; AtomicInteger _cidx; private boolean _done; public boolean isDone(){return _done || (_done = _cidx.get() >= _cids.length);} ComputeHistoThread(DHistogram [] hcs, int col, int maxChunkSz,AtomicInteger cidx){ _lh = hcs; _col = col; _maxChunkSz = maxChunkSz; _cidx = cidx; } @Override public ComputeHistoThread makeCopy() { return new ComputeHistoThread(ArrayUtils.deepClone(_lh),_col,_maxChunkSz,_cidx); } @Override protected void map(int id){ double [] cs = null; for(int i = _cidx.getAndIncrement(); i < _cids.length; i = _cidx.getAndIncrement()) { if(cs == null) cs = MemoryManager.malloc8d(_maxChunkSz); computeChunk(i,cs,_ws[i]); } } private void computeChunk(int id, double [] cs, double [] ws){ int [] nh = _nhs[id]; int [] rs = _rss[id]; Chunk resChk = _chks[id][_workIdx]; int len = resChk._len; double [] ys = ScoreBuildHistogram2.this._ys[id]; if(_weightIdx != -1) _chks[id][_weightIdx].getDoubles(ws, 0, len); final int hcslen = _lh.length; boolean extracted = false; for (int n = 0; n < hcslen; n++) { int sCols[] = _tree.undecided(n + _leaf)._scoreCols; // Columns to score (null, or a list of selected cols) if (sCols == null || ArrayUtils.find(sCols, _col) >= 0) { DHistogram h = _lh[n]; int hi = nh[n]; int lo = (n == 0 ? 0 : nh[n - 1]); if (hi == lo || h == null) continue; // Ignore untracked columns in this split if (h._vals == null) h.init(); if (!extracted) { _chks[id][_col].getDoubles(cs,0,len); extracted = true; } h.updateHisto(ws, cs, ys, rs, hi, lo); } } } @Override protected void reduce(ComputeHistoThread cc) { assert _lh != cc._lh; mergeHistos(_lh, cc._lh); } } @Override public void postGlobal(){ _hcs = ArrayUtils.transpose(_hcs); for(DHistogram [] ary:_hcs) for(DHistogram dh:ary) { if(dh == null) continue; dh.reducePrecision(); } } }