BinaryMerge.java example

Explorer
h2o-3-master
package water.rapids;

// Since we have a single key field in H2O (different to data.table), bmerge() becomes a lot simpler (no
// need for recursion through join columns) with a downside of transfer-cost should we not need all the key.

import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import static water.rapids.SingleThreadRadixOrder.getSortedOXHeaderKey;
import water.util.ArrayUtils;

import java.util.Arrays;

class BinaryMerge extends DTask<BinaryMerge> {
  long _numRowsInResult=0;  // returned to caller, so not transient
  int _chunkSizes[]; // TODO:  only _chunkSizes.length is needed by caller, so return that length only
  double _timings[];

  private transient long _ret1st[/*n2GB*/][];  // The row number of the first right table's index key that matches
  private transient long _retLen[/*n2GB*/][];   // How many rows does it match to?

  final FFSB _leftSB, _riteSB;
  private transient KeyOrder _leftKO, _riteKO;

  private final int _numJoinCols;
  private transient long _leftFrom;
  private transient int _retBatchSize;

  private final boolean _allLeft, _allRight;

  // does any left row match to more than 1 right row?  If not, can allocate
  // and loop more efficiently, and mark the resulting key'd frame with a
  // 'unique' index.  //   TODO: implement
  private transient boolean _oneToManyMatch = false;

  // Data which is duplicated left and rite, but only one copy is needed
  // per-map.  This data is made in the constructor and shallow-copy shared
  // around the cluster.
  static class FFSB extends Iced<FFSB> {
    private final Frame _frame;
    private final Vec _vec;
    // fast lookups to save repeated calls to node.index() which calls
    // binarysearch within it.
    private final int _chunkNode[]; // Chunk homenode index
    final int _msb;
    private final int _shift;
    private final long _base[]; // the col.min() of each column in the key
    private final int _fieldSizes[]; // the widths of each column in the key
    private final int _keySize; // the total width in bytes of the key, sum of field sizes

    FFSB( Frame frame, int msb, int shift, int fieldSizes[], long base[] ) { 
      assert -1<=msb && msb<=255; // left ranges from 0 to 255, right from -1 to 255
      _frame = frame;
      _msb = msb;
      _shift = shift;
      _fieldSizes = fieldSizes;
      _keySize = ArrayUtils.sum(fieldSizes);
      _base = base;
      // Create fast lookups to go from chunk index to node index of that chunk
      Vec vec = _vec = frame.anyVec();
      _chunkNode = vec==null ? null : new int[vec.nChunks()];
      if( vec == null ) return; // Zero-columns for Sort
      for( int i=0; i<_chunkNode.length; i++ )
        _chunkNode[i] = vec.chunkKey(i).home_node().index();
    }

    long min() { return (((long)_msb  ) << _shift) + _base[0]-1; } // the first key possible in this bucket
    long max() { return (((long)_msb+1) << _shift) + _base[0]-2; } // the last  key possible in this bucket
  }

  // In X[Y], 'left'=i and 'right'=x
  BinaryMerge(FFSB leftSB, FFSB riteSB, boolean allLeft) {   
    assert riteSB._msb!=-1 || allLeft;
    _leftSB = leftSB;
    _riteSB = riteSB;
    // the number of columns in the key i.e. length of _leftFieldSizes and _riteSB._fieldSizes
    _numJoinCols = Math.min(_leftSB._fieldSizes.length, _riteSB._fieldSizes.length);
    _allLeft = allLeft;
    _allRight = false;  // TODO: pass through
  }


  @Override
  public void compute2() {
    _timings = new double[20];
    long t0 = System.nanoTime();

    SingleThreadRadixOrder.OXHeader leftSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/true, _leftSB._msb));
    if (leftSortedOXHeader == null) {
      if( !_allRight ) { tryComplete(); return; }
      throw H2O.unimpl();  // TODO pass through _allRight and implement
    }
    _leftKO = new KeyOrder(leftSortedOXHeader);

    SingleThreadRadixOrder.OXHeader rightSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/false, _riteSB._msb));
    //if (_riteSB._msb==-1) assert _allLeft && rightSortedOXHeader == null; // i.e. it's known nothing on right can join
    if (rightSortedOXHeader == null) {
      if( !_allLeft ) { tryComplete(); return; }
      // enables general case code to run below without needing new special case code
      rightSortedOXHeader = new SingleThreadRadixOrder.OXHeader(0, 0, 0);  
    }
    _riteKO = new KeyOrder(rightSortedOXHeader);

    // get left batches
    _leftKO.initKeyOrder(_leftSB._msb,/*left=*/true);
    final long leftN = leftSortedOXHeader._numRows;
    assert leftN >= 1;

    // get right batches
    _riteKO.initKeyOrder(_riteSB._msb, /*left=*/false);
    final long rightN = rightSortedOXHeader._numRows;
    
    _timings[0] += (System.nanoTime() - t0) / 1e9;


    // Now calculate which subset of leftMSB and which subset of rightMSB we're
    // joining here by going into the detail of the key values present rather
    // than the extents of the range (the extents themselves may not be
    // present).

    // We see where the right extents occur in the left keys present; and if
    // there is an overlap we find the full extent of the overlap on the left
    // side (nothing less).

    // We only _need_ do this for left outer join otherwise we'd end up with
    // too many no-match left rows.

    // We'll waste allocating the retFirst and retLen vectors though if only a
    // small overlap is needed, so for that reason it's useful to restrict size
    // of retFirst and retLen even for inner join too.

    // Find left and right MSB extents in terms of the key boundaries they represent
    // _riteSB._msb==-1 indicates that no right MSB should be looked at
    final long leftMin = _leftSB.min();  // the first key possible in this bucket
    final long leftMax = _leftSB.max();  // the last  key possible in this bucket
    // if _riteSB._msb==-1 then the values in riteMin and riteMax here are redundant and not used
    final long riteMin = _riteSB._msb==-1 ? -1 : _riteSB.min();  // the first key possible in this bucket
    final long riteMax = _riteSB._msb==-1 ? -1 : _riteSB.max();  // the last  key possible in this bucket

    _leftFrom =   (_riteSB._msb==-1 || leftMin>=riteMin || (_allLeft && _riteSB._msb==0  )) ? -1    : bsearchLeft(riteMin, /*retLow*/true , leftN);
    long leftTo = (_riteSB._msb==-1 || leftMax<=riteMax || (_allLeft && _riteSB._msb==255)) ? leftN : bsearchLeft(riteMax, /*retLow*/false, leftN);
    // The (_allLeft && rightMSB==0) part is to include those keys in that
    // leftMSB just below the right base.  They won't be caught by rightMSBs to
    // the left because there are no more rightMSBs below 0.  Only when
    // _allLeft do we need to create NA match for them.  They must be created
    // in the same MSB/MSB pair along with the keys that may match the very
    // lowest right keys, because stitching assumes unique MSB/MSB pairs.

    long retSize = leftTo - _leftFrom - 1;   // since leftTo and leftFrom are 1 outside the extremes
    assert retSize >= 0;
    if (retSize==0) { tryComplete(); return; } // nothing can match, even when allLeft
    _retBatchSize = 268435456;    // 2^31 / 8 since Java arrays are limited to 2^31 bytes
    int retNBatch = (int)((retSize - 1) / _retBatchSize + 1);
    int retLastSize = (int)(retSize - (retNBatch - 1) * _retBatchSize);

    _ret1st = new long[retNBatch][];
    _retLen = new long[retNBatch][];
    for( int b=0; b<retNBatch; b++) {
      _ret1st[b] = MemoryManager.malloc8(b==retNBatch-1 ? retLastSize : _retBatchSize);
      _retLen[b] = MemoryManager.malloc8(b==retNBatch-1 ? retLastSize : _retBatchSize);
    }

    // always look at the whole right bucket.  Even though in types -1 and 1,
    // we know range is outside so nothing should match.  if types -1 and 1 do
    // occur, they only happen for leftMSB 0 and 255, and will quickly resolve
    // to no match in the right bucket via bmerge
    t0 = System.nanoTime();
    bmerge_r(_leftFrom, leftTo, -1, rightN);   
    _timings[1] += (System.nanoTime() - t0) / 1e9;

    if (_allLeft) {
      assert _leftKO.numRowsToFetch() == retSize;
    } else {
      long tt = 0;
      for( long[] retFirstx : _ret1st )    // i.e. sum(_ret1st>0) in R
        for( long rF : retFirstx )
          tt += (rF > 0) ? 1 : 0;
      // TODO: change to tt.privateAssertMethod() containing the loop above to
      //       avoid that loop when asserts are off, or accumulate the tt
      //       inside the merge_r, somehow
      assert tt <= retSize;  
      assert _leftKO.numRowsToFetch() == tt;
    }

    if (_numRowsInResult > 0) createChunksInDKV();

    // TODO: set 2 Frame and 2 int[] to NULL at the end of compute2 to save
    // some traffic back, but should be small and insignificant
    // TODO: recheck transients or null out here before returning
    tryComplete();
  }

  // Holder for Key & Order info
  private static class KeyOrder {
    private final transient long _batchSize;
    private final transient byte _key  [/*n2GB*/][/*i mod 2GB * _keySize*/];
    private final transient long _order[/*n2GB*/][/*i mod 2GB * _keySize*/];
    private final transient long _perNodeNumRowsToFetch[];

    KeyOrder( SingleThreadRadixOrder.OXHeader sortedOXHeader ) {
      _batchSize = sortedOXHeader._batchSize;
      final int nBatch = sortedOXHeader._nBatch;
      _key   = new byte[nBatch][];
      _order = new long[nBatch][];
      _perNodeNumRowsToFetch = new long[H2O.CLOUD.size()];
    }

    void initKeyOrder( int msb, boolean isLeft ) {
      for( int b=0; b<_key.length; b++ ) {
        Value v = DKV.get(SplitByMSBLocal.getSortedOXbatchKey(isLeft, msb, b));
        SplitByMSBLocal.OXbatch ox = v.get(); //mem version (obtained from remote) of the Values gets turned into POJO version
        v.freeMem(); //only keep the POJO version of the Value
        _key  [b] = ox._x;
        _order[b] = ox._o;
      }
    }
    long numRowsToFetch() { return ArrayUtils.sum(_perNodeNumRowsToFetch); }
    // Do a mod/div long _order array lookup
    long at8order( long idx ) { return _order[(int)(idx / _batchSize)][(int)(idx % _batchSize)]; }

    long[][] fillPerNodeRows( int i ) {
      final int batchSizeLong = 256*1024*1024 / 16;  // 256GB DKV limit / sizeof(UUID)
      if( _perNodeNumRowsToFetch[i] <= 0 ) return null;
      int nbatch  = (int) ((_perNodeNumRowsToFetch[i] - 1) / batchSizeLong + 1);  // TODO: wrap in class to avoid this boiler plate
      assert nbatch >= 1;
      int lastSize = (int) (_perNodeNumRowsToFetch[i] - (nbatch - 1) * batchSizeLong);
      assert lastSize > 0;
      long[][] res = new long[nbatch][];
      for( int b = 0; b < nbatch; b++ )
        res[b] = MemoryManager.malloc8(b==nbatch-1 ? lastSize : batchSizeLong);
      return res;
    }
  }


  // TODO: specialize keycmp for cases when no join column contains NA (very
  // very often) and make this totally branch free; i.e. without the two `==0 ? :`
  private int keycmp(byte xss[][], long xi, byte yss[][], long yi) {
    // Must be passed a left key and a right key to avoid call overhead of
    // extra arguments.  Only need left to left for equality only and that's
    // optimized in leftKeyEqual below.

    byte xbatch[] = xss[(int)(xi / _leftKO._batchSize)];
    byte ybatch[] = yss[(int)(yi / _riteKO._batchSize)];
    int xoff = (int)(xi % _leftKO._batchSize) * _leftSB._keySize;
    int yoff = (int)(yi % _riteKO._batchSize) * _riteSB._keySize;
    long xval=0, yval=0;

    // We avoid the NewChunk compression because we want finer grain
    // compression than 1,2,4 or 8 bytes types.  In particular, a range just
    // greater than 4bn can use 5 bytes rather than 8 bytes; a 38% RAM saving
    // over the wire in that possibly common case.  Note this is tight and
    // almost branch free.
    int i=0;
    while( i<_numJoinCols && xval==yval ) { // TODO: pass i in to start at a later key column, when known
      int xlen = _leftSB._fieldSizes[i];
      int ylen = _riteSB._fieldSizes[i];
      xval = xbatch[xoff] & 0xFFL; while (xlen>1) { xval <<= 8; xval |= xbatch[++xoff] & 0xFFL; xlen--; } xoff++;
      yval = ybatch[yoff] & 0xFFL; while (ylen>1) { yval <<= 8; yval |= ybatch[++yoff] & 0xFFL; ylen--; } yoff++;
      xval = xval==0 ? Long.MIN_VALUE : xval-1+_leftSB._base[i];
      yval = yval==0 ? Long.MIN_VALUE : yval-1+_riteSB._base[i];
      i++;
    }

    // The magnitude of the difference is used for limiting staleness in a
    // rolling join, capped at Integer.MAX|(MIN+1).  Roll's type is chosen to
    // be int so staleness can't be requested over int's limit.
    // Same return value as strcmp in C. <0 => xi<yi.
    long diff = xval-yval;  // could overflow even in long; e.g. joining to a prevailing NA, or very large gaps O(2^62)
    if (xval>yval) {        // careful not diff>0 here due to overflow
      return( (diff<0 | diff>Integer.MAX_VALUE  ) ? Integer.MAX_VALUE   : (int)diff);
    } else {
      return( (diff>0 | diff<Integer.MIN_VALUE+1) ? Integer.MIN_VALUE+1 : (int)diff);
    }
  }

  // binary search to the left MSB in the 1st column only
  private long bsearchLeft(long x, boolean returnLow, long upp) {
    long low = -1;
    while (low < upp - 1) {
      long mid = low + (upp - low) / 2;
      byte keyBatch[] = _leftKO._key[(int)(mid / _leftKO._batchSize)];
      int off = (int)(mid % _leftKO._batchSize) * _leftSB._keySize;
      int len = _leftSB._fieldSizes[0];
      long val = keyBatch[off] & 0xFFL; 
      while( len>1 ) { 
        val <<= 8; val |= keyBatch[++off] & 0xFFL; len--; 
      }
      val = val==0 ? Long.MIN_VALUE : val-1+_leftSB._base[0];
      if (x<val || (x==val && returnLow)) {
        upp = mid;
      } else {
        low = mid;
      }
    }
    return returnLow ? low : upp;
  }

  // Must be passed two leftKeys only.
  // Optimized special case for the two calling points; see usages in bmerge_r below.
  private boolean leftKeyEqual(byte x[][], long xi, long yi) {
    byte xbatch[] = x[(int)(xi / _leftKO._batchSize)];
    byte ybatch[] = x[(int)(yi / _leftKO._batchSize)];
    int xoff = (int)(xi % _leftKO._batchSize) * _leftSB._keySize;
    int yoff = (int)(yi % _leftKO._batchSize) * _leftSB._keySize;
    int i=0;
    while (i<_leftSB._keySize && xbatch[xoff++] == ybatch[yoff++]) i++;
    return(i==_leftSB._keySize);
  }

  private void bmerge_r(long lLowIn, long lUppIn, long rLowIn, long rUppIn) {
    // TODO: parallel each of the 256 bins
    long lLow = lLowIn, lUpp = lUppIn, rLow = rLowIn, rUpp = rUppIn;
    long mid, tmpLow, tmpUpp;
    // i.e. (lLow+lUpp)/2 but being robust to one day in the future someone
    // somewhere overflowing long; e.g. 32 exabytes of 1-column ints
    long lr = lLow + (lUpp - lLow) / 2;   
    while (rLow < rUpp - 1) {
      mid = rLow + (rUpp - rLow) / 2;
      int cmp = keycmp(_leftKO._key, lr, _riteKO._key, mid);  // -1, 0 or 1, like strcmp
      if (cmp < 0) {
        rUpp = mid;
      } else if (cmp > 0) {
        rLow = mid;
      } else { // rKey == lKey including NA == NA
        // branch mid to find start and end of this group in this column
        // TODO?: not if mult=first|last and col<ncol-1
        tmpLow = mid;
        tmpUpp = mid;
        while (tmpLow < rUpp - 1) {
          mid = tmpLow + (rUpp - tmpLow) / 2;
          if (keycmp(_leftKO._key, lr, _riteKO._key, mid) == 0) tmpLow = mid;
          else rUpp = mid;
        }
        while (rLow < tmpUpp - 1) {
          mid = rLow + (tmpUpp - rLow) / 2;
          if (keycmp(_leftKO._key, lr, _riteKO._key, mid) == 0) tmpUpp = mid;
          else rLow = mid;
        }
        break;
      }
    }
    // rLow and rUpp now surround the group in the right table.

    // The left table key may (unusually, and not recommended, but sometimes needed) be duplicated.
    // Linear search outwards from left row.  
    // Most commonly, the first test shows this left key is unique.
    // This saves (i) re-finding the matching rows in the right for all the
    // dup'd left and (ii) recursive bounds logic gets awkward if other left
    // rows can find the same right rows
    // Related to 'allow.cartesian' in data.table.
    // TODO: if index stores attribute that it is unique then we don't need
    // this step. However, each of these while()s would run at most once in
    // that case, which may not be worth optimizing.
    tmpLow = lr + 1;
    // TODO: these while's could be rolled up inside leftKeyEqual saving call overhead
    while (tmpLow<lUpp && leftKeyEqual(_leftKO._key, tmpLow, lr)) tmpLow++;  
    lUpp = tmpLow;
    tmpUpp = lr - 1;
    while (tmpUpp>lLow && leftKeyEqual(_leftKO._key, tmpUpp, lr)) tmpUpp--;
    lLow = tmpUpp;
    // lLow and lUpp now surround the group in the left table.  If left key is unique then lLow==lr-1 and lUpp==lr+1.
    assert lUpp - lLow >= 2;

    // if value found, rLow and rUpp surround it, unlike standard binary search where rLow falls on it
    long len = rUpp - rLow - 1;  
    // TODO - we don't need loop here :)  Why does perNodeNumRightRowsToFetch increase so much?
    if (len > 0 || _allLeft) {
      long t0 = System.nanoTime();
      if (len > 1) _oneToManyMatch = true;
      _numRowsInResult += Math.max(1,len) * (lUpp-lLow-1);   // 1 for NA row when _allLeft
      for (long j = lLow + 1; j < lUpp; j++) {   // usually iterates once only for j=lr, but more than once if there are dup keys in left table
        // may be a range of left dup'd join-col values, but we need to fetch
        // each one since the left non-join columns are likely not dup'd and
        // may be the reason for the cartesian join
        long t00 = System.nanoTime();
        // TODO could loop through batches rather than / and % wastefully
        long globalRowNumber = _leftKO.at8order(j);
        _timings[17] += (System.nanoTime() - t00)/1e9;
        t00 = System.nanoTime();
        int chkIdx = _leftSB._vec.elem2ChunkIdx(globalRowNumber); //binary search in espc
        _timings[15] += (System.nanoTime() - t00)/1e9;
        // the key is the same within this left dup range, but still need to fetch left non-join columns
        _leftKO._perNodeNumRowsToFetch[_leftSB._chunkNode[chkIdx]]++;  
        if (len==0) continue;  // _allLeft must be true if len==0

        // TODO: initial MSB splits should split down to small enough chunk
        // size - but would that require more passes and if so, how long?  Code
        // simplification benefits would be welcome!
        long outLoc = j - (_leftFrom + 1);   // outOffset is 0 here in the standard scaling up high cardinality test
        // outBatchSize can be different, and larger since known to be 8 bytes
        // per item, both retFirst and retLen.  (Allowing 8 byte here seems
        // wasteful, actually.)
        final int jb2 = (int)(outLoc/_retBatchSize);  
        final int jo2 = (int)(outLoc%_retBatchSize);  // TODO - take outside the loop.  However when we go deep-msb, this'll go away.

        // rLow surrounds row, so +1.  Then another +1 for 1-based
        // row-number. 0 (default) means nomatch and saves extra set to -1 for
        // no match.  Could be significant in large edge cases by not needing
        // to write at all to _ret1st if it has no matches.
        _ret1st[jb2][jo2] = rLow + 2;  
        _retLen[jb2][jo2] = len;
      }

      // if we have dup'd left row, we only need to fetch the right rows once
      // for the first dup.  Those should then be recycled locally later.
      for (long i=0; i<len; i++) {
        long loc = rLow+1+i;
        long t00 = System.nanoTime();
        // TODO could loop through batches rather than / and % wastefully
        long globalRowNumber = _riteKO.at8order(loc);
        _timings[18] += (System.nanoTime() - t00)/1e9;
        t00 = System.nanoTime();
        int chkIdx = _riteSB._vec.elem2ChunkIdx(globalRowNumber); //binary search in espc
        _timings[16] += (System.nanoTime() - t00)/1e9;
        // just count the number per node. So we can allocate arrays precisely
        // up front, and also to return early to use in case of memory errors
        // or other distribution problems
        _riteKO._perNodeNumRowsToFetch[_riteSB._chunkNode[chkIdx]]++;  
      }
      _timings[14] += (System.nanoTime() - t0)/1e9;
    }
    // TODO: check assumption that retFirst and retLength are initialized to 0, for case of no match
    // Now branch (and TODO in parallel) to merge below and merge above

    // '|| _allLeft' is needed here in H2O (but not data.table) for the
    // _leftKO._perNodeNumRowsToFetch above to populate and pass the assert near
    // the end of the compute2() above.
    if (lLow > lLowIn && (rLow > rLowIn || _allLeft)) // '|| _allLeft' is needed here in H2O (but not data.table)
      bmerge_r(lLowIn, lLow+1, rLowIn, rLow+1);
    if (lUpp < lUppIn && (rUpp < rUppIn || _allLeft))
      bmerge_r(lUpp-1, lUppIn, rUpp-1, rUppIn);

    // We don't feel tempted to reduce the global _ansN here and make a global
    // frame, since we want to process each MSB l/r combo individually without
    // allocating them all.  Since recursive, no more code should be here (it
    // would run too much)
  }


  private void createChunksInDKV() {
    // Collect all matches
    // Create the final frame (part) for this MSB combination
    // Cannot use a List<Long> as that's restricted to 2Bn items and also isn't an Iced datatype
    long t0 = System.nanoTime(), t1;

    final int cloudSize = H2O.CLOUD.size();
    final long perNodeRightRows[][][] = new long[cloudSize][][];
    final long perNodeLeftRows [][][] = new long[cloudSize][][];
    // Allocate memory to split this MSB combn's left and right matching rows
    // into contiguous batches sent to the nodes they reside on
    for( int i = 0; i < cloudSize; i++ ) {
      perNodeRightRows[i] = _riteKO.fillPerNodeRows(i);
      perNodeLeftRows [i] = _leftKO.fillPerNodeRows(i);
    }
    _timings[2] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;

    // Loop over _ret1st and _retLen and populate the batched requests for
    // each node helper.  _ret1st and _retLen are the same shape
    final long perNodeRightLoc[] = new long[cloudSize];
    final long perNodeLeftLoc [] = new long[cloudSize];
    chunksPopulatePerNode(perNodeLeftLoc,perNodeLeftRows,perNodeRightLoc,perNodeRightRows);
    _timings[3] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;

    // Create the chunks for the final frame from this MSB pair.
    
    // 16 bytes for each UUID (biggest type). Enum will be long (8). TODO: How is non-Enum 'string' handled by H2O?
    final int batchSizeUUID = 256*1024*1024 / 16;  // number of rows per chunk to fit in 256GB DKV limit.
    final int nbatch = (int) ((_numRowsInResult-1)/batchSizeUUID +1);  // TODO: wrap in class to avoid this boiler plate
    assert nbatch >= 1;
    final int lastSize = (int) (_numRowsInResult - (nbatch-1)*batchSizeUUID);
    assert lastSize > 0;
    final int numLeftCols = _leftSB._frame.numCols();
    final int numColsInResult = _leftSB._frame.numCols() + _riteSB._frame.numCols() - _numJoinCols;
    final double[][][] frameLikeChunks = new double[numColsInResult][nbatch][]; //TODO: compression via int types
    _chunkSizes = new int[nbatch];
    for( int col=0; col<numColsInResult; col++ )
      for( int b = 0; b < nbatch; b++ ) {
        frameLikeChunks[col][b] = MemoryManager.malloc8d(_chunkSizes[b] = (b==nbatch-1 ? lastSize : batchSizeUUID));
        // NA by default to save filling with NA for nomatches when allLeft
        Arrays.fill(frameLikeChunks[col][b], Double.NaN);
      }
    _timings[4] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;

    // Get Raw Remote Rows
    final GetRawRemoteRows grrrsLeft[][] = new GetRawRemoteRows[cloudSize][];
    final GetRawRemoteRows grrrsRite[][] = new GetRawRemoteRows[cloudSize][];
    chunksGetRawRemoteRows(perNodeLeftRows,perNodeRightRows,grrrsLeft,grrrsRite);
    _timings[6] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;  // all this time is expected to be in [5]

    // Now loop through _ret1st and _retLen and populate
    chunksPopulateRetFirst(numColsInResult, numLeftCols, perNodeLeftLoc, grrrsLeft, perNodeRightLoc, grrrsRite, frameLikeChunks);
    _timings[10] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;

    // compress all chunks and store them
    chunksCompressAndStore(nbatch, numColsInResult, frameLikeChunks);
    _timings[11] += (System.nanoTime() - t0) / 1e9;
  }

  // Loop over _ret1st and _retLen and populate the batched requests for
  // each node helper.  _ret1st and _retLen are the same shape
  private void chunksPopulatePerNode( final long perNodeLeftLoc[], final long perNodeLeftRows[][][], final long perNodeRightLoc[], final long perNodeRightRows[][][] ) {
    final int batchSizeLong = 256*1024*1024 / 16;  // 256GB DKV limit / sizeof(UUID)
    long prevf = -1, prevl = -1;
    // TODO: hop back to original order here for [] syntax.
    long leftLoc=_leftFrom;  // sweep through left table along the sorted row locations.  
    for (int jb=0; jb<_ret1st.length; ++jb) {              // jb = j batch
      for (int jo=0; jo<_ret1st[jb].length; ++jo) {        // jo = j offset
        leftLoc++;  // to save jb*_ret1st[0].length + jo;
        long f = _ret1st[jb][jo];  // TODO: take _ret1st[jb] outside inner loop
        long l = _retLen[jb][jo];
        if (f==0) {
          // left row matches to no right row
          assert l == 0;  // doesn't have to be 0 (could be 1 already if allLeft==true) but currently it should be, so check it
          if (!_allLeft) continue;
          // now insert the left row once and NA for the right columns i.e. left outer join
        }

        { // new scope so 'row' can be declared in the for() loop below and registerized (otherwise 'already defined in this scope' in that scope)
          // Fetch the left rows and mark the contiguous from-ranges each left row should be recycled over
          // TODO: when single node, not needed
          // TODO could loop through batches rather than / and % wastefully
          long row = _leftKO.at8order(leftLoc);
          int chkIdx = _leftSB._vec.elem2ChunkIdx(row); //binary search in espc
          int ni = _leftSB._chunkNode[chkIdx];
          long pnl = perNodeLeftLoc[ni]++;   // pnl = per node location
          perNodeLeftRows[ni][(int)(pnl/batchSizeLong)][(int)(pnl%batchSizeLong)] = row;  // ask that node for global row number row
        }
        if (f==0) continue;
        assert l > 0;
        if (prevf == f && prevl == l) 
          continue;  // don't re-fetch the same matching rows (cartesian). We'll repeat them locally later.
        prevf = f; prevl = l;
        for (int r=0; r<l; r++) {
          long loc = f+r-1;  // -1 because these are 0-based where 0 means no-match and 1 refers to the first row
          // TODO: could take / and % outside loop in cases where it doesn't span a batch boundary
          long row = _riteKO.at8order(loc);
          // find the owning node for the row, using local operations here
          int chkIdx = _riteSB._vec.elem2ChunkIdx(row); //binary search in espc
          int ni = _riteSB._chunkNode[chkIdx];
          // TODO Split to an if() and batch and offset separately
          long pnl = perNodeRightLoc[ni]++;   // pnl = per node location.
          perNodeRightRows[ni][(int)(pnl/batchSizeLong)][(int)(pnl%batchSizeLong)] = row;  // ask that node for global row number row
        }
      }
    }
    // TODO assert that perNodeRite and Left are exactly equal to the number
    // expected and allocated.
    Arrays.fill(perNodeLeftLoc ,0); // clear for reuse below
    Arrays.fill(perNodeRightLoc,0);
  }

  // Get Raw Remote Rows
  private void chunksGetRawRemoteRows(final long perNodeLeftRows[][][], final long perNodeRightRows[][][], GetRawRemoteRows grrrsLeft[][], GetRawRemoteRows grrrsRite[][]) {
    RPC<GetRawRemoteRows> grrrsRiteRPC[][] = new RPC[H2O.CLOUD.size()][];
    RPC<GetRawRemoteRows> grrrsLeftRPC[][] = new RPC[H2O.CLOUD.size()][];

    // Launch remote tasks left and right
    for( H2ONode node : H2O.CLOUD._memary ) {
      final int ni = node.index();
      final int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
      final int bUppLeft =  perNodeLeftRows[ni] == null ? 0 :  perNodeLeftRows[ni].length;
      grrrsRiteRPC[ni] = new RPC[bUppRite];
      grrrsLeftRPC[ni] = new RPC[bUppLeft];
      grrrsRite[ni] = new GetRawRemoteRows[bUppRite];
      grrrsLeft[ni] = new GetRawRemoteRows[bUppLeft];
      for (int b = 0; b < bUppRite; b++) {
        // TODO try again now with better surrounding method
        // Arrays.sort(perNodeRightRows[ni][b]);  Simple quick test of fetching in monotonic order. Doesn't seem to help so far. 
        grrrsRiteRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_riteSB._frame, perNodeRightRows[ni][b])).call();
      }
      for (int b = 0; b < bUppLeft; b++) {
        // Arrays.sort(perNodeLeftRows[ni][b]);
        grrrsLeftRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_leftSB._frame, perNodeLeftRows[ni][b])).call();
      }
    }
    for( H2ONode node : H2O.CLOUD._memary ) {
      // TODO: just send and wait for first batch on each node and then .get() next batch as needed.
      int ni = node.index();
      final int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
      for (int b = 0; b < bUppRite; b++)
        _timings[5] += (grrrsRite[ni][b] = grrrsRiteRPC[ni][b].get()).timeTaken;
      final int bUppLeft = perNodeLeftRows[ni] == null ? 0 :  perNodeLeftRows[ni].length;
      for (int b = 0; b < bUppLeft; b++)
        _timings[5] += (grrrsLeft[ni][b] = grrrsLeftRPC[ni][b].get()).timeTaken;
    }
  }

  // Now loop through _ret1st and _retLen and populate
  private void chunksPopulateRetFirst(final int numColsInResult, final int numLeftCols, final long perNodeLeftLoc[], final GetRawRemoteRows grrrsLeft[][], final long perNodeRightLoc[], final GetRawRemoteRows grrrsRite[][], final double[][][] frameLikeChunks) {
    // 16 bytes for each UUID (biggest type). Enum will be long (8). 
    // TODO: How is non-Enum 'string' handled by H2O?
    final int batchSizeUUID = 256*1024*1024 / 16;  // number of rows per chunk to fit in 256GB DKV limit.
    long resultLoc=0;   // sweep upwards through the final result, filling it in
    // TODO: hop back to original order here for [] syntax.
    long leftLoc=_leftFrom; // sweep through left table along the sorted row locations.  
    long prevf = -1, prevl = -1;
    for (int jb=0; jb<_ret1st.length; ++jb) {              // jb = j batch
      for (int jo=0; jo<_ret1st[jb].length; ++jo) {        // jo = j offset
        leftLoc++;  // to save jb*_ret1st[0].length + jo;
        long f = _ret1st[jb][jo];  // TODO: take _ret1st[jb] outside inner loop
        long l = _retLen[jb][jo];
        if (f==0 && !_allLeft) continue;  // f==0 => left row matches to no right row
        // else insert the left row once and NA for the right columns i.e. left outer join

        // Fetch the left rows and recycle it if more than 1 row in the right table is matched to.
        // TODO could loop through batches rather than / and % wastefully
        long row = _leftKO.at8order(leftLoc);
        // TODO should leftOrder and retFirst/retLen have the same batch size to make this easier?
        // TODO Can we not just loop through _leftKO._order only? Why jb and jo too through
        int chkIdx = _leftSB._vec.elem2ChunkIdx(row); //binary search in espc
        int ni = _leftSB._chunkNode[chkIdx];
        long pnl = perNodeLeftLoc[ni]++;   // pnl = per node location.  TODO: batch increment this rather than
        int b = (int)(pnl / batchSizeUUID);
        int o = (int)(pnl % batchSizeUUID);
        double[][] chks = grrrsLeft[ni][b]._chk;

        final int l1 = Math.max((int)l,1);
        for (int rep = 0; rep < l1; rep++) {
          long a = resultLoc + rep;
          // TODO: loop into batches to save / and % for each repeat and still
          // cater for crossing multiple batch boundaries
          int whichChunk = (int) (a / batchSizeUUID);  
          int offset = (int) (a % batchSizeUUID);

          for (int col=0; col<chks.length; col++) {
            // TODO: this only works for numeric columns (not for UUID, strings, etc.)
            frameLikeChunks[col][whichChunk][offset] = chks[col][o];  // colForBatch.atd(row); 
          }
        }
        if (f==0) { resultLoc++; continue; } // no match so just one row (NA for right table) to advance over
        assert l > 0;
        if (prevf == f && prevl == l) {
          // just copy from previous batch in the result (populated by for()
          // below).  Contiguous easy in-cache copy (other than batches).
          for (int r=0; r<l; r++) {
            // TODO: loop into batches to save / and % for each repeat and
            // still cater for crossing multiple batch boundaries
            int toChunk = (int) (resultLoc / batchSizeUUID);  
            int toOffset = (int) (resultLoc % batchSizeUUID);
            int fromChunk = (int) ((resultLoc - l) / batchSizeUUID);
            int fromOffset = (int) ((resultLoc - l) % batchSizeUUID);
            for (int col=0; col<numColsInResult-numLeftCols; col++) {
              frameLikeChunks[numLeftCols + col][toChunk][toOffset] = frameLikeChunks[numLeftCols + col][fromChunk][fromOffset];
            }
            resultLoc++;
          }
          continue;
        }
        prevf = f;
        prevl = l;
        for (int r=0; r<l; r++) {
          // TODO: loop into batches to save / and % for each repeat and still
          // cater for crossing multiple batch boundaries
          int whichChunk = (int) (resultLoc / batchSizeUUID);  
          int offset = (int) (resultLoc % batchSizeUUID);
          long loc = f+r-1;  // -1 because these are 0-based where 0 means no-match and 1 refers to the first row
          // TODO: could take / and % outside loop in cases where it doesn't span a batch boundary
          row = _riteKO.at8order(loc);
          // find the owning node for the row, using local operations here
          chkIdx = _riteSB._vec.elem2ChunkIdx(row); //binary search in espc
          ni = _riteSB._chunkNode[chkIdx];
          pnl = perNodeRightLoc[ni]++;   // pnl = per node location.   // TODO Split to an if() and batch and offset separately
          chks = grrrsRite[ni][(int)(pnl / batchSizeUUID)]._chk;
          o = (int)(pnl % batchSizeUUID);
          for (int col=0; col<numColsInResult-numLeftCols; col++) {
            // TODO: this only works for numeric columns (not for UUID, strings, etc.)
            frameLikeChunks[numLeftCols + col][whichChunk][offset] = chks[_numJoinCols + col][o];  // colForBatch.atd(row); 
          }
          resultLoc++;
        }
      }
    }
  }

  // compress all chunks and store them
  private void chunksCompressAndStore(final int nbatch, final int numColsInResult, final double[][][] frameLikeChunks) {
    // compress all chunks and store them
    Futures fs = new Futures();
    for (int col=0; col<numColsInResult; col++) {
      for (int b = 0; b < nbatch; b++) {
        Chunk ck = new NewChunk(frameLikeChunks[col][b]).compress();
        DKV.put(getKeyForMSBComboPerCol(_leftSB._msb, _riteSB._msb, col, b), ck, fs, true);
        frameLikeChunks[col][b]=null; //free mem as early as possible (it's now in the store)
      }
    }
    fs.blockForPending();
  }


  static Key getKeyForMSBComboPerCol(/*Frame leftFrame, Frame rightFrame,*/ int leftMSB, int rightMSB, int col /*final table*/, int batch) {
    return Key.make("__binary_merge__Chunk_for_col" + col + "_batch" + batch
                    // + rightFrame._key.toString() + "_joined_with" + leftFrame._key.toString()
                    + "_leftSB._msb" + leftMSB + "_riteSB._msb" + rightMSB,
            (byte) 1, Key.HIDDEN_USER_KEY, false, SplitByMSBLocal.ownerOfMSB(rightMSB==-1 ? leftMSB : rightMSB)
    ); //TODO home locally
  }

  static class GetRawRemoteRows extends DTask<GetRawRemoteRows> {
    Frame _fr;
    long[/*rows*/] _rows; //which rows to fetch from remote node, non-null on the way to remote, null on the way back

    double[/*col*/][] _chk; //null on the way to remote node, non-null on the way back
    double timeTaken;
    GetRawRemoteRows(Frame fr, long[] rows) { _rows = rows;  _fr = fr; }

    @Override
    public void compute2() {
      assert(_rows!=null);
      assert(_chk ==null);
      long t0 = System.nanoTime();
      // System.out.print("Allocating _chk with " + _fr.numCols() +" by " + _rows.length + "...");
      _chk  = MemoryManager.malloc8d(_fr.numCols(),_rows.length);  // TODO: should this be transposed in memory?
      // System.out.println("done");
      int cidx[] = MemoryManager.malloc4(_rows.length);
      int offset[] = MemoryManager.malloc4(_rows.length);
      Vec anyVec = _fr.anyVec();  assert anyVec != null;
      for (int row=0; row<_rows.length; row++) {
        cidx[row] = anyVec.elem2ChunkIdx(_rows[row]);  // binary search of espc array.  TODO: sort input row numbers to avoid
        offset[row] = (int)(_rows[row] - anyVec.espc()[cidx[row]]);
      }
      Chunk c[] = new Chunk[anyVec.nChunks()];
      for (int col=0; col<_fr.numCols(); col++) {
        Vec v = _fr.vec(col);
        for (int i=0; i<c.length; i++) c[i] = v.chunkKey(i).home() ? v.chunkForChunkIdx(i) : null;
        for (int row=0; row<_rows.length; row++) {
          _chk[col][row] = c[cidx[row]].atd(offset[row]);
        }
      }

      // tell remote node to fill up Chunk[/*batch*/][/*rows*/]
      // perNodeRows[node] has perNodeRows[node].length batches of row numbers to fetch
      _rows=null;
      _fr=null;
      assert(_chk !=null);

      timeTaken = (System.nanoTime() - t0) / 1e9;

      tryComplete();
    }
  }
}