RadixOrder.java example

Explorer
h2o-3-master
package water.rapids;

import water.*;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.Log;


// counted completer so that left and right index can run at the same time
class RadixOrder extends H2O.H2OCountedCompleter<RadixOrder> {
  private final Frame _DF;
  private final boolean _isLeft;
  private final int _whichCols[], _id_maps[][];
  final int _shift[];
  final int _bytesUsed[];
  final long _base[];

  RadixOrder(Frame DF, boolean isLeft, int whichCols[], int id_maps[][]) {
    _DF = DF;
    _isLeft = isLeft;
    _whichCols = whichCols;
    _id_maps = id_maps;
    _shift = new int[_whichCols.length];   // currently only _shift[0] is used
    _bytesUsed = new int[_whichCols.length];
    _base = new long[_whichCols.length];
  }

  @Override
  public void compute2() {
    long t0 = System.nanoTime(), t1;
    initBaseShift();

    // The MSB is stored (seemingly wastefully on first glance) because we need
    // it when aligning two keys in Merge()
    int keySize = ArrayUtils.sum(_bytesUsed);
    // 256MB is the DKV limit.  / 2 because we fit o and x together in one OXBatch.
    int batchSize = 256*1024*1024 / Math.max(keySize, 8) / 2 ;
    // The Math.max ensures that batches of o and x are aligned, even for wide
    // keys.  To save % and / in deep iteration; e.g. in insert().
    System.out.println("Time to use rollup stats to determine biggestBit: " + ((t1=System.nanoTime()) - t0) / 1e9); t0=t1;

    if( _whichCols.length > 0 )
      new RadixCount(_isLeft, _base[0], _shift[0], _whichCols[0], _isLeft ? _id_maps : null ).doAll(_DF.vec(_whichCols[0]));
    System.out.println("Time of MSB count MRTask left local on each node (no reduce): " + ((t1=System.nanoTime()) - t0) / 1e9); t0=t1;

    // NOT TO DO:  we do need the full allocation of x[] and o[].  We need o[] anyway.  x[] will be compressed and dense.
    // o is the full ordering vector of the right size
    // x is the byte key aligned with o
    // o AND x are what bmerge() needs. Pushing x to each node as well as o avoids inter-node comms.

    // Workaround for incorrectly blocking closeLocal() in MRTask is to do a
    // double MRTask and pass a key between them to pass output from first on
    // that node to second on that node.
    // TODO: fix closeLocal() blocking issue and revert to simpler usage of closeLocal()
    Key linkTwoMRTask = Key.make();
    if( _whichCols.length > 0 )
      new SplitByMSBLocal(_isLeft, _base, _shift[0], keySize, batchSize, _bytesUsed, _whichCols, linkTwoMRTask, _id_maps).doAll(_DF.vecs(_whichCols)); // postLocal needs DKV.put()
    System.out.println("SplitByMSBLocal MRTask (all local per node, no network) took : " + ((t1=System.nanoTime()) - t0) / 1e9); t0=t1;

    if( _whichCols.length > 0 )
      new SendSplitMSB(linkTwoMRTask).doAllNodes();
    System.out.println("SendSplitMSB across all nodes took : " + ((t1=System.nanoTime()) - t0) / 1e9); t0=t1;

    // dispatch in parallel
    RPC[] radixOrders = new RPC[256];
    System.out.print("Sending SingleThreadRadixOrder async RPC calls ... ");
    for (int i = 0; i < 256; i++)
      radixOrders[i] = new RPC<>(SplitByMSBLocal.ownerOfMSB(i), new SingleThreadRadixOrder(_DF, _isLeft, batchSize, keySize, /*nGroup,*/ i)).call();
    System.out.println("took : " + ((t1=System.nanoTime()) - t0) / 1e9); t0=t1;

    System.out.print("Waiting for RPC SingleThreadRadixOrder to finish ... ");
    for( RPC rpc : radixOrders )
      rpc.get();
    System.out.println("took " + (System.nanoTime() - t0) / 1e9);

    tryComplete();

    // serial, do one at a time
//    for (int i = 0; i < 256; i++) {
//      H2ONode node = MoveByFirstByte.ownerOfMSB(i);
//      SingleThreadRadixOrder radixOrder = new RPC<>(node, new SingleThreadRadixOrder(DF, batchSize, keySize, nGroup, i)).call().get();
//      _o[i] = radixOrder._o;
//      _x[i] = radixOrder._x;
//    }

    // If sum(nGroup) == nrow then the index is unique.
    // 1) useful to know if an index is unique or not (when joining to it we
    //    know multiples can't be returned so can allocate more efficiently)
    // 2) If all groups are size 1 there's no need to actually allocate an
    //    all-1 group size vector (perhaps user was checking for uniqueness by
    //    counting group sizes)
    // 3) some nodes may have unique input and others may contain dups; e.g.,
    //    in the case of looking for rare dups.  So only a few threads may have
    //    found dups.
    // 4) can sweep again in parallel and cache-efficient finding the groups,
    //    and allocate known size up front to hold the group sizes.
    // 5) can return to Flow early with the group count. User may now realise
    //    they selected wrong columns and cancel early.
  }

  private void initBaseShift() {
    for (int i=0; i<_whichCols.length; i++) {
      Vec col = _DF.vec(_whichCols[i]);
      // TODO: strings that aren't already categoricals and fixed precision double.
      long max;
      if (col.isCategorical()) {
        // simpler and more robust for now for all categorical bases to be 0,
        // even though some subsets may be far above 0; i.e. forgo uncommon
        // efficiency savings for now
        _base[i] = 0;  
        if (_isLeft) {
          // the left's levels have been matched to the right's levels and we
          // store the mapped values so it's that mapped range we need here (or
          // the col.max() of the corresponding right table would be fine too,
          // but mapped range might be less so use that for possible
          // efficiency)
          assert _id_maps[i] != null;

          // TODO: what is in _id_maps for no matches (-1?) and exclude those
          // i.e. find the minimum >=0. Then treat -1 in _id_map as an NA when
          // writing key
          //_colMin[i] = ArrayUtils.minValue(_id_maps[i]);  
          // if we join to a small subset of levels starting at 0, we'll
          // benefit from the smaller range here, though
          max = ArrayUtils.maxValue(_id_maps[i]); 
        } else {
          max = (long)col.max();
        }
      } else {
        _base[i] = (long)col.min();
        max = (long)col.max();
      }

      // Compute the span or range between min and max.  Compute a
      // shift amount to bring the high order bits of the range down
      // low for radix sorting.  Lower the lower-bound to be an even
      // power of the shift.
      long chk = computeShift(max, i);
      // On rare occasions, lowering the lower-bound also increases
      // the span or range until another bit is needed in the sort.
      // In this case, we need to re-compute the shift amount and
      // perhaps use an even lower lower-bound.
      if( chk == 256 ) chk = computeShift(max, i);
      assert chk <= 255;
      assert chk >= 0;

      _bytesUsed[i] = (_shift[i]+15) / 8;
      //assert (biggestBit-1)/8 + 1 == _bytesUsed[i];
    }
  }

  // Compute the span or range between min and max.  Compute a
  // shift amount to bring the high order bits of the range down
  // low for radix sorting.  Lower the lower-bound to be an even
  // power of the shift.
  private long computeShift( final long max, final int i )  {
    long range = max - _base[i] + 2; // +1 for when min==max to include the bound, +1 for the leading NA spot
    // number of bits starting from 1 easier to think about (for me)
    int biggestBit = 1 + (int) Math.floor(Math.log(range) / Math.log(2));  
    // TODO: feed back to R warnings()
    if (biggestBit < 8) Log.warn("biggest bit should be >= 8 otherwise need to dip into next column (TODO)");  
    assert biggestBit >= 1;
    _shift[i] = Math.max(8, biggestBit)-8;
    long MSBwidth = 1L<<_shift[i];
    if (_base[i] % MSBwidth != 0) {
      // choose base lower than minimum so as to align boundaries (unless
      // minimum already on a boundary by chance)
      _base[i] = MSBwidth * (_base[i]/MSBwidth + (_base[i]<0 ? -1 : 0));
      assert _base[i] % MSBwidth == 0;
    }
    return (max - _base[i] + 1L) >> _shift[i];  // relied on in RadixCount.map
  }

  private static class SendSplitMSB extends MRTask<SendSplitMSB> {
    final Key _linkTwoMRTask;
    SendSplitMSB(Key linkTwoMRTask) { _linkTwoMRTask = linkTwoMRTask; }
    @Override public void setupLocal() {
      SplitByMSBLocal.MOVESHASH.get(_linkTwoMRTask).sendSplitMSB();
      SplitByMSBLocal.MOVESHASH.remove(_linkTwoMRTask);
    }
  }
}