SplitByMSBLocal.java example

Explorer
h2o-3-master
package water.rapids;

import water.*;
import water.fvec.Chunk;
import water.util.ArrayUtils;
import water.util.Log;
import water.util.PrettyPrint;

import java.util.Arrays;
import java.util.Hashtable;

class SplitByMSBLocal extends MRTask<SplitByMSBLocal> {
  private final boolean _isLeft;
  private final int _shift, _batchSize, _bytesUsed[], _keySize;
  private final long _base[];
  private final int  _col[];
  private final Key _linkTwoMRTask;
  private final int _id_maps[][];

  private transient long _counts[][];
  private transient long _o[][][];  // transient ok because there is no reduce here between nodes, and important to save shipping back to caller.
  private transient byte _x[][][];
  private long _numRowsOnThisNode;

  static Hashtable<Key,SplitByMSBLocal> MOVESHASH = new Hashtable<>();
  SplitByMSBLocal(boolean isLeft, long base[], int shift, int keySize, int batchSize, int bytesUsed[], int[] col, Key linkTwoMRTask, int[][] id_maps) {
    _isLeft = isLeft;
    // we only currently use the shift (in bits) for the first column for the
    // MSB (which we don't know from bytesUsed[0]). Otherwise we use the
    // bytesUsed to write the key's bytes.
    _shift = shift;
    _batchSize=batchSize; _bytesUsed=bytesUsed; _col=col; _base=base;
    _keySize = keySize;
    _linkTwoMRTask = linkTwoMRTask;
    _id_maps = id_maps;
  }

  @Override protected void setupLocal() {

    Key k = RadixCount.getKey(_isLeft, _col[0], H2O.SELF);
    _counts = ((RadixCount.Long2DArray) DKV.getGet(k))._val;   // get the sparse spine for this node, created and DKV-put above
    DKV.remove(k);
    // First cumulate MSB count histograms across the chunks in this node
    long MSBhist[] = new long[256];
    int nc = _fr.anyVec().nChunks();
    assert nc == _counts.length;
    for (int c = 0; c < nc; c++) {
      if (_counts[c]!=null) {
        for (int h = 0; h < 256; h++) {
          MSBhist[h] += _counts[c][h];
        }
      }
    }
    _numRowsOnThisNode = ArrayUtils.sum(MSBhist);   // we just use this count for the DKV data transfer rate message
    if (ArrayUtils.maxValue(MSBhist) > Math.max(1000, _fr.numRows() / 20 / H2O.CLOUD.size())) {  // TO DO: better test of a good even split
      Log.warn("RadixOrder(): load balancing on this node not optimal (max value should be <= "
              + (Math.max(1000, _fr.numRows() / 20 / H2O.CLOUD.size()))
              + " " + Arrays.toString(MSBhist) + ")");
    }
    // shared between threads on the same node, all mappers write into distinct
    // locations (no conflicts, no need to atomic updates, etc.)
    System.out.print("Allocating _o and _x buckets on this node with known size up front ... ");
    long t0 = System.nanoTime();
    _o = new long[256][][];
    _x = new byte[256][][];  // for each bucket, there might be > 2^31 bytes, so an extra dimension for that
    for (int msb = 0; msb < 256; msb++) {
      if (MSBhist[msb] == 0) continue;
      int nbatch = (int) ((MSBhist[msb]-1)/_batchSize +1);  // at least one batch
      int lastSize = (int) (MSBhist[msb] - (nbatch-1) * _batchSize);   // the size of the last batch (could be batchSize)
      assert nbatch > 0;
      assert lastSize > 0;
      _o[msb] = new long[nbatch][];
      _x[msb] = new byte[nbatch][];
      int b;
      for (b = 0; b < nbatch-1; b++) {
        _o[msb][b] = new long[_batchSize];          // TO DO?: use MemoryManager.malloc8()
        _x[msb][b] = new byte[_batchSize * _keySize];
      }
      _o[msb][b] = new long[lastSize];
      _x[msb][b] = new byte[lastSize * _keySize];
    }
    System.out.println("done in " + (System.nanoTime() - t0) / 1e9);

    // TO DO: otherwise, expand width. Once too wide (and interestingly large
    // width may not be a problem since small buckets won't impact cache),
    // start rolling up bins (maybe into pairs or even quads)
    for (int msb = 0; msb < 256; msb++) {
      // each of the 256 columns starts at 0 for the 0th chunk. This 0 offsets
      // into x[MSBvalue][batch div][mod] and o[MSBvalue][batch div][mod]
      long rollSum = 0;
      for (int c = 0; c < nc; c++) {
        if (_counts[c] == null) continue;
        long tmp = _counts[c][msb];
        // Warning: modify the POJO DKV cache, but that's fine since this node
        // won't ask for the original DKV.get() version again
        _counts[c][msb] = rollSum;
        rollSum += tmp;
      }
    }

    MOVESHASH.put(_linkTwoMRTask, this);

    // NB: no radix skipping in this version (unlike data.table we'll use
    // biggestBit and assume further bits are used).
  }


  @Override public void map(Chunk chk[]) {
    long myCounts[] = _counts[chk[0].cidx()]; //cumulative offsets into o and x
    if (myCounts == null) {
      System.out.println("myCounts empty for chunk " + chk[0].cidx());
      return;
    }

    // Loop through this chunk and write the byte key and the source row number
    // into the local MSB buckets
    // TODO: make this branch free and write the already-compressed _mem
    // directly.  Just need to normalize compression across all chunks.  This
    // has to loop through rows because we need the MSBValue from the first
    // column to use on the others, by row.  Nothing to do cache efficiency,
    // although, it will be most cache efficient (holding one page of each
    // column's _mem, plus a page of this_x, all contiguous.  At the cost of
    // more instructions.
    for (int r=0; r<chk[0]._len; r++) {    // tight, branch free and cache efficient (surprisingly)
      int MSBvalue = 0;  // default for NA
      long thisx = 0;
      if (!chk[0].isNA(r)) {
        thisx = chk[0].at8(r);
        // TODO: restore branch-free again, go by column and retain original
        // compression with no .at8()
        if (_isLeft && _id_maps[0]!=null) thisx = _id_maps[0][(int)thisx] + 1;
        // may not be worth that as has to be global minimum so will rarely be
        // able to use as raw, but when we can maybe can do in bulk
        else thisx = thisx - _base[0] + 1;    // +1 leaving 0'th offset from base to mean NA
        MSBvalue = (int)(thisx >> _shift);   // NA are counted in the first bin
      }
      long target = myCounts[MSBvalue]++;
      int batch = (int) (target / _batchSize);
      int offset = (int) (target % _batchSize);
      assert _o[MSBvalue] != null;
      _o[MSBvalue][batch][offset] = (long) r + chk[0].start();    // move i and the index.

      byte this_x[] = _x[MSBvalue][batch];
      offset *= _keySize; // can't overflow because batchsize was chosen above to be maxByteSize/max(keysize,8)
      for (int i = _bytesUsed[0] - 1; i >= 0; i--) {   // a loop because I don't believe System.arraycopy() can copy parts of (byte[])long to byte[]
        this_x[offset + i] = (byte) (thisx & 0xFFL);
        thisx >>= 8;
      }
      for (int c=1; c<chk.length; c++) {  // TO DO: left align subsequent
        offset += _bytesUsed[c-1];     // advance offset by the previous field width
        if (chk[c].isNA(r)) continue;  // NA is a zero field so skip over as java initializes memory to 0 for us always
        thisx = chk[c].at8(r);         // TODO : compress with a scale factor such as dates stored as ms since epoch / 3600000L
        if (_isLeft && _id_maps[c] != null) thisx = _id_maps[c][(int)thisx] + 1;
        else thisx = thisx - _base[c] + 1;
        for (int i = _bytesUsed[c] - 1; i >= 0; i--) {
          this_x[offset + i] = (byte) (thisx & 0xFFL);
          thisx >>= 8;
        }
      }
    }
  }

  static H2ONode ownerOfMSB(int MSBvalue) {
    // TO DO: this isn't properly working for efficiency. This value should pick the value of where it is, somehow.
    //        Why not getSortedOXHeader(MSBvalue).home_node() ?
    //int blocksize = (int) Math.ceil(256. / H2O.CLOUD.size());
    //H2ONode node = H2O.CLOUD._memary[MSBvalue / blocksize];
    return H2O.CLOUD._memary[MSBvalue % H2O.CLOUD.size()];   // spread it around more.
  }

  static Key getNodeOXbatchKey(boolean isLeft, int MSBvalue, int node, int batch) {
    return Key.make("__radix_order__NodeOXbatch_MSB" + MSBvalue + "_node" + node + "_batch" + batch + (isLeft ? "_LEFT" : "_RIGHT"),
            (byte) 1, Key.HIDDEN_USER_KEY, false, SplitByMSBLocal.ownerOfMSB(MSBvalue));
  }

  static Key getSortedOXbatchKey(boolean isLeft, int MSBvalue, int batch) {
    return Key.make("__radix_order__SortedOXbatch_MSB" + MSBvalue + "_batch" + batch + (isLeft ? "_LEFT" : "_RIGHT"),
            (byte) 1, Key.HIDDEN_USER_KEY, false, SplitByMSBLocal.ownerOfMSB(MSBvalue));
  }


  static class OXbatch extends Iced {
    OXbatch(long[] o, byte[] x) { _o = o; _x = x; }
    final long[/*batchSize or lastSize*/] _o;
    final byte[/*batchSize or lastSize*/] _x;
  }

  static Key getMSBNodeHeaderKey(boolean isLeft, int MSBvalue, int node) {
    return Key.make("__radix_order__OXNodeHeader_MSB" + MSBvalue + "_node" + node + (isLeft ? "_LEFT" : "_RIGHT"),
            (byte) 1, Key.HIDDEN_USER_KEY, false, SplitByMSBLocal.ownerOfMSB(MSBvalue));
  }

  static class MSBNodeHeader extends Iced {
    MSBNodeHeader(int MSBnodeChunkCounts[/*chunks*/]) { _MSBnodeChunkCounts = MSBnodeChunkCounts;}
    int _MSBnodeChunkCounts[];   // a vector of the number of contributions from each chunk.  Since each chunk is length int, this must less than that, so int
  }

  // Push o/x in chunks to owning nodes
  void sendSplitMSB() {
    // The map() above ran above for each chunk on this node.  Although this
    // data was written to _o and _x in the order of chunk number (because we
    // calculated those offsets in order in the prior step), the chunk numbers
    // will likely have gaps because chunks are distributed across nodes not
    // using a modulo approach but something like chunk1 on node1, chunk2 on
    // node2, etc then modulo after that.  Also, as tables undergo changes as a
    // result of user action, their distribution of chunks to nodes could
    // change or be changed (e.g. 'Tomas' rebalance()') for various reasons.
    // When the helper node (i.e the node doing all the A's) gets the A's from
    // this node, it must stack all this nodes' A's with the A's from the other
    // nodes in chunk order in order to maintain the original order of the A's
    // within the global table.  To to do that, this node must tell the helper
    // node where the boundaries are in _o and _x.  That's what the first for
    // loop below does.  The helper node doesn't need to be sent the
    // corresponding chunk numbers. He already knows them from the Vec header
    // which he already has locally.

    // TODO: perhaps write to o_ and x_ in batches in the first place, and just
    // send more and smaller objects via the DKV.  This makes the stitching
    // much easier on the helper node too, as it doesn't need to worry about
    // batch boundaries in the source data.  Then it might be easier to
    // parallelize that helper part.  The thinking was that if each chunk
    // generated 256 objects, that would flood the DKV with keys?

    // TODO: send nChunks * 256.  Currently we do nNodes * 256.  Or avoid DKV
    // altogether if possible.

    System.out.print("Starting SendSplitMSB on this node (keySize is " + _keySize + " as [");
    for( int bs : _bytesUsed ) System.out.print(" "+bs);
    System.out.println(" ]) ...");

    long t0 = System.nanoTime();
    Futures myfs = new Futures(); // Private Futures instead of _fs, so can block early and get timing results
    for (int msb =0; msb <_o.length /*256*/; ++msb) {   // TODO this can be done in parallel, surely
      // "I found my A's (msb=0) and now I'll send them to the node doing all the A's"
      // "I'll send you a long vector of _o and _x (batched if very long) along with where the boundaries are."
      // "You don't need to know the chunk numbers of these boundaries, because you know the node of each chunk from your local Vec header"
      if(_o[msb] == null) continue;
      myfs.add(H2O.submitTask(new SendOne(msb,myfs)));
    }
    myfs.blockForPending();
    double timeTaken = (System.nanoTime() - t0) / 1e9;
    long bytes = _numRowsOnThisNode*( 8/*_o*/ + _keySize) + 64;
    System.out.println("took : " + timeTaken);
    System.out.println("  DKV.put " + PrettyPrint.bytes(bytes) + " @ " +
                       String.format("%.3f", bytes / timeTaken / (1024*1024*1024)) + " GByte/sec  [10Gbit = 1.25GByte/sec]");
  }

  class SendOne extends H2O.H2OCountedCompleter<SendOne> {
    // Nothing on remote node here, just a local parallel loop
    private final int _msb;
    private final Futures _myfs;
    SendOne(int msb, Futures myfs) { _msb = msb; _myfs = myfs; }

    @Override public void compute2() {
      int numChunks = 0;  // how many of the chunks are on this node
      for( long[] cnts : _counts )
        if (cnts != null)  // the map() allocated the 256 vector in the spine slots for this node's chunks
          // even if cnts[_msb]==0 (no _msb for this chunk) we'll store
          // that because needed by line marked LINE_ANCHOR_1 below.
          numChunks++;
      // make dense.  And by construction (i.e. cumulative counts) these chunks
      // contributed in order
      int msbNodeChunkCounts[] = new int[numChunks];
      int j=0;
      long lastCount = 0; // _counts are cumulative at this stage so need to diff
      for( long[] cnts : _counts ) {
        if (cnts != null) {
          if (cnts[_msb] == 0) {  // robust in case we skipped zeros when accumulating
            msbNodeChunkCounts[j] = 0;
          } else {
            // _counts is long so it can be accumulated in-place iirc.
            // TODO: check
            msbNodeChunkCounts[j] = (int)(cnts[_msb] - lastCount);
            lastCount = cnts[_msb];
          }
          j++;
        }
      }
      MSBNodeHeader msbh = new MSBNodeHeader(msbNodeChunkCounts);
      // Need dontCache==true, so data does not remain both locally and on remote.
      // Use private Futures so can block independent of MRTask Futures.
      DKV.put(getMSBNodeHeaderKey(_isLeft, _msb, H2O.SELF.index()), msbh, _myfs, true);
      for (int b=0;b<_o[_msb].length; b++) {
        OXbatch ox = new OXbatch(_o[_msb][b], _x[_msb][b]);   // this does not copy in Java, just references
        DKV.put(getNodeOXbatchKey(_isLeft, _msb, H2O.SELF.index(), b), ox, _myfs, true);
      }
      tryComplete();
    }
  }
}