RadixCount.java example

Explorer
h2o-3-master
package water.rapids;

import water.*;
import water.fvec.Chunk;

class RadixCount extends MRTask<RadixCount> {
  static class Long2DArray extends Iced {
    Long2DArray(int len) { _val = new long[len][]; }
    long _val[][];
  }
  private Long2DArray _counts;
  private final int _shift;
  private final int _col;
  private final long _base;
  // used to determine the unique DKV names since DF._key is null now and
  // before only an RTMP name anyway
  private final boolean _isLeft; 
  private final int _id_maps[][];

  RadixCount(boolean isLeft, long base, int shift, int col, int id_maps[][]) {
    _isLeft = isLeft;
    _base = base;
    _col = col;
    _shift = shift;
    _id_maps = id_maps;
  }

  // make a unique deterministic key as a function of frame, column and node
  // make it homed to the owning node
  static Key getKey(boolean isLeft, int col, H2ONode node) {
    return Key.make("__radix_order__MSBNodeCounts_col" + col + "_node" + node.index() + (isLeft ? "_LEFT" : "_RIGHT"));
    // Each node's contents is different so the node number needs to be in the key
    // TODO: need the biggestBit in here too, that the MSB is offset from
  }

  @Override protected void setupLocal() {
    _counts = new Long2DArray(_fr.anyVec().nChunks());
  }

  @Override public void map( Chunk chk ) {
    long tmp[] = _counts._val[chk.cidx()] = new long[256];
    // TODO: assert chk instanceof integer or enum; -- but how since many
    // integers (C1,C2 etc)?  Alternatively: chk.getClass().equals(C8Chunk.class)
    if (!(_isLeft && chk.vec().isCategorical())) {
      if (chk.vec().naCnt() == 0) {
        // There are no NA in this join column; hence branch-free loop. Most
        // common case as should never really have NA in join columns.
        for (int r=0; r<chk._len; r++) {
          tmp[(int)((chk.at8(r)-_base+1) >> _shift)]++;
          // TODO - use _mem directly.  Hist the compressed bytes and then shift
          // the histogram afterwards when reducing.
        }
      } else {
        // There are some NA in the column so have to branch.  TODO: warn user
        // NA are present in join column
        for (int r=0; r<chk._len; r++) {
          if (chk.isNA(r)) tmp[0]++;
          else tmp[(int)((chk.at8(r)-_base+1) >> _shift)]++;
          // Done - we will join NA to NA as data.table does
          // TODO: allow NA-to-NA join to be turned off.  Do that in bmerge as a simple low-cost switch.
          // Note that NA and the minimum may well both be in MSB 0 but most of
          // the time we will not have NA in join columns
        }
      }
    } else {
      // first column (for MSB split) in an Enum
      // map left categorical to right levels using _id_maps
      assert _id_maps[0].length > 0;
      assert _base==0;
      if (chk.vec().naCnt() == 0) {
        for (int r=0; r<chk._len; r++) {
          tmp[(_id_maps[0][(int)chk.at8(r)]+1) >> _shift]++;
        }
      } else {
        for (int r=0; r<chk._len; r++) {
          if (chk.isNA(r)) tmp[0]++;
          else tmp[(_id_maps[0][(int)chk.at8(r)]+1) >> _shift]++;
        }
      }
    }
  }

  @Override protected void closeLocal() {
    DKV.put(getKey(_isLeft, _col, H2O.SELF), _counts, _fs, true);
    // just the MSB counts per chunk on this node.  Most of this spine will be empty here.  
    // TODO: could condense to just the chunks on this node but for now, leave sparse.
    // We'll use this sparse spine right now on this node and the reduce happens on _o and _x later
  }
}