package water.rapids;
import water.*;
import water.fvec.Chunk;
class RadixCount extends MRTask<RadixCount> {
static class Long2DArray extends Iced {
Long2DArray(int len) { _val = new long[len][]; }
long _val[][];
}
private Long2DArray _counts;
private final int _shift;
private final int _col;
private final long _base;
// used to determine the unique DKV names since DF._key is null now and
// before only an RTMP name anyway
private final boolean _isLeft;
private final int _id_maps[][];
RadixCount(boolean isLeft, long base, int shift, int col, int id_maps[][]) {
_isLeft = isLeft;
_base = base;
_col = col;
_shift = shift;
_id_maps = id_maps;
}
// make a unique deterministic key as a function of frame, column and node
// make it homed to the owning node
static Key getKey(boolean isLeft, int col, H2ONode node) {
return Key.make("__radix_order__MSBNodeCounts_col" + col + "_node" + node.index() + (isLeft ? "_LEFT" : "_RIGHT"));
// Each node's contents is different so the node number needs to be in the key
// TODO: need the biggestBit in here too, that the MSB is offset from
}
@Override protected void setupLocal() {
_counts = new Long2DArray(_fr.anyVec().nChunks());
}
@Override public void map( Chunk chk ) {
long tmp[] = _counts._val[chk.cidx()] = new long[256];
// TODO: assert chk instanceof integer or enum; -- but how since many
// integers (C1,C2 etc)? Alternatively: chk.getClass().equals(C8Chunk.class)
if (!(_isLeft && chk.vec().isCategorical())) {
if (chk.vec().naCnt() == 0) {
// There are no NA in this join column; hence branch-free loop. Most
// common case as should never really have NA in join columns.
for (int r=0; r<chk._len; r++) {
tmp[(int)((chk.at8(r)-_base+1) >> _shift)]++;
// TODO - use _mem directly. Hist the compressed bytes and then shift
// the histogram afterwards when reducing.
}
} else {
// There are some NA in the column so have to branch. TODO: warn user
// NA are present in join column
for (int r=0; r<chk._len; r++) {
if (chk.isNA(r)) tmp[0]++;
else tmp[(int)((chk.at8(r)-_base+1) >> _shift)]++;
// Done - we will join NA to NA as data.table does
// TODO: allow NA-to-NA join to be turned off. Do that in bmerge as a simple low-cost switch.
// Note that NA and the minimum may well both be in MSB 0 but most of
// the time we will not have NA in join columns
}
}
} else {
// first column (for MSB split) in an Enum
// map left categorical to right levels using _id_maps
assert _id_maps[0].length > 0;
assert _base==0;
if (chk.vec().naCnt() == 0) {
for (int r=0; r<chk._len; r++) {
tmp[(_id_maps[0][(int)chk.at8(r)]+1) >> _shift]++;
}
} else {
for (int r=0; r<chk._len; r++) {
if (chk.isNA(r)) tmp[0]++;
else tmp[(_id_maps[0][(int)chk.at8(r)]+1) >> _shift]++;
}
}
}
}
@Override protected void closeLocal() {
DKV.put(getKey(_isLeft, _col, H2O.SELF), _counts, _fs, true);
// just the MSB counts per chunk on this node. Most of this spine will be empty here.
// TODO: could condense to just the chunks on this node but for now, leave sparse.
// We'll use this sparse spine right now on this node and the reduce happens on _o and _x later
}
}