package water.rapids;
// Since we have a single key field in H2O (different to data.table), bmerge() becomes a lot simpler (no
// need for recursion through join columns) with a downside of transfer-cost should we not need all the key.
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import static water.rapids.SingleThreadRadixOrder.getSortedOXHeaderKey;
import water.util.ArrayUtils;
import java.util.Arrays;
class BinaryMerge extends DTask<BinaryMerge> {
long _numRowsInResult=0; // returned to caller, so not transient
int _chunkSizes[]; // TODO: only _chunkSizes.length is needed by caller, so return that length only
double _timings[];
private transient long _ret1st[/*n2GB*/][]; // The row number of the first right table's index key that matches
private transient long _retLen[/*n2GB*/][]; // How many rows does it match to?
final FFSB _leftSB, _riteSB;
private transient KeyOrder _leftKO, _riteKO;
private final int _numJoinCols;
private transient long _leftFrom;
private transient int _retBatchSize;
private final boolean _allLeft, _allRight;
// does any left row match to more than 1 right row? If not, can allocate
// and loop more efficiently, and mark the resulting key'd frame with a
// 'unique' index. // TODO: implement
private transient boolean _oneToManyMatch = false;
// Data which is duplicated left and rite, but only one copy is needed
// per-map. This data is made in the constructor and shallow-copy shared
// around the cluster.
static class FFSB extends Iced<FFSB> {
private final Frame _frame;
private final Vec _vec;
// fast lookups to save repeated calls to node.index() which calls
// binarysearch within it.
private final int _chunkNode[]; // Chunk homenode index
final int _msb;
private final int _shift;
private final long _base[]; // the col.min() of each column in the key
private final int _fieldSizes[]; // the widths of each column in the key
private final int _keySize; // the total width in bytes of the key, sum of field sizes
FFSB( Frame frame, int msb, int shift, int fieldSizes[], long base[] ) {
assert -1<=msb && msb<=255; // left ranges from 0 to 255, right from -1 to 255
_frame = frame;
_msb = msb;
_shift = shift;
_fieldSizes = fieldSizes;
_keySize = ArrayUtils.sum(fieldSizes);
_base = base;
// Create fast lookups to go from chunk index to node index of that chunk
Vec vec = _vec = frame.anyVec();
_chunkNode = vec==null ? null : new int[vec.nChunks()];
if( vec == null ) return; // Zero-columns for Sort
for( int i=0; i<_chunkNode.length; i++ )
_chunkNode[i] = vec.chunkKey(i).home_node().index();
}
long min() { return (((long)_msb ) << _shift) + _base[0]-1; } // the first key possible in this bucket
long max() { return (((long)_msb+1) << _shift) + _base[0]-2; } // the last key possible in this bucket
}
// In X[Y], 'left'=i and 'right'=x
BinaryMerge(FFSB leftSB, FFSB riteSB, boolean allLeft) {
assert riteSB._msb!=-1 || allLeft;
_leftSB = leftSB;
_riteSB = riteSB;
// the number of columns in the key i.e. length of _leftFieldSizes and _riteSB._fieldSizes
_numJoinCols = Math.min(_leftSB._fieldSizes.length, _riteSB._fieldSizes.length);
_allLeft = allLeft;
_allRight = false; // TODO: pass through
}
@Override
public void compute2() {
_timings = new double[20];
long t0 = System.nanoTime();
SingleThreadRadixOrder.OXHeader leftSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/true, _leftSB._msb));
if (leftSortedOXHeader == null) {
if( !_allRight ) { tryComplete(); return; }
throw H2O.unimpl(); // TODO pass through _allRight and implement
}
_leftKO = new KeyOrder(leftSortedOXHeader);
SingleThreadRadixOrder.OXHeader rightSortedOXHeader = DKV.getGet(getSortedOXHeaderKey(/*left=*/false, _riteSB._msb));
//if (_riteSB._msb==-1) assert _allLeft && rightSortedOXHeader == null; // i.e. it's known nothing on right can join
if (rightSortedOXHeader == null) {
if( !_allLeft ) { tryComplete(); return; }
// enables general case code to run below without needing new special case code
rightSortedOXHeader = new SingleThreadRadixOrder.OXHeader(0, 0, 0);
}
_riteKO = new KeyOrder(rightSortedOXHeader);
// get left batches
_leftKO.initKeyOrder(_leftSB._msb,/*left=*/true);
final long leftN = leftSortedOXHeader._numRows;
assert leftN >= 1;
// get right batches
_riteKO.initKeyOrder(_riteSB._msb, /*left=*/false);
final long rightN = rightSortedOXHeader._numRows;
_timings[0] += (System.nanoTime() - t0) / 1e9;
// Now calculate which subset of leftMSB and which subset of rightMSB we're
// joining here by going into the detail of the key values present rather
// than the extents of the range (the extents themselves may not be
// present).
// We see where the right extents occur in the left keys present; and if
// there is an overlap we find the full extent of the overlap on the left
// side (nothing less).
// We only _need_ do this for left outer join otherwise we'd end up with
// too many no-match left rows.
// We'll waste allocating the retFirst and retLen vectors though if only a
// small overlap is needed, so for that reason it's useful to restrict size
// of retFirst and retLen even for inner join too.
// Find left and right MSB extents in terms of the key boundaries they represent
// _riteSB._msb==-1 indicates that no right MSB should be looked at
final long leftMin = _leftSB.min(); // the first key possible in this bucket
final long leftMax = _leftSB.max(); // the last key possible in this bucket
// if _riteSB._msb==-1 then the values in riteMin and riteMax here are redundant and not used
final long riteMin = _riteSB._msb==-1 ? -1 : _riteSB.min(); // the first key possible in this bucket
final long riteMax = _riteSB._msb==-1 ? -1 : _riteSB.max(); // the last key possible in this bucket
_leftFrom = (_riteSB._msb==-1 || leftMin>=riteMin || (_allLeft && _riteSB._msb==0 )) ? -1 : bsearchLeft(riteMin, /*retLow*/true , leftN);
long leftTo = (_riteSB._msb==-1 || leftMax<=riteMax || (_allLeft && _riteSB._msb==255)) ? leftN : bsearchLeft(riteMax, /*retLow*/false, leftN);
// The (_allLeft && rightMSB==0) part is to include those keys in that
// leftMSB just below the right base. They won't be caught by rightMSBs to
// the left because there are no more rightMSBs below 0. Only when
// _allLeft do we need to create NA match for them. They must be created
// in the same MSB/MSB pair along with the keys that may match the very
// lowest right keys, because stitching assumes unique MSB/MSB pairs.
long retSize = leftTo - _leftFrom - 1; // since leftTo and leftFrom are 1 outside the extremes
assert retSize >= 0;
if (retSize==0) { tryComplete(); return; } // nothing can match, even when allLeft
_retBatchSize = 268435456; // 2^31 / 8 since Java arrays are limited to 2^31 bytes
int retNBatch = (int)((retSize - 1) / _retBatchSize + 1);
int retLastSize = (int)(retSize - (retNBatch - 1) * _retBatchSize);
_ret1st = new long[retNBatch][];
_retLen = new long[retNBatch][];
for( int b=0; b<retNBatch; b++) {
_ret1st[b] = MemoryManager.malloc8(b==retNBatch-1 ? retLastSize : _retBatchSize);
_retLen[b] = MemoryManager.malloc8(b==retNBatch-1 ? retLastSize : _retBatchSize);
}
// always look at the whole right bucket. Even though in types -1 and 1,
// we know range is outside so nothing should match. if types -1 and 1 do
// occur, they only happen for leftMSB 0 and 255, and will quickly resolve
// to no match in the right bucket via bmerge
t0 = System.nanoTime();
bmerge_r(_leftFrom, leftTo, -1, rightN);
_timings[1] += (System.nanoTime() - t0) / 1e9;
if (_allLeft) {
assert _leftKO.numRowsToFetch() == retSize;
} else {
long tt = 0;
for( long[] retFirstx : _ret1st ) // i.e. sum(_ret1st>0) in R
for( long rF : retFirstx )
tt += (rF > 0) ? 1 : 0;
// TODO: change to tt.privateAssertMethod() containing the loop above to
// avoid that loop when asserts are off, or accumulate the tt
// inside the merge_r, somehow
assert tt <= retSize;
assert _leftKO.numRowsToFetch() == tt;
}
if (_numRowsInResult > 0) createChunksInDKV();
// TODO: set 2 Frame and 2 int[] to NULL at the end of compute2 to save
// some traffic back, but should be small and insignificant
// TODO: recheck transients or null out here before returning
tryComplete();
}
// Holder for Key & Order info
private static class KeyOrder {
private final transient long _batchSize;
private final transient byte _key [/*n2GB*/][/*i mod 2GB * _keySize*/];
private final transient long _order[/*n2GB*/][/*i mod 2GB * _keySize*/];
private final transient long _perNodeNumRowsToFetch[];
KeyOrder( SingleThreadRadixOrder.OXHeader sortedOXHeader ) {
_batchSize = sortedOXHeader._batchSize;
final int nBatch = sortedOXHeader._nBatch;
_key = new byte[nBatch][];
_order = new long[nBatch][];
_perNodeNumRowsToFetch = new long[H2O.CLOUD.size()];
}
void initKeyOrder( int msb, boolean isLeft ) {
for( int b=0; b<_key.length; b++ ) {
Value v = DKV.get(SplitByMSBLocal.getSortedOXbatchKey(isLeft, msb, b));
SplitByMSBLocal.OXbatch ox = v.get(); //mem version (obtained from remote) of the Values gets turned into POJO version
v.freeMem(); //only keep the POJO version of the Value
_key [b] = ox._x;
_order[b] = ox._o;
}
}
long numRowsToFetch() { return ArrayUtils.sum(_perNodeNumRowsToFetch); }
// Do a mod/div long _order array lookup
long at8order( long idx ) { return _order[(int)(idx / _batchSize)][(int)(idx % _batchSize)]; }
long[][] fillPerNodeRows( int i ) {
final int batchSizeLong = 256*1024*1024 / 16; // 256GB DKV limit / sizeof(UUID)
if( _perNodeNumRowsToFetch[i] <= 0 ) return null;
int nbatch = (int) ((_perNodeNumRowsToFetch[i] - 1) / batchSizeLong + 1); // TODO: wrap in class to avoid this boiler plate
assert nbatch >= 1;
int lastSize = (int) (_perNodeNumRowsToFetch[i] - (nbatch - 1) * batchSizeLong);
assert lastSize > 0;
long[][] res = new long[nbatch][];
for( int b = 0; b < nbatch; b++ )
res[b] = MemoryManager.malloc8(b==nbatch-1 ? lastSize : batchSizeLong);
return res;
}
}
// TODO: specialize keycmp for cases when no join column contains NA (very
// very often) and make this totally branch free; i.e. without the two `==0 ? :`
private int keycmp(byte xss[][], long xi, byte yss[][], long yi) {
// Must be passed a left key and a right key to avoid call overhead of
// extra arguments. Only need left to left for equality only and that's
// optimized in leftKeyEqual below.
byte xbatch[] = xss[(int)(xi / _leftKO._batchSize)];
byte ybatch[] = yss[(int)(yi / _riteKO._batchSize)];
int xoff = (int)(xi % _leftKO._batchSize) * _leftSB._keySize;
int yoff = (int)(yi % _riteKO._batchSize) * _riteSB._keySize;
long xval=0, yval=0;
// We avoid the NewChunk compression because we want finer grain
// compression than 1,2,4 or 8 bytes types. In particular, a range just
// greater than 4bn can use 5 bytes rather than 8 bytes; a 38% RAM saving
// over the wire in that possibly common case. Note this is tight and
// almost branch free.
int i=0;
while( i<_numJoinCols && xval==yval ) { // TODO: pass i in to start at a later key column, when known
int xlen = _leftSB._fieldSizes[i];
int ylen = _riteSB._fieldSizes[i];
xval = xbatch[xoff] & 0xFFL; while (xlen>1) { xval <<= 8; xval |= xbatch[++xoff] & 0xFFL; xlen--; } xoff++;
yval = ybatch[yoff] & 0xFFL; while (ylen>1) { yval <<= 8; yval |= ybatch[++yoff] & 0xFFL; ylen--; } yoff++;
xval = xval==0 ? Long.MIN_VALUE : xval-1+_leftSB._base[i];
yval = yval==0 ? Long.MIN_VALUE : yval-1+_riteSB._base[i];
i++;
}
// The magnitude of the difference is used for limiting staleness in a
// rolling join, capped at Integer.MAX|(MIN+1). Roll's type is chosen to
// be int so staleness can't be requested over int's limit.
// Same return value as strcmp in C. <0 => xi<yi.
long diff = xval-yval; // could overflow even in long; e.g. joining to a prevailing NA, or very large gaps O(2^62)
if (xval>yval) { // careful not diff>0 here due to overflow
return( (diff<0 | diff>Integer.MAX_VALUE ) ? Integer.MAX_VALUE : (int)diff);
} else {
return( (diff>0 | diff<Integer.MIN_VALUE+1) ? Integer.MIN_VALUE+1 : (int)diff);
}
}
// binary search to the left MSB in the 1st column only
private long bsearchLeft(long x, boolean returnLow, long upp) {
long low = -1;
while (low < upp - 1) {
long mid = low + (upp - low) / 2;
byte keyBatch[] = _leftKO._key[(int)(mid / _leftKO._batchSize)];
int off = (int)(mid % _leftKO._batchSize) * _leftSB._keySize;
int len = _leftSB._fieldSizes[0];
long val = keyBatch[off] & 0xFFL;
while( len>1 ) {
val <<= 8; val |= keyBatch[++off] & 0xFFL; len--;
}
val = val==0 ? Long.MIN_VALUE : val-1+_leftSB._base[0];
if (x<val || (x==val && returnLow)) {
upp = mid;
} else {
low = mid;
}
}
return returnLow ? low : upp;
}
// Must be passed two leftKeys only.
// Optimized special case for the two calling points; see usages in bmerge_r below.
private boolean leftKeyEqual(byte x[][], long xi, long yi) {
byte xbatch[] = x[(int)(xi / _leftKO._batchSize)];
byte ybatch[] = x[(int)(yi / _leftKO._batchSize)];
int xoff = (int)(xi % _leftKO._batchSize) * _leftSB._keySize;
int yoff = (int)(yi % _leftKO._batchSize) * _leftSB._keySize;
int i=0;
while (i<_leftSB._keySize && xbatch[xoff++] == ybatch[yoff++]) i++;
return(i==_leftSB._keySize);
}
private void bmerge_r(long lLowIn, long lUppIn, long rLowIn, long rUppIn) {
// TODO: parallel each of the 256 bins
long lLow = lLowIn, lUpp = lUppIn, rLow = rLowIn, rUpp = rUppIn;
long mid, tmpLow, tmpUpp;
// i.e. (lLow+lUpp)/2 but being robust to one day in the future someone
// somewhere overflowing long; e.g. 32 exabytes of 1-column ints
long lr = lLow + (lUpp - lLow) / 2;
while (rLow < rUpp - 1) {
mid = rLow + (rUpp - rLow) / 2;
int cmp = keycmp(_leftKO._key, lr, _riteKO._key, mid); // -1, 0 or 1, like strcmp
if (cmp < 0) {
rUpp = mid;
} else if (cmp > 0) {
rLow = mid;
} else { // rKey == lKey including NA == NA
// branch mid to find start and end of this group in this column
// TODO?: not if mult=first|last and col<ncol-1
tmpLow = mid;
tmpUpp = mid;
while (tmpLow < rUpp - 1) {
mid = tmpLow + (rUpp - tmpLow) / 2;
if (keycmp(_leftKO._key, lr, _riteKO._key, mid) == 0) tmpLow = mid;
else rUpp = mid;
}
while (rLow < tmpUpp - 1) {
mid = rLow + (tmpUpp - rLow) / 2;
if (keycmp(_leftKO._key, lr, _riteKO._key, mid) == 0) tmpUpp = mid;
else rLow = mid;
}
break;
}
}
// rLow and rUpp now surround the group in the right table.
// The left table key may (unusually, and not recommended, but sometimes needed) be duplicated.
// Linear search outwards from left row.
// Most commonly, the first test shows this left key is unique.
// This saves (i) re-finding the matching rows in the right for all the
// dup'd left and (ii) recursive bounds logic gets awkward if other left
// rows can find the same right rows
// Related to 'allow.cartesian' in data.table.
// TODO: if index stores attribute that it is unique then we don't need
// this step. However, each of these while()s would run at most once in
// that case, which may not be worth optimizing.
tmpLow = lr + 1;
// TODO: these while's could be rolled up inside leftKeyEqual saving call overhead
while (tmpLow<lUpp && leftKeyEqual(_leftKO._key, tmpLow, lr)) tmpLow++;
lUpp = tmpLow;
tmpUpp = lr - 1;
while (tmpUpp>lLow && leftKeyEqual(_leftKO._key, tmpUpp, lr)) tmpUpp--;
lLow = tmpUpp;
// lLow and lUpp now surround the group in the left table. If left key is unique then lLow==lr-1 and lUpp==lr+1.
assert lUpp - lLow >= 2;
// if value found, rLow and rUpp surround it, unlike standard binary search where rLow falls on it
long len = rUpp - rLow - 1;
// TODO - we don't need loop here :) Why does perNodeNumRightRowsToFetch increase so much?
if (len > 0 || _allLeft) {
long t0 = System.nanoTime();
if (len > 1) _oneToManyMatch = true;
_numRowsInResult += Math.max(1,len) * (lUpp-lLow-1); // 1 for NA row when _allLeft
for (long j = lLow + 1; j < lUpp; j++) { // usually iterates once only for j=lr, but more than once if there are dup keys in left table
// may be a range of left dup'd join-col values, but we need to fetch
// each one since the left non-join columns are likely not dup'd and
// may be the reason for the cartesian join
long t00 = System.nanoTime();
// TODO could loop through batches rather than / and % wastefully
long globalRowNumber = _leftKO.at8order(j);
_timings[17] += (System.nanoTime() - t00)/1e9;
t00 = System.nanoTime();
int chkIdx = _leftSB._vec.elem2ChunkIdx(globalRowNumber); //binary search in espc
_timings[15] += (System.nanoTime() - t00)/1e9;
// the key is the same within this left dup range, but still need to fetch left non-join columns
_leftKO._perNodeNumRowsToFetch[_leftSB._chunkNode[chkIdx]]++;
if (len==0) continue; // _allLeft must be true if len==0
// TODO: initial MSB splits should split down to small enough chunk
// size - but would that require more passes and if so, how long? Code
// simplification benefits would be welcome!
long outLoc = j - (_leftFrom + 1); // outOffset is 0 here in the standard scaling up high cardinality test
// outBatchSize can be different, and larger since known to be 8 bytes
// per item, both retFirst and retLen. (Allowing 8 byte here seems
// wasteful, actually.)
final int jb2 = (int)(outLoc/_retBatchSize);
final int jo2 = (int)(outLoc%_retBatchSize); // TODO - take outside the loop. However when we go deep-msb, this'll go away.
// rLow surrounds row, so +1. Then another +1 for 1-based
// row-number. 0 (default) means nomatch and saves extra set to -1 for
// no match. Could be significant in large edge cases by not needing
// to write at all to _ret1st if it has no matches.
_ret1st[jb2][jo2] = rLow + 2;
_retLen[jb2][jo2] = len;
}
// if we have dup'd left row, we only need to fetch the right rows once
// for the first dup. Those should then be recycled locally later.
for (long i=0; i<len; i++) {
long loc = rLow+1+i;
long t00 = System.nanoTime();
// TODO could loop through batches rather than / and % wastefully
long globalRowNumber = _riteKO.at8order(loc);
_timings[18] += (System.nanoTime() - t00)/1e9;
t00 = System.nanoTime();
int chkIdx = _riteSB._vec.elem2ChunkIdx(globalRowNumber); //binary search in espc
_timings[16] += (System.nanoTime() - t00)/1e9;
// just count the number per node. So we can allocate arrays precisely
// up front, and also to return early to use in case of memory errors
// or other distribution problems
_riteKO._perNodeNumRowsToFetch[_riteSB._chunkNode[chkIdx]]++;
}
_timings[14] += (System.nanoTime() - t0)/1e9;
}
// TODO: check assumption that retFirst and retLength are initialized to 0, for case of no match
// Now branch (and TODO in parallel) to merge below and merge above
// '|| _allLeft' is needed here in H2O (but not data.table) for the
// _leftKO._perNodeNumRowsToFetch above to populate and pass the assert near
// the end of the compute2() above.
if (lLow > lLowIn && (rLow > rLowIn || _allLeft)) // '|| _allLeft' is needed here in H2O (but not data.table)
bmerge_r(lLowIn, lLow+1, rLowIn, rLow+1);
if (lUpp < lUppIn && (rUpp < rUppIn || _allLeft))
bmerge_r(lUpp-1, lUppIn, rUpp-1, rUppIn);
// We don't feel tempted to reduce the global _ansN here and make a global
// frame, since we want to process each MSB l/r combo individually without
// allocating them all. Since recursive, no more code should be here (it
// would run too much)
}
private void createChunksInDKV() {
// Collect all matches
// Create the final frame (part) for this MSB combination
// Cannot use a List<Long> as that's restricted to 2Bn items and also isn't an Iced datatype
long t0 = System.nanoTime(), t1;
final int cloudSize = H2O.CLOUD.size();
final long perNodeRightRows[][][] = new long[cloudSize][][];
final long perNodeLeftRows [][][] = new long[cloudSize][][];
// Allocate memory to split this MSB combn's left and right matching rows
// into contiguous batches sent to the nodes they reside on
for( int i = 0; i < cloudSize; i++ ) {
perNodeRightRows[i] = _riteKO.fillPerNodeRows(i);
perNodeLeftRows [i] = _leftKO.fillPerNodeRows(i);
}
_timings[2] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;
// Loop over _ret1st and _retLen and populate the batched requests for
// each node helper. _ret1st and _retLen are the same shape
final long perNodeRightLoc[] = new long[cloudSize];
final long perNodeLeftLoc [] = new long[cloudSize];
chunksPopulatePerNode(perNodeLeftLoc,perNodeLeftRows,perNodeRightLoc,perNodeRightRows);
_timings[3] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;
// Create the chunks for the final frame from this MSB pair.
// 16 bytes for each UUID (biggest type). Enum will be long (8). TODO: How is non-Enum 'string' handled by H2O?
final int batchSizeUUID = 256*1024*1024 / 16; // number of rows per chunk to fit in 256GB DKV limit.
final int nbatch = (int) ((_numRowsInResult-1)/batchSizeUUID +1); // TODO: wrap in class to avoid this boiler plate
assert nbatch >= 1;
final int lastSize = (int) (_numRowsInResult - (nbatch-1)*batchSizeUUID);
assert lastSize > 0;
final int numLeftCols = _leftSB._frame.numCols();
final int numColsInResult = _leftSB._frame.numCols() + _riteSB._frame.numCols() - _numJoinCols;
final double[][][] frameLikeChunks = new double[numColsInResult][nbatch][]; //TODO: compression via int types
_chunkSizes = new int[nbatch];
for( int col=0; col<numColsInResult; col++ )
for( int b = 0; b < nbatch; b++ ) {
frameLikeChunks[col][b] = MemoryManager.malloc8d(_chunkSizes[b] = (b==nbatch-1 ? lastSize : batchSizeUUID));
// NA by default to save filling with NA for nomatches when allLeft
Arrays.fill(frameLikeChunks[col][b], Double.NaN);
}
_timings[4] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;
// Get Raw Remote Rows
final GetRawRemoteRows grrrsLeft[][] = new GetRawRemoteRows[cloudSize][];
final GetRawRemoteRows grrrsRite[][] = new GetRawRemoteRows[cloudSize][];
chunksGetRawRemoteRows(perNodeLeftRows,perNodeRightRows,grrrsLeft,grrrsRite);
_timings[6] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1; // all this time is expected to be in [5]
// Now loop through _ret1st and _retLen and populate
chunksPopulateRetFirst(numColsInResult, numLeftCols, perNodeLeftLoc, grrrsLeft, perNodeRightLoc, grrrsRite, frameLikeChunks);
_timings[10] += ((t1=System.nanoTime()) - t0) / 1e9; t0=t1;
// compress all chunks and store them
chunksCompressAndStore(nbatch, numColsInResult, frameLikeChunks);
_timings[11] += (System.nanoTime() - t0) / 1e9;
}
// Loop over _ret1st and _retLen and populate the batched requests for
// each node helper. _ret1st and _retLen are the same shape
private void chunksPopulatePerNode( final long perNodeLeftLoc[], final long perNodeLeftRows[][][], final long perNodeRightLoc[], final long perNodeRightRows[][][] ) {
final int batchSizeLong = 256*1024*1024 / 16; // 256GB DKV limit / sizeof(UUID)
long prevf = -1, prevl = -1;
// TODO: hop back to original order here for [] syntax.
long leftLoc=_leftFrom; // sweep through left table along the sorted row locations.
for (int jb=0; jb<_ret1st.length; ++jb) { // jb = j batch
for (int jo=0; jo<_ret1st[jb].length; ++jo) { // jo = j offset
leftLoc++; // to save jb*_ret1st[0].length + jo;
long f = _ret1st[jb][jo]; // TODO: take _ret1st[jb] outside inner loop
long l = _retLen[jb][jo];
if (f==0) {
// left row matches to no right row
assert l == 0; // doesn't have to be 0 (could be 1 already if allLeft==true) but currently it should be, so check it
if (!_allLeft) continue;
// now insert the left row once and NA for the right columns i.e. left outer join
}
{ // new scope so 'row' can be declared in the for() loop below and registerized (otherwise 'already defined in this scope' in that scope)
// Fetch the left rows and mark the contiguous from-ranges each left row should be recycled over
// TODO: when single node, not needed
// TODO could loop through batches rather than / and % wastefully
long row = _leftKO.at8order(leftLoc);
int chkIdx = _leftSB._vec.elem2ChunkIdx(row); //binary search in espc
int ni = _leftSB._chunkNode[chkIdx];
long pnl = perNodeLeftLoc[ni]++; // pnl = per node location
perNodeLeftRows[ni][(int)(pnl/batchSizeLong)][(int)(pnl%batchSizeLong)] = row; // ask that node for global row number row
}
if (f==0) continue;
assert l > 0;
if (prevf == f && prevl == l)
continue; // don't re-fetch the same matching rows (cartesian). We'll repeat them locally later.
prevf = f; prevl = l;
for (int r=0; r<l; r++) {
long loc = f+r-1; // -1 because these are 0-based where 0 means no-match and 1 refers to the first row
// TODO: could take / and % outside loop in cases where it doesn't span a batch boundary
long row = _riteKO.at8order(loc);
// find the owning node for the row, using local operations here
int chkIdx = _riteSB._vec.elem2ChunkIdx(row); //binary search in espc
int ni = _riteSB._chunkNode[chkIdx];
// TODO Split to an if() and batch and offset separately
long pnl = perNodeRightLoc[ni]++; // pnl = per node location.
perNodeRightRows[ni][(int)(pnl/batchSizeLong)][(int)(pnl%batchSizeLong)] = row; // ask that node for global row number row
}
}
}
// TODO assert that perNodeRite and Left are exactly equal to the number
// expected and allocated.
Arrays.fill(perNodeLeftLoc ,0); // clear for reuse below
Arrays.fill(perNodeRightLoc,0);
}
// Get Raw Remote Rows
private void chunksGetRawRemoteRows(final long perNodeLeftRows[][][], final long perNodeRightRows[][][], GetRawRemoteRows grrrsLeft[][], GetRawRemoteRows grrrsRite[][]) {
RPC<GetRawRemoteRows> grrrsRiteRPC[][] = new RPC[H2O.CLOUD.size()][];
RPC<GetRawRemoteRows> grrrsLeftRPC[][] = new RPC[H2O.CLOUD.size()][];
// Launch remote tasks left and right
for( H2ONode node : H2O.CLOUD._memary ) {
final int ni = node.index();
final int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
final int bUppLeft = perNodeLeftRows[ni] == null ? 0 : perNodeLeftRows[ni].length;
grrrsRiteRPC[ni] = new RPC[bUppRite];
grrrsLeftRPC[ni] = new RPC[bUppLeft];
grrrsRite[ni] = new GetRawRemoteRows[bUppRite];
grrrsLeft[ni] = new GetRawRemoteRows[bUppLeft];
for (int b = 0; b < bUppRite; b++) {
// TODO try again now with better surrounding method
// Arrays.sort(perNodeRightRows[ni][b]); Simple quick test of fetching in monotonic order. Doesn't seem to help so far.
grrrsRiteRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_riteSB._frame, perNodeRightRows[ni][b])).call();
}
for (int b = 0; b < bUppLeft; b++) {
// Arrays.sort(perNodeLeftRows[ni][b]);
grrrsLeftRPC[ni][b] = new RPC<>(node, new GetRawRemoteRows(_leftSB._frame, perNodeLeftRows[ni][b])).call();
}
}
for( H2ONode node : H2O.CLOUD._memary ) {
// TODO: just send and wait for first batch on each node and then .get() next batch as needed.
int ni = node.index();
final int bUppRite = perNodeRightRows[ni] == null ? 0 : perNodeRightRows[ni].length;
for (int b = 0; b < bUppRite; b++)
_timings[5] += (grrrsRite[ni][b] = grrrsRiteRPC[ni][b].get()).timeTaken;
final int bUppLeft = perNodeLeftRows[ni] == null ? 0 : perNodeLeftRows[ni].length;
for (int b = 0; b < bUppLeft; b++)
_timings[5] += (grrrsLeft[ni][b] = grrrsLeftRPC[ni][b].get()).timeTaken;
}
}
// Now loop through _ret1st and _retLen and populate
private void chunksPopulateRetFirst(final int numColsInResult, final int numLeftCols, final long perNodeLeftLoc[], final GetRawRemoteRows grrrsLeft[][], final long perNodeRightLoc[], final GetRawRemoteRows grrrsRite[][], final double[][][] frameLikeChunks) {
// 16 bytes for each UUID (biggest type). Enum will be long (8).
// TODO: How is non-Enum 'string' handled by H2O?
final int batchSizeUUID = 256*1024*1024 / 16; // number of rows per chunk to fit in 256GB DKV limit.
long resultLoc=0; // sweep upwards through the final result, filling it in
// TODO: hop back to original order here for [] syntax.
long leftLoc=_leftFrom; // sweep through left table along the sorted row locations.
long prevf = -1, prevl = -1;
for (int jb=0; jb<_ret1st.length; ++jb) { // jb = j batch
for (int jo=0; jo<_ret1st[jb].length; ++jo) { // jo = j offset
leftLoc++; // to save jb*_ret1st[0].length + jo;
long f = _ret1st[jb][jo]; // TODO: take _ret1st[jb] outside inner loop
long l = _retLen[jb][jo];
if (f==0 && !_allLeft) continue; // f==0 => left row matches to no right row
// else insert the left row once and NA for the right columns i.e. left outer join
// Fetch the left rows and recycle it if more than 1 row in the right table is matched to.
// TODO could loop through batches rather than / and % wastefully
long row = _leftKO.at8order(leftLoc);
// TODO should leftOrder and retFirst/retLen have the same batch size to make this easier?
// TODO Can we not just loop through _leftKO._order only? Why jb and jo too through
int chkIdx = _leftSB._vec.elem2ChunkIdx(row); //binary search in espc
int ni = _leftSB._chunkNode[chkIdx];
long pnl = perNodeLeftLoc[ni]++; // pnl = per node location. TODO: batch increment this rather than
int b = (int)(pnl / batchSizeUUID);
int o = (int)(pnl % batchSizeUUID);
double[][] chks = grrrsLeft[ni][b]._chk;
final int l1 = Math.max((int)l,1);
for (int rep = 0; rep < l1; rep++) {
long a = resultLoc + rep;
// TODO: loop into batches to save / and % for each repeat and still
// cater for crossing multiple batch boundaries
int whichChunk = (int) (a / batchSizeUUID);
int offset = (int) (a % batchSizeUUID);
for (int col=0; col<chks.length; col++) {
// TODO: this only works for numeric columns (not for UUID, strings, etc.)
frameLikeChunks[col][whichChunk][offset] = chks[col][o]; // colForBatch.atd(row);
}
}
if (f==0) { resultLoc++; continue; } // no match so just one row (NA for right table) to advance over
assert l > 0;
if (prevf == f && prevl == l) {
// just copy from previous batch in the result (populated by for()
// below). Contiguous easy in-cache copy (other than batches).
for (int r=0; r<l; r++) {
// TODO: loop into batches to save / and % for each repeat and
// still cater for crossing multiple batch boundaries
int toChunk = (int) (resultLoc / batchSizeUUID);
int toOffset = (int) (resultLoc % batchSizeUUID);
int fromChunk = (int) ((resultLoc - l) / batchSizeUUID);
int fromOffset = (int) ((resultLoc - l) % batchSizeUUID);
for (int col=0; col<numColsInResult-numLeftCols; col++) {
frameLikeChunks[numLeftCols + col][toChunk][toOffset] = frameLikeChunks[numLeftCols + col][fromChunk][fromOffset];
}
resultLoc++;
}
continue;
}
prevf = f;
prevl = l;
for (int r=0; r<l; r++) {
// TODO: loop into batches to save / and % for each repeat and still
// cater for crossing multiple batch boundaries
int whichChunk = (int) (resultLoc / batchSizeUUID);
int offset = (int) (resultLoc % batchSizeUUID);
long loc = f+r-1; // -1 because these are 0-based where 0 means no-match and 1 refers to the first row
// TODO: could take / and % outside loop in cases where it doesn't span a batch boundary
row = _riteKO.at8order(loc);
// find the owning node for the row, using local operations here
chkIdx = _riteSB._vec.elem2ChunkIdx(row); //binary search in espc
ni = _riteSB._chunkNode[chkIdx];
pnl = perNodeRightLoc[ni]++; // pnl = per node location. // TODO Split to an if() and batch and offset separately
chks = grrrsRite[ni][(int)(pnl / batchSizeUUID)]._chk;
o = (int)(pnl % batchSizeUUID);
for (int col=0; col<numColsInResult-numLeftCols; col++) {
// TODO: this only works for numeric columns (not for UUID, strings, etc.)
frameLikeChunks[numLeftCols + col][whichChunk][offset] = chks[_numJoinCols + col][o]; // colForBatch.atd(row);
}
resultLoc++;
}
}
}
}
// compress all chunks and store them
private void chunksCompressAndStore(final int nbatch, final int numColsInResult, final double[][][] frameLikeChunks) {
// compress all chunks and store them
Futures fs = new Futures();
for (int col=0; col<numColsInResult; col++) {
for (int b = 0; b < nbatch; b++) {
Chunk ck = new NewChunk(frameLikeChunks[col][b]).compress();
DKV.put(getKeyForMSBComboPerCol(_leftSB._msb, _riteSB._msb, col, b), ck, fs, true);
frameLikeChunks[col][b]=null; //free mem as early as possible (it's now in the store)
}
}
fs.blockForPending();
}
static Key getKeyForMSBComboPerCol(/*Frame leftFrame, Frame rightFrame,*/ int leftMSB, int rightMSB, int col /*final table*/, int batch) {
return Key.make("__binary_merge__Chunk_for_col" + col + "_batch" + batch
// + rightFrame._key.toString() + "_joined_with" + leftFrame._key.toString()
+ "_leftSB._msb" + leftMSB + "_riteSB._msb" + rightMSB,
(byte) 1, Key.HIDDEN_USER_KEY, false, SplitByMSBLocal.ownerOfMSB(rightMSB==-1 ? leftMSB : rightMSB)
); //TODO home locally
}
static class GetRawRemoteRows extends DTask<GetRawRemoteRows> {
Frame _fr;
long[/*rows*/] _rows; //which rows to fetch from remote node, non-null on the way to remote, null on the way back
double[/*col*/][] _chk; //null on the way to remote node, non-null on the way back
double timeTaken;
GetRawRemoteRows(Frame fr, long[] rows) { _rows = rows; _fr = fr; }
@Override
public void compute2() {
assert(_rows!=null);
assert(_chk ==null);
long t0 = System.nanoTime();
// System.out.print("Allocating _chk with " + _fr.numCols() +" by " + _rows.length + "...");
_chk = MemoryManager.malloc8d(_fr.numCols(),_rows.length); // TODO: should this be transposed in memory?
// System.out.println("done");
int cidx[] = MemoryManager.malloc4(_rows.length);
int offset[] = MemoryManager.malloc4(_rows.length);
Vec anyVec = _fr.anyVec(); assert anyVec != null;
for (int row=0; row<_rows.length; row++) {
cidx[row] = anyVec.elem2ChunkIdx(_rows[row]); // binary search of espc array. TODO: sort input row numbers to avoid
offset[row] = (int)(_rows[row] - anyVec.espc()[cidx[row]]);
}
Chunk c[] = new Chunk[anyVec.nChunks()];
for (int col=0; col<_fr.numCols(); col++) {
Vec v = _fr.vec(col);
for (int i=0; i<c.length; i++) c[i] = v.chunkKey(i).home() ? v.chunkForChunkIdx(i) : null;
for (int row=0; row<_rows.length; row++) {
_chk[col][row] = c[cidx[row]].atd(offset[row]);
}
}
// tell remote node to fill up Chunk[/*batch*/][/*rows*/]
// perNodeRows[node] has perNodeRows[node].length batches of row numbers to fetch
_rows=null;
_fr=null;
assert(_chk !=null);
timeTaken = (System.nanoTime() - t0) / 1e9;
tryComplete();
}
}
}