/*
package water.rapids.ast.prims.mungers;
import water.fvec.*;
import water.*;
import water.rapids.RadixCount;
import water.rapids.assignG;
import water.util.ArrayUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class AstGroupSorted {
// 2^31 bytes > java max (2^31-1), so 2^30 / 8 bytes per long. TO DO - how to make global?
//private static final int MAXVECLONG = 134217728;
//private static final int MAXVECBYTE = 1073741824;
long[][] sort(Frame groupCols) {
//return (new RadixOrder(groupCols, ArrayUtils.seq(0,groupCols.numCols()-1))._groupIndex); // TO DO: won't work yet as needs 2nd group step
return (new long[][] {{1,2,3}});
// a vector
System.out.println("Calling RadixCount ...");
long t0 = System.nanoTime();
long t00 = t0;
int nChunks = groupCols.anyVec().nChunks();
if( groupCols.numCols() != 1 ) throw H2O.unimpl(); // Only looking at column 0 for now
long counts[][][] = new RadixCount(nChunks).doAll(groupCols.vec(0))._counts;
System.out.println("Time of RadixCount: " + (System.nanoTime() - t0) / 1e9); t0 = System.nanoTime();
// for (int c=0; c<5; c++) { System.out.print("First 10 for chunk "+c+" byte 0: "); for (int i=0; i<10; i++) System.out.print(counts[0][c][i] + " "); System.out.print("\n"); }
long totalHist[] = new long[256];
for (int c=0; c<nChunks; c++) {
for (int h=0; h<256; h++) {
totalHist[h] += counts[5][c][h]; // TO DO: hard coded 5 here
}
}
for (int b=0; b<8; b++) {
for (int h=0; h<256; h++) {
long rollSum = 0;
for (int c = 0; c < nChunks; c++) {
long tmp = counts[b][c][h];
counts[b][c][h] = rollSum;
rollSum += tmp;
}
}
}
// Any radix skipping needs to be detected with a loop over node results to ensure no use of those bits on any node.
System.out.println("Time to cumulate counts: " + (System.nanoTime() - t0) / 1e9); t0 = System.nanoTime();
// TO DO: by this stage we know now the width of byte field we need. So allocate it tight up to MAXVEC
// TO DO: reduce to 5 if we're only passed the first column
int keySize = 7;
long o[][][] = new long[256][][];
byte x[][][] = new byte[256][][]; // for each bucket, there might be > 2^31 bytes, so an extra dimension for that
for (int c=0; c<256; c++) {
if (totalHist[c] == 0) continue;
int d;
int nbatch = (int)(totalHist[c] * Math.max(keySize,8) / MAXVECBYTE); // TO DO. can't be 2^31 because 2^31-1 was limit. If we use 2^30, instead of /, can we do >> for speed?
int rem = (int)(totalHist[c] * Math.max(keySize,8) % MAXVECBYTE);
assert nbatch==0; // in the case of 20m rows, we should always be well within a batch size
// The Math.max ensures that batches are aligned, even for wide keys. For efficiency inside insert() above so it doesn't have to cross boundaries.
o[c] = new long[nbatch + (rem>0?1:0)][];
x[c] = new byte[nbatch + (rem>0?1:0)][];
assert nbatch==0;
for (d=0; d<nbatch; d++) {
o[c][d] = new long[MAXVECLONG];
// TO DO?: use MemoryManager.malloc8()
x[c][d] = new byte[MAXVECBYTE];
}
if (rem>0) {
o[c][d] = new long[rem];
x[c][d] = new byte[rem * keySize];
}
}
System.out.println("Time to allocate o[][] and x[][]: " + (System.nanoTime() - t0) / 1e9); t0 = System.nanoTime();
// NOT TO DO: we do need the full allocation of x[] and o[]. We need o[]
// anyway. x[] will be as dense as possible.
// o is the full ordering vector of the right size
// x is the byte key aligned with o
// o AND x are what bmerge() needs. Pushing x to each node as well as o avoids inter-node comms.
// feasibly, that we could move by byte 5 and then skip the next byte. Too
// complex case though and rare so simplify
new MoveByFirstByte(5, o, x, counts, keySize).doAll(groupCols);
System.out.println("Time to MoveByFirstByte: " + (System.nanoTime() - t0) / 1e9); t0 = System.nanoTime();
// Add check that this first split is reasonable. e.g. if it were just 2,
// it definitely would not be enough. 90 is enough though. Need to fill
// L2 with pages.
// for counted completer 0:255
long groups[][] = new long[256][]; // at most MAXVEC groups per radix, currently
long nGroup[] = new long[257]; // one extra to make undo of cumulate easier
Futures fs = new Futures();
for (int i=0; i<256; i++) {
if (totalHist[i] > 0)
fs.add(H2O.submitTask(new dradix(groups, nGroup, i, x[i], o[i], totalHist[i], keySize)));
}
fs.blockForPending();
long nGroups = 0;
for (int i = 0; i < 257; i++) {
long tmp = nGroup[i];
nGroup[i] = nGroups;
nGroups += tmp;
}
System.out.println("Time to recursive radix: " + (System.nanoTime() - t0) / 1e9 ); t0 = System.nanoTime();
System.out.println("Total groups found: " + nGroups);
// We now have o and x that bmerge() needs
long nrow = groupCols.numRows();
long g[][] = new long[(int)(1 + nrow / MAXVECLONG)][];
int c;
for (c=0; c<nrow/MAXVECLONG; c++) {
g[c] = new long[MAXVECLONG];
}
g[c] = new long[(int)(nrow % MAXVECLONG)];
fs = new Futures();
for (int i=0; i<256; i++) {
if (totalHist[i] > 0)
fs.add(H2O.submitTask(new assignG(g, groups[i], nGroup[i+1]-nGroup[i], nGroup[i], o[i])));
// reuse the x vector we allocated before to store the group numbers. i.e. a perfect and ordered hash, stored alongside table
}
fs.blockForPending();
System.out.println("Time to assign group index (length nrows): " + (System.nanoTime() - t0) / 1e9 ); t0 = System.nanoTime();
return g;
}
}
*/