package water.rapids; import java.util.Arrays; import java.util.Random; import hex.CreateFrame; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import water.*; import water.fvec.*; import water.util.ArrayUtils; import water.util.RandomUtils; class MySample extends MRTask<MySample> { private final int K; MySample(int K) { this.K = K; } @Override public void map( Chunk chk ) { Random randomGenerator = water.util.RandomUtils.getRNG(chk.cidx()); for (int i=0; i<chk.len(); i++) { chk.set(i, randomGenerator.nextInt(K)); } } } class MyCountRange extends MRTask<MyCountRange> { private final long _max, _min; long _counts[][]; private int _nChunks; // int ans; MyCountRange(long max, long min, int nChunks) { System.out.println("Constructor for MyCountRange"); _max = max; _min = min; _counts = new long[nChunks][]; _nChunks = nChunks; } // setupLocal(); // to do. No because the memory gets copied across. Constructor doesn't need to run on other nodes. @Override public void map( Chunk chk ) { long tmp[] = _counts[chk.cidx()] = new long[(int)(_max-_min+1)]; //long tmp[] = new long[(int)(_max-_min+1)]; // does non-sharing help? If so, assign afterwards after the loop? int rows = chk._len; //double dummyCounter=1; for (int r=0; r<rows; r++) tmp[(int)(chk.at8(r)-_min)]++; // for (int w=0; w<10; w++) // for (long r=0; r<((long)rows)*25; r++) dummyCounter *= 3.14; //Math.sin(3.14); //(int)(chk.at8(r)-_min); // ans = (int)Math.round(dummyCounter); // use it to stop it being dropped by optimizer } @Override public void reduce(MyCountRange g) { if (g._counts != _counts) { // assign the counts from the other one System.out.println("This should just print once since 2 nodes"); for (int c=0; c<_nChunks; c++) { if (g._counts[c] != null) { assert _counts[c] == null; _counts[c] = g._counts[c]; //for (int i=0; i<_max-_min+1; i++) { // //} } else { assert _counts[c] != null; } } // throw H2O.unimpl(); } } } class MyCountRangeNoSpline extends MRTask<MyCountRangeNoSpline> { private final long _max, _min; private long _counts[]; MyCountRangeNoSpline(long max, long min, int nChunks) { System.out.println("Constructor for MyCountRange"); _max = max; _min = min; } @Override public void map( Chunk chk ) { _counts = new long[(int)(_max-_min+1)]; int rows = chk._len; for (int r=0; r<rows; r++) _counts[(int)(chk.at8(r)-_min)]++; } @Override public void reduce(MyCountRangeNoSpline g) { ArrayUtils.add(_counts, g._counts); } } /* * When order is one single array. But Java memalloc fail on 1e9 items. */ /* class WriteOrder extends MRTask<WriteOrder> { final long _counts[][]; final long _order[]; final long _min; final long _max; WriteOrder(long[][] counts, long[] order, long min, long max) { _counts = counts; _order = order; _min = min; _max=max; } @Override public void map( Chunk chk ) { long myCounts[] = _counts[chk._cidx]; for (int r=0; r<chk._len; r++) _order[ (int)(myCounts[(int)(chk.at8(r)-_min)]++) ] = r+chk._start; } } */ class WriteOrder extends MRTask<WriteOrder> { final long _counts[][]; final int _order[][]; final long _min; final long _max; WriteOrder(long[][] counts, int[][] order, long min, long max) { _counts = counts; _order = order; _min = min; _max = max; } @Override public void map( Chunk chk ) { long nanos[] = new long[5]; Vec vec = chk.vec(); int range = (int)(_max-_min+1); long[] espc = vec.espc(); long myCounts[] = _counts[chk.cidx()]; // Test thread local counts. Keep in cache and never push to RAM (don't need to be shared) // long myCounts[] = new long[(int)(_max-_min+1)]; // for (int i=0; i<_max-_min+1; i++) myCounts[i] = _counts[chk._cidx][i]; int myTargetChunks[] = new int[range]; for (int i=0; i<_max-_min+1; i++) myTargetChunks[i] = vec.elem2ChunkIdx(myCounts[i]); // elem2ChunkIdx is a binary search due to chunks not being equal size. Try to avoid. for (int r=0; r<chk._len; r++) { //long t0 = System.nanoTime(); int group = (int)(chk.at8(r)-_min); //nanos[0] += System.nanoTime()-t0; t0=System.nanoTime(); long target = myCounts[group]++; //nanos[1] += System.nanoTime()-t0; t0=System.nanoTime(); int targetChunk = myTargetChunks[group]; //nanos[2] += System.nanoTime()-t0; t0=System.nanoTime(); if ( target == espc[targetChunk+1] ) { myTargetChunks[group]++; targetChunk++; } // crossed chunk boundary //nanos[3] += System.nanoTime()-t0; t0=System.nanoTime(); //_order[targetChunk][(int)(target - espc[targetChunk])] = r+(int)chk._start; //nanos[4] += System.nanoTime()-t0; } //System.out.print("Chunk "+chk._cidx+": "); for (int i=0; i<5; i++) System.out.print(Math.round(nanos[i]/1e6)+" "); System.out.print("\n"); // print ms } } public class GroupingBench extends TestUtil { @BeforeClass public static void setup() { stall_till_cloudsize(1); } @Ignore @Test public void runGroupingBench() { // Simplified version of tests in runit_quantile_1_golden.R. There we test probs=seq(0,1,by=0.01) //Vec vec = Vec.makeCon(1.1, 1000000000); //Vec vec = Vec.makeRepSeq(10,10); Vec vec = Vec.makeZero((long)1e9); //System.out.println("Chunks: " + vec.nChunks()); //System.out.println("Vec length: " + vec.length()); //System.out.println("Populating vector... "); //new MySeq((int)100).doAll(vec); //new MySample((int)10).doAll(vec); new MySample((int)10).doAll(vec); vec.max(); // to cache rollups, so timing below excludes it System.out.println("\nFirst 30 of vec ..."); System.out.println("There are "+vec.nChunks()+" chunks"); for (int i=0; i<vec.nChunks(); i++) { System.out.println("Chunk"+i+"is on"+vec.chunkKey(i).home_node()); } CreateFrame cf = new CreateFrame(); cf.rows = 100; cf.cols = 10; cf.categorical_fraction = 0.1; cf.integer_fraction = 1 - cf.categorical_fraction; cf.binary_fraction = 0; cf.factors = 4; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 1234; Frame frame = cf.execImpl().get(); System.out.print( frame.toString(0,14) ); for (int i=0; i<30; i++) System.out.print((int)vec.at(i) + " "); System.out.println("\n"); // Vec vec = vec(5 , 8 , 9 , 12 , 13 , 16 , 18 , 23 , 27 , 28 , 30 , 31 , 33 , 34 , 43, 45, 48, 161); // makeSeq; // Take out memory alloc before the loop to avoid GC costs, before vtune profiling // Now broken up into arrays of same shape as vec.chunks. Really cannot have one array of 1e9 items in Java. // nanos = System.nanoTime(); long heapsize=Runtime.getRuntime().totalMemory(); System.out.println("heapsize is::"+heapsize); //long o[] = new long[(int)vec.length()]; int o[][] = new int[vec.nChunks()][]; // [(int)vec.length()]; for (int c=0; c<o.length; c++) o[c] = new int[vec.chunkForChunkIdx(c)._len]; for (int timeRep=0; timeRep<3; timeRep++) { // TO DO: caliper java project // TO DO: search for utils.Timer, prettyPrint long nanos = System.nanoTime(); long ans2[][] = new MyCountRange((long) vec.max(), (long) vec.min(), vec.nChunks()).doAll(vec)._counts; long nanos1 = System.nanoTime() - nanos; System.out.println("Counts per chunk (first 5 chunks) ..."); for (int c = 0; c < 5; c++) System.out.println(Arrays.toString(ans2[c])); /* nanos = System.nanoTime(); // cumulate across chunks int nBuckets = (int)((long) vec.max() - (long) vec.min() + 1); long rollSum = 0; for (int b = 0; b < nBuckets; b++) { for (int c = 0; c < vec.nChunks(); c++) { long tmp = ans2[c][b]; ans2[c][b] = rollSum; rollSum += tmp; } } long nanos2 = System.nanoTime() - nanos; //System.out.println("\nCounts after cumulate ..."); //for (int c = 0; c < vec.nChunks(); c++) System.out.println(Arrays.toString(ans2[c])); nanos = System.nanoTime(); new WriteOrder(ans2, o, (long) vec.min(), (long) vec.max()).doAll(vec); long nanos3 = System.nanoTime() - nanos; //System.out.println("\nCounts after WriteOrder ..."); //for (int c = 0; c < vec.nChunks(); c++) System.out.println(Arrays.toString(ans2[c])); System.out.println("\nFirst 10 of order ..."); //for (int i=0; i<10; i++) System.out.print(o[i] + " "); for (int i=0; i<10; i++) System.out.print(o[0][i] + " "); System.out.println("\nLast 10 of order ..."); //for (int i=9; i>=0; i--) System.out.print(o[(int)(vec.length()-i-1)] + " "); System.out.print("\n"); int c = vec.nChunks()-1; long cstart = vec._espc[c]; for (int i=9; i>=0; i--) System.out.print(o[c][(int)(vec.length()-i-1-cstart)] + " "); System.out.print("\n"); System.out.println("\nFirst 40 of vec ..."); for (int i=0; i<40; i++) System.out.print((int)vec.at(i) + " "); System.out.println("\nLast 40 of vec ..."); for (int i=39; i>=0; i--) System.out.print((int)vec.at((int)vec.length()-i-1) + " "); System.out.print("\n"); */ System.out.println("\nInitial count: " + nanos1 / 1e9); //System.out.println("Cumulate across chunks: " + nanos2 / 1e9); //System.out.println("Write to order[]: " + nanos3 / 1e9); //System.out.println("Total time: " + (nanos1+nanos2+nanos3) / 1e9); System.out.println(""); } // Next: input int, then large groups, small groups vec.remove(); frame.delete(); } @Test public void runBench2() { Frame f1=null, f2=null, fx=null; try { // build a hi count cardinality frame final long card = (long)1e4; f1 = buildFrame(card,-1); System.out.println(f1.toString(0,100)); Vec seq = Vec.makeSeq(card,false); f2 = new Frame(seq,seq); for( int i=0; i<10; i++ ) { long t0 = System.currentTimeMillis(); fx = Merge.merge(f1, f2, new int[]{0}, new int[]{0}, false, new int[1][]); long t1 = System.currentTimeMillis(); System.out.println("MERGE Took " + (t1 - t0) + " msec for " + f1.numRows()); //System.out.println(fx.toString(0,100)); fx.delete(); } } finally { if( f1 != null ) f1.delete(); if( f2 != null ) f2.delete(); if( fx != null ) fx.delete(); } } // Build 2 column frame, with the given Chunks. Col #0 is high-count // categorical; we will make about 1/10 this many rows drawing at random from // the "card" range with replacement. Col#1 is a row number. private static Frame buildFrame( final long card, int nChunks ) { final int scale0 = 10; final long len = card/scale0; if( nChunks == -1 ) { int rowsPerChunk = 100000; nChunks = (int)((len+rowsPerChunk-1)/rowsPerChunk); } Vec.VectorGroup g = new Vec.VectorGroup(); AppendableVec col0 = new AppendableVec(g.addVec(), Vec.T_NUM); AppendableVec col1 = new AppendableVec(g.addVec(), Vec.T_NUM); NewChunk ncs0[] = new NewChunk[nChunks]; NewChunk ncs1[] = new NewChunk[nChunks]; for( int i=0; i<nChunks; i++ ) { ncs0[i] = new NewChunk(col0,i); ncs1[i] = new NewChunk(col1,i); } RandomUtils.PCGRNG R = new RandomUtils.PCGRNG(card,0); for( long i=0; i<len; i++ ) ncs0[R.nextInt(nChunks)].addNum( R.nextInt((int)card), 0 ); // Compute data layout int espc[] = new int[nChunks+1]; for( int i=0; i<nChunks; i++ ) espc[i+1] = espc[i] + ncs0[i].len(); // Compute row numbers into col 2 for( int i=0; i<nChunks; i++ ) for( int j=0; j<ncs0[i].len(); j++ ) ncs1[i].addNum(espc[i]+j,0); Futures fs = new Futures(); for( int i=0; i<nChunks; i++ ) { ncs0[i].close(i,fs); ncs1[i].close(i,fs); } Vec vec0 = col0.layout_and_close(fs); Vec vec1 = col1.layout_and_close(fs); fs.blockForPending(); Frame fr = new Frame(Key.<Frame>make("hex"), null, new Vec[]{vec0,vec1}); DKV.put(fr); return fr; } }