GroupingBench.java example

Explorer
h2o-3-master
package water.rapids;

import java.util.Arrays;
import java.util.Random;
import hex.CreateFrame;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import water.*;
import water.fvec.*;
import water.util.ArrayUtils;
import water.util.RandomUtils;


class MySample extends MRTask<MySample> {
    private final int K;
    MySample(int K) {
        this.K = K;
    }
    @Override public void map( Chunk chk ) {
        Random randomGenerator = water.util.RandomUtils.getRNG(chk.cidx());
        for (int i=0; i<chk.len(); i++) {
            chk.set(i, randomGenerator.nextInt(K));
        }
    }
}

class MyCountRange extends MRTask<MyCountRange> {
    private final long _max, _min;
    long _counts[][];
    private int _nChunks;
    // int ans;
    MyCountRange(long max, long min, int nChunks) {
        System.out.println("Constructor for MyCountRange");
        _max = max; _min = min; _counts = new long[nChunks][]; _nChunks = nChunks;
    }
    // setupLocal(); // to do.  No because the memory gets copied across. Constructor doesn't need to run on other nodes.
    @Override public void map( Chunk chk ) {
        long tmp[] = _counts[chk.cidx()] = new long[(int)(_max-_min+1)];
        //long tmp[] = new long[(int)(_max-_min+1)];   // does non-sharing help?   If so, assign afterwards after the loop?
        int rows = chk._len;
        //double dummyCounter=1;
        for (int r=0; r<rows; r++) tmp[(int)(chk.at8(r)-_min)]++;
        // for (int w=0; w<10; w++)
        // for (long r=0; r<((long)rows)*25; r++) dummyCounter *= 3.14;  //Math.sin(3.14);  //(int)(chk.at8(r)-_min);
        // ans = (int)Math.round(dummyCounter);  // use it to stop it being dropped by optimizer
    }
    @Override public void reduce(MyCountRange g) {
        if (g._counts != _counts) {
            // assign the counts from the other one
            System.out.println("This should just print once since 2 nodes");
            for (int c=0; c<_nChunks; c++) {
                if (g._counts[c] != null) {
                    assert _counts[c] == null;
                    _counts[c] = g._counts[c];
                    //for (int i=0; i<_max-_min+1; i++) {
                    //
                    //}
                } else {
                    assert _counts[c] != null;
                }
            }
            // throw H2O.unimpl();
        }
    }
}


class MyCountRangeNoSpline extends MRTask<MyCountRangeNoSpline> {
    private final long _max, _min;
    private long _counts[];
    MyCountRangeNoSpline(long max, long min, int nChunks) {
        System.out.println("Constructor for MyCountRange");
        _max = max; _min = min;
    }
    @Override public void map( Chunk chk ) {
        _counts = new long[(int)(_max-_min+1)];
        int rows = chk._len;
        for (int r=0; r<rows; r++) _counts[(int)(chk.at8(r)-_min)]++;
    }
    @Override public void reduce(MyCountRangeNoSpline g) {
        ArrayUtils.add(_counts, g._counts);
    }
}

/*
 * When order is one single array. But Java memalloc fail on 1e9 items.
 */
/*
class WriteOrder extends MRTask<WriteOrder> {
    final long _counts[][];
    final long _order[];
    final long _min;
    final long _max;
    WriteOrder(long[][] counts, long[] order, long min, long max) { _counts = counts; _order = order; _min = min; _max=max; }
    @Override public void map( Chunk chk ) {
        long myCounts[] = _counts[chk._cidx];
        for (int r=0; r<chk._len; r++) _order[ (int)(myCounts[(int)(chk.at8(r)-_min)]++) ] = r+chk._start;
    }
}
*/
class WriteOrder extends MRTask<WriteOrder> {
    final long _counts[][];
    final int _order[][];
    final long _min;
    final long _max;
    WriteOrder(long[][] counts, int[][] order, long min, long max) { _counts = counts; _order = order; _min = min; _max = max; }
    @Override public void map( Chunk chk ) {
        long nanos[] = new long[5];
        Vec vec = chk.vec();
        int range = (int)(_max-_min+1);
        long[] espc = vec.espc();

        long myCounts[] = _counts[chk.cidx()];

        // Test thread local counts. Keep in cache and never push to RAM (don't need to be shared)
        // long myCounts[] = new long[(int)(_max-_min+1)];
        // for (int i=0; i<_max-_min+1; i++) myCounts[i] = _counts[chk._cidx][i];

        int myTargetChunks[] = new int[range];
        for (int i=0; i<_max-_min+1; i++) myTargetChunks[i] = vec.elem2ChunkIdx(myCounts[i]);  // elem2ChunkIdx is a binary search due to chunks not being equal size. Try to avoid.
        for (int r=0; r<chk._len; r++) {
            //long t0 = System.nanoTime();
            int group = (int)(chk.at8(r)-_min);
            //nanos[0] += System.nanoTime()-t0; t0=System.nanoTime();
            long target = myCounts[group]++;
            //nanos[1] += System.nanoTime()-t0; t0=System.nanoTime();
            int targetChunk = myTargetChunks[group];
            //nanos[2] += System.nanoTime()-t0; t0=System.nanoTime();
            if ( target == espc[targetChunk+1] ) { myTargetChunks[group]++; targetChunk++; }  // crossed chunk boundary
            //nanos[3] += System.nanoTime()-t0; t0=System.nanoTime();
            //_order[targetChunk][(int)(target - espc[targetChunk])] = r+(int)chk._start;
            //nanos[4] += System.nanoTime()-t0;
        }
        //System.out.print("Chunk "+chk._cidx+": "); for (int i=0; i<5; i++) System.out.print(Math.round(nanos[i]/1e6)+" "); System.out.print("\n");  // print ms
    }
}

public class GroupingBench extends TestUtil {
    @BeforeClass public static void setup() { stall_till_cloudsize(1); }

    @Ignore @Test public void runGroupingBench() {
        // Simplified version of tests in runit_quantile_1_golden.R. There we test probs=seq(0,1,by=0.01)
        //Vec vec = Vec.makeCon(1.1, 1000000000);
        //Vec vec = Vec.makeRepSeq(10,10);
        Vec vec = Vec.makeZero((long)1e9);
        //System.out.println("Chunks: " + vec.nChunks());
        //System.out.println("Vec length: " + vec.length());
        //System.out.println("Populating vector... ");

        //new MySeq((int)100).doAll(vec);
        //new MySample((int)10).doAll(vec);
        new MySample((int)10).doAll(vec);
        vec.max(); // to cache rollups,  so timing below excludes it

        System.out.println("\nFirst 30 of vec ...");
        System.out.println("There are "+vec.nChunks()+" chunks");
        for (int i=0; i<vec.nChunks(); i++) {
            System.out.println("Chunk"+i+"is on"+vec.chunkKey(i).home_node());
        }

        CreateFrame cf = new CreateFrame();
        cf.rows = 100;
        cf.cols = 10;
        cf.categorical_fraction = 0.1;
        cf.integer_fraction = 1 - cf.categorical_fraction;
        cf.binary_fraction = 0;
        cf.factors = 4;
        cf.response_factors = 2;
        cf.positive_response = false;
        cf.has_response = true;
        cf.seed = 1234;
        Frame frame = cf.execImpl().get();
        System.out.print( frame.toString(0,14) );

        for (int i=0; i<30; i++) System.out.print((int)vec.at(i) + " ");  System.out.println("\n");
        // Vec vec = vec(5 , 8 ,  9 , 12 , 13 , 16 , 18 , 23 , 27 , 28 , 30 , 31 , 33 , 34 , 43,  45,  48, 161);
        // makeSeq;

        // Take out memory alloc before the loop to avoid GC costs, before vtune profiling
        // Now broken up into arrays of same shape as vec.chunks. Really cannot have one array of 1e9 items in Java.
        // nanos = System.nanoTime();
        long heapsize=Runtime.getRuntime().totalMemory();
        System.out.println("heapsize is::"+heapsize);

        //long o[] = new long[(int)vec.length()];

        int o[][] = new int[vec.nChunks()][];    // [(int)vec.length()];
        for (int c=0; c<o.length; c++)
            o[c] = new int[vec.chunkForChunkIdx(c)._len];

        for (int timeRep=0; timeRep<3; timeRep++) {   // TO DO: caliper java project

            // TO DO:  search for utils.Timer,  prettyPrint

            long nanos = System.nanoTime();
            long ans2[][] = new MyCountRange((long) vec.max(), (long) vec.min(), vec.nChunks()).doAll(vec)._counts;
            long nanos1 = System.nanoTime() - nanos;
            System.out.println("Counts per chunk (first 5 chunks) ...");
            for (int c = 0; c < 5; c++) System.out.println(Arrays.toString(ans2[c]));



            /*
            nanos = System.nanoTime();
            // cumulate across chunks
            int nBuckets = (int)((long) vec.max() - (long) vec.min() + 1);
            long rollSum = 0;
            for (int b = 0; b < nBuckets; b++) {
                for (int c = 0; c < vec.nChunks(); c++) {
                    long tmp = ans2[c][b];
                    ans2[c][b] = rollSum;
                    rollSum += tmp;
                }
            }
            long nanos2 = System.nanoTime() - nanos;
            //System.out.println("\nCounts after cumulate ...");
            //for (int c = 0; c < vec.nChunks(); c++) System.out.println(Arrays.toString(ans2[c]));

            nanos = System.nanoTime();
            new WriteOrder(ans2, o, (long) vec.min(), (long) vec.max()).doAll(vec);
            long nanos3 = System.nanoTime() - nanos;

            //System.out.println("\nCounts after WriteOrder ...");
            //for (int c = 0; c < vec.nChunks(); c++) System.out.println(Arrays.toString(ans2[c]));

            System.out.println("\nFirst 10 of order ...");
            //for (int i=0; i<10; i++) System.out.print(o[i] + " ");
            for (int i=0; i<10; i++) System.out.print(o[0][i] + " ");

            System.out.println("\nLast 10 of order ...");
            //for (int i=9; i>=0; i--) System.out.print(o[(int)(vec.length()-i-1)] + " "); System.out.print("\n");
            int c = vec.nChunks()-1;
            long cstart = vec._espc[c];
            for (int i=9; i>=0; i--) System.out.print(o[c][(int)(vec.length()-i-1-cstart)] + " "); System.out.print("\n");

            System.out.println("\nFirst 40 of vec ...");
            for (int i=0; i<40; i++) System.out.print((int)vec.at(i) + " ");
            System.out.println("\nLast 40 of vec ...");
            for (int i=39; i>=0; i--) System.out.print((int)vec.at((int)vec.length()-i-1) + " ");  System.out.print("\n");
            */
            System.out.println("\nInitial count: " + nanos1 / 1e9);
            //System.out.println("Cumulate across chunks: " + nanos2 / 1e9);
            //System.out.println("Write to order[]: " + nanos3 / 1e9);
            //System.out.println("Total time: " + (nanos1+nanos2+nanos3) / 1e9);
            System.out.println("");

        }
        // Next: input int, then large groups, small groups

        vec.remove();
        frame.delete();
    }

  @Test public void runBench2() {
    Frame f1=null, f2=null, fx=null;
    try {
      // build a hi count cardinality frame
      final long card = (long)1e4;
      f1 = buildFrame(card,-1);
      System.out.println(f1.toString(0,100));
      Vec seq = Vec.makeSeq(card,false);
      f2 = new Frame(seq,seq);
      for( int i=0; i<10; i++ ) {
        long t0 = System.currentTimeMillis();
        fx = Merge.merge(f1, f2, new int[]{0}, new int[]{0}, false, new int[1][]);
        long t1 = System.currentTimeMillis();
        System.out.println("MERGE Took " + (t1 - t0) + " msec for " + f1.numRows());
        //System.out.println(fx.toString(0,100));
        fx.delete();
      }
    } finally {
      if( f1 != null ) f1.delete();
      if( f2 != null ) f2.delete();
      if( fx != null ) fx.delete();
    }
  }

  // Build 2 column frame, with the given Chunks.  Col #0 is high-count
  // categorical; we will make about 1/10 this many rows drawing at random from
  // the "card" range with replacement.  Col#1 is a row number.
  private static Frame buildFrame( final long card, int nChunks ) {
    final int scale0 = 10;
    final long len = card/scale0;
    if( nChunks == -1 ) {
      int rowsPerChunk = 100000;
      nChunks = (int)((len+rowsPerChunk-1)/rowsPerChunk);
    }

    Vec.VectorGroup g = new Vec.VectorGroup();
    AppendableVec col0 = new AppendableVec(g.addVec(), Vec.T_NUM);
    AppendableVec col1 = new AppendableVec(g.addVec(), Vec.T_NUM);
    NewChunk ncs0[] = new NewChunk[nChunks];
    NewChunk ncs1[] = new NewChunk[nChunks];
    for( int i=0; i<nChunks; i++ ) {
      ncs0[i] = new NewChunk(col0,i);
      ncs1[i] = new NewChunk(col1,i);
    }

    RandomUtils.PCGRNG R = new RandomUtils.PCGRNG(card,0);
    for( long i=0; i<len; i++ )
      ncs0[R.nextInt(nChunks)].addNum( R.nextInt((int)card), 0 );

    // Compute data layout
    int espc[] = new int[nChunks+1];
    for( int i=0; i<nChunks; i++ )
      espc[i+1] = espc[i] + ncs0[i].len();

    // Compute row numbers into col 2
    for( int i=0; i<nChunks; i++ )
      for( int j=0; j<ncs0[i].len(); j++ )
        ncs1[i].addNum(espc[i]+j,0);

    Futures fs = new Futures();
    for( int i=0; i<nChunks; i++ ) {
      ncs0[i].close(i,fs);
      ncs1[i].close(i,fs);
    }

    Vec vec0 = col0.layout_and_close(fs);
    Vec vec1 = col1.layout_and_close(fs);
    fs.blockForPending();
    Frame fr = new Frame(Key.<Frame>make("hex"), null, new Vec[]{vec0,vec1});
    DKV.put(fr);
    return fr;
  }
}