ChunksizeTest.java example

Explorer
h2o-3-master
package water.parser;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.TestUtil;
import water.Value;
import water.fvec.FileVec;
import water.util.Log;
import water.util.PrettyPrint;

import java.io.FileWriter;
import java.io.IOException;

public class ChunksizeTest extends TestUtil {
  @BeforeClass
  static public void setup() {
    stall_till_cloudsize(1);
  }

  @Test
  public void run() throws IOException {
    FileWriter fw = new FileWriter("/tmp/chunksize.csv");
    String header = "\t" + String.format("%10s", "cloudSize")
            + "\t" + String.format("%8s", "cores")
            + "\t" + String.format("%8s", "numCols")
            + "\t" + String.format("%8s", "numRows")
            + "\t" + String.format("%16s", "maxLineLength")
            + "\t" + String.format("%13s", "totalSize")
            + "\t" + String.format("%13s", "chunkSize")
            + "\t" + String.format("%15s", "parseChunkCount")
            + "\t" + String.format("%15s", "totalChunks")
            +"\n";
    int[] toosmall=new int[2];
    int[] toolarge=new int[2];
    int[] toofew=new int[2];
    int[] toomany=new int[2];
    int[] counter=new int[2];
    int[] failed=new int[2];
    for (int oldheuristic : new int[]{0, 1}) {
      for (int cloudSize : new int[]{1,2,4,8,16,32,64,128,256,512,1024,2048,4096}) {
        for (int cores : new int[]{2,4,8,16,32,64,128}) { //per node
          for (int numCols : new int[]{1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768}) {
            for (long maxLineLength : new long[]{10,100,1000,10000,1000000}) {
              for (double totalSize : new double[]{1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12,1e13,1e14}) {

                int numRows = (int)(totalSize/maxLineLength);

                // exclude impossible stuff
                if (maxLineLength > totalSize) continue; //need at least 1 row
                if ((double)maxLineLength / numCols < 3) continue; //need at least 3 bytes per column
                if ((double)maxLineLength / numCols > 100) continue; //can't have more than 100 bytes per column

                // Pretend to be in ParseSetup
                int chunkSize = FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);

                int parseChunkCount = (int) Math.max(1, totalSize/chunkSize);
                int parseChunkCountPerNode = parseChunkCount/cloudSize;

                long totalChunks = (long)parseChunkCount*numCols;

                String log = "\t" + String.format("%10s", cloudSize)
                    + "\t" + String.format("%8s", cores)
                    + "\t" + String.format("%8s", numCols)
                    + "\t" + String.format("%8s", numRows)
                    + "\t" + String.format("%16s", maxLineLength)
                    + "\t" + String.format("%13s", totalSize)
                    + "\t" + String.format("%13s", chunkSize)
                    + "\t" + String.format("%15s", parseChunkCount)
                    + "\t" + String.format("%15s", totalChunks);

                boolean fail = false;
                String msg = "\n"+header + log + "                  <- TOO ";
                // don't cut small data into too many chunks (only 10 numbers per chunk)
                if (chunkSize < 10*maxLineLength) {
                  msg += "SMALL ";
                  FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
                  toosmall[oldheuristic]++;
                  fail = true;
                }

                if (chunkSize >= (1<<28)) { //256MB
                  msg += "LARGE ";
                  FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
                  toolarge[oldheuristic]++;
                  fail = true;
                }

                // want at least one chunk per core
                if (parseChunkCountPerNode < cores && oldheuristic==0) {
                  // only complain if we have at least 100k matrix entries per node - otherwise it's small data and fast enough anyway even with fewer chunks
                  if (numRows * numCols > 100000 * cloudSize
                      && totalSize/cloudSize/numCols/(4*cores) > 1000 // Only complain about too few chunks if there's enough data to cut it into Chunk POJO of 1kB each, otherwise it's small data and we're fine with fewer chunks
                      ) {
                    msg += "FEW ";
                    FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
                    toofew[oldheuristic]++;
                    fail = true;
                    Assert.assertTrue(numCols > 1e4); //only for very wide data
                    Assert.assertTrue(parseChunkCountPerNode > cores/2); //at least keep half the cores busy
                  }
                }

                if (parseChunkCountPerNode*numCols > (1<<24)) {//no more than 16M chunk POJOs per node
                  msg += "MANY ";
                  FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
                  toomany[oldheuristic]++;
                  fail = true;
                  Assert.assertTrue(totalSize/cloudSize/cores > 1e9); //only for big data, where we have more than 1GB per core
                }

                if (fail) {
                  Log.info(msg + (oldheuristic==0?"(New Heuristic)":"(Old Heuristic)"));
                  failed[oldheuristic]++;
                }
                counter[oldheuristic]++;
              }
            }
          }
        }
      }
    }
    fw.close();
      for (int i : new int[]{0,1}) {
        Log.info((i==1 ? "Old" : "New") + " heuristic:");
        Log.info("Total: " + counter[i]);
        Log.info("Failure rate: " + PrettyPrint.formatPct((double) failed[i] / counter[i]));
        Log.info("Too small: " + PrettyPrint.formatPct((double) toosmall[i] / counter[i]));
        Log.info("Too large: " + PrettyPrint.formatPct((double) toolarge[i] / counter[i]));
        Log.info("Too few: " + PrettyPrint.formatPct((double) toofew[i] / counter[i]));
        Log.info("Too many: " + PrettyPrint.formatPct((double) toomany[i] / counter[i]));

        if (i==0) {
          Assert.assertTrue("Too small means that files cannot be parsed", toosmall[i] == 0);
          Assert.assertTrue("Too large means that chunks cannot fit in the DKV", toolarge[i] == 0);
          Assert.assertTrue("Too few means that cores aren't utilized", toofew[i] < 1e-3*counter[i]); //extremely rare, only for wide data
          Assert.assertTrue("Too many means that each node has to store more than 8M chunks in its KV store", toomany[i] < 3e-2*counter[i]); //it's very rare to have too many chunks (huge data)
        }
      }
  }
}