package water.parser;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.TestUtil;
import water.Value;
import water.fvec.FileVec;
import water.util.Log;
import water.util.PrettyPrint;
import java.io.FileWriter;
import java.io.IOException;
public class ChunksizeTest extends TestUtil {
@BeforeClass
static public void setup() {
stall_till_cloudsize(1);
}
@Test
public void run() throws IOException {
FileWriter fw = new FileWriter("/tmp/chunksize.csv");
String header = "\t" + String.format("%10s", "cloudSize")
+ "\t" + String.format("%8s", "cores")
+ "\t" + String.format("%8s", "numCols")
+ "\t" + String.format("%8s", "numRows")
+ "\t" + String.format("%16s", "maxLineLength")
+ "\t" + String.format("%13s", "totalSize")
+ "\t" + String.format("%13s", "chunkSize")
+ "\t" + String.format("%15s", "parseChunkCount")
+ "\t" + String.format("%15s", "totalChunks")
+"\n";
int[] toosmall=new int[2];
int[] toolarge=new int[2];
int[] toofew=new int[2];
int[] toomany=new int[2];
int[] counter=new int[2];
int[] failed=new int[2];
for (int oldheuristic : new int[]{0, 1}) {
for (int cloudSize : new int[]{1,2,4,8,16,32,64,128,256,512,1024,2048,4096}) {
for (int cores : new int[]{2,4,8,16,32,64,128}) { //per node
for (int numCols : new int[]{1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768}) {
for (long maxLineLength : new long[]{10,100,1000,10000,1000000}) {
for (double totalSize : new double[]{1e4,1e5,1e6,1e7,1e8,1e9,1e10,1e11,1e12,1e13,1e14}) {
int numRows = (int)(totalSize/maxLineLength);
// exclude impossible stuff
if (maxLineLength > totalSize) continue; //need at least 1 row
if ((double)maxLineLength / numCols < 3) continue; //need at least 3 bytes per column
if ((double)maxLineLength / numCols > 100) continue; //can't have more than 100 bytes per column
// Pretend to be in ParseSetup
int chunkSize = FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
int parseChunkCount = (int) Math.max(1, totalSize/chunkSize);
int parseChunkCountPerNode = parseChunkCount/cloudSize;
long totalChunks = (long)parseChunkCount*numCols;
String log = "\t" + String.format("%10s", cloudSize)
+ "\t" + String.format("%8s", cores)
+ "\t" + String.format("%8s", numCols)
+ "\t" + String.format("%8s", numRows)
+ "\t" + String.format("%16s", maxLineLength)
+ "\t" + String.format("%13s", totalSize)
+ "\t" + String.format("%13s", chunkSize)
+ "\t" + String.format("%15s", parseChunkCount)
+ "\t" + String.format("%15s", totalChunks);
boolean fail = false;
String msg = "\n"+header + log + " <- TOO ";
// don't cut small data into too many chunks (only 10 numbers per chunk)
if (chunkSize < 10*maxLineLength) {
msg += "SMALL ";
FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
toosmall[oldheuristic]++;
fail = true;
}
if (chunkSize >= (1<<28)) { //256MB
msg += "LARGE ";
FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
toolarge[oldheuristic]++;
fail = true;
}
// want at least one chunk per core
if (parseChunkCountPerNode < cores && oldheuristic==0) {
// only complain if we have at least 100k matrix entries per node - otherwise it's small data and fast enough anyway even with fewer chunks
if (numRows * numCols > 100000 * cloudSize
&& totalSize/cloudSize/numCols/(4*cores) > 1000 // Only complain about too few chunks if there's enough data to cut it into Chunk POJO of 1kB each, otherwise it's small data and we're fine with fewer chunks
) {
msg += "FEW ";
FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
toofew[oldheuristic]++;
fail = true;
Assert.assertTrue(numCols > 1e4); //only for very wide data
Assert.assertTrue(parseChunkCountPerNode > cores/2); //at least keep half the cores busy
}
}
if (parseChunkCountPerNode*numCols > (1<<24)) {//no more than 16M chunk POJOs per node
msg += "MANY ";
FileVec.calcOptimalChunkSize((long) totalSize, numCols, maxLineLength, cores, cloudSize, oldheuristic==1, true);
toomany[oldheuristic]++;
fail = true;
Assert.assertTrue(totalSize/cloudSize/cores > 1e9); //only for big data, where we have more than 1GB per core
}
if (fail) {
Log.info(msg + (oldheuristic==0?"(New Heuristic)":"(Old Heuristic)"));
failed[oldheuristic]++;
}
counter[oldheuristic]++;
}
}
}
}
}
}
fw.close();
for (int i : new int[]{0,1}) {
Log.info((i==1 ? "Old" : "New") + " heuristic:");
Log.info("Total: " + counter[i]);
Log.info("Failure rate: " + PrettyPrint.formatPct((double) failed[i] / counter[i]));
Log.info("Too small: " + PrettyPrint.formatPct((double) toosmall[i] / counter[i]));
Log.info("Too large: " + PrettyPrint.formatPct((double) toolarge[i] / counter[i]));
Log.info("Too few: " + PrettyPrint.formatPct((double) toofew[i] / counter[i]));
Log.info("Too many: " + PrettyPrint.formatPct((double) toomany[i] / counter[i]));
if (i==0) {
Assert.assertTrue("Too small means that files cannot be parsed", toosmall[i] == 0);
Assert.assertTrue("Too large means that chunks cannot fit in the DKV", toolarge[i] == 0);
Assert.assertTrue("Too few means that cores aren't utilized", toofew[i] < 1e-3*counter[i]); //extremely rare, only for wide data
Assert.assertTrue("Too many means that each node has to store more than 8M chunks in its KV store", toomany[i] < 3e-2*counter[i]); //it's very rare to have too many chunks (huge data)
}
}
}
}