package water.util; import water.H2O; import water.MRTask; import water.fvec.Chunk; import water.fvec.CategoricalWrappedVec; import water.fvec.Vec; /** * Simple summary of how many chunks of each type are in a Frame */ public class ChunkSummary extends MRTask<ChunkSummary> { ChunkSummary() { super((byte)(Thread.currentThread() instanceof H2O.FJWThr ? currThrPriority()+1 : H2O.MIN_HI_PRIORITY - 2)); } public static final String[][] chunkTypes = new String[][]{ {"C0L","Constant long"}, {"C0D","Constant double"}, {"CBS","Binary"}, {"CXI","Sparse Integers"}, // Sparse ints {"CXF","Sparse Reals"}, // Sparse ints {"C1","1-Byte Integers"}, {"C1N","1-Byte Integers (w/o NAs)"}, {"C1S","1-Byte Fractions"}, {"C2","2-Byte Integers"}, {"C2S","2-Byte Fractions"}, {"C4","4-Byte Integers"}, {"C4S","4-Byte Fractions"}, {"C4F","4-byte Reals"}, {"C8","8-byte Integers"}, {"C16","UUIDs"}, {"CStr","Strings"}, {"CUD","Unique Reals"}, {"C8D","64-bit Reals"}, }; // OUTPUT private long[] chunk_counts; private long total_chunk_count; private long[] chunk_byte_sizes; private long total_chunk_byte_size; private long[] byte_size_per_node; //averaged over all chunks private double byte_size_per_node_mean; private double byte_size_per_node_min; private double byte_size_per_node_max; private double byte_size_per_node_stddev; private long total_row_count; private long[] row_count_per_node; private double row_count_per_node_mean; private double row_count_per_node_min; private double row_count_per_node_max; private double row_count_per_node_stddev; private long total_chunk_count_per_col; private long[] chunk_count_per_col_per_node; private double chunk_count_per_col_per_node_mean; private double chunk_count_per_col_per_node_min; private double chunk_count_per_col_per_node_max; private double chunk_count_per_col_per_node_stddev; @Override public void map(Chunk[] cs) { chunk_counts = new long[chunkTypes.length]; chunk_byte_sizes = new long[chunkTypes.length]; byte_size_per_node = new long[H2O.CLOUD.size()]; row_count_per_node = new long[H2O.CLOUD.size()]; chunk_count_per_col_per_node = new long[H2O.CLOUD.size()]; for( Chunk c : cs ) { // Can be a big loop, for high column counts // Pull out the class name; trim a trailing "Chunk" String cname = c.getClass().getSimpleName(); int nlen = cname.length(); assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k'; String sname = cname.substring(0,nlen-5); if (sname.equals("CategoricalWrapped")) { Chunk ec = ((CategoricalWrappedVec.CategoricalWrappedChunk)c)._c; cname = ec.getClass().getSimpleName(); nlen = cname.length(); assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k'; sname = cname.substring(0,nlen-5); } // Table lookup, roughly sorted by frequency int j; for( j = 0; j < chunkTypes.length; ++j ) if( sname.equals(chunkTypes[j][0]) ) break; if( j==chunkTypes.length ) throw H2O.fail("Unknown Chunk Type: " + sname); chunk_counts[j]++; chunk_byte_sizes[j] += c.byteSize(); byte_size_per_node[H2O.SELF.index()] += c.byteSize(); } row_count_per_node[H2O.SELF.index()] += cs[0].len(); total_row_count += cs[0].len(); chunk_count_per_col_per_node[H2O.SELF.index()]++; total_chunk_count_per_col++; } @Override public void reduce(ChunkSummary mrt) { ArrayUtils.add(chunk_counts,mrt.chunk_counts); ArrayUtils.add(chunk_byte_sizes,mrt.chunk_byte_sizes); ArrayUtils.add(byte_size_per_node,mrt.byte_size_per_node); ArrayUtils.add(row_count_per_node,mrt.row_count_per_node); ArrayUtils.add(chunk_count_per_col_per_node,mrt.chunk_count_per_col_per_node); total_row_count += mrt.total_row_count; total_chunk_count_per_col += mrt.total_chunk_count_per_col; } @Override protected void postGlobal() { if (chunk_counts == null || chunk_byte_sizes == null || byte_size_per_node == null) return; assert(total_row_count == _fr.numRows()): "total_row_count["+total_row_count+"] != _fr.numRows()["+_fr.numRows()+"]. "; // compute counts and sizes total_chunk_byte_size = 0; total_chunk_count = 0; for (int j = 0; j < chunkTypes.length; ++j) { total_chunk_byte_size += chunk_byte_sizes[j]; total_chunk_count += chunk_counts[j]; } long check = 0; for (Vec v : _fr.vecs()) check += v.nChunks(); assert(total_chunk_count == check); // This doesn't always hold, FileVecs have File-based byte size, while Vecs have Chunk-based byte size. // assert(total_chunk_byte_size == _fr.byteSize()); double[] res=MathUtils.min_max_mean_stddev(byte_size_per_node); byte_size_per_node_min = res[0]; byte_size_per_node_max = res[1]; byte_size_per_node_mean = res[2]; byte_size_per_node_stddev = res[3]; res=MathUtils.min_max_mean_stddev(row_count_per_node); row_count_per_node_min = res[0]; row_count_per_node_max = res[1]; row_count_per_node_mean = res[2]; row_count_per_node_stddev = res[3]; res=MathUtils.min_max_mean_stddev(chunk_count_per_col_per_node); chunk_count_per_col_per_node_min = res[0]; chunk_count_per_col_per_node_max = res[1]; chunk_count_per_col_per_node_mean = res[2]; chunk_count_per_col_per_node_stddev = res[3]; } String display(long val) { return String.format("%10s", val == 0 ? " 0 B" : PrettyPrint.bytes(val)); } public TwoDimTable toTwoDimTableChunkTypes() { final String tableHeader = "Chunk compression summary"; int rows = 0; for (int j = 0; j < chunkTypes.length; ++j) if (chunk_counts != null && chunk_counts[j] > 0) rows++; final String[] rowHeaders = new String[rows]; final String[] colHeaders = new String[]{"Chunk Type", "Chunk Name", "Count", "Count Percentage", "Size", "Size Percentage"}; final String[] colTypes = new String[]{"string", "string", "int", "float", "string", "float"}; final String[] colFormats = new String[]{"%8s", "%s", "%10d", "%10.3f %%", "%10s", "%10.3f %%"}; final String colHeaderForRowHeaders = null; TwoDimTable table = new TwoDimTable(tableHeader, null, rowHeaders, colHeaders, colTypes, colFormats, colHeaderForRowHeaders); int row = 0; for (int j = 0; j < chunkTypes.length; ++j) { if (chunk_counts != null && chunk_counts[j] > 0) { table.set(row, 0, chunkTypes[j][0]); table.set(row, 1, chunkTypes[j][1]); table.set(row, 2, chunk_counts[j]); table.set(row, 3, (float) chunk_counts[j] / total_chunk_count * 100.f); table.set(row, 4, display(chunk_byte_sizes[j])); table.set(row, 5, (float) chunk_byte_sizes[j] / total_chunk_byte_size * 100.f); row++; } } return table; } public TwoDimTable toTwoDimTableDistribution() { final String tableHeader = "Frame distribution summary"; int rows = H2O.CLOUD.size() + 5; final String[] rowHeaders = new String[rows]; int row; for (row=0; row<rows-5; ++row) { rowHeaders[row] = H2O.CLOUD._memary[row].getIpPortString(); } rowHeaders[row++] = "mean"; rowHeaders[row++] = "min"; rowHeaders[row++] = "max"; rowHeaders[row++] = "stddev"; rowHeaders[row ] = "total"; final String[] colHeaders = new String[]{"Size", "Number of Rows", "Number of Chunks per Column", "Number of Chunks"}; final String[] colTypes = new String[]{"string", "float", "float", "float"}; final String[] colFormats = new String[]{"%s", "%f", "%f", "%f"}; final String colHeaderForRowHeaders = ""; TwoDimTable table = new TwoDimTable(tableHeader, null, rowHeaders, colHeaders, colTypes, colFormats, colHeaderForRowHeaders); for (row = 0; row < rows-5; ++row) { if (byte_size_per_node != null) { table.set(row, 0, display(byte_size_per_node[row])); table.set(row, 1, row_count_per_node[row]); table.set(row, 2, chunk_count_per_col_per_node[row]); table.set(row, 3, _fr.numCols() * chunk_count_per_col_per_node[row]); } } table.set(row, 0, display((long)byte_size_per_node_mean)); table.set(row, 1, row_count_per_node_mean); table.set(row, 2, chunk_count_per_col_per_node_mean); table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_mean); table.set(row, 0, display((long)byte_size_per_node_min)); table.set(row, 1, row_count_per_node_min); table.set(row, 2, chunk_count_per_col_per_node_min); table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_min); table.set(row, 0, display((long)byte_size_per_node_max)); table.set(row, 1, row_count_per_node_max); table.set(row, 2, chunk_count_per_col_per_node_max); table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_max); table.set(row, 0, display((long)byte_size_per_node_stddev)); table.set(row, 1, row_count_per_node_stddev); table.set(row, 2, chunk_count_per_col_per_node_stddev); table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_stddev); table.set(row, 0, display(total_chunk_byte_size)); table.set(row, 1, total_row_count); table.set(row, 2, total_chunk_count_per_col); table.set(row, 3, _fr.numCols()*total_chunk_count_per_col); return table; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(toTwoDimTableChunkTypes().toString()); sb.append(toTwoDimTableDistribution().toString()); if (H2O.CLOUD.size() > 1 && byte_size_per_node_stddev > 0.2 * byte_size_per_node_mean) { sb.append("** Note: Dataset is not well distributed, consider rebalancing **\n"); } return sb.toString(); } }