ChunkSummary.java example

Explorer
h2o-3-master
package water.util;

import water.H2O;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.CategoricalWrappedVec;
import water.fvec.Vec;

/**
 * Simple summary of how many chunks of each type are in a Frame
 */
public class ChunkSummary extends MRTask<ChunkSummary> {

  ChunkSummary() {  super((byte)(Thread.currentThread() instanceof H2O.FJWThr ? currThrPriority()+1 : H2O.MIN_HI_PRIORITY - 2)); }

  public static final String[][] chunkTypes = new String[][]{

      {"C0L","Constant long"},
      {"C0D","Constant double"},
      {"CBS","Binary"},
      {"CXI","Sparse Integers"},                   // Sparse ints
      {"CXF","Sparse Reals"},                   // Sparse ints
      {"C1","1-Byte Integers"},
      {"C1N","1-Byte Integers (w/o NAs)"},
      {"C1S","1-Byte Fractions"},
      {"C2","2-Byte Integers"},
      {"C2S","2-Byte Fractions"},
      {"C4","4-Byte Integers"},
      {"C4S","4-Byte Fractions"},
      {"C4F","4-byte Reals"},
      {"C8","8-byte Integers"},
      {"C16","UUIDs"},
      {"CStr","Strings"},
      {"CUD","Unique Reals"},
      {"C8D","64-bit Reals"},
  };


  // OUTPUT
  private long[] chunk_counts;
  private long total_chunk_count;
  private long[] chunk_byte_sizes;

  private long total_chunk_byte_size;
  private long[] byte_size_per_node; //averaged over all chunks
  private double byte_size_per_node_mean;
  private double byte_size_per_node_min;
  private double byte_size_per_node_max;
  private double byte_size_per_node_stddev;

  private long total_row_count;
  private long[] row_count_per_node;
  private double row_count_per_node_mean;
  private double row_count_per_node_min;
  private double row_count_per_node_max;
  private double row_count_per_node_stddev;

  private long total_chunk_count_per_col;
  private long[] chunk_count_per_col_per_node;
  private double chunk_count_per_col_per_node_mean;
  private double chunk_count_per_col_per_node_min;
  private double chunk_count_per_col_per_node_max;
  private double chunk_count_per_col_per_node_stddev;

  @Override
  public void map(Chunk[] cs) {
    chunk_counts = new long[chunkTypes.length];
    chunk_byte_sizes = new long[chunkTypes.length];
    byte_size_per_node = new long[H2O.CLOUD.size()];
    row_count_per_node = new long[H2O.CLOUD.size()];
    chunk_count_per_col_per_node = new long[H2O.CLOUD.size()];
    for( Chunk c : cs ) {       // Can be a big loop, for high column counts
      // Pull out the class name; trim a trailing "Chunk"
      String cname = c.getClass().getSimpleName();
      int nlen = cname.length();
      assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k';
      String sname = cname.substring(0,nlen-5);
      if (sname.equals("CategoricalWrapped")) {
        Chunk ec = ((CategoricalWrappedVec.CategoricalWrappedChunk)c)._c;
        cname = ec.getClass().getSimpleName();
        nlen = cname.length();
        assert nlen > 5 && cname.charAt(nlen-5)=='C' && cname.charAt(nlen-1)=='k';
        sname = cname.substring(0,nlen-5);
      }
      // Table lookup, roughly sorted by frequency
      int j;
      for( j = 0; j < chunkTypes.length; ++j )
        if( sname.equals(chunkTypes[j][0]) )
          break;
      if( j==chunkTypes.length ) throw H2O.fail("Unknown Chunk Type: " + sname);
      chunk_counts[j]++;
      chunk_byte_sizes[j] += c.byteSize();
      byte_size_per_node[H2O.SELF.index()] += c.byteSize();
    }
    row_count_per_node[H2O.SELF.index()] += cs[0].len();
    total_row_count +=  cs[0].len();
    chunk_count_per_col_per_node[H2O.SELF.index()]++;
    total_chunk_count_per_col++;
  }

  @Override
  public void reduce(ChunkSummary mrt) {
    ArrayUtils.add(chunk_counts,mrt.chunk_counts);
    ArrayUtils.add(chunk_byte_sizes,mrt.chunk_byte_sizes);
    ArrayUtils.add(byte_size_per_node,mrt.byte_size_per_node);
    ArrayUtils.add(row_count_per_node,mrt.row_count_per_node);
    ArrayUtils.add(chunk_count_per_col_per_node,mrt.chunk_count_per_col_per_node);
    total_row_count += mrt.total_row_count;
    total_chunk_count_per_col += mrt.total_chunk_count_per_col;
  }

  @Override
  protected void postGlobal() {
    if (chunk_counts == null || chunk_byte_sizes == null || byte_size_per_node == null) return;
    assert(total_row_count == _fr.numRows()): "total_row_count["+total_row_count+"] != _fr.numRows()["+_fr.numRows()+"]. ";

    // compute counts and sizes
    total_chunk_byte_size = 0;
    total_chunk_count = 0;
    for (int j = 0; j < chunkTypes.length; ++j) {
      total_chunk_byte_size += chunk_byte_sizes[j];
      total_chunk_count += chunk_counts[j];
    }

    long check = 0;
    for (Vec v : _fr.vecs())
      check += v.nChunks();
    assert(total_chunk_count == check);

    // This doesn't always hold, FileVecs have File-based byte size, while Vecs have Chunk-based byte size.
//    assert(total_chunk_byte_size == _fr.byteSize());

    double[] res=MathUtils.min_max_mean_stddev(byte_size_per_node);
    byte_size_per_node_min = res[0];
    byte_size_per_node_max = res[1];
    byte_size_per_node_mean = res[2];
    byte_size_per_node_stddev = res[3];

    res=MathUtils.min_max_mean_stddev(row_count_per_node);
    row_count_per_node_min = res[0];
    row_count_per_node_max = res[1];
    row_count_per_node_mean = res[2];
    row_count_per_node_stddev = res[3];

    res=MathUtils.min_max_mean_stddev(chunk_count_per_col_per_node);
    chunk_count_per_col_per_node_min = res[0];
    chunk_count_per_col_per_node_max = res[1];
    chunk_count_per_col_per_node_mean = res[2];
    chunk_count_per_col_per_node_stddev = res[3];
  }

  String display(long val) { return String.format("%10s", val == 0 ? "  0  B" : PrettyPrint.bytes(val)); }

  public TwoDimTable toTwoDimTableChunkTypes() {
    final String tableHeader = "Chunk compression summary";
    int rows = 0;
    for (int j = 0; j < chunkTypes.length; ++j) if (chunk_counts != null && chunk_counts[j] > 0) rows++;
    final String[] rowHeaders = new String[rows];
    final String[] colHeaders = new String[]{"Chunk Type", "Chunk Name", "Count", "Count Percentage", "Size", "Size Percentage"};
    final String[] colTypes = new String[]{"string", "string", "int", "float", "string", "float"};
    final String[] colFormats = new String[]{"%8s", "%s", "%10d", "%10.3f %%", "%10s", "%10.3f %%"};
    final String colHeaderForRowHeaders = null;
    TwoDimTable table = new TwoDimTable(tableHeader, null, rowHeaders, colHeaders, colTypes, colFormats, colHeaderForRowHeaders);

    int row = 0;
    for (int j = 0; j < chunkTypes.length; ++j) {
      if (chunk_counts != null && chunk_counts[j] > 0) {
        table.set(row, 0, chunkTypes[j][0]);
        table.set(row, 1, chunkTypes[j][1]);
        table.set(row, 2, chunk_counts[j]);
        table.set(row, 3, (float) chunk_counts[j] / total_chunk_count * 100.f);
        table.set(row, 4, display(chunk_byte_sizes[j]));
        table.set(row, 5, (float) chunk_byte_sizes[j] / total_chunk_byte_size * 100.f);
        row++;
      }
    }
    return table;
  }

  public TwoDimTable toTwoDimTableDistribution() {
    final String tableHeader = "Frame distribution summary";
    int rows = H2O.CLOUD.size() + 5;
    final String[] rowHeaders = new String[rows];
    int row;
    for (row=0; row<rows-5; ++row) {
      rowHeaders[row] = H2O.CLOUD._memary[row].getIpPortString();
    }
    rowHeaders[row++] = "mean";
    rowHeaders[row++] = "min";
    rowHeaders[row++] = "max";
    rowHeaders[row++] = "stddev";
    rowHeaders[row  ] = "total";
    final String[] colHeaders = new String[]{"Size", "Number of Rows", "Number of Chunks per Column", "Number of Chunks"};
    final String[] colTypes = new String[]{"string", "float", "float", "float"};
    final String[] colFormats = new String[]{"%s", "%f", "%f", "%f"};
    final String colHeaderForRowHeaders = "";
    TwoDimTable table = new TwoDimTable(tableHeader, null, rowHeaders, colHeaders, colTypes, colFormats, colHeaderForRowHeaders);

    for (row = 0; row < rows-5; ++row) {
      if (byte_size_per_node != null) {
        table.set(row, 0, display(byte_size_per_node[row]));
        table.set(row, 1, row_count_per_node[row]);
        table.set(row, 2, chunk_count_per_col_per_node[row]);
        table.set(row, 3, _fr.numCols() * chunk_count_per_col_per_node[row]);
      }
    }
    table.set(row, 0, display((long)byte_size_per_node_mean));
    table.set(row, 1, row_count_per_node_mean);
    table.set(row, 2, chunk_count_per_col_per_node_mean);
    table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_mean);

    table.set(row, 0, display((long)byte_size_per_node_min));
    table.set(row, 1, row_count_per_node_min);
    table.set(row, 2, chunk_count_per_col_per_node_min);
    table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_min);

    table.set(row, 0, display((long)byte_size_per_node_max));
    table.set(row, 1, row_count_per_node_max);
    table.set(row, 2, chunk_count_per_col_per_node_max);
    table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_max);

    table.set(row, 0, display((long)byte_size_per_node_stddev));
    table.set(row, 1, row_count_per_node_stddev);
    table.set(row, 2, chunk_count_per_col_per_node_stddev);
    table.set(row++, 3, _fr.numCols()*chunk_count_per_col_per_node_stddev);

    table.set(row, 0, display(total_chunk_byte_size));
    table.set(row, 1, total_row_count);
    table.set(row, 2, total_chunk_count_per_col);
    table.set(row, 3, _fr.numCols()*total_chunk_count_per_col);

    return table;
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append(toTwoDimTableChunkTypes().toString());
    sb.append(toTwoDimTableDistribution().toString());
    if (H2O.CLOUD.size() > 1 && byte_size_per_node_stddev > 0.2 * byte_size_per_node_mean) {
      sb.append("** Note: Dataset is not well distributed, consider rebalancing **\n");
    }
    return sb.toString();
  }
}