AstDdply.java example

Explorer
h2o-3-master
package water.rapids.ast.prims.mungers;

import water.*;
import water.fvec.*;
import water.rapids.*;
import water.rapids.ast.*;
import water.rapids.ast.params.AstNumList;
import water.rapids.vals.ValFrame;
import water.util.*;

import java.util.Arrays;

/**
 * Ddply
 * Group the rows of 'data' by unique combinations of '[group-by-cols]'.
 * Apply any function 'fcn' to a group Frame, which must accept a Frame (and
 * any "extra" arguments) and return a single scalar value.
 * <p/>
 * Returns a set of grouping columns, with the single answer column, with one
 * row per unique group.
 */
public class AstDdply extends AstPrimitive {
  @Override
  public String[] args() {
    return new String[]{"ary", "groupByCols", "fun"};
  }

  @Override
  public int nargs() {
    return 1 + 3;
  } // (ddply data [group-by-cols] fcn )

  @Override
  public String str() {
    return "ddply";
  }

  @Override
  public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
    Frame fr = stk.track(asts[1].exec(env)).getFrame();
    int ncols = fr.numCols();

    AstNumList groupby = AstGroup.check(ncols, asts[2]);
    int[] gbCols = groupby.expand4();

    AstRoot fun = asts[3].exec(env).getFun();
    AstFunction scope = env._scope;  // Current execution scope; needed to lookup variables

    // Pass 1: Find all the groups (and count rows-per-group)
    IcedHashMap<AstGroup.G, String> gss = AstGroup.doGroups(fr, gbCols, AstGroup.aggNRows());
    final AstGroup.G[] grps = gss.keySet().toArray(new AstGroup.G[gss.size()]);

    // apply an ORDER by here...
    final int[] ordCols = new AstNumList(0, gbCols.length).expand4();
    Arrays.sort(grps, new java.util.Comparator<AstGroup.G>() {
      // Compare 2 groups.  Iterate down _gs, stop when _gs[i] > that._gs[i],
      // or _gs[i] < that._gs[i].  Order by various columns specified by
      // _orderByCols.  NaN is treated as least
      @Override
      public int compare(AstGroup.G g1, AstGroup.G g2) {
        for (int i : ordCols) {
          if (Double.isNaN(g1._gs[i]) && !Double.isNaN(g2._gs[i])) return -1;
          if (!Double.isNaN(g1._gs[i]) && Double.isNaN(g2._gs[i])) return 1;
          if (g1._gs[i] != g2._gs[i]) return g1._gs[i] < g2._gs[i] ? -1 : 1;
        }
        return 0;
      }

      // I do not believe sort() calls equals() at this time, so no need to implement
      @Override
      public boolean equals(Object o) {
        throw H2O.unimpl();
      }
    });

    // Uniquely number the groups
    for (int gnum = 0; gnum < grps.length; gnum++) grps[gnum]._dss[0][0] = gnum;

    // Pass 2: Build all the groups, building 1 Vec per-group, with exactly the
    // same Chunk layout, except each Chunk will be the filter rows numbers; a
    // list of the Chunk-relative row-numbers for that group in an original
    // data Chunk.  Each Vec will have a *different* number of rows.
    Vec[] vgrps = new BuildGroup(gbCols, gss).doAll(gss.size(), Vec.T_NUM, fr).close();

    // Pass 3: For each group, build a full frame for the group, run the
    // function on it and tear the frame down.
    final RemoteRapids[] remoteTasks = new RemoteRapids[gss.size()]; // gather up the remote tasks...
    Futures fs = new Futures();
    for (int i = 0; i < remoteTasks.length; i++)
      fs.add(RPC.call(vgrps[i]._key.home_node(), remoteTasks[i] = new RemoteRapids(fr, vgrps[i]._key, fun, scope)));
    fs.blockForPending();

    // Build the output!
    final double[] res0 = remoteTasks[0]._result;
    String[] fcnames = new String[res0.length];
    for (int i = 0; i < res0.length; i++)
      fcnames[i] = "ddply_C" + (i + 1);

    MRTask mrfill = new MRTask() {
      @Override
      public void map(Chunk[] c, NewChunk[] ncs) {
        int start = (int) c[0].start();
        for (int i = 0; i < c[0]._len; ++i) {
          AstGroup.G g = grps[i + start];  // One Group per row
          int j;
          for (j = 0; j < g._gs.length; j++) // The Group Key, as a row
            ncs[j].addNum(g._gs[j]);
          double[] res = remoteTasks[i + start]._result;
          for (int a = 0; a < res0.length; a++)
            ncs[j++].addNum(res[a]);
        }
      }
    };

    Frame f = AstGroup.buildOutput(gbCols, res0.length, fr, fcnames, gss.size(), mrfill);
    return new ValFrame(f);
  }

  // --------------------------------------------------------------------------
  // Build all the groups, building 1 Vec per-group, with exactly the same
  // Chunk layout, except each Chunk will be the filter rows numbers; a list
  // of the Chunk-relative row-numbers for that group in an original data Chunk.
  private static class BuildGroup extends MRTask<BuildGroup> {
    final IcedHashMap<AstGroup.G, String> _gss;
    final int[] _gbCols;

    BuildGroup(int[] gbCols, IcedHashMap<AstGroup.G, String> gss) {
      _gbCols = gbCols;
      _gss = gss;
    }

    @Override
    public void map(Chunk[] cs, NewChunk[] ncs) {
      AstGroup.G gWork = new AstGroup.G(_gbCols.length, null); // Working Group
      for (int row = 0; row < cs[0]._len; row++) {
        gWork.fill(row, cs, _gbCols); // Fill the worker Group for the hashtable lookup
        int gnum = (int) _gss.getk(gWork)._dss[0][0]; // Existing group number
        ncs[gnum].addNum(row);  // gather row-numbers per-chunk per-group
      }
    }

    // Gather all the output Vecs.  Note that each Vec has a *different* number
    // of rows, and taken together they do NOT make a valid Frame.
    Vec[] close() {
      Futures fs = new Futures();
      Vec[] vgrps = new Vec[_gss.size()];
      for (int i = 0; i < vgrps.length; i++)
        vgrps[i] = _appendables[i].close(_appendables[i].compute_rowLayout(), fs);
      fs.blockForPending();
      return vgrps;
    }
  }

  // --------------------------------------------------------------------------
  private static class RemoteRapids extends DTask<RemoteRapids> {
    private Frame _data;        // Data frame
    private Key<Vec> _vKey;     // the group to process...
    private AstRoot _fun;           // the ast to execute on the group
    private AstFunction _scope;      // Execution environment
    private double[] _result;   // result is 1 row per group!

    RemoteRapids(Frame data, Key<Vec> vKey, AstRoot fun, AstFunction scope) {
      _data = data;
      _vKey = vKey;
      _fun = fun;
      _scope = scope;
    }

    @Override
    public void compute2() {
      assert _vKey.home();
      final Vec gvec = DKV.getGet(_vKey);
      assert gvec.group().equals(_data.anyVec().group());

      // Make a group Frame, using wrapped Vecs wrapping the original data
      // frame with the filtered Vec passed in.  Run the function, getting a
      // scalar or a 1-row Frame back out.  Delete the group Frame.  Return the
      // 1-row Frame as a double[] of results for this group.

      // Make the subset Frame Vecs, no chunks yet
      Key<Vec>[] groupKeys = gvec.group().addVecs(_data.numCols());
      final Vec[] groupVecs = new Vec[_data.numCols()];
      Futures fs = new Futures();
      for (int i = 0; i < _data.numCols(); i++)
        DKV.put(groupVecs[i] = new Vec(groupKeys[i], gvec._rowLayout, gvec.domain(), gvec.get_type()), fs);
      fs.blockForPending();
      // Fill in the chunks
      new MRTask() {
        @Override
        public void setupLocal() {
          Vec[] data_vecs = _data.vecs();
          for (int i = 0; i < gvec.nChunks(); i++)
            if (data_vecs[0].chunkKey(i).home()) {
              Chunk rowchk = gvec.chunkForChunkIdx(i);
              for (int col = 0; col < data_vecs.length; col++)
                DKV.put(Vec.chunkKey(groupVecs[col]._key, i), new SubsetChunk(data_vecs[col].chunkForChunkIdx(i), rowchk, groupVecs[col]), _fs);
            }
        }
      }.doAllNodes();
      Frame groupFrame = new Frame(_data._names, groupVecs);

      // Now run the function on the group frame
      Session ses = new Session();
      // Build an environment with proper lookup scope, and execute in a temp session
      Val val = ses.exec(new AstExec(new AstRoot[]{_fun, new AstFrame(groupFrame)}), _scope);
      val = ses.end(val);

      // Result into a double[]
      if (val.isFrame()) {
        Frame res = val.getFrame();
        if (res.numRows() != 1)
          throw new IllegalArgumentException("ddply must return a 1-row (many column) frame, found " + res.numRows());
        _result = new double[res.numCols()];
        for (int i = 0; i < res.numCols(); i++)
          _result[i] = res.vec(i).at(0);
        res.remove();
      } else if (val.isNum())
        _result = new double[]{val.getNum()};
      else if (val.isNums())
        _result = val.getNums();
      else
        throw new IllegalArgumentException("ddply must return either a number or a frame, not a " + val);

      // Cleanup
      groupFrame.delete();      // Delete the Frame holding WrappedVecs over SubsetChunks
      gvec.remove();            // Delete the group-defining Vec
      _data = null;             // Nuke to avoid returning (not for GC)
      _vKey = null;             // Nuke to avoid returning (not for GC)
      _fun = null;              // Nuke to avoid returning (not for GC)
      _scope = null;            // Nuke to avoid returning (not for GC)
      // And done!
      tryComplete();
    }
  }

}