package water.exec; import water.*; import water.fvec.*; import water.util.Utils.*; public abstract class Flow extends Iced { // Flow-Coding with Frames, Filters & GroupBy // // frame.with(filter).with(groupby).with(reducer).doit(); // // Define a pipeline of operations to perform on a frame. Can use any number // of filters, and one optional groupby, and one required reducer which must // be last in the pipe. The return result is either the instance of the // reducer, or a collection of reducers (one per Group in the GroupBy). // // All elements are passed a row from the Frame in double ds[]. // // Reducer uses "mapreduce(ds)" to collect data for one row into a reducer // object, and "this.reduce(that)" to gather chunks of reduced objects. Rows // are reduced in parallel and in any order; each row exactly once. // // Filter returns boolean to ignore or reduce the row. // // GroupBy returns a long to specify which Group this row belongs too. The // row is reduced into a seperate reducer for each group. The group ids do // not have to form a dense space, any unique long value specifies a group; a // hash table is used to gather the groups. The hashtable of reduced values // is returned. // //Frame fr = parseFrame("cars.hex", "smalldata/cars.csv"); //final int cyl_idx = fr.find("cylinders"); //final int year_idx = fr.find("year"); // //SumCol sumcols = fr. // with(new SumCol(year_idx)). // doit(); //System.out.println(sumcols._sum+"/"+sumcols._n+" = "+(sumcols._sum/sumcols._n)); // //SumCol sumcols1 = fr. // with(new Filter() { boolean filter(double ds[]) { return ds[cyl_idx]!=5; } }). // with(new SumCol(year_idx)). // doit(); //System.out.println(sumcols1._sum+"/"+sumcols1._n+" = "+(sumcols1._sum/sumcols1._n)); // //IcedHashMap<IcedLong,SumCol> sumcols2 = fr. // with(new GroupBy() { long groupId(double ds[]) { return (long)ds[cyl_idx];} }). // with(new SumCol(year_idx)). // doit(); //for( IcedLong gid : sumcols2.keySet() ) { // SumCol sumcol = sumcols2.get(gid); // System.out.println("Cyl="+gid._val+", "+sumcol._sum+"/"+sumcol._n+" = "+(sumcol._sum/sumcol._n)); //} // ----------------------- // THE PUBLIC API: public abstract static class PerRow<X extends PerRow> extends Iced { abstract public void mapreduce( double ds[] ); abstract public void reduce( X that ); abstract public X make(); @Override public String toString() { return "perRow"; } } public abstract static class Filter extends Iced { abstract public boolean filter( double ds[] ); @Override public String toString() { return "filter"; } } public abstract static class GroupBy extends Iced { abstract public long groupId( double ds[]); @Override public String toString() { return "groupBy"; } } // ----------------------- abstract Frame frame(); abstract <X extends PerRow<X>> // Type parameter PerRow<X> // Return type of doit() doit // Method name ( PerRow<X> pr, double ds[], PerRow<X> pr0 ); // Arguments for doit() public static class FlowFrame extends Flow { final Frame _fr; public FlowFrame( Frame fr ) { _fr = fr; } @Override Frame frame() { return _fr; } @Override public String toString() { return _fr.toString(); } @Override <X extends PerRow<X>> PerRow<X> doit(PerRow<X> pr, double ds[], PerRow<X> pr0) { if( pr == null ) pr = pr0.make(); pr.mapreduce(ds); return pr; } } public static class FlowFilter extends Flow { final Filter _fr; final Flow _ex; public FlowFilter( Filter fr, Flow ex ) { _fr = fr; _ex = ex;} public <Y extends PerRow<Y>> FlowPerRow<Y> with( PerRow<Y> pr ) { return new FlowPerRow<Y>(pr,this); } public FlowGroupBy with( GroupBy fr ) { return new FlowGroupBy(fr,this); } public FlowFilter with ( Filter filter){ return new FlowFilter(filter, this); } @Override Frame frame() { return _ex.frame(); } @Override public String toString() { return _ex.toString()+".with("+_fr+")"; } @Override <X extends PerRow<X>> PerRow<X> doit(PerRow<X> pr, double ds[], PerRow<X> pr0) { return _fr.filter(ds) ? _ex.doit(pr,ds,pr0) : pr; } } public static class FlowGroupBy extends Flow { final GroupBy _gb; final Flow _ex; public FlowGroupBy( GroupBy gb, Flow ex ) { _gb = gb; _ex = ex;} public <Y extends PerRow<Y>> FlowGroupPerRow<Y> with( PerRow<Y> pr ) { return new FlowGroupPerRow<Y>(pr,this); } public FlowGroupBy with( Filter fr ) { return new FlowGroupBy(_gb,new FlowFilter(fr,_ex)); } @Override Frame frame() { return _ex.frame(); } @Override public String toString() { return _ex.toString()+".with("+_gb+")"; } @Override <X extends PerRow<X>> PerRow<X> doit(PerRow<X> pr, double ds[], PerRow<X> pr0) { throw H2O.fail(); } } public static class FlowGroupPerRow<X extends PerRow<X>> extends MRTask2<FlowGroupPerRow<X>> { final PerRow<X> _pr; // Canonical example, not returned IcedHashMap<IcedLong,PerRow<X>> _prs; FlowGroupBy _ex; public FlowGroupPerRow( PerRow<X> pr, FlowGroupBy ex ) { _pr = pr; _ex = ex; } public IcedHashMap<IcedLong,X> doit() { return doAll(_ex.frame()).self(); } @Override public void map( Chunk chks[] ) { _prs = new IcedHashMap<IcedLong,PerRow<X>>(); double ds[] = new double[chks.length]; for( int i=0; i<chks[0]._len; i++ ) { // Load the internal double array for( int j=0; j<chks.length; j++ ) ds[j] = chks[j].at0(i); IcedLong gid = new IcedLong(_ex._gb.groupId(ds)); PerRow<X> pr1 = _prs.get(gid); PerRow<X> pr2 = _ex._ex.doit(pr1,ds,_pr); if( pr1 == null && pr2 != null ) _prs.put(gid,pr2); } } @Override public void reduce( FlowGroupPerRow<X> that ) { for( IcedLong gid : that._prs.keySet() ) { PerRow<X> that_pr = that._prs.get(gid); PerRow<X> this_pr = this._prs.get(gid); if( that_pr != null ) { if( this_pr != null ) this_pr.reduce((X)that_pr); else this._prs.put(gid,that_pr); } } } IcedHashMap<IcedLong,X> self() { return (IcedHashMap<IcedLong,X>)_prs; } @Override public String toString() { return _ex.toString()+".with("+_pr+")"; } } public static class FlowPerRow<X extends PerRow<X>> extends MRTask2<FlowPerRow<X>> { PerRow<X> _pr; Flow _ex; public FlowPerRow( PerRow<X> pr, Flow ex ) { _pr = pr; _ex = ex;} public X doit() { return doAll(_ex.frame()).self(); } @Override public void map( Chunk chks[] ) { _pr = _pr.make(); double ds[] = new double[chks.length]; for( int i=0; i<chks[0]._len; i++ ) { // Load the internal double array for( int j=0; j<chks.length; j++ ) ds[j] = chks[j].at0(i); _ex.doit(_pr,ds,_pr); } } @Override public void reduce( FlowPerRow<X> ebpr ) { _pr.reduce(ebpr.self()); } X self() { return (X)_pr; } @Override public String toString() { return _ex.toString()+".with("+_pr+")"; } } }