package water.rapids; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import water.*; import water.fvec.*; import water.parser.ParseDataset; import water.util.FileUtils; import java.io.InputStream; import java.io.OutputStream; public class MungingTest extends TestUtil { @BeforeClass() public static void setup() { stall_till_cloudsize(2); } private void copyStream(OutputStream os, InputStream is, final int buffer_size) { try { byte[] bytes=new byte[buffer_size]; for(;;) { int count=is.read(bytes, 0, buffer_size); if( count<=0 ) break; os.write(bytes, 0, count); } } catch(Exception ex) { throw new RuntimeException(ex); } } @Ignore @Test public void run2() throws Exception { System.out.println("Running run2 ..."); NFSFileVec nfs = TestUtil.makeNfsFileVec("/home/mdowle/devtestdata/step1.csv"); Frame frame = ParseDataset.parse(Key.make(), nfs._key); // look into parse() to manip column types System.out.println("Loaded file, now calling Query ..."); // new RadixOrder(frame, true, new int[] {0,1}); // group by 0=id, 1=date and sum 3 == quantity // TO DO: change back to DoGroup(frame, new int[] {0,1}, frame.vec(3), true) frame.delete(); } @Ignore @Test public void run3() throws Exception { System.out.println("Running run3 ..."); NFSFileVec nfs = TestUtil.makeNfsFileVec("/home/mdowle/devtestdata/step1_subset.csv"); //NFSFileVec nfs = NFSFileVec.make(find_test_file("/users/arno/devtestdata/step1_subset.csv")); Frame leftFrame = ParseDataset.parse(Key.make(), nfs._key); //nfs = NFSFileVec.make(find_test_file("/home/mdowle/devtestdata/fullsize.csv")); nfs = NFSFileVec.make(FileUtils.locateFile("/home/mdowle/devtestdata/fullsize.csv")); //nfs = NFSFileVec.make(find_test_file("/users/arno/devtestdata/fullsize.csv")); Frame rightFrame = ParseDataset.parse(Key.make(), nfs._key); // look into parse() to manip column types System.out.println("Loaded two files, now calling order ..."); // TO DO: this would be nice to see in chunk summary ... // for (int i=0; i<rightFrame.anyVec().nChunks(); i++) { // Log.info("Chunk " + i + " is on node " + rightFrame.anyVec().chunkKey(i).home_node().index()); // } // Frame fr1 = Merge.merge(leftFrame, rightFrame, new int[] {0,1}, new int[] {0,1}, false); // 0==id, 1==date (no dups) // Frame fr2 = Merge.merge(leftFrame, rightFrame, new int[] {0}, new int[] {0}, false ); // 0==id (many dups) //Log.info(fr1.toString(0,(int)fr1.numRows())); //Log.info(fr2.toString(0,(int)fr2.numRows())); // NFSFileVec ref1 = NFSFileVec.make(find_test_file("/users/arno/devtestdata/ref1.csv")); // Frame ref1Frame = ParseDataset.parse(Key.make(), nfs._key); // Assert.assertTrue("First Join is not correct", TestUtil.isBitIdentical(fr1, ref1Frame)); // // NFSFileVec ref2 = NFSFileVec.make(find_test_file("/users/arno/devtestdata/ref2.csv")); // Frame ref2Frame = ParseDataset.parse(Key.make(), nfs._key); // Assert.assertTrue("First Join is not correct", TestUtil.isBitIdentical(fr2, ref2Frame)); // // ref1Frame.delete(); // ref2Frame.delete(); //fr1.delete(); //fr2.delete(); leftFrame.delete(); rightFrame.delete(); //Merge.cleanUp(); } // @Test public void run1() throws Exception { // System.out.println("Running run1 ..."); // NFSFileVec nfs = NFSFileVec.make(find_test_file("sapplytest.csv")); // Frame frame = ParseDataset.parse(Key.make(), nfs._key); // look into parse() to manip column types // // long t0 = System.nanoTime(); // System.out.println("File loaded, now grouping using ASTGroupBy ..."); // int _colIdx = frame.find("QTY"); // AGG[] agg = new AGG[]{new AGG("sum", _colIdx, "rm", "QTY", null, null)}; // long _by[] = new long[]{ frame.find("ID"), frame.find("DATE") }; // GBTask p1 = new GBTask(_by, agg).doAll(frame); ///* // do = new do; // do.add("sum", ...); // as string "sum" *fun(fdjfj, fdfhdh) // do.add("mean", ...); // one array of strings, or a set of arguments type string or integer, array of objects in mixed types // // how to do (colA+colB)/2 // // Groovy and BeanScript // // UDFs in Java // // Prithvi: infix rapids as string then parse // frame.groupBy(by =, do = ); // frame.query("sum(QTY)", by=""); // // frame.query("SELECT sum(QTY), UDF(anothercol) GROUP BY ID, DATE"); // ... Java has, but same type // // // DT[, .(QTY = sum(QTY)), keyby=.(ID,DATE)] // new DT("(sum (col frame QTY))",new String[]{"ID", "DATE"}).doAll(frame); // // int nGrps = p1._g.size(); // G[] tmpGrps = p1._g.keySet().toArray(new G[nGrps]); // while( tmpGrps[nGrps-1]==null ) nGrps--; // final G[] grps = new G[nGrps]; // System.arraycopy(tmpGrps,0,grps,0,nGrps); // H2O.submitTask(new ParallelPostGlobal(grps, nGrps, new long[]{0,1})).join(); // Arrays.sort(grps); // // // build the output // final int nCols = _by.length+agg.length; // // // dummy vec // Vec v = Vec.makeZero(nGrps); // // // the names of columns // String[] names = new String[nCols]; // String[][] domains = new String[nCols][]; // for( int i=0;i<_by.length;++i) { // names[i] = frame.name((int) _by[i]); // domains[i] = frame.domains()[(int)_by[i]]; // } // System.arraycopy(AGG.names(agg),0,names,_by.length,agg.length); // // final AGG[] _agg=agg; // Frame f=new MRTask() { // @Override public void map(Chunk[] c, NewChunk[] ncs) { // int start=(int)c[0].start(); // for( int i=0;i<c[0]._len;++i) { // G g = grps[i+start]; // int j=0; // for(;j<g._ds.length;++j) // ncs[j].addNum(g._ds[j]); // // for(int a=0; a<_agg.length;++a) { // byte type = _agg[a]._type; // switch( type ) { // case AGG.T_N: ncs[j++].addNum(g._N ); break; // case AGG.T_AVG:ncs[j++].addNum(g._avs[a] ); break; // case AGG.T_MIN:ncs[j++].addNum(g._min[a] ); break; // case AGG.T_MAX:ncs[j++].addNum(g._max[a] ); break; // case AGG.T_VAR:ncs[j++].addNum(g._vars[a] ); break; // case AGG.T_SD :ncs[j++].addNum(g._sdevs[a]); break; // case AGG.T_SUM:ncs[j++].addNum(g._sum[a] ); break; // case AGG.T_SS :ncs[j++].addNum(g._ss [a] ); break; // case AGG.T_ND: ncs[j++].addNum(g._ND[a] ); break; // case AGG.T_F: ncs[j++].addNum(g._f[a] ); break; // case AGG.T_L: ncs[j++].addNum(g._l[a] ); break; // default: // throw new IllegalArgumentException("Unsupported aggregation type: " + type); // } // } // } // } // }.doAll(nCols,v).outputFrame(names,domains); // p1._g=null; // this frees up all mem in hash map // // System.out.print(f.toString(0,10)); // System.out.println("Time of aggregation (sec): " + (System.nanoTime() - t0) / 1e9); // // InputStream is = (f).toCSV(true,false); // // PersistManager pm = H2O.getPM(); // OutputStream os = null; // try { // os = pm.create("/Users/arno/devtestdata/h2oOut.csv", true); // copyStream(os, is, 4 * 1024 * 1024); // } finally { // if (os != null) { // try { // os.close(); // } // catch (Exception e) { // Log.err(e); // } // } // } // // // frame.delete(); // f.delete(); // v.remove(); // // } }