package water.rapids; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import water.Key; import water.TestUtil; import water.DKV; import water.fvec.Frame; import water.fvec.Vec; public class RefCntTest extends TestUtil { @BeforeClass public static void setup() { stall_till_cloudsize(1); } @Test public void testNoTest() { /*defeat junit complaining about no tests in file*/ } // Test basic Copy-On-Write optimization is working, by witnessing that the // correct (small) number of real vec copies are made, despite many virtual // copies being made. @Test public void testBasic() { Session session = new Session(); Frame crimes = parse_test_file(Key.make("chicagoCrimes10k.hex"),"smalldata/chicago/chicagoCrimes10k.csv.zip"); Vec.VectorGroup vg = crimes.anyVec().group(); // Expect to compute and update crimes.hex "Date" column in-place, but the // result is called py_1. Exactly 1 new vector is made (result of +) int key1 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertTrue(crimes.vec("Date").isTime()); Rapids.exec("(tmp= py_1 (:= chicagoCrimes10k.hex (+ (cols_py chicagoCrimes10k.hex \"Date\") 1) 2 []))",session); Assert.assertTrue(crimes.vec("Date").isTime());// User named frame is unchanged Frame py_1 = DKV.getGet(Key.make("py_1")); Assert.assertTrue(py_1.vec("Date").isNumeric()); // tmp= py_1 holds the changed column Assert.assertTrue(py_1.vec("Date").mean() > 1300000000L); // msec since epoch is generally >1.3b msec int key2 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key1+1,key2); // Exactly 1 new vector is made: as.Date // Remove original hex key - even though most columns are shared. Note // that this remove is only valid when done in the session context - // otherwise the sharing can't be tracked. Since most columns are shared, // the DKV key should be removed, but NOT most data. Rapids.exec("(rm chicagoCrimes10k.hex)",session); crimes = null; for( Vec vec : py_1.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key_tmp = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key2,key_tmp); // No New Vectors, and VecGroup never rolls backwards // Both append, and nuke a dead temp, in one expression Rapids.exec("(, (tmp= py_2 (append py_1 (day (cols_py py_1 \"Date\")) \"Day\")) (rm py_1))",session); py_1 = null; Frame py_2 = DKV.getGet(Key.make("py_2")); for( Vec vec : py_2.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key3 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key2+1,key3); // Exactly 1 new vector // Start a series of computations that append columns Rapids.exec("(tmp= py_3 (append py_2 (month (cols_py py_2 \"Date\")) \"Month\"))",session); Frame py_3 = DKV.getGet(Key.make("py_3")); for( Vec vec : py_3.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key4 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key3+1,key4); // Exactly 1 new vector // This one does 2 computations to append 1 column, also does an over-write // instead of append. Rapids.exec("(, (rm py_2) (tmp= py_4 (:= py_3 (+ (year (cols_py py_3 \"Date\")) 1900) 17 [])))",session); Frame py_4 = DKV.getGet(Key.make("py_4")); py_2 = null; for( Vec vec : py_4.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key5 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key4 + 2, key5); // Exactly 2 new vector, for two ops: "year" and "+1900". Rapids.exec("(, (rm py_3) (tmp= py_5 (append py_4 (week (cols_py py_4 \"Date\")) \"WeekNum\")))",session); Frame py_5 = DKV.getGet(Key.make("py_5")); py_3 = null; for( Vec vec : py_5.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key6 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key5 + 1, key6); // Exactly 1 new vector Rapids.exec("(, (rm py_4) (tmp= py_6 (append py_5 (dayOfWeek (cols_py py_5 \"Date\")) \"WeekDay\")))",session); Frame py_6 = DKV.getGet(Key.make("py_6")); py_4 = null; for( Vec vec : py_6.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key7 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key6 + 1, key7); // Exactly 1 new vector Rapids.exec("(, (rm py_5) (tmp= py_7 (append py_6 (hour (cols_py py_6 \"Date\")) \"HourOfDay\")))",session); Frame py_7 = DKV.getGet(Key.make("py_7")); py_5 = null; for( Vec vec : py_7.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key8 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key7 + 1, key8); // Exactly 1 new vector // A more involved expression; lots of internal temps Rapids.exec("(, (rm py_6) (tmp= py_8 (append py_7 (| (== (cols_py py_7 \"WeekDay\") \"Sun\") (== (cols_py py_7 \"WeekDay\") \"Sat\")) \"Weekend\")))",session); Frame py_8 = DKV.getGet(Key.make("py_8")); py_6 = null; for( Vec vec : py_8.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key9 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key8 + 3, key9); // Exactly 3 new vectors, one for each of {==, ==, |} // A more involved expression; lots of internal temps Rapids.exec("(, (rm py_7) (tmp= py_9 (append py_8 (cut (cols_py py_8 \"Month\") [0 2 5 7 10 12] [\"Winter\" \"Spring\" \"Summer\" \"Autumn\" \"Winter\"] FALSE TRUE 3) \"Season\")))",session); Frame py_9 = DKV.getGet(Key.make("py_9")); py_7 = null; for( Vec vec : py_9.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted int key10 = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key9 + 1, key10); // Exactly 1 new vector, despite lots of internal vecs // Drop a column Rapids.exec("(, (rm py_8) (tmp= py_10 (cols py_9 -3)))",session); Frame py_10 = DKV.getGet(Key.make("py_10")); py_8 = null; for( Vec vec : py_10.vecs() ) vec.mean(); // Verify we can compute rollups on all cols; will crash if some cols are deleted key_tmp = DKV.<Vec.VectorGroup>getGet(vg._key).len(); // Pull latest value from DKV (no caching allowed) Assert.assertEquals(key10,key_tmp); // No new vectors // End the session; freeing all resources session.end(null); // NO FINALLY FRAME DELETES HERE PLEASE... // Session ending should clean up; if it does not we need to detect the leak } }