package hex.createframe;
import hex.createframe.recipes.OriginalCreateFrameRecipe;
import org.junit.BeforeClass;
import org.junit.Test;
import water.TestUtil;
import water.api.schemas4.input.CreateFrameOriginalIV4;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import static org.junit.Assert.*;
/**
* Test for the {@link OriginalCreateFrameRecipe} class (and the overall {@link CreateFrameExecutor} mechanism).
*/
public class OriginalCreateFrameRecipeTest extends TestUtil {
@BeforeClass() public static void setup() {
stall_till_cloudsize(1);
}
/**
* Simple initial test: verify that the random frame can be created, that it has the correct
* dimensions and column names (response, C1, C2, C3, ...)
*/
@Test public void basicTest() {
CreateFrameOriginalIV4 s = new CreateFrameOriginalIV4().fillFromImpl();
s.rows = (int)(Math.random() * 200) + 50;
s.cols = (int)(Math.random() * 10) + 5;
s.categorical_fraction = 0.1;
s.integer_fraction = 1 - s.categorical_fraction;
s.binary_fraction = 0;
s.factors = 4;
s.response_factors = 2;
s.positive_response = false;
s.has_response = true;
s.seed = 1234;
OriginalCreateFrameRecipe cf = s.createAndFillImpl();
Frame frame = cf.exec().get();
assertNotNull(frame);
assertEquals(s.cols + 1, frame.numCols());
assertEquals(s.rows, frame.numRows());
assertEquals("response", frame.name(0));
for (int i = 1; i < s.cols; i++)
assertEquals("C" + i, frame.name(i));
Log.info(frame.toString());
frame.delete();
}
/**
* Creates frame with binary columns, and test that the <code>binary_ones_fraction</code> setting is respected.
* This test is non-deterministic and may fail with probability 0.001%.
*/
@Test public void binaryFrameTest() {
CreateFrameOriginalIV4 s = new CreateFrameOriginalIV4().fillFromImpl();
s.rows = 25000;
s.cols = 6;
s.categorical_fraction = 0;
s.integer_fraction = 0;
s.binary_fraction = 1;
s.binary_ones_fraction = 0.2;
s.missing_fraction = 0;
s.has_response = true;
s.response_factors = 2; // binomial response
Frame frame = s.createAndFillImpl().exec().get();
assertNotNull(frame);
assertEquals("response", frame.name(0));
assertEquals(s.cols + 1, frame.numCols());
assertEquals(s.rows, frame.numRows());
long totalCount = 0;
for (int i = 0; i < s.cols + 1; i++) {
assertTrue(frame.vec(i).isBinary());
if (i > 0) // response column is skipped because its proportion of 1s is always 0.5
totalCount += Math.round(s.rows * frame.vec(i).mean());
}
double N = s.rows * s.cols;
double p = s.binary_ones_fraction;
double ttest = Math.abs(totalCount - N * p) / Math.sqrt(N * p * (1 - p));
assertTrue("Count of 1s is more than 4.417 sigmas away from the expected value: t = " + ttest,
ttest < 4.417);
frame.delete();
}
/**
* Test that the produced number of missing values is the same as requested.
*/
@Test public void missingValuesTest() {
CreateFrameOriginalIV4 s = new CreateFrameOriginalIV4().fillFromImpl();
s.rows = 25000;
s.cols = 4;
s.categorical_fraction = 0;
s.integer_fraction = 0;
s.binary_fraction = 0;
s.string_fraction = 0;
s.time_fraction = 0;
s.missing_fraction = 0.1;
s.has_response = true;
s.response_factors = 1;
Frame frame = s.createAndFillImpl().exec().get();
assertNotNull(frame);
assertEquals(s.cols + 1, frame.numCols());
assertEquals(s.rows, frame.numRows());
long missingCount = 0;
for (int i = 0; i < s.cols + 1; i++) {
missingCount += frame.vec(i).naCnt();
}
double N = s.rows * (s.cols + 1);
double p = s.missing_fraction;
double ttest = Math.abs(missingCount - N * p) / Math.sqrt(N * p * (1 - p));
assertTrue("Count of NAs is more than 4.417 sigmas away from the expected value: t = " + ttest,
ttest < 4.417);
frame.delete();
}
/**
* Test that columns of all types can be created, and that there is the correct number of each
* in the resulting frame.
*/
@Test public void testAllColumnTypes() {
CreateFrameOriginalIV4 s = new CreateFrameOriginalIV4().fillFromImpl();
s.rows = 100;
s.cols = 100;
s.categorical_fraction = 0.10000000000001;
s.integer_fraction = 0.099999999999998;
s.binary_fraction = 0.10000000000003;
s.time_fraction = 0.1200045762024587;
s.string_fraction = 0.16000204587202;
s.binary_ones_fraction = 0.1;
s.factors = 5;
s.response_factors = 5; // response is also categorical
s.positive_response = false;
s.has_response = true;
s.seed = 1234567;
Frame frame = s.createAndFillImpl().exec().get();
assertNotNull(frame);
assertEquals("response", frame.name(0));
assertEquals(s.cols + 1, frame.numCols());
assertEquals(s.rows, frame.numRows());
assertEquals(Math.round(s.cols * s.categorical_fraction) + 1, countVecsOfType(frame, "enum"));
assertEquals(Math.round(s.cols * s.time_fraction), countVecsOfType(frame, "time"));
assertEquals(Math.round(s.cols * s.string_fraction), countVecsOfType(frame, "str"));
assertEquals(Math.round(s.cols * s.integer_fraction), countVecsOfType(frame, "int"));
assertEquals(Math.round(s.cols * s.binary_fraction), countVecsOfType(frame, "bool"));
Log.info(frame.toString());
frame.delete();
}
/**
* This test attempts to create the same dataset twice starting from the same seed, and then checks that
* the result came out exactly the same both times.
* We also verify that the test frame has multiple chunks, since most of the breakages will happen because of
* nondeterministic chunk execution.
*/
@Test public void testReproducibility() {
CreateFrameOriginalIV4 s = new CreateFrameOriginalIV4().fillFromImpl();
s.rows = 5000;
s.cols = 20;
s.time_fraction = 0.1;
s.categorical_fraction = 0.2;
s.integer_fraction = 0.2;
s.binary_fraction = 0.2;
s.string_fraction = 0.1;
s.missing_fraction = 0.05;
s.has_response = false;
s.seed = (long)(Math.random() * 100000000000L);
Log.info("Using seed " + s.seed);
Frame frame1 = s.createAndFillImpl().exec().get();
assertNotNull(frame1);
Log.info(frame1.toString());
assertTrue("Please adjust test parameters to have more than 1 chunk in the frame", frame1.vec(0).nChunks() > 1);
Frame frame2 = s.createAndFillImpl().exec().get();
assertNotNull(frame2);
assertTrue(isBitIdentical(frame1, frame2));
frame1.delete();
frame2.delete();
}
private static int countVecsOfType(Frame fr, String type) {
int count = 0;
for (Vec v : fr.vecs()) {
boolean test = false;
switch (type) {
case "enum": test = v.isCategorical(); break;
case "time": test = v.isTime(); break;
case "str": test = v.isString(); break;
case "int": test = v.isInt() && !v.isTime() && !v.isCategorical() && !v.isBinary(); break;
case "bool": test = v.isBinary(); break;
}
if (test) count++;
}
return count;
}
}