package hex.createframe.recipes;
import hex.createframe.CreateFrameExecutor;
import hex.createframe.CreateFrameRecipe;
import hex.createframe.columns.*;
import hex.createframe.postprocess.MissingInserterCfps;
import hex.createframe.postprocess.ShuffleColumnsCfps;
/**
* Similar to {@link OriginalCreateFrameRecipe}, except that this recipe
* requires to specify the number of columns of each type explicitly (not
* as fractions). It also uses different naming scheme, so that columns of
* different types have names according to that type: integer columns are
* {@code I1, I2, ...}, binary are {@code B1, B2, ...}, and so on.
*/
public class SimpleCreateFrameRecipe extends CreateFrameRecipe<SimpleCreateFrameRecipe> {
public int nrows = 100;
public int ncols_real = 0;
public int ncols_int = 0;
public int ncols_enum = 0;
public int ncols_bool = 0;
public int ncols_str = 0;
public int ncols_time = 0;
public double real_lb = -100;
public double real_ub = 100;
public int int_lb = -100;
public int int_ub = 100;
public int enum_nlevels = 10;
public double bool_p = 0.3;
public long time_lb = 365L * 24 * 3600 * 1000 * (2000 - 1970); // ~ 2000-01-01
public long time_ub = 365L * 24 * 3600 * 1000 * (2020 - 1970); // ~ 2020-01-01
public int str_length = 8;
public double missing_fraction = 0;
public ResponseType response_type = ResponseType.NONE;
public double response_lb = 0;
public double response_ub = 10;
public double response_p = 0.6;
public int response_nlevels = 25;
public enum ResponseType {
NONE, REAL, INT, ENUM, BOOL, TIME
}
protected void checkParametersValidity() {
check(nrows > 0, "Number of rows must be greater than 0");
check(ncols_real >= 0, "Number of real columns cannot be negative");
check(ncols_int >= 0, "Number of integer columns cannot be negative");
check(ncols_bool >= 0, "Number of bool (binary) columns cannot be negative");
check(ncols_enum >= 0, "Number of enum (categorical) columns cannot be negative");
check(ncols_str >= 0, "Number of string columns cannot be negative");
check(ncols_time >= 0, "Number of time columns cannot be negative");
check(!Double.isNaN(real_lb), "Real range's lower bound cannot be NaN");
check(!Double.isNaN(real_ub), "Real range's upper bound cannot be NaN");
check(!Double.isInfinite(real_lb), "Real range's lower bound cannot be infinite");
check(!Double.isInfinite(real_ub), "Real range's upper bound cannot be infinite");
check(real_lb <= real_ub, "Invalid real range interval: lower bound exceeds the upper bound");
check(int_lb <= int_ub, "Invalid integer range interval: lower bound exceeds the upper bound");
check(!Double.isNaN(bool_p), "Boolean frequency parameter cannot be NaN");
check(bool_p >= 0 && bool_p <= 1, "Boolean frequency parameter must be in the range 0..1");
check(time_lb <= time_ub, "Invalid time range interval: lower bound exceeds the upper bound");
check(enum_nlevels > 0, "Number of levels for enum (categorical) columns must be positive");
check(str_length > 0, "Length of string values should be positive");
check(!Double.isNaN(missing_fraction), "Missing fraction cannot be NaN");
check(missing_fraction >= 0 && missing_fraction <= 1, "Missing fraction must be in the range 0..1");
check(!Double.isNaN(response_lb), "Response column's lower bound cannot be NaN");
check(!Double.isNaN(response_ub), "Response column's upper bound cannot be NaN");
check(!Double.isInfinite(response_lb), "Response column's lower bound cannot be infinite");
check(!Double.isInfinite(response_ub), "Response column's upper bound cannot be infinite");
check(response_lb <= response_ub, "Invalid interval for response column: lower bound exceeds the upper bound");
check(!Double.isNaN(response_p), "Response binary frequency parameter (response_p) cannot be NaN");
check(response_p >= 0 && response_p <= 1, "Response binary frequency (response_p) should be in the range 0..1");
check(response_nlevels >= 2, "Number of categorical levels for the response column must be 2 or more");
}
protected void buildRecipe(CreateFrameExecutor cfe) {
cfe.setSeed(seed);
cfe.setNumRows(nrows);
switch (response_type) {
case REAL:
cfe.addColumnMaker(new RealColumnCfcm("response", response_lb, response_ub));
break;
case INT:
cfe.addColumnMaker(new IntegerColumnCfcm("response", (int)response_lb, (int)response_ub));
break;
case ENUM:
cfe.addColumnMaker(new CategoricalColumnCfcm("response", response_nlevels));
break;
case BOOL:
cfe.addColumnMaker(new BinaryColumnCfcm("response", response_p));
break;
case TIME:
cfe.addColumnMaker(new TimeColumnCfcm("response", (long)response_lb, (long)response_ub));
break;
}
for (int i = 1; i <= ncols_real; i++)
cfe.addColumnMaker(new RealColumnCfcm("R" + i, real_lb, real_ub));
for (int i = 1; i <= ncols_int; i++)
cfe.addColumnMaker(new IntegerColumnCfcm("I" + i, int_lb, int_ub));
for (int i = 0; i < ncols_enum; i++)
cfe.addColumnMaker(new CategoricalColumnCfcm("E" + i, enum_nlevels));
for (int i = 1; i <= ncols_bool; i++)
cfe.addColumnMaker(new BinaryColumnCfcm("B" + i, bool_p));
for (int i = 0; i < ncols_time; i++)
cfe.addColumnMaker(new TimeColumnCfcm("T" + i, time_lb, time_ub));
for (int i = 0; i < ncols_str; i++)
cfe.addColumnMaker(new StringColumnCfcm("S" + i, str_length));
cfe.addPostprocessStep(new MissingInserterCfps(missing_fraction));
cfe.addPostprocessStep(new ShuffleColumnsCfps(true, true));
}
}