package hex.createframe.postprocess;
import hex.createframe.CreateFramePostprocessStep;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.util.RandomUtils;
import java.util.Random;
/**
* This action randomly injects missing values into the dataframe.
*/
public class MissingInserterCfps extends CreateFramePostprocessStep {
private double p;
public MissingInserterCfps() {}
/**
* @param p Fraction of values to be converted into NAs.
*/
public MissingInserterCfps(double p) {
assert p >= 0 && p < 1 : "p should be in the range [0, 1), got " + p;
this.p = p;
}
/** Execute this post-processing step. */
@Override
public void exec(Frame fr, Random rng) {
// No need to do anything if p == 0
if (p > 0)
new InsertNAs(p, rng).doAll(fr);
}
/**
* Task that does the actual job of imputing missing values.
*
* Typically the fraction p of values to be replaced with missings is fairly small, therefore it is inefficient
* to visit each value individually and decide whether to flip it to NA based on comparing a uniform random number
* against p. Instead we rely on the fact that the distribution of gaps between "successes" in a bernoulli experiment
* follows the Geometric(p) distribution. Drawing from such distribution is also easy: if u is uniform on (0,1) then
* <code>floor(log(u)/log(1-p))</code> is geometric with parameter p.
*/
private static class InsertNAs extends MRTask<InsertNAs> {
private long seed;
private double p;
public InsertNAs(double prob, Random random) {
p = prob;
seed = random.nextLong();
}
@Override
public void map(Chunk[] cs) {
int numRows = cs[0]._len;
long chunkStart = cs[0].start();
double denom = Math.log(1 - p);
Random rng = RandomUtils.getRNG(0);
for (int i = 0; i < cs.length; i++) {
rng.setSeed(seed + i * 35602489 + chunkStart * 47582);
int l = 0;
while (true) {
l += (int) Math.floor(Math.log(rng.nextDouble()) / denom);
if (l < numRows)
cs[i].set(l++, Double.NaN);
else
break;
}
}
}
}
}