MissingInserterCfps.java example

Explorer
h2o-3-master
package hex.createframe.postprocess;

import hex.createframe.CreateFramePostprocessStep;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.util.RandomUtils;

import java.util.Random;


/**
 * This action randomly injects missing values into the dataframe.
 */
public class MissingInserterCfps extends CreateFramePostprocessStep {
  private double p;


  public MissingInserterCfps() {}

  /**
   * @param p Fraction of values to be converted into NAs.
   */
  public MissingInserterCfps(double p) {
    assert p >= 0 && p < 1 : "p should be in the range [0, 1), got " + p;
    this.p = p;
  }

  /** Execute this post-processing step. */
  @Override
  public void exec(Frame fr, Random rng) {
    // No need to do anything if p == 0
    if (p > 0)
      new InsertNAs(p, rng).doAll(fr);
  }

  /**
   * Task that does the actual job of imputing missing values.
   *
   * Typically the fraction p of values to be replaced with missings is fairly small, therefore it is inefficient
   * to visit each value individually and decide whether to flip it to NA based on comparing a uniform random number
   * against p. Instead we rely on the fact that the distribution of gaps between "successes" in a bernoulli experiment
   * follows the Geometric(p) distribution. Drawing from such distribution is also easy: if u is uniform on (0,1) then
   * <code>floor(log(u)/log(1-p))</code> is geometric with parameter p.
   */
  private static class InsertNAs extends MRTask<InsertNAs> {
    private long seed;
    private double p;

    public InsertNAs(double prob, Random random) {
      p = prob;
      seed = random.nextLong();
    }

    @Override
    public void map(Chunk[] cs) {
      int numRows = cs[0]._len;
      long chunkStart = cs[0].start();
      double denom = Math.log(1 - p);
      Random rng = RandomUtils.getRNG(0);
      for (int i = 0; i < cs.length; i++) {
        rng.setSeed(seed + i * 35602489 + chunkStart * 47582);
        int l = 0;
        while (true) {
          l += (int) Math.floor(Math.log(rng.nextDouble()) / denom);
          if (l < numRows)
            cs[i].set(l++, Double.NaN);
          else
            break;
        }
      }
    }
  }

}