PCAScore.java example

Explorer
h2o-2-master
package hex.pca;

import hex.FrameTask;
import hex.FrameTask.DataInfo;
import water.Job;
import water.Job.FrameJob;
import water.Key;
import water.api.DocGen;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.RString;

/**
 * Principal Components Scoring
 * This algorithm maps a dataset into the subspace generated by the principal components.
 * If A = dataset to be scored, and B = eigenvector matrix (rows = features, cols = components),
 * then the score is simply A * B, assuming the column features match up exactly.
 * <a href = "http://en.wikipedia.org/wiki/Principal_component_analysis">PCA on Wikipedia</a>
 * @author anqi_fu
 *
 */
public class PCAScore extends FrameJob {
  static final int API_WEAVER = 1;
  static public DocGen.FieldDoc[] DOC_FIELDS;
  static final String DOC_GET = "pca_score";

  @API(help = "PCA model to use for scoring", required = true, filter = Default.class)
  PCAModel model;

  @API(help = "Number of principal components to return", filter = Default.class, lmin = 1, lmax = 5000)
  int num_pc = 1;

  @Override protected void execImpl() {
    // Note: Source data MUST contain all features (matched by name) used to build PCA model!
    // If additional columns exist in source, they are automatically ignored in scoring
    new Frame(destination_key, new String[0], new Vec[0]).delete_and_lock(self());
    Frame fr = model.adapt(source, true)[0];
    int nfeat = model._names.length;
    DataInfo dinfo = new DataInfo(fr, 0, false, false, model.normSub, model.normMul, DataInfo.TransformType.STANDARDIZE, null, null);

    PCAScoreTask tsk = new PCAScoreTask(this, dinfo, nfeat, num_pc, model.eigVec);
    tsk.doAll(num_pc, dinfo._adaptedFrame);
    String[] names = new String[num_pc];
    String[][] domains = new String[num_pc][];
    for(int i = 0; i < num_pc; i++) {
      names[i] = "PC" + i;
      domains[i] = null;
    }
    tsk.outputFrame(destination_key, names, domains).unlock(self());
  }

  @Override protected void init() {
    super.init();
    if(model != null && num_pc > model.num_pc)
      throw new IllegalArgumentException("Argument 'num_pc' must be between 1 and " + model.num_pc);
  }

  /* @Override public float progress() {
    ChunkProgress progress = UKV.get(progressKey());
    return (progress != null ? progress.progress() : 0);
  } */

  public static String link(Key modelKey, String content) {
    return link("model", modelKey, content);
  }

  public static String link(String key_param, Key k, String content) {
    RString rs = new RString("<a href='/2/PCAScore.query?%key_param=%$key'>%content</a>");
    rs.replace("key_param", key_param);
    rs.replace("key", k.toString());
    rs.replace("content", content);
    return rs.toString();
  }

  // Matrix multiplication A * B, where A is a skinny matrix (# rows >> # cols) and B is a
  // small matrix that fits on a single node. For PCA scoring, the cols of A (rows of B) are
  // the features of the input dataset, while the cols of B are the principal components.
  public static class PCAScoreTask extends FrameTask<PCAScoreTask> {
    final int _nfeat;         // number of features
    final int _ncomp;         // number of principal components (<= nfeat)
    final double[][] _eigvec; // eigenvector matrix

    public PCAScoreTask(Job job, DataInfo dinfo, int nfeat, int ncomp, double[][] eigvec) {
      super(job.self(), dinfo);
      _nfeat = nfeat;
      _ncomp = ncomp;
      _eigvec = eigvec;
    }

    // Note: Rows with NAs (missing values) are automatically skipped!
    @Override protected void processRow(long gid, double[] nums, int ncats, int[] cats, double[] response, NewChunk[] outputs) {
      for(int c = 0; c < _ncomp; c++) {
        double x = 0;
        for(int d = 0; d < ncats; d++)
          x += _eigvec[cats[d]][c];
        int k = _dinfo.numStart();
        for(int d = 0; d < nums.length; d++)
          x += nums[d]*_eigvec[k++][c];
        assert k == _eigvec.length;
        outputs[c].addNum(x);
      }
    }
  }
}