package hex.pca;
import hex.FrameTask;
import hex.FrameTask.DataInfo;
import water.Job;
import water.Job.FrameJob;
import water.Key;
import water.api.DocGen;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.RString;
/**
* Principal Components Scoring
* This algorithm maps a dataset into the subspace generated by the principal components.
* If A = dataset to be scored, and B = eigenvector matrix (rows = features, cols = components),
* then the score is simply A * B, assuming the column features match up exactly.
* <a href = "http://en.wikipedia.org/wiki/Principal_component_analysis">PCA on Wikipedia</a>
* @author anqi_fu
*
*/
public class PCAScore extends FrameJob {
static final int API_WEAVER = 1;
static public DocGen.FieldDoc[] DOC_FIELDS;
static final String DOC_GET = "pca_score";
@API(help = "PCA model to use for scoring", required = true, filter = Default.class)
PCAModel model;
@API(help = "Number of principal components to return", filter = Default.class, lmin = 1, lmax = 5000)
int num_pc = 1;
@Override protected void execImpl() {
// Note: Source data MUST contain all features (matched by name) used to build PCA model!
// If additional columns exist in source, they are automatically ignored in scoring
new Frame(destination_key, new String[0], new Vec[0]).delete_and_lock(self());
Frame fr = model.adapt(source, true)[0];
int nfeat = model._names.length;
DataInfo dinfo = new DataInfo(fr, 0, false, false, model.normSub, model.normMul, DataInfo.TransformType.STANDARDIZE, null, null);
PCAScoreTask tsk = new PCAScoreTask(this, dinfo, nfeat, num_pc, model.eigVec);
tsk.doAll(num_pc, dinfo._adaptedFrame);
String[] names = new String[num_pc];
String[][] domains = new String[num_pc][];
for(int i = 0; i < num_pc; i++) {
names[i] = "PC" + i;
domains[i] = null;
}
tsk.outputFrame(destination_key, names, domains).unlock(self());
}
@Override protected void init() {
super.init();
if(model != null && num_pc > model.num_pc)
throw new IllegalArgumentException("Argument 'num_pc' must be between 1 and " + model.num_pc);
}
/* @Override public float progress() {
ChunkProgress progress = UKV.get(progressKey());
return (progress != null ? progress.progress() : 0);
} */
public static String link(Key modelKey, String content) {
return link("model", modelKey, content);
}
public static String link(String key_param, Key k, String content) {
RString rs = new RString("<a href='/2/PCAScore.query?%key_param=%$key'>%content</a>");
rs.replace("key_param", key_param);
rs.replace("key", k.toString());
rs.replace("content", content);
return rs.toString();
}
// Matrix multiplication A * B, where A is a skinny matrix (# rows >> # cols) and B is a
// small matrix that fits on a single node. For PCA scoring, the cols of A (rows of B) are
// the features of the input dataset, while the cols of B are the principal components.
public static class PCAScoreTask extends FrameTask<PCAScoreTask> {
final int _nfeat; // number of features
final int _ncomp; // number of principal components (<= nfeat)
final double[][] _eigvec; // eigenvector matrix
public PCAScoreTask(Job job, DataInfo dinfo, int nfeat, int ncomp, double[][] eigvec) {
super(job.self(), dinfo);
_nfeat = nfeat;
_ncomp = ncomp;
_eigvec = eigvec;
}
// Note: Rows with NAs (missing values) are automatically skipped!
@Override protected void processRow(long gid, double[] nums, int ncats, int[] cats, double[] response, NewChunk[] outputs) {
for(int c = 0; c < _ncomp; c++) {
double x = 0;
for(int d = 0; d < ncats; d++)
x += _eigvec[cats[d]][c];
int k = _dinfo.numStart();
for(int d = 0; d < nums.length; d++)
x += nums[d]*_eigvec[k++][c];
assert k == _eigvec.length;
outputs[c].addNum(x);
}
}
}
}