package hex.glrm;
import hex.*;
import hex.genmodel.algos.glrm.GlrmInitialization;
import hex.genmodel.algos.glrm.GlrmLoss;
import hex.genmodel.algos.glrm.GlrmRegularizer;
import hex.svd.SVDModel.SVDParameters;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.*;
import water.util.TwoDimTable;
import java.util.ArrayList;
/**
* GLRM (<a href="https://web.stanford.edu/~boyd/papers/pdf/glrm.pdf">Generalized Low Rank Model</a>).
*
* The model seeks to represent an input frame A of dimensions m x n as a product of two smaller matrices X (m x k)
* and Y (k x n) of rank k each. To this end, the model solves a generic optimization problem
* Loss(A, XY) + Rx(X) + Ry(Y) -> min_{X,Y}
* in other words it tries to find X and Y such that XY is close to A (as measured by the loss function), taking into
* account regularization constraints on X and Y as well.
*
* Note that the input frame A may have columns of different types; while output matrices X and Y are always
* real-valued.
*
* The Loss function is assumed to be separable in each element of the matrix, so that
* Loss(A, XY) = Sum[L_{ij}(A_{ij}, x_i y_j) over i=1..m, j=1..n]
* the individual loss functions can be different for each element; but in our implementation we assume that L_{ij}'s
* are constant over rows and may only differ by columns. Thus, L_{ij} == L_j.
*
* The regularizers Rx and Ry are assumed to be row-separable:
* Rx(X) = Sum[rx_i(x_i) for i=1..m]
* Ry(Y) = Sum[ry_j(x_j) for j=1..n]
*
* The output of the model consists of matrices X and Y. There are multiple interpretations of these (see section 5.4
* in Boyd's paper). In particular,
* + The rows of Y (1 x n) can be interpreted as "idealized examples" of input rows (even if rows of Y are always
* real-valued, while rows of the input data may have any types). Thus, we call them *archetypes* in the code.
* + The rows of X (1 x k) provide an embedding of each original data row into a lower-dimensional space. Thus, we
* call them "representations" of the data.
*/
public class GLRMModel extends Model<GLRMModel, GLRMModel.GLRMParameters, GLRMModel.GLRMOutput>
implements Model.GLRMArchetypes {
//--------------------------------------------------------------------------------------------------------------------
// Input parameters
//--------------------------------------------------------------------------------------------------------------------
public static class GLRMParameters extends Model.Parameters {
@Override public String algoName() { return "GLRM"; }
@Override public String fullName() { return "Generalized Low Rank Modeling"; }
@Override public String javaName() { return GLRMModel.class.getName(); }
@Override public long progressUnits() { return 2 + _max_iterations; }
// Data transformation (demean to compare with PCA)
public DataInfo.TransformType _transform = DataInfo.TransformType.NONE;
public int _k = 1; // Rank of resulting XY matrix
public GlrmInitialization _init = GlrmInitialization.PlusPlus; // Initialization of Y matrix
public SVDParameters.Method _svd_method = SVDParameters.Method.Power; // SVD initialization method (for _init = SVD)
public Key<Frame> _user_y; // User-specified Y matrix (for _init = User)
public Key<Frame> _user_x; // User-specified X matrix (for _init = User)
public boolean _expand_user_y = true; // Should categorical columns in _user_y be expanded via one-hot encoding? (for _init = User)
// Loss functions
public GlrmLoss _loss = GlrmLoss.Quadratic; // Default loss function for numeric cols
public GlrmLoss _multi_loss = GlrmLoss.Categorical; // Default loss function for categorical cols
public int _period = 1; // Length of the period when _loss = Periodic
public GlrmLoss[] _loss_by_col; // Override default loss function for specific columns
public int[] _loss_by_col_idx;
// Regularization functions
public GlrmRegularizer _regularization_x = GlrmRegularizer.None; // Regularization function for X matrix
public GlrmRegularizer _regularization_y = GlrmRegularizer.None; // Regularization function for Y matrix
public double _gamma_x = 0; // Regularization weight on X matrix
public double _gamma_y = 0; // Regularization weight on Y matrix
// Optional parameters
public int _max_iterations = 1000; // Max iterations
public int _max_updates = 2*_max_iterations; // Max number of updates (X or Y)
public double _init_step_size = 1.0; // Initial step size (decrease until we hit min_step_size)
public double _min_step_size = 1e-4; // Min step size
public String _representation_name;
public boolean _recover_svd = false; // Recover singular values and eigenvectors of XY at the end?
public boolean _impute_original = false; // Reconstruct original training data by reversing _transform?
public boolean _verbose = true; // Log when objective increases each iteration?
}
//--------------------------------------------------------------------------------------------------------------------
// Outputs
//--------------------------------------------------------------------------------------------------------------------
public static class GLRMOutput extends Model.Output {
// Number of iterations executed
public int _iterations;
// Number of updates executed
public int _updates;
// Current value of the objective function
public double _objective;
// Current value of step_size used
public double _step_size;
// Average change in objective function this iteration
public double _avg_change_obj;
public ArrayList<Double> _history_objective = new ArrayList<>();
// Mapping from lower dimensional k-space to training features (Y)
public TwoDimTable _archetypes;
public GLRM.Archetypes _archetypes_raw; // Needed for indexing into Y for scoring
// Step size each iteration
public ArrayList<Double> _history_step_size = new ArrayList<>();
// SVD of output XY
public double[/*feature*/][/*k*/] _eigenvectors_raw;
public TwoDimTable _eigenvectors;
public double[] _singular_vals;
// Frame key of X matrix
public String _representation_name;
public Key<Frame> _representation_key;
public Key<? extends Model> _init_key;
// Number of categorical and numeric columns
public int _ncats;
public int _nnums;
// Number of good rows in training frame (not skipped)
public long _nobs;
// Categorical offset vector
public int[] _catOffsets;
// Standardization arrays for numeric data columns
public double[] _normSub; // Mean of each numeric column
public double[] _normMul; // One over standard deviation of each numeric column
// Permutation array mapping adapted to original training col indices
public int[] _permutation; // _permutation[i] = j means col i in _adaptedFrame maps to col j of _train
// Expanded column names of adapted training frame
public String[] _names_expanded;
// Loss function for every column in adapted training frame
public GlrmLoss[] _lossFunc;
// Training time
public ArrayList<Long> _training_time_ms = new ArrayList<>();
// Total column variance for expanded and transformed data
public double _total_variance;
// Standard deviation of each principal component
public double[] _std_deviation;
// Importance of principal components
// Standard deviation, proportion of variance explained, and cumulative proportion of variance explained
public TwoDimTable _importance;
public GLRMOutput(GLRM b) { super(b); }
/** Override because base class implements ncols-1 for features with the
* last column as a response variable; for GLRM all the columns are features. */
@Override public int nfeatures() { return _names.length; }
@Override public ModelCategory getModelCategory() {
return ModelCategory.DimReduction;
}
}
public GLRMModel(Key<GLRMModel> selfKey, GLRMParameters parms, GLRMOutput output) {
super(selfKey, parms, output);
}
@Override protected Futures remove_impl( Futures fs ) {
if (_output._init_key != null) _output._init_key.remove(fs);
if (_output._representation_key != null) _output._representation_key.remove(fs);
return super.remove_impl(fs);
}
@Override protected AutoBuffer writeAll_impl(AutoBuffer ab) {
ab.putKey(_output._init_key);
ab.putKey(_output._representation_key);
return super.writeAll_impl(ab);
}
@Override protected Keyed readAll_impl(AutoBuffer ab, Futures fs) {
ab.getKey(_output._init_key, fs);
ab.getKey(_output._representation_key, fs);
return super.readAll_impl(ab, fs);
}
@Override public GlrmMojoWriter getMojo() {
return new GlrmMojoWriter(this);
}
//--------------------------------------------------------------------------------------------------------------------
// Scoring
//--------------------------------------------------------------------------------------------------------------------
// GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell (2015), Section 5.3)
private Frame reconstruct(Frame orig, Frame adaptedFr, Key<Frame> destination_key, boolean save_imputed, boolean reverse_transform) {
int ncols = _output._names.length;
assert ncols == adaptedFr.numCols();
String prefix = "reconstr_";
// Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
// Note: A is adapted to original training frame, P has columns shuffled so cats come before nums!
Frame fullFrm = new Frame(adaptedFr);
Frame loadingFrm = DKV.get(_output._representation_key).get();
fullFrm.add(loadingFrm);
String[][] adaptedDomme = adaptedFr.domains();
Vec anyVec = fullFrm.anyVec();
assert anyVec != null;
for (int i = 0; i < ncols; i++) {
Vec v = anyVec.makeZero();
v.setDomain(adaptedDomme[i]);
fullFrm.add(prefix + _output._names[i], v);
}
GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);
// Return the imputed training frame
int x = ncols + _parms._k, y = fullFrm.numCols();
Frame f = fullFrm.extractFrame(x, y); // this will call vec_impl() and we cannot call the delete() below just yet
f = new Frame((destination_key == null ? Key.<Frame>make() : destination_key), f.names(), f.vecs());
DKV.put(f);
gs._mb.makeModelMetrics(GLRMModel.this, orig, null, null); // save error metrics based on imputed data
return f;
}
@Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key, Job j, boolean computeMetrics) {
return reconstruct(orig, adaptedFr, Key.<Frame>make(destination_key), true, _parms._impute_original);
}
@Override public Frame scoreReconstruction(Frame frame, Key<Frame> destination_key, boolean reverse_transform) {
Frame adaptedFr = new Frame(frame);
adaptTestForTrain(adaptedFr, true, false);
return reconstruct(frame, adaptedFr, destination_key, true, reverse_transform);
}
/**
* Project each archetype into original feature space
* @param frame Original training data with m rows and n columns
* @param destination_key Frame Id for output
* @return Frame containing k rows and n columns, where each row corresponds to an archetype
*/
@Override public Frame scoreArchetypes(Frame frame, Key<Frame> destination_key, boolean reverse_transform) {
int ncols = _output._names.length;
Frame adaptedFr = new Frame(frame);
adaptTestForTrain(adaptedFr, true, false);
assert ncols == adaptedFr.numCols();
String[][] adaptedDomme = adaptedFr.domains();
double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];
// Categorical columns
for (int d = 0; d < _output._ncats; d++) {
double[][] block = _output._archetypes_raw.getCatBlock(d);
for (int k = 0; k < _parms._k; k++)
proj[k][_output._permutation[d]] = _output._lossFunc[d].mimpute(block[k]);
}
// Numeric columns
for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
int ds = d - _output._ncats;
for (int k = 0; k < _parms._k; k++) {
double num = _output._archetypes_raw.getNum(ds, k);
proj[k][_output._permutation[d]] = _output._lossFunc[d].impute(num);
if (reverse_transform)
proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
}
}
// Convert projection of archetypes into a frame with correct domains
Frame f = ArrayUtils.frame((destination_key == null ? Key.<Frame>make() : destination_key), adaptedFr.names(), proj);
for(int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
return f;
}
private class GLRMScore extends MRTask<GLRMScore> {
final int _ncolA; // Number of cols in original data A
final int _ncolX; // Number of cols in X (rank k)
final boolean _save_imputed; // Save imputed data into new vecs?
final boolean _reverse_transform; // Reconstruct original training data by reversing transform?
ModelMetricsGLRM.GlrmModelMetricsBuilder _mb;
GLRMScore( int ncolA, int ncolX, boolean save_imputed ) {
this(ncolA, ncolX, save_imputed, _parms._impute_original);
}
GLRMScore( int ncolA, int ncolX, boolean save_imputed, boolean reverse_transform ) {
_ncolA = ncolA; _ncolX = ncolX;
_save_imputed = save_imputed;
_reverse_transform = reverse_transform;
}
@Override public void map(Chunk[] chks) {
float[] atmp = new float[_ncolA];
double[] xtmp = new double[_ncolX];
double[] preds = new double[_ncolA];
_mb = GLRMModel.this.makeMetricBuilder(null);
if (_save_imputed) {
for (int row = 0; row < chks[0]._len; row++) {
double[] p = impute_data(chks, row, xtmp, preds);
compute_metrics(chks, row, atmp, p);
for (int c = 0; c < preds.length; c++)
chks[_ncolA + _ncolX + c].set(row, p[c]);
}
} else {
for (int row = 0; row < chks[0]._len; row++) {
double[] p = impute_data(chks, row, xtmp, preds);
compute_metrics(chks, row, atmp, p);
}
}
}
@Override public void reduce(GLRMScore other) {
if (_mb != null) _mb.reduce(other._mb);
}
@Override protected void postGlobal() {
if (_mb != null) _mb.postGlobal();
}
private float[] compute_metrics(Chunk[] chks, int row_in_chunk, float[] tmp, double[] preds) {
for (int i = 0; i < tmp.length; i++)
tmp[i] = (float)chks[i].atd(row_in_chunk);
_mb.perRow(preds, tmp, GLRMModel.this);
return tmp;
}
private double[] impute_data(Chunk[] chks, int row_in_chunk, double[] tmp, double[] preds) {
for (int i = 0; i < tmp.length; i++ )
tmp[i] = chks[_ncolA+i].atd(row_in_chunk);
impute_data(tmp, preds);
return preds;
}
private double[] impute_data(double[] tmp, double[] preds) {
assert preds.length == _output._nnums + _output._ncats;
// Categorical columns
for (int d = 0; d < _output._ncats; d++) {
double[] xyblock = _output._archetypes_raw.lmulCatBlock(tmp,d);
preds[_output._permutation[d]] = _output._lossFunc[d].mimpute(xyblock);
}
// Numeric columns
for (int d = _output._ncats; d < preds.length; d++) {
int ds = d - _output._ncats;
double xy = _output._archetypes_raw.lmulNumCol(tmp, ds);
preds[_output._permutation[d]] = _output._lossFunc[d].impute(xy);
if (_reverse_transform)
preds[_output._permutation[d]] = preds[_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
}
return preds;
}
}
@Override protected double[] score0(double[] data, double[] preds) {
throw H2O.unimpl();
}
public ModelMetricsGLRM scoreMetricsOnly(Frame frame) {
if (frame == null) return null;
int ncols = _output._names.length;
// Need [A,X] where A = adapted test frame, X = loading frame
// Note: A is adapted to original training frame
Frame adaptedFr = new Frame(frame);
adaptTestForTrain(adaptedFr, true, false);
assert ncols == adaptedFr.numCols();
// Append loading frame X for calculating XY
Frame fullFrm = new Frame(adaptedFr);
Frame loadingFrm = DKV.get(_output._representation_key).get();
fullFrm.add(loadingFrm);
GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm);
ModelMetrics mm = gs._mb.makeModelMetrics(GLRMModel.this, frame, null, null); // save error metrics based on imputed data
return (ModelMetricsGLRM) mm;
}
@Override public ModelMetricsGLRM.GlrmModelMetricsBuilder makeMetricBuilder(String[] domain) {
return new ModelMetricsGLRM.GlrmModelMetricsBuilder(_parms._k, _output._permutation, _parms._impute_original);
}
}