GLRMModel.java example

Explorer
h2o-3-master
package hex.glrm;

import hex.*;
import hex.genmodel.algos.glrm.GlrmInitialization;
import hex.genmodel.algos.glrm.GlrmLoss;
import hex.genmodel.algos.glrm.GlrmRegularizer;
import hex.svd.SVDModel.SVDParameters;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.*;
import water.util.TwoDimTable;

import java.util.ArrayList;

/**
 * GLRM (<a href="https://web.stanford.edu/~boyd/papers/pdf/glrm.pdf">Generalized Low Rank Model</a>).
 *
 * The model seeks to represent an input frame A of dimensions m x n as a product of two smaller matrices X (m x k)
 * and Y (k x n) of rank k each. To this end, the model solves a generic optimization problem
 *    Loss(A, XY) + Rx(X) + Ry(Y) -> min_{X,Y}
 * in other words it tries to find X and Y such that XY is close to A (as measured by the loss function), taking into
 * account regularization constraints on X and Y as well.
 *
 * Note that the input frame A may have columns of different types; while output matrices X and Y are always
 * real-valued.
 *
 * The Loss function is assumed to be separable in each element of the matrix, so that
 *    Loss(A, XY) = Sum[L_{ij}(A_{ij}, x_i y_j)  over i=1..m, j=1..n]
 * the individual loss functions can be different for each element; but in our implementation we assume that L_{ij}'s
 * are constant over rows and may only differ by columns. Thus, L_{ij} == L_j.
 *
 * The regularizers Rx and Ry are assumed to be row-separable:
 *    Rx(X) = Sum[rx_i(x_i)  for i=1..m]
 *    Ry(Y) = Sum[ry_j(x_j)  for j=1..n]
 *
 * The output of the model consists of matrices X and Y. There are multiple interpretations of these (see section 5.4
 * in Boyd's paper). In particular,
 *   + The rows of Y (1 x n) can be interpreted as "idealized examples" of input rows (even if rows of Y are always
 *     real-valued, while rows of the input data may have any types). Thus, we call them *archetypes* in the code.
 *   + The rows of X (1 x k) provide an embedding of each original data row into a lower-dimensional space. Thus, we
 *     call them "representations" of the data.
 */
public class GLRMModel extends Model<GLRMModel, GLRMModel.GLRMParameters, GLRMModel.GLRMOutput>
        implements Model.GLRMArchetypes {


  //--------------------------------------------------------------------------------------------------------------------
  // Input parameters
  //--------------------------------------------------------------------------------------------------------------------

  public static class GLRMParameters extends Model.Parameters {
    @Override public String algoName() { return "GLRM"; }
    @Override public String fullName() { return "Generalized Low Rank Modeling"; }
    @Override public String javaName() { return GLRMModel.class.getName(); }
    @Override public long progressUnits() { return 2 + _max_iterations; }

    // Data transformation (demean to compare with PCA)
    public DataInfo.TransformType _transform = DataInfo.TransformType.NONE;
    public int _k = 1;                       // Rank of resulting XY matrix
    public GlrmInitialization _init = GlrmInitialization.PlusPlus;  // Initialization of Y matrix
    public SVDParameters.Method _svd_method = SVDParameters.Method.Power;  // SVD initialization method (for _init = SVD)
    public Key<Frame> _user_y;               // User-specified Y matrix (for _init = User)
    public Key<Frame> _user_x;               // User-specified X matrix (for _init = User)
    public boolean _expand_user_y = true;    // Should categorical columns in _user_y be expanded via one-hot encoding? (for _init = User)

    // Loss functions
    public GlrmLoss _loss = GlrmLoss.Quadratic;          // Default loss function for numeric cols
    public GlrmLoss _multi_loss = GlrmLoss.Categorical;  // Default loss function for categorical cols
    public int _period = 1;                      // Length of the period when _loss = Periodic
    public GlrmLoss[] _loss_by_col;                  // Override default loss function for specific columns
    public int[] _loss_by_col_idx;

    // Regularization functions
    public GlrmRegularizer _regularization_x = GlrmRegularizer.None;   // Regularization function for X matrix
    public GlrmRegularizer _regularization_y = GlrmRegularizer.None;   // Regularization function for Y matrix
    public double _gamma_x = 0;                   // Regularization weight on X matrix
    public double _gamma_y = 0;                   // Regularization weight on Y matrix

    // Optional parameters
    public int _max_iterations = 1000;            // Max iterations
    public int _max_updates = 2*_max_iterations;  // Max number of updates (X or Y)
    public double _init_step_size = 1.0;          // Initial step size (decrease until we hit min_step_size)
    public double _min_step_size = 1e-4;          // Min step size

    public String _representation_name;
    public boolean _recover_svd = false;          // Recover singular values and eigenvectors of XY at the end?
    public boolean _impute_original = false;      // Reconstruct original training data by reversing _transform?
    public boolean _verbose = true;               // Log when objective increases each iteration?
  }


  //--------------------------------------------------------------------------------------------------------------------
  // Outputs
  //--------------------------------------------------------------------------------------------------------------------

  public static class GLRMOutput extends Model.Output {
    // Number of iterations executed
    public int _iterations;

    // Number of updates executed
    public int _updates;

    // Current value of the objective function
    public double _objective;

    // Current value of step_size used
    public double _step_size;

    // Average change in objective function this iteration
    public double _avg_change_obj;
    public ArrayList<Double> _history_objective = new ArrayList<>();

    // Mapping from lower dimensional k-space to training features (Y)
    public TwoDimTable _archetypes;
    public GLRM.Archetypes _archetypes_raw;   // Needed for indexing into Y for scoring

    // Step size each iteration
    public ArrayList<Double> _history_step_size = new ArrayList<>();

    // SVD of output XY
    public double[/*feature*/][/*k*/] _eigenvectors_raw;
    public TwoDimTable _eigenvectors;
    public double[] _singular_vals;

    // Frame key of X matrix
    public String _representation_name;
    public Key<Frame> _representation_key;
    public Key<? extends Model> _init_key;

    // Number of categorical and numeric columns
    public int _ncats;
    public int _nnums;

    // Number of good rows in training frame (not skipped)
    public long _nobs;

    // Categorical offset vector
    public int[] _catOffsets;

    // Standardization arrays for numeric data columns
    public double[] _normSub;   // Mean of each numeric column
    public double[] _normMul;   // One over standard deviation of each numeric column

    // Permutation array mapping adapted to original training col indices
    public int[] _permutation;  // _permutation[i] = j means col i in _adaptedFrame maps to col j of _train

    // Expanded column names of adapted training frame
    public String[] _names_expanded;

    // Loss function for every column in adapted training frame
    public GlrmLoss[] _lossFunc;

    // Training time
    public ArrayList<Long> _training_time_ms = new ArrayList<>();

    // Total column variance for expanded and transformed data
    public double _total_variance;

    // Standard deviation of each principal component
    public double[] _std_deviation;

    // Importance of principal components
    // Standard deviation, proportion of variance explained, and cumulative proportion of variance explained
    public TwoDimTable _importance;

    public GLRMOutput(GLRM b) { super(b); }

    /** Override because base class implements ncols-1 for features with the
     *  last column as a response variable; for GLRM all the columns are features. */
    @Override public int nfeatures() { return _names.length; }

    @Override public ModelCategory getModelCategory() {
      return ModelCategory.DimReduction;
    }
  }




  public GLRMModel(Key<GLRMModel> selfKey, GLRMParameters parms, GLRMOutput output) {
    super(selfKey, parms, output);
  }

  @Override protected Futures remove_impl( Futures fs ) {
    if (_output._init_key != null) _output._init_key.remove(fs);
    if (_output._representation_key != null) _output._representation_key.remove(fs);
    return super.remove_impl(fs);
  }

  @Override protected AutoBuffer writeAll_impl(AutoBuffer ab) {
    ab.putKey(_output._init_key);
    ab.putKey(_output._representation_key);
    return super.writeAll_impl(ab);
  }

  @Override protected Keyed readAll_impl(AutoBuffer ab, Futures fs) {
    ab.getKey(_output._init_key, fs);
    ab.getKey(_output._representation_key, fs);
    return super.readAll_impl(ab, fs);
  }

  @Override public GlrmMojoWriter getMojo() {
    return new GlrmMojoWriter(this);
  }



  //--------------------------------------------------------------------------------------------------------------------
  // Scoring
  //--------------------------------------------------------------------------------------------------------------------

  // GLRM scoring is data imputation based on feature domains using reconstructed XY (see Udell (2015), Section 5.3)
  private Frame reconstruct(Frame orig, Frame adaptedFr, Key<Frame> destination_key, boolean save_imputed, boolean reverse_transform) {
    int ncols = _output._names.length;
    assert ncols == adaptedFr.numCols();
    String prefix = "reconstr_";

    // Need [A,X,P] where A = adaptedFr, X = loading frame, P = imputed frame
    // Note: A is adapted to original training frame, P has columns shuffled so cats come before nums!
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);
    String[][] adaptedDomme = adaptedFr.domains();
    Vec anyVec = fullFrm.anyVec();
    assert anyVec != null;
    for (int i = 0; i < ncols; i++) {
      Vec v = anyVec.makeZero();
      v.setDomain(adaptedDomme[i]);
      fullFrm.add(prefix + _output._names[i], v);
    }
    GLRMScore gs = new GLRMScore(ncols, _parms._k, save_imputed, reverse_transform).doAll(fullFrm);

    // Return the imputed training frame
    int x = ncols + _parms._k, y = fullFrm.numCols();
    Frame f = fullFrm.extractFrame(x, y);  // this will call vec_impl() and we cannot call the delete() below just yet

    f = new Frame((destination_key == null ? Key.<Frame>make() : destination_key), f.names(), f.vecs());
    DKV.put(f);
    gs._mb.makeModelMetrics(GLRMModel.this, orig, null, null);   // save error metrics based on imputed data
    return f;
  }

  @Override protected Frame predictScoreImpl(Frame orig, Frame adaptedFr, String destination_key, Job j, boolean computeMetrics) {
    return reconstruct(orig, adaptedFr, Key.<Frame>make(destination_key), true, _parms._impute_original);
  }

  @Override public Frame scoreReconstruction(Frame frame, Key<Frame> destination_key, boolean reverse_transform) {
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    return reconstruct(frame, adaptedFr, destination_key, true, reverse_transform);
  }

  /**
   * Project each archetype into original feature space
   * @param frame Original training data with m rows and n columns
   * @param destination_key Frame Id for output
   * @return Frame containing k rows and n columns, where each row corresponds to an archetype
   */
  @Override public Frame scoreArchetypes(Frame frame, Key<Frame> destination_key, boolean reverse_transform) {
    int ncols = _output._names.length;
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();
    String[][] adaptedDomme = adaptedFr.domains();
    double[][] proj = new double[_parms._k][_output._nnums + _output._ncats];

    // Categorical columns
    for (int d = 0; d < _output._ncats; d++) {
      double[][] block = _output._archetypes_raw.getCatBlock(d);
      for (int k = 0; k < _parms._k; k++)
        proj[k][_output._permutation[d]] = _output._lossFunc[d].mimpute(block[k]);
    }

    // Numeric columns
    for (int d = _output._ncats; d < (_output._ncats + _output._nnums); d++) {
      int ds = d - _output._ncats;
      for (int k = 0; k < _parms._k; k++) {
        double num = _output._archetypes_raw.getNum(ds, k);
        proj[k][_output._permutation[d]] = _output._lossFunc[d].impute(num);
        if (reverse_transform)
          proj[k][_output._permutation[d]] = proj[k][_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
    }

    // Convert projection of archetypes into a frame with correct domains
    Frame f = ArrayUtils.frame((destination_key == null ? Key.<Frame>make() : destination_key), adaptedFr.names(), proj);
    for(int i = 0; i < ncols; i++) f.vec(i).setDomain(adaptedDomme[i]);
    return f;
  }

  private class GLRMScore extends MRTask<GLRMScore> {
    final int _ncolA;   // Number of cols in original data A
    final int _ncolX;   // Number of cols in X (rank k)
    final boolean _save_imputed;  // Save imputed data into new vecs?
    final boolean _reverse_transform;   // Reconstruct original training data by reversing transform?
    ModelMetricsGLRM.GlrmModelMetricsBuilder _mb;

    GLRMScore( int ncolA, int ncolX, boolean save_imputed ) {
      this(ncolA, ncolX, save_imputed, _parms._impute_original);
    }

    GLRMScore( int ncolA, int ncolX, boolean save_imputed, boolean reverse_transform ) {
      _ncolA = ncolA; _ncolX = ncolX;
      _save_imputed = save_imputed;
      _reverse_transform = reverse_transform;
    }

    @Override public void map(Chunk[] chks) {
      float[] atmp = new float[_ncolA];
      double[] xtmp = new double[_ncolX];
      double[] preds = new double[_ncolA];
      _mb = GLRMModel.this.makeMetricBuilder(null);

      if (_save_imputed) {
        for (int row = 0; row < chks[0]._len; row++) {
          double[] p = impute_data(chks, row, xtmp, preds);
          compute_metrics(chks, row, atmp, p);
          for (int c = 0; c < preds.length; c++)
            chks[_ncolA + _ncolX + c].set(row, p[c]);
        }
      } else {
        for (int row = 0; row < chks[0]._len; row++) {
          double[] p = impute_data(chks, row, xtmp, preds);
          compute_metrics(chks, row, atmp, p);
        }
      }
    }

    @Override public void reduce(GLRMScore other) {
      if (_mb != null) _mb.reduce(other._mb);
    }

    @Override protected void postGlobal() {
      if (_mb != null) _mb.postGlobal();
    }

    private float[] compute_metrics(Chunk[] chks, int row_in_chunk, float[] tmp, double[] preds) {
      for (int i = 0; i < tmp.length; i++)
        tmp[i] = (float)chks[i].atd(row_in_chunk);
      _mb.perRow(preds, tmp, GLRMModel.this);
      return tmp;
    }

    private double[] impute_data(Chunk[] chks, int row_in_chunk, double[] tmp, double[] preds) {
      for (int i = 0; i < tmp.length; i++ )
        tmp[i] = chks[_ncolA+i].atd(row_in_chunk);
      impute_data(tmp, preds);
      return preds;
    }

    private double[] impute_data(double[] tmp, double[] preds) {
      assert preds.length == _output._nnums + _output._ncats;

      // Categorical columns
      for (int d = 0; d < _output._ncats; d++) {
        double[] xyblock = _output._archetypes_raw.lmulCatBlock(tmp,d);
        preds[_output._permutation[d]] = _output._lossFunc[d].mimpute(xyblock);
      }

      // Numeric columns
      for (int d = _output._ncats; d < preds.length; d++) {
        int ds = d - _output._ncats;
        double xy = _output._archetypes_raw.lmulNumCol(tmp, ds);
        preds[_output._permutation[d]] = _output._lossFunc[d].impute(xy);
        if (_reverse_transform)
          preds[_output._permutation[d]] = preds[_output._permutation[d]] / _output._normMul[ds] + _output._normSub[ds];
      }
      return preds;
    }
  }

  @Override protected double[] score0(double[] data, double[] preds) {
    throw H2O.unimpl();
  }

  public ModelMetricsGLRM scoreMetricsOnly(Frame frame) {
    if (frame == null) return null;
    int ncols = _output._names.length;

    // Need [A,X] where A = adapted test frame, X = loading frame
    // Note: A is adapted to original training frame
    Frame adaptedFr = new Frame(frame);
    adaptTestForTrain(adaptedFr, true, false);
    assert ncols == adaptedFr.numCols();

    // Append loading frame X for calculating XY
    Frame fullFrm = new Frame(adaptedFr);
    Frame loadingFrm = DKV.get(_output._representation_key).get();
    fullFrm.add(loadingFrm);

    GLRMScore gs = new GLRMScore(ncols, _parms._k, false).doAll(fullFrm);
    ModelMetrics mm = gs._mb.makeModelMetrics(GLRMModel.this, frame, null, null);   // save error metrics based on imputed data
    return (ModelMetricsGLRM) mm;
  }

  @Override public ModelMetricsGLRM.GlrmModelMetricsBuilder makeMetricBuilder(String[] domain) {
    return new ModelMetricsGLRM.GlrmModelMetricsBuilder(_parms._k, _output._permutation, _parms._impute_original);
  }
}