GLRM.java example

Explorer
h2o-3-master
package hex.glrm;

import Jama.CholeskyDecomposition;
import Jama.Matrix;
import Jama.QRDecomposition;
import Jama.SingularValueDecomposition;
import hex.DataInfo;
import hex.ModelBuilder;
import hex.ModelCategory;
import hex.genmodel.algos.glrm.GlrmInitialization;
import hex.genmodel.algos.glrm.GlrmLoss;
import hex.genmodel.algos.glrm.GlrmRegularizer;
import hex.glrm.GLRMModel.GLRMParameters;
import hex.gram.Gram;
import hex.gram.Gram.Cholesky;
import hex.gram.Gram.GramTask;
import hex.kmeans.KMeans;
import hex.kmeans.KMeansModel;
import hex.svd.SVD;
import hex.svd.SVDModel;
import hex.svd.SVDModel.SVDParameters;
import hex.util.LinearAlgebraUtils;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import water.*;
import water.api.ModelCacheManager;
import water.fvec.C0DChunk;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.*;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import static hex.util.DimensionReductionUtils.generateIPC;

import static hex.genmodel.algos.glrm.GlrmLoss.Quadratic;

/**
 * Generalized Low Rank Models
 * This is an algorithm for dimensionality reduction of a dataset. It is a general, parallelized
 * optimization algorithm that applies to a variety of loss and regularization functions.
 * Categorical columns are handled by expansion into 0/1 indicator columns for each level.
 * <a href = "http://web.stanford.edu/~boyd/papers/pdf/glrm.pdf">Generalized Low Rank Models</a>
 * @author anqi_fu
 */
public class GLRM extends ModelBuilder<GLRMModel, GLRMModel.GLRMParameters, GLRMModel.GLRMOutput> {
  // Convergence tolerance
  private static final double TOLERANCE = 1e-6;

  // Number of columns in the training set (n)
  private transient int _ncolA;

  // Number of columns in the resulting matrix Y, taking into account expansion of the categoricals
  private transient int _ncolY;

  // Number of columns in the resulting matrix X (k), also number of rows in Y.
  private transient int _ncolX;

  // Loss function for each column
  private transient GlrmLoss[] _lossFunc;

  private ArrayList<Integer> _binaryColumnIndices;  // store binary columns using binary loss functions.

  @Override protected GLRMDriver trainModelImpl() { return new GLRMDriver(); }
  @Override public ModelCategory[] can_build() { return new ModelCategory[]{ModelCategory.Clustering}; }
  @Override public boolean isSupervised() { return false; }

  @Override public boolean havePojo() { return false; }
  @Override public boolean haveMojo() { return true; }

  @Override protected void checkMemoryFootPrint() {
    HeartBeat hb = H2O.SELF._heartbeat;
    double p = hex.util.LinearAlgebraUtils.numColsExp(_train,true);
    double gramSize =  _train.lastVec().nChunks()==1 ? 1 :
            Math.log((double) _train.lastVec().nChunks()) / Math.log(2.); // gets to zero if nChunks=1
    long mem_usage = (long)(hb._cpus_allowed * p*_parms._k * 8 * gramSize);  // loose estimation of memory usage
    long max_mem = H2O.SELF._heartbeat.get_free_mem();
    if (mem_usage > max_mem) {
      String msg = "Archtypes in matrix Y cannot fit in the driver node's memory ("
              + PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem)
              + ") - try reducing k, the number of columns and/or the number of categorical factors.";
      error("_train", msg);
    }
  }
  //--------------------------------------------------------------------------------------------------------------------
  // Model initialization
  //--------------------------------------------------------------------------------------------------------------------

  // Called from an http request
  public GLRM(GLRMParameters parms) {
    super(parms);
    init(false);
  }
  public GLRM(GLRMParameters parms, Job<GLRMModel> job) {
    super(parms, job);
    init(false);
  }
  public GLRM(boolean startup_once) {
    super(new GLRMParameters(), startup_once);
  }

  /**
   * Validate all parameters, and prepare the model for training.
   */
  @Override public void init(boolean expensive) {
    super.init(expensive);

    _ncolX = _parms._k;
    _ncolA = _train == null? -1 : _train.numCols();
    _ncolY = _train == null? -1 : LinearAlgebraUtils.numColsExp(_train, true);

    initLoss();

    if (_parms._gamma_x < 0) error("_gamma_x", "gamma must be a non-negative number");
    if (_parms._gamma_y < 0) error("_gamma_y", "gamma_y must be a non-negative number");
    if (_parms._max_iterations < 1 || _parms._max_iterations > 1e6)
      error("_max_iterations", "max_iterations must be between 1 and 1e6 inclusive");
    if (_parms._init_step_size <= 0)
      error ("_init_step_size", "init_step_size must be a positive number");
    if (_parms._min_step_size < 0 || _parms._min_step_size > _parms._init_step_size)
      error("_min_step_size", "min_step_size must be between 0 and " + _parms._init_step_size);

    // Cannot recover SVD of original _train from XY of transformed _train
    if (_parms._recover_svd && (_parms._impute_original && _parms._transform != DataInfo.TransformType.NONE))
      error("_recover_svd", "_recover_svd and _impute_original cannot both be true if _train is transformed");

    if (_train == null) return;
    if (_ncolA < 2)
      error("_train", "_train must have more than one column");
    if (_valid != null && _valid.numRows() != _train.numRows())
      error("_valid", "_valid must have same number of rows as _train");

    if (_ncolY > 5000)
      warn("_train", "_train has " + _ncolY + " columns when categoricals are expanded. Algorithm may be slow.");

    if (_parms._k < 1 || _parms._k > _ncolY) error("_k", "_k must be between 1 and " + _ncolY + " inclusive");
    if (_parms._user_y != null) { // Check dimensions of user-specified initial Y
      if (_parms._init != GlrmInitialization.User)
        error("_init", "init must be 'User' if providing user-specified points");

      Frame user_y = _parms._user_y.get();
      assert user_y != null;
      int user_y_cols = _parms._expand_user_y ? _ncolA : _ncolY;

      // Check dimensions of user-specified initial Y
      if (user_y.numCols() != user_y_cols)
        error("_user_y", "The user-specified Y must have the same number of columns (" + user_y_cols + ") " +
                "as the training observations");
      else if (user_y.numRows() != _parms._k)
        error("_user_y", "The user-specified Y must have k = " + _parms._k + " rows");
      else {
        int zero_vec = 0;
        Vec[] centersVecs = user_y.vecs();
        for (int c = 0; c < _ncolA; c++) {
          if (centersVecs[c].naCnt() > 0) {
            error("_user_y", "The user-specified Y cannot contain any missing values");
            break;
          } else if (centersVecs[c].isConst() && centersVecs[c].max() == 0)
            zero_vec++;
        }
        if (zero_vec == _ncolA)
          error("_user_y", "The user-specified Y cannot all be zero");
      }
    }

    if (_parms._user_x != null) { // Check dimensions of user-specified initial X
      if (_parms._init != GlrmInitialization.User)
        error("_init", "init must be 'User' if providing user-specified points");

      Frame user_x = _parms._user_x.get();
      assert user_x != null;

      if (user_x.numCols() != _parms._k)
        error("_user_x", "The user-specified X must have k = " + _parms._k + " columns");
      else if (user_x.numRows() != _train.numRows())
        error("_user_x", "The user-specified X must have the same number of rows " +
                "(" + _train.numRows() + ") as the training observations");
      else {
        int zero_vec = 0;
        Vec[] centersVecs = user_x.vecs();
        for (int c = 0; c < _parms._k; c++) {
          if (centersVecs[c].naCnt() > 0) {
            error("_user_x", "The user-specified X cannot contain any missing values");
            break;
          } else if (centersVecs[c].isConst() && centersVecs[c].max() == 0)
            zero_vec++;
        }
        if (zero_vec == _parms._k)
          error("_user_x", "The user-specified X cannot all be zero");
      }
    }

    for (int i = 0; i < _ncolA; i++) {
      if (_train.vec(i).isString() || _train.vec(i).isUUID())
        throw H2O.unimpl("GLRM cannot handle String or UUID data");
    }

    if (expensive && error_count() == 0) checkMemoryFootPrint();  // check to make sure we can fit.
  }

  /** Validate all Loss-related parameters, and fill in the `_lossFunc` array. */
  private void initLoss() {
    int num_loss_by_cols = _parms._loss_by_col == null? 0 : _parms._loss_by_col.length;
    int num_loss_by_cols_idx = _parms._loss_by_col_idx == null? 0 : _parms._loss_by_col_idx.length;

    // First validate the parameters that do not require access to the training frame
    if (_parms._period <= 0)
      error("_period", "_period must be a positive integer");
    if (!_parms._loss.isForNumeric())
      error("_loss", _parms._loss + " is not a numeric loss function");
    if (!_parms._multi_loss.isForCategorical())
      error("_multi_loss", _parms._multi_loss + " is not a multivariate loss function");
    if (num_loss_by_cols != num_loss_by_cols_idx && num_loss_by_cols_idx > 0)
      error("_loss_by_col", "Sizes of arrays _loss_by_col and _loss_by_col_idx must be the same");

    if (_train == null) return;

    _binaryColumnIndices = new ArrayList<Integer>();

    // Initialize the default loss functions for each column
    // Note: right now for binary columns `.isCategorical()` returns true. It has the undesired consequence that
    // such variables will get categorical loss function, and will get expanded into 2 columns.
    _lossFunc = new GlrmLoss[_ncolA];
    for (int i = 0; i < _ncolA; i++) {
      Vec vi = _train.vec(i);
      _lossFunc[i] = vi.isCategorical()? _parms._multi_loss : _parms._loss;
    }

    String[] origColumnNames = _parms.train().names();  // grab original frame column names before change
    ArrayList<String> newColumnNames = new ArrayList<String>(Arrays.asList(_train._names));
    // If _loss_by_col is provided, then override loss functions on the specified columns
    if (num_loss_by_cols > 0) {
      if (num_loss_by_cols_idx == 0) {
        if (num_loss_by_cols == origColumnNames.length)
          assignLossByCol(num_loss_by_cols, newColumnNames, origColumnNames);
        else
          error("_loss_by_col", "Number of override loss functions should be the same as the " +
                  "number of columns in the input frame; or otherwise an explicit _loss_by_col_idx should be " +
                  "provided.");
      }
      if (num_loss_by_cols_idx == num_loss_by_cols)
        assignLossByCol(num_loss_by_cols, newColumnNames, origColumnNames);

      // Otherwise we have already reported an error at the start of this method
    }

    // Check that all loss functions correspond to their actual type
    for (int i = 0; i < _ncolA; i++) {
      Vec vi = _train.vec(i);
      GlrmLoss lossi = _lossFunc[i];

      if (vi.isNumeric())   { // numeric columns
        if (!vi.isBinary()) { // non-binary numeric columns
          if (!lossi.isForNumeric())
            error("_loss_by_col", "Loss function "+lossi+" cannot be applied to numeric column "+i);
        } else {  // binary numeric columns
          if (!lossi.isForBinary() && !lossi.isForNumeric()) {
            error("_loss_by_col", "Loss function "+lossi+" cannot be applied to binary column "+i);
          }
        }
      } else if (vi.isCategorical()) {  // categorical columns
        if (vi.isBinary())  {  // categorical binary columns
          if (!lossi.isForBinary() && !lossi.isForCategorical())
            error("_loss_by_col", "Loss function "+lossi+" cannot be applied to binary column "+i);
          else if (lossi.isForBinary())
            _binaryColumnIndices.add(i);  // collect column indices storing binary columns with binary loss function.
        } else {  // categorical non-binary columns
          if (!lossi.isForCategorical())
            error("_loss_by_col","Loss function "+lossi+" cannot be applied to categorical column "+i);
        }
      }

      // For "Periodic" loss function supply the period. We currently have no support for different periods for
      // different columns.
      if (lossi == GlrmLoss.Periodic)
        lossi.setParameters(_parms._period);
    }
  }

  /*
  Need to assign column loss for each column.  However, due to constant columns being dropping, the
  loss function specified for a constant columns will no longer apply since we dropped that column.
  Need to take care of this case to avoid errors.
 */
  private void assignLossByCol(int num_loss_by_cols, ArrayList<String> newColumnNames, String[] origColumnNames) {
    for (int i = 0; i < num_loss_by_cols; i++) {
      int cidx = _parms._loss_by_col_idx==null?i:_parms._loss_by_col_idx[i];
      String colNames = origColumnNames[cidx];
      if (cidx < 0 || cidx >= origColumnNames.length)
        error("_loss_by_col_idx", "Column index " + cidx + " must be in [0," + _ncolA + ")");
      else
      if (newColumnNames.contains(colNames))
        _lossFunc[newColumnNames.indexOf(colNames)] = _parms._loss_by_col[i];
    }
  }

  // Squared Frobenius norm of a matrix (sum of squared entries)
  public static double frobenius2(double[][] x) {
    if (x == null) return 0;
    double frob = 0;
    for (double[] xs : x)
      for (double j : xs)
        frob += j*j;
    return frob;
  }

  // Closed-form solution for X and Y may exist if both loss and regularizers are quadratic.
  public final boolean hasClosedForm(long na_cnt) {
    if (na_cnt != 0) return false;
    for (GlrmLoss lossi : _lossFunc)
      if (lossi != Quadratic)
        return false;
    return (_parms._gamma_x == 0 || _parms._regularization_x == GlrmRegularizer.None ||
                                    _parms._regularization_x == GlrmRegularizer.Quadratic) &&
           (_parms._gamma_y == 0 || _parms._regularization_y == GlrmRegularizer.None ||
                                    _parms._regularization_y == GlrmRegularizer.Quadratic);
  }


  // Transform each column of a 2-D array, assuming categoricals sorted before numeric cols
  public static double[][] transform(double[][] centers, double[] normSub, double[] normMul, int ncats, int nnums) {
    int K = centers.length;
    int N = centers[0].length;
    assert ncats + nnums == N;
    double[][] value = new double[K][N];
    double[] means = normSub == null ? MemoryManager.malloc8d(nnums) : normSub;
    double[] mults = normMul == null ? MemoryManager.malloc8d(nnums) : normMul;
    if (normMul == null) Arrays.fill(mults, 1.0);

    for (int clu = 0; clu < K; clu++) {
      System.arraycopy(centers[clu], 0, value[clu], 0, ncats);
      for (int col = 0; col < nnums; col++)
        value[clu][ncats+col] = (centers[clu][ncats+col] - means[col]) * mults[col];
    }
    return value;
  }

  // More efficient implementation assuming sdata cols aligned with adaptedFrame
  public static double[][] expandCats(double[][] sdata, DataInfo dinfo) {
    if (sdata == null || dinfo._cats == 0) return sdata;
    assert sdata[0].length == dinfo._adaptedFrame.numCols();

    // Column count for expanded matrix
    int catsexp = dinfo._catOffsets[dinfo._catOffsets.length-1];
    double[][] cexp = new double[sdata.length][catsexp + dinfo._nums];

    for (int i = 0; i < sdata.length; i++)
      LinearAlgebraUtils.expandRow(sdata[i], dinfo, cexp[i], false);
    return cexp;
  }

  class GLRMDriver extends Driver {
    // Initialize Y and X matrices
    // tinfo = original training data A, dfrm = [A,X,W] where W is working copy of X (initialized here)
    private double[][] initialXY(DataInfo tinfo, Frame dfrm, GLRMModel model, long na_cnt) {
      double[][] centers, centers_exp = null;

      if (_parms._init == GlrmInitialization.User) { // Set X and Y to user-specified points if available, Gaussian matrix if not
        Frame userYFrame = _parms._user_y == null? null : _parms._user_y.get();
        if (userYFrame != null) {   // Set Y = user-specified initial points
          Vec[] yVecs = userYFrame.vecs();

          if (_parms._expand_user_y) {   // Categorical cols must be one-hot expanded
            // Get the centers and put into array
            centers = new double[_parms._k][_ncolA];
            for (int c = 0; c < _ncolA; c++) {
              for (int r = 0; r < _parms._k; r++)
                centers[r][c] = yVecs[c].at(r);
            }

            // Permute cluster columns to align with dinfo and expand out categoricals
            centers = ArrayUtils.permuteCols(centers, tinfo._permutation);
            centers_exp = expandCats(centers, tinfo);
          } else {    // User Y already has categoricals expanded
            centers_exp = new double[_parms._k][_ncolY];
            for (int c = 0; c < _ncolY; c++) {
              for (int r = 0; r < _parms._k; r++)
                centers_exp[r][c] = yVecs[c].at(r);
            }
          }
        } else
          centers_exp = ArrayUtils.gaussianArray(_parms._k, _ncolY);

        if (_parms._user_x != null) {   // Set X = user-specified initial points
          Frame tmp = new Frame(dfrm);
          tmp.add(_parms._user_x.get());   // [A,X,W,U] where U = user-specified X

          // Set X and W to the same values as user-specified initial X
          new MRTask() {
            @Override public void map(Chunk[] cs) {
              for (int row = 0; row < cs[0]._len; row++) {
                for (int i = _ncolA; i < _ncolA+_ncolX; i++) {
                  double x = cs[2*_ncolX + i].atd(row);
                  cs[i].set(row, x);
                  cs[_ncolX + i].set(row, x);
                }
              }
            }
          }.doAll(tmp);
        } else {
          InitialXProj xtsk = new InitialXProj(_parms, _ncolA, _ncolX);
          xtsk.doAll(dfrm);
        }
        return centers_exp;   // Don't project or change Y in any way if user-specified, just return it

      } else if (_parms._init == GlrmInitialization.Random) {  // Generate X and Y from standard normal distribution
        centers_exp = ArrayUtils.gaussianArray(_parms._k, _ncolY);
        InitialXProj xtsk = new InitialXProj(_parms, _ncolA, _ncolX);
        xtsk.doAll(dfrm);

      } else if (_parms._init == GlrmInitialization.SVD) {  // Run SVD on A'A/n (Gram) and set Y = right singular vectors
        SVDParameters parms = new SVDParameters();
        parms._train = _parms._train;
        parms._ignored_columns = _parms._ignored_columns;
        parms._ignore_const_cols = _parms._ignore_const_cols;
        parms._score_each_iteration = _parms._score_each_iteration;
        parms._use_all_factor_levels = true;   // Since GLRM requires Y matrix to have fully expanded ncols
        parms._nv = _parms._k;
        parms._transform = _parms._transform;
        parms._svd_method = _parms._svd_method;
        parms._max_iterations = parms._svd_method == SVDParameters.Method.Randomized ?
                _parms._k : _parms._max_iterations;
        parms._seed = _parms._seed;
        parms._keep_u = true;
        parms._impute_missing = true;
        parms._save_v_frame = false;

        SVDModel svd = ModelCacheManager.get(parms);
        if (svd == null) svd = new SVD(parms, _job, true, model).trainModelNested(_rebalancedTrain);
        model._output._init_key = svd._key;

        // Ensure SVD centers align with adapted training frame cols
        assert svd._output._permutation.length == tinfo._permutation.length;
        for (int i = 0; i < tinfo._permutation.length; i++)
          assert svd._output._permutation[i] == tinfo._permutation[i];
        centers_exp = ArrayUtils.transpose(svd._output._v);

        // Set X and Y appropriately given SVD of A = UDV'
        // a) Set Y = D^(1/2)V'S where S = diag(\sigma)
        double[] dsqrt = new double[_parms._k];
        for (int i = 0; i < _parms._k; i++) {
          dsqrt[i] = Math.sqrt(svd._output._d[i]);
          ArrayUtils.mult(centers_exp[i], dsqrt[i]);  // This gives one row of D^(1/2)V'
        }
        // b) Set X = UD^(1/2) = AVD^(-1/2)
        Frame uFrm = DKV.get(svd._output._u_key).get();
        assert uFrm.numCols() == _parms._k;
        assert uFrm.isCompatible(dfrm);
        Frame fullFrm = (new Frame(uFrm)).add(dfrm);  // Jam matrices together into frame [U,A,X,W]
        InitialXSVD xtsk = new InitialXSVD(dsqrt, _parms._k, _ncolA, _ncolX);
        xtsk.doAll(fullFrm);
      } else if (_parms._init == GlrmInitialization.PlusPlus) {  // Run k-means++ and set Y = resulting cluster centers, X = indicator matrix of assignments
        KMeansModel.KMeansParameters parms = new KMeansModel.KMeansParameters();
        parms._train = _parms._train;
        parms._ignored_columns = _parms._ignored_columns;
        parms._ignore_const_cols = _parms._ignore_const_cols;
        parms._score_each_iteration = _parms._score_each_iteration;
        parms._init = KMeans.Initialization.PlusPlus;
        parms._k = _parms._k;
        parms._max_iterations = _parms._max_iterations;
        parms._standardize = true;
        parms._seed = _parms._seed;
        parms._pred_indicator = true;

        KMeansModel km = ModelCacheManager.get(parms);
        if (km == null) km = new KMeans(parms, _job).trainModelNested(_rebalancedTrain);
        model._output._init_key = km._key;

        // Score only if clusters well-defined and closed-form solution does not exist
        double frob = frobenius2(km._output._centers_raw);
        if (frob != 0 && !Double.isNaN(frob) && !hasClosedForm(na_cnt)) {
          // Frame pred = km.score(_parms.train());
          Log.info("Initializing X to matrix of weights inversely correlated with cluster distances");
          InitialXKMeans xtsk = new InitialXKMeans(_parms, km, _ncolA, _ncolX);
          xtsk.doAll(dfrm);
        }

        // Permute cluster columns to align with dinfo, normalize nums, and expand out cats to indicator cols
        centers = ArrayUtils.permuteCols(km._output._centers_raw, tinfo.mapNames(km._output._names));
        centers = transform(centers, tinfo._normSub, tinfo._normMul, tinfo._cats, tinfo._nums);
        centers_exp = expandCats(centers, tinfo); // expand categorical columns to N binary columns
      } else
        error("_init", "Initialization method " + _parms._init + " is undefined");

      // If all centers are zero or any are NaN, initialize to standard Gaussian random matrix
      assert centers_exp != null && centers_exp.length == _parms._k && centers_exp[0].length == _ncolY :
              "Y must have " + _parms._k + " rows and " + _ncolY + " columns";
      double frob = frobenius2(centers_exp);
      if (frob == 0 || Double.isNaN(frob)) {
        warn("_init", "Initialization failed. Setting initial Y to standard normal random matrix instead");
        centers_exp = ArrayUtils.gaussianArray(_parms._k, _ncolY);
      }

      // Project rows of Y into appropriate subspace for regularizer
      Random rand = RandomUtils.getRNG(_parms._seed);
      for (int i = 0; i < _parms._k; i++)
        centers_exp[i] = _parms._regularization_y.project(centers_exp[i], rand);
      return centers_exp;
    }

    // In case of quadratic loss and regularization, initialize closed form X = AY'(YY' + \gamma)^(-1)
    private void initialXClosedForm(DataInfo dinfo, Archetypes yt_arch, double[] normSub, double[] normMul) {
      Log.info("Initializing X = AY'(YY' + gamma I)^(-1) where A = training data");
      double[][] ygram = ArrayUtils.formGram(yt_arch._archetypes);
      if (_parms._gamma_y > 0) {
        for (int i = 0; i < ygram.length; i++)
          ygram[i][i] += _parms._gamma_y;
      }
      CholeskyDecomposition yychol = regularizedCholesky(ygram, 10, false);
      if(!yychol.isSPD())
        Log.warn("Initialization failed: (YY' + gamma I) is non-SPD. Setting initial X to standard normal random " +
                "matrix. Results will be numerically unstable");
      else {
        CholMulTask cmtsk = new CholMulTask(yychol, yt_arch, _ncolA, _ncolX, dinfo._cats, normSub, normMul);
        cmtsk.doAll(dinfo._adaptedFrame);
      }
    }

    // Stopping criteria
    private boolean isDone(GLRMModel model, int steps_in_row, double step) {
      if (stop_requested()) return true;  // Stopped/cancelled

      // Stopped for running out of iterations
      if (model._output._iterations >= _parms._max_iterations) return true;
      if (model._output._updates >= _parms._max_updates) return true;

      // Stopped for falling below minimum step size
      if (step <= _parms._min_step_size) return true;

      // Stopped when enough steps and average decrease in objective per iteration < TOLERANCE
      return model._output._iterations > 10 && steps_in_row > 3 && Math.abs(model._output._avg_change_obj) < TOLERANCE;
    }

    // Regularized Cholesky decomposition using H2O implementation
    public Cholesky regularizedCholesky(Gram gram, int max_attempts) {
      int attempts = 0;
      double addedL2 = 0;   // TODO: Should I report this to the user?
      Cholesky chol = gram.cholesky(null);

      while (!chol.isSPD() && attempts < max_attempts) {
        if (addedL2 == 0) addedL2 = 1e-5;
        else addedL2 *= 10;
        ++attempts;
        gram.addDiag(addedL2); // try to add L2 penalty to make the Gram SPD
        Log.info("Added L2 regularization = " + addedL2 + " to diagonal of Gram matrix");
        gram.cholesky(chol);
      }
      if (!chol.isSPD())
        throw new Gram.NonSPDMatrixException();
      return chol;
    }

    public Cholesky regularizedCholesky(Gram gram) { return regularizedCholesky(gram, 10); }

    // Regularized Cholesky decomposition using JAMA implementation
    public CholeskyDecomposition regularizedCholesky(double[][] gram, int max_attempts, boolean throw_exception) {
      int attempts = 0;
      double addedL2 = 0;
      Matrix gmat = new Matrix(gram);
      CholeskyDecomposition chol = new CholeskyDecomposition(gmat);

      while (!chol.isSPD() && attempts < max_attempts) {
        if (addedL2 == 0) addedL2 = 1e-5;
        else addedL2 *= 10;
        ++attempts;

        for (int i = 0; i < gram.length; i++) gmat.set(i,i,addedL2); // try to add L2 penalty to make the Gram SPD
        Log.info("Added L2 regularization = " + addedL2 + " to diagonal of Gram matrix");
        chol = new CholeskyDecomposition(gmat);
      }
      if (!chol.isSPD() && throw_exception)
        throw new Gram.NonSPDMatrixException();
      return chol;
    }

    // Recover singular values and eigenvectors of XY
    // However, they are only saved if the user has specified _parms._recover_svd to be true.
    // Otherwise, we are doing recoverSVD to just recover enough information to calculate information
    // for variance metrics specified in PUBDEV-3501.
    public void recoverSVD(GLRMModel model, DataInfo xinfo, DataInfo dinfo) {
      // NOTE: Gram computes X'X/n where n = nrow(A) = number of rows in training set
      GramTask xgram = new GramTask(_job._key, xinfo).doAll(xinfo._adaptedFrame);
      GramTask dgram = new GramTask(_job._key, dinfo).doAll(dinfo._adaptedFrame);
      Cholesky xxchol = regularizedCholesky(xgram._gram);
      long nobs = xgram._nobs;

      // R from QR decomposition of X = QR is upper triangular factor of Cholesky of X'X
      // Gram = X'X/n = LL' -> X'X = (L*sqrt(n))(L'*sqrt(n))
      Matrix x_r = new Matrix(xxchol.getL()).transpose();
      x_r = x_r.times(Math.sqrt(nobs));

      Matrix yt = new Matrix(model._output._archetypes_raw.getY(true));
      QRDecomposition yt_qr = new QRDecomposition(yt);
      Matrix yt_r = yt_qr.getR();   // S from QR decomposition of Y' = ZS
      Matrix rrmul = x_r.times(yt_r.transpose());
      SingularValueDecomposition rrsvd = new SingularValueDecomposition(rrmul);   // RS' = U \Sigma V'
      double[] sval = rrsvd.getSingularValues();  // get singular values as double array

      double dfcorr = (nobs > 1)?nobs/(nobs-1.0):1.0;  // find number of observations
      double oneOverNobsm1 = (nobs>1)?1.0/Math.sqrt(nobs-1):1.0;

      model._output._std_deviation = Arrays.copyOf(sval, sval.length);
      ArrayUtils.mult(model._output._std_deviation, oneOverNobsm1);
      model._output._total_variance = dfcorr * dgram._gram.diagSum();

      if (Math.abs(model._output._std_deviation[0])>0) {
          double catScale = Math.sqrt(model._output._total_variance/ArrayUtils.l2norm2(model._output._std_deviation));
          ArrayUtils.mult(model._output._std_deviation, catScale);
      }

      double[] vars = new double[model._output._std_deviation.length];
      double[] prop_var = new double[vars.length];
      double[] cum_var = new double[vars.length];
      generateIPC(model._output._std_deviation, model._output._total_variance, vars, prop_var, cum_var);

      String[] colTypes = new String[_parms._k];
      String[] colFormats = new String[_parms._k];
      String[] colHeaders = new String[_parms._k];
      String[]  pcHeaders = new String[_parms._k];  // header for variance metrics, set to equal PCA
      Arrays.fill(colTypes, "double");
      Arrays.fill(colFormats, "%5f");
      for (int i = 0; i < colHeaders.length; i++) {
        colHeaders[i] = "Vec" + String.valueOf(i + 1);
        pcHeaders[i] = "pc" + String.valueOf(i + 1);
      }

      model._output._importance = new TwoDimTable("Importance of components", null,
              new String[]{"Standard deviation", "Proportion of Variance", "Cumulative Proportion"},
              pcHeaders, colTypes, colFormats, "", new String[3][], new double[][]{model._output._std_deviation,
              prop_var, cum_var});

      if (_parms._recover_svd) {  // only save the eigenvectors, singular values if _recover_svd=true
        // Eigenvectors are V'Z' = (ZV)'
        Matrix eigvec = yt_qr.getQ().times(rrsvd.getV());
        // Make TwoDimTable objects for prettier output
        model._output._eigenvectors_raw = eigvec.getArray();

        // Singular values ordered in weakly descending order by algorithm
        model._output._singular_vals = rrsvd.getSingularValues();
        assert model._output._names_expanded.length == model._output._eigenvectors_raw.length;
        model._output._eigenvectors = new TwoDimTable("Eigenvectors", null, model._output._names_expanded,
                colHeaders, colTypes, colFormats, "", new String[model._output._eigenvectors_raw.length][],
                model._output._eigenvectors_raw);
      }
    }

    private transient Frame _rebalancedTrain;
    @SuppressWarnings("ConstantConditions")  // Method too complex for IntelliJ
    @Override
    public void computeImpl() {
      GLRMModel model = null;
      DataInfo dinfo = null, xinfo = null, tinfo = null, tempinfo = null;
      Frame fr = null;
      boolean overwriteX = false;

      try {
        init(true);   // Initialize + Validate parameters
        if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());

        // The model to be built
        model = new GLRMModel(dest(), _parms, new GLRMModel.GLRMOutput(GLRM.this));
        model.delete_and_lock(_job);

        _rebalancedTrain = new Frame(_train);
        // Save adapted frame info for scoring later
        tinfo = new DataInfo(_train, _valid, 0, true, _parms._transform, DataInfo.TransformType.NONE,
                             false, false, false, /* weights */ false, /* offset */ false, /* fold */ false);
        DKV.put(tinfo._key, tinfo);


        tempinfo = new DataInfo(_train, null, 0, true, _parms._transform, DataInfo.TransformType.NONE,
                false, false, false, /* weights */ false, /* offset */ false, /* fold */ false);

        correctForBinaryLoss(tinfo);

        model._output._permutation = tinfo._permutation;
        model._output._nnums = tinfo._nums;
        model._output._ncats = tinfo._cats;
        model._output._catOffsets = tinfo._catOffsets;
        int[] numLevels = tinfo._adaptedFrame.cardinality();
        // may have more numerical columns now since we may change binary columns with binary loss to numerical columns
        for (int colIndex = tinfo._cats; colIndex < _train.numCols(); colIndex++) {
          if (numLevels[colIndex] > -1)
            numLevels[colIndex] = -1;
          else
            break;  // hit the numericcal columns already.  Nothing more need to be done here.
        }
        // need to prevent binary data column being expanded into two when the loss function is logistic here
        if (error_count() > 0) throw new IllegalArgumentException("Found validation errors: " + validationErrors());
        model._output._catOffsets = tinfo._catOffsets;
        model._output._names_expanded = tinfo.coefNames();

        // Save training frame adaptation information for use in scoring later
        model._output._normSub = tinfo._normSub == null ? new double[tinfo._nums] : tinfo._normSub;
        if (tinfo._normMul == null) {
          model._output._normMul = new double[tinfo._nums];
          Arrays.fill(model._output._normMul, 1.0);
        } else
          model._output._normMul = tinfo._normMul;

        // Save loss function for each column in adapted frame order
        assert _lossFunc != null && _lossFunc.length == _train.numCols();
        model._output._lossFunc = new GlrmLoss[_lossFunc.length];
        for (int i = 0; i < _lossFunc.length; i++)
          model._output._lossFunc[i] = _lossFunc[tinfo._permutation[i]];

        long nobs = _train.numRows() * _train.numCols();
        long na_cnt = 0;
        for (int i = 0; i < _train.numCols(); i++)
          na_cnt += _train.vec(i).naCnt();
        model._output._nobs = nobs - na_cnt;   // TODO: Should we count NAs?

        // 0) Initialize Y and X matrices
        // Jam A and X into a single frame for distributed computation
        // [A,X,W] A is read-only training data, X is matrix from prior iteration, W is working copy of X this iteration
        fr = new Frame(_train);
        Vec anyvec = fr.anyVec();
        assert anyvec != null;
        for (int i = 0; i < _ncolX; i++) fr.add("xcol_" + i, anyvec.makeZero());
        for (int i = 0; i < _ncolX; i++) fr.add("wcol_" + i, anyvec.makeZero());
        dinfo = new DataInfo(/* train */ fr, /* validation */ null, /* nResponses */ 0, /* useAllFactorLevels */ true,
                             /* pred. transform */ _parms._transform, /* resp. transform */ DataInfo.TransformType.NONE,
                             /* skipMissing */ false, /* imputeMissing */ false, /* missingBucket */ false,
                             /* weights */ false, /* offset */ false, /* fold */ false);
        DKV.put(dinfo._key, dinfo);

        int weightId = dinfo._weights ? dinfo.weightChunkId() : -1;


        // Use closed form solution for X if quadratic loss and regularization
        _job.update(1, "Initializing X and Y matrices");   // One unit of work

        double[/*k*/][/*features*/] yinit = initialXY(tinfo, dinfo._adaptedFrame, model, na_cnt); // on normalized A
        
        // Store Y' for more efficient matrix ops (rows = features, cols = k rank)
        Archetypes yt = new Archetypes(ArrayUtils.transpose(yinit), true, tinfo._catOffsets, numLevels);
        Archetypes ytnew = yt;

        double yreg = _parms._regularization_y.regularize(yt._archetypes);
        // Set X to closed-form solution of ALS equation if possible for better accuracy
        if (!(_parms._init == GlrmInitialization.User && _parms._user_x != null) && hasClosedForm(na_cnt))
          initialXClosedForm(dinfo, yt, model._output._normSub, model._output._normMul);

        // Compute initial objective function
        _job.update(1, "Computing initial objective function");   // One unit of work
        // Assume regularization on initial X is finite, else objective can be NaN if \gamma_x = 0
        boolean regX = _parms._regularization_x != GlrmRegularizer.None && _parms._gamma_x != 0;

        ObjCalc objtsk = new ObjCalc(_parms, yt, _ncolA, _ncolX, tinfo._cats, model._output._normSub,
                                     model._output._normMul, model._output._lossFunc, weightId, regX);
        objtsk.doAll(dinfo._adaptedFrame);

        model._output._objective = objtsk._loss + _parms._gamma_x * objtsk._xold_reg + _parms._gamma_y * yreg;
        model._output._archetypes_raw = yt;
        model._output._iterations = 0;
        model._output._updates = 0;
        model._output._avg_change_obj = 2 * TOLERANCE;    // Allow at least 1 iteration
        model._output._step_size = 0;       // set to zero
        model.update(_job);  // Update model in K/V store

        double step = _parms._init_step_size;   // Initial step size
        int steps_in_row = 0;                   // Keep track of number of steps taken that decrease objective

        while (!isDone(model, steps_in_row, step)) {
          // One unit of work
          _job.update(1, "Iteration " + String.valueOf(model._output._iterations+1) + " of alternating minimization");

          // TODO: Should step be divided by number of original or expanded (with 0/1 categorical) cols?
          // 1) Update X matrix given fixed Y

          // find out how much time it takes to update x
          UpdateX xtsk = new UpdateX(_parms, yt, step/_ncolA, overwriteX, _ncolA, _ncolX, tinfo._cats, model._output._normSub, model._output._normMul, model._output._lossFunc, weightId);

          xtsk.doAll(dinfo._adaptedFrame);
          model._output._updates++;

          // 2) Update Y matrix given fixed X
          if (model._output._updates < _parms._max_updates) {
            // If max_updates is odd, we will terminate after the X update
            UpdateY ytsk = new UpdateY(_parms, yt, step/_ncolA, _ncolA, _ncolX, tinfo._cats, model._output._normSub,
                    model._output._normMul, model._output._lossFunc, weightId);
            double[][] yttmp = ytsk.doAll(dinfo._adaptedFrame)._ytnew;
            ytnew = new Archetypes(yttmp, true, tinfo._catOffsets, numLevels);
            yreg = ytsk._yreg;
            model._output._updates++;
          }

          // 3) Compute average change in objective function
          objtsk = new ObjCalc(_parms, ytnew, _ncolA, _ncolX, tinfo._cats, model._output._normSub, model._output._normMul, model._output._lossFunc, weightId);
          objtsk.doAll(dinfo._adaptedFrame);
          double obj_new = objtsk._loss + _parms._gamma_x * xtsk._xreg + _parms._gamma_y * yreg;
          model._output._avg_change_obj = (model._output._objective - obj_new) / nobs;
          model._output._iterations++;

          // step = 1.0 / model._output._iterations;   // Step size \alpha_k = 1/iters
          if (model._output._avg_change_obj > 0) {   // Objective decreased this iteration
            yt = ytnew;
            model._output._archetypes_raw = ytnew;  // Need full archetypes object for scoring
            model._output._objective = obj_new;
            step *= 1.05;
            steps_in_row = Math.max(1, steps_in_row+1);
            overwriteX = true;
          } else {    // If objective increased, re-run with smaller step size
            step /= Math.max(1.5, -steps_in_row);
            steps_in_row = Math.min(0, steps_in_row-1);
            overwriteX = false;
            if (_parms._verbose) {
              Log.info("Iteration " + model._output._iterations + ": Objective increased to " + obj_new
                      + "; reducing step size to " + step);
              _job.update(0,"Iteration " + model._output._iterations + ": Objective increased to " + obj_new +
                      "; reducing step size to " + step);
            }
          }

          // Add to scoring history
          model._output._training_time_ms.add(System.currentTimeMillis());
          model._output._history_step_size.add(step);
          model._output._history_objective.add(model._output._objective);
          model.update(_job); // Update model in K/V store
        }

        // 4) Save solution to model output
        // Save X frame for user reference later
        Vec[] xvecs = new Vec[_ncolX];
        String[] xnames = new String[_ncolX];
        if (overwriteX) {
          for (int i = 0; i < _ncolX; i++) {
            xvecs[i] = fr.vec(idx_xnew(i, _ncolA, _ncolX));
            xnames[i] = "Arch" + String.valueOf(i + 1);
          }
        } else {
          for (int i = 0; i < _ncolX; i++) {
            xvecs[i] = fr.vec(idx_xold(i, _ncolA));
            xnames[i] = "Arch" + String.valueOf(i + 1);
          }
        }
        model._output._representation_name = StringUtils.isNullOrEmpty(_parms._representation_name) ?
                "GLRMLoading_" + Key.rand() : _parms._representation_name;
        model._output._representation_key = Key.make(model._output._representation_name);
        Frame x = new Frame(model._output._representation_key, xnames, xvecs);
        xinfo = new DataInfo(x, null, 0, true, DataInfo.TransformType.NONE, DataInfo.TransformType.NONE,
                             false, false, false, /* weights */ false, /* offset */ false, /* fold */ false);
        DKV.put(x);
        DKV.put(xinfo);

        // add last step_size used
        model._output._step_size = step;

        // Add to scoring history
        model._output._history_step_size.add(step);
        // Transpose Y' to get original Y
        model._output._archetypes = yt.buildTable(model._output._names_expanded, false);
        recoverSVD(model, xinfo, tempinfo);     // getting variance information here

        // Impute and compute error metrics on training/validation frame
        model._output._training_metrics = model.scoreMetricsOnly(_parms.train());
        model._output._validation_metrics = model.scoreMetricsOnly(_parms.valid());
        model._output._model_summary = createModelSummaryTable(model._output);
        model._output._scoring_history = createScoringHistoryTable(model._output);  //no need to call this per iteration
        model.update(_job);
      } finally {
        List<Key<Vec>> keep = new ArrayList<>();
        if (model != null) {
          Frame loadingFrm = DKV.getGet(model._output._representation_key);
          if (loadingFrm != null) for (Vec vec: loadingFrm.vecs()) keep.add(vec._key);
          model.unlock(_job);
        }
        if (tinfo != null) tinfo.remove();
        if (dinfo != null) dinfo.remove();
        if (xinfo != null) xinfo.remove();
        if (tempinfo != null) tempinfo.remove();

        // if (x != null && !_parms._keep_loading) x.delete();
        // Clean up unused copy of X matrix
        if (fr != null) {
          if (overwriteX) {
            for (int i = 0; i < _ncolX; i++) fr.vec(idx_xold(i, _ncolA)).remove();
          } else {
            for (int i = 0; i < _ncolX; i++) fr.vec(idx_xnew(i, _ncolA, _ncolX)).remove();
          }
        }
        Scope.untrack(keep);
      }
    }

    /*
    This funciton will
    1. change categorical binary columns using binary loss function into numerical columns.  This involves
    performing the following variables:
    _nums, _cats, _catOffsets, _catMissing, _catNAFill, _permutation, _normMul, _normSub, _normMeans, _numOffsets
      If no logistic loss is specified, no action will be performed.
    2. for numeric columns using logistic loss, it will prevent it from being transformed to say zero mean
    and unit variance columns.
     */
    private void correctForBinaryLoss(DataInfo tinfo) {

      // set mean = 0 and mul = 1.0 for binary numeric columns using binary loss functions
      for (int index = 0; index < tinfo._nums; index++) {
        if (_lossFunc[tinfo._permutation[index+tinfo._cats]].isForBinary()) { // binary loss used on numeric columns
          if (tinfo._normMul != null)
            tinfo._normMul[index] = 1;

          if (tinfo._normSub != null)
            tinfo._normSub[index] = 0;
        }
      }
      // change binary categorical columns using binary loss functions to binary numerical columns
      if (!(_binaryColumnIndices == null) && (_binaryColumnIndices.size()>0)) {
        // change the number of categorical and numerical column counts.
        int binaryLossCols = _binaryColumnIndices.size(); // number of columns to change to numerics
        int numCatColumns = tinfo._cats;  // store original categorical column number
        int numNumColumns = tinfo._nums;  // store original numerical column number
        tinfo._cats -= binaryLossCols;  // decrease the number of categorical columns
        tinfo._nums += binaryLossCols;  // increase the number of numerical columns

        int[] catOffsetsTemp = new int[tinfo._cats+1];   // offset column indices for the 1-hot expanded values (includes enum-enum interaction)
        boolean[] catMissingTemp = new boolean[tinfo._cats];  // bucket for missing levels
        int[] catNAFillTemp = new int[tinfo._cats];    // majority class of each categorical col (or last bucket if _catMissing[i] is true)
        int[] permutationTemp = new int[tinfo._permutation.length]; // permutation matrix mapping input col indices to adaptedFrame
        int[] numOffsetsTemp = new int[tinfo._nums];
        int[] cardinalities = _train.cardinality();
        int[] currentCardinality = new int[tinfo._cats];
        double[] normMulTemp = new double[tinfo._nums];
        double[] normSubTemp = new double[tinfo._nums];
        double[] numMeansTemp = new double[tinfo._nums];
        int newColIndex = 0;

        for (int colIndex = 0; colIndex < numCatColumns; colIndex++) {  // go through all categoricals
          if (!(_binaryColumnIndices.contains(tinfo._permutation[colIndex]))) {
            permutationTemp[newColIndex] = tinfo._permutation[colIndex];
            catMissingTemp[newColIndex] = tinfo._catMissing[colIndex];
            catNAFillTemp[newColIndex] = tinfo.catNAFill(colIndex);
            currentCardinality[newColIndex] = cardinalities[colIndex];
            catOffsetsTemp[newColIndex+1] = catOffsetsTemp[newColIndex]+currentCardinality[newColIndex];

            newColIndex++;
          }
        }
        numOffsetsTemp[0] = catOffsetsTemp[newColIndex];
        for (int colIndex = 0; colIndex < binaryLossCols; colIndex++) { // set infos for new numerical binary columns
          permutationTemp[colIndex + newColIndex] = _binaryColumnIndices.get(colIndex);
          normMulTemp[colIndex] = 1.0;
          normSubTemp[colIndex] = 0.0;
          numMeansTemp[colIndex] = 0.0;
          if (colIndex > 0)
            numOffsetsTemp[colIndex] = numOffsetsTemp[colIndex-1]+1;
        }

        // copy over original numerical columns
        for (int colIndex = 0; colIndex < numNumColumns; colIndex++) {
          int newColumnIndex = colIndex + binaryLossCols;
          if (tinfo._normSub != null) {
            normMulTemp[newColumnIndex] = tinfo._normMul[colIndex];
          }
          if (tinfo._normSub != null) {
            normSubTemp[newColumnIndex] = tinfo._normSub[colIndex];
          }
          if (tinfo._numMeans != null) {
            numMeansTemp[newColumnIndex] = tinfo._numMeans[colIndex];
          }
          numOffsetsTemp[newColumnIndex] = numOffsetsTemp[newColumnIndex-1]+1;

          int numColIndex = newColumnIndex + tinfo._cats;
          permutationTemp[numColIndex] = tinfo._permutation[numColIndex];
        }

        // copy the changed arrays back to tinfo information
        tinfo._catOffsets = Arrays.copyOf(catOffsetsTemp, catOffsetsTemp.length);
        tinfo._catMissing = Arrays.copyOf(catMissingTemp, tinfo._cats);
        tinfo.setCatNAFill(Arrays.copyOf(catNAFillTemp, tinfo._cats));
        tinfo._permutation = Arrays.copyOf(permutationTemp, tinfo._permutation.length);
        tinfo._numOffsets = Arrays.copyOf(numOffsetsTemp, tinfo._nums);
        if (tinfo._normMul != null) {
          tinfo._normMul = Arrays.copyOf(normMulTemp, tinfo._nums);
        }
        if (tinfo._normSub != null) {
          tinfo._normSub = Arrays.copyOf(normSubTemp, tinfo._nums);
        }
        if (tinfo._numMeans != null) {
          tinfo._numMeans = Arrays.copyOf(numMeansTemp, tinfo._nums);
        }
        _ncolY = _ncolY-binaryLossCols;   // adjust for binary columns with binary loss changed to numerical columns
      }
    }

    private TwoDimTable createModelSummaryTable(GLRMModel.GLRMOutput output) {
      List<String> colHeaders = new ArrayList<>();
      List<String> colTypes = new ArrayList<>();
      List<String> colFormat = new ArrayList<>();

      // TODO: This causes overflow in R if too large
      // colHeaders.add("Number of Observed Entries"); colTypes.add("long"); colFormat.add("%d");
      colHeaders.add("Number of Iterations"); colTypes.add("long"); colFormat.add("%d");
      colHeaders.add("Final Step Size"); colTypes.add("double"); colFormat.add("%.5f");
      colHeaders.add("Final Objective Value"); colTypes.add("double"); colFormat.add("%.5f");

      TwoDimTable table = new TwoDimTable(
              "Model Summary", null,
              new String[1],
              colHeaders.toArray(new String[0]),
              colTypes.toArray(new String[0]),
              colFormat.toArray(new String[0]),
              "");
      int row = 0;
      int col = 0;
      // table.set(row, col++, output._nobs);
      table.set(row, col++, output._iterations);
      table.set(row, col++, output._history_step_size.get(output._history_step_size.size() - 1));
      table.set(row, col, output._objective);
      return table;
    }

    private TwoDimTable createScoringHistoryTable(GLRMModel.GLRMOutput output) {
      List<String> colHeaders = new ArrayList<>();
      List<String> colTypes = new ArrayList<>();
      List<String> colFormat = new ArrayList<>();
      colHeaders.add("Timestamp"); colTypes.add("string"); colFormat.add("%s");
      colHeaders.add("Duration"); colTypes.add("string"); colFormat.add("%s");
      colHeaders.add("Iteration"); colTypes.add("long"); colFormat.add("%d");
      colHeaders.add("Step Size"); colTypes.add("double"); colFormat.add("%.5f");
      colHeaders.add("Objective"); colTypes.add("double"); colFormat.add("%.5f");

      int rows = output._training_time_ms.size();
      TwoDimTable table = new TwoDimTable(
              "Scoring History", null,
              new String[rows],
              colHeaders.toArray(new String[0]),
              colTypes.toArray(new String[0]),
              colFormat.toArray(new String[0]),
              "");
      for (int row = 0; row<rows; row++) {
        int col = 0;
        assert(row < table.getRowDim());
        assert(col < table.getColDim());
        DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss");
        table.set(row, col++, fmt.print(output._training_time_ms.get(row)));
        table.set(row, col++, PrettyPrint.msecs(output._training_time_ms.get(row) - _job.start_time(), true));
        table.set(row, col++, row);
        table.set(row, col++, output._history_step_size.get(row));
        table.set(row, col  , output._history_objective.get(row));
      }
      return table;
    }
  }

  @SuppressWarnings("ExternalizableWithoutPublicNoArgConstructor")
  protected static final class Archetypes extends Iced<Archetypes> {
    double[][] _archetypes;  // Y has nrows = k (lower dim), ncols = n (features)
    boolean _transposed;     // Is _archetypes = Y'? Used during model building for convenience.
    final int[] _catOffsets;
    final int[] _numLevels;  // numLevels[i] = -1 if column i is not categorical

    Archetypes(double[][] y, boolean transposed, int[] catOffsets, int[] numLevels) {
      _archetypes = y;
      _transposed = transposed;
      _catOffsets = catOffsets;
      _numLevels = numLevels;   // TODO: Check sum(cardinality[cardinality > 0]) + nnums == nfeatures()
    }

    public int rank() {
      return _transposed ? _archetypes[0].length : _archetypes.length;
    }

    public int nfeatures() {
      return _transposed ? _archetypes.length : _archetypes[0].length;
    }

    // If transpose = true, we want to return Y'
    public double[][] getY(boolean transpose) {
      return (transpose ^ _transposed) ? ArrayUtils.transpose(_archetypes) : _archetypes;
    }

    public TwoDimTable buildTable(String[] features, boolean transpose) {
      // Must pass in categorical column expanded feature names
      int rank = rank();
      int nfeat = nfeatures();
      assert features != null && features.length == nfeatures();

      double[][] yraw = getY(transpose);
      if (transpose) {  // rows = features (n), columns = archetypes (k)
        String[] colTypes = new String[rank];
        String[] colFormats = new String[rank];
        String[] colHeaders = new String[rank];

        Arrays.fill(colTypes, "double");
        Arrays.fill(colFormats, "%5f");
        for (int i = 0; i < colHeaders.length; i++) colHeaders[i] = "Arch" + String.valueOf(i + 1);
        return new TwoDimTable("Archetypes", null, features, colHeaders, colTypes, colFormats, "",
                new String[yraw.length][], yraw);
      } else {  // rows = archetypes (k), columns = features (n)
        String[] rowNames = new String[rank];
        String[] colTypes = new String[nfeat];
        String[] colFormats = new String[nfeat];

        Arrays.fill(colTypes, "double");
        Arrays.fill(colFormats, "%5f");
        for (int i = 0; i < rowNames.length; i++) rowNames[i] = "Arch" + String.valueOf(i + 1);
        return new TwoDimTable("Archetypes", null, rowNames, features, colTypes, colFormats, "",
                new String[yraw.length][], yraw);
      }
    }

    // For j = 0 to number of numeric columns - 1
    public int getNumCidx(int j) {
      return _catOffsets[_catOffsets.length-1]+j;
    }

    // For j = 0 to number of categorical columns - 1, and level = 0 to number of levels in categorical column - 1
    public int getCatCidx(int j, int level) {
      int catColJLevel = _numLevels[j];
      assert catColJLevel != 0 : "Number of levels in categorical column cannot be zero";
      assert !Double.isNaN(level) && level >= 0 && level < catColJLevel : "Got level = " + level +
              " when expected integer in [0," + catColJLevel + ")";
      return _catOffsets[j]+level;
    }

    protected final double getNum(int j, int k) {
      int cidx = getNumCidx(j);
      return _transposed ? _archetypes[cidx][k] : _archetypes[k][cidx];
    }

    // Inner product x * y_j where y_j is numeric column j of Y
    protected final double lmulNumCol(double[] x, int j) {
      assert x != null && x.length == rank() : "x must be of length " + rank();
      int cidx = getNumCidx(j);

      double prod = 0;
      if (_transposed) {
        for (int k = 0; k < rank(); k++)
          prod += x[k] * _archetypes[cidx][k];
      } else {
        for (int k = 0; k < rank(); k++)
          prod += x[k] * _archetypes[k][cidx];
      }
      return prod;
    }

    protected final double getCat(int j, int level, int k) {
      int cidx = getCatCidx(j, level);
      return _transposed ? _archetypes[cidx][k] : _archetypes[k][cidx];
    }

    // Extract Y_j the k by d_j block of Y corresponding to categorical column j
    // Note: d_j = number of levels in categorical column j
    protected final double[][] getCatBlock(int j) {
      int catColJLevel = _numLevels[j];
      assert catColJLevel != 0 : "Number of levels in categorical column cannot be zero";
      double[][] block = new double[rank()][catColJLevel];

      if (_transposed) {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = getCatCidx(j,level);
          for (int k = 0; k < rank(); k++)
            block[k][level] = _archetypes[cidx][k];
        }
      } else {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = getCatCidx(j,level);
          for (int k = 0; k < rank(); k++)
            block[k][level] = _archetypes[k][cidx];
        }
      }
      return block;
    }

    // Vector-matrix product x * Y_j where Y_j is block of Y corresponding to categorical column j
    protected final double[] lmulCatBlock(double[] x, int j) {
      int catColJLevel = _numLevels[j];
      assert catColJLevel != 0 : "Number of levels in categorical column cannot be zero";
      assert x != null && x.length == rank() : "x must be of length " + rank();
      double[] prod = new double[catColJLevel];

      if (_transposed) {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = getCatCidx(j,level);
          for (int k = 0; k < rank(); k++)
            prod[level] += x[k] * _archetypes[cidx][k];
        }
      } else {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = getCatCidx(j,level);
          for (int k = 0; k < rank(); k++)
            prod[level] += x[k] * _archetypes[k][cidx];
        }
      }
      return prod;
    }
  }

  // In chunk, first _ncolA cols are A, next _ncolX cols are X
  protected static int idx_xold(int c, int ncolA) { return ncolA+c; }
  protected static int idx_xnew(int c, int ncolA, int ncolX) { return ncolA+ncolX+c; }

  // Initialize X to standard Gaussian random matrix projected into regularizer subspace
  private static class InitialXProj extends MRTask<InitialXProj> {
    GLRMParameters _parms;
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)

    InitialXProj(GLRMParameters parms, int ncolA, int ncolX) {
      _parms = parms;
      _ncolA = ncolA;
      _ncolX = ncolX;
    }

    @Override public void map(Chunk[] chks) {
      Random rand = RandomUtils.getRNG(0);

      for (int row = 0; row < chks[0]._len; row++) {
        rand.setSeed(_parms._seed + chks[0].start() + row);   // global row ID determines the seed
        double[] xrow = ArrayUtils.gaussianVector(_ncolX, rand);
        xrow = _parms._regularization_x.project(xrow, rand);
        for (int c = 0; c < xrow.length; c++) {
          chks[_ncolA+c].set(row, xrow[c]);
          chks[_ncolA+_ncolX+c].set(row, xrow[c]);
        }
      }
    }
  }

  // Initialize X = UD, where U is m x k and D is a diagonal k x k matrix
  private static class InitialXSVD extends MRTask<InitialXSVD> {
    final double[] _diag;   // Diagonal of D
    final int _ncolU;       // Number of cols in U (k)
    final int _offX;        // Column offset to X matrix
    final int _offW;        // Column offset to W matrix

    InitialXSVD(double[] diag, int ncolU, int ncolA, int ncolX) {
      assert diag != null && diag.length == ncolU;
      _diag = diag;
      _ncolU = ncolU;
      _offX = ncolU + ncolA;
      _offW = _offX + ncolX;
    }

    @Override public void map(Chunk[] chks) {
      for (int row = 0; row < chks[0]._len; row++) {
        for (int c = 0; c < _ncolU; c++) {
          double ud = chks[c].atd(row) * _diag[c];
          chks[_offX+c].set(row, ud);
          chks[_offW+c].set(row, ud);
        }
      }
    }
  }

  // Initialize X to matrix of indicator columns for cluster assignments, e.g. k = 4, cluster = 3 -> [0, 0, 1, 0]
  private static class InitialXKMeans extends MRTask<InitialXKMeans> {
    GLRMParameters _parms;
    KMeansModel _model;
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)

    InitialXKMeans(GLRMParameters parms, KMeansModel model, int ncolA, int ncolX) {
      _parms = parms;
      _model = model;
      _ncolA = ncolA;
      _ncolX = ncolX;
    }

    @Override public void map(Chunk[] chks) {
      double[] tmp = new double[_ncolA];
      Random rand = RandomUtils.getRNG(0);

      for (int row = 0; row < chks[0]._len; row++) {
        // double preds[] = new double[_ncolX];
        // double p[] = _model.score_indicator(chks, row, tmp, preds);
        double[] p = _model.score_ratio(chks, row, tmp);
        rand.setSeed(_parms._seed + chks[0].start() + row); //global row ID determines the seed
        // TODO: Should we restrict indicator cols to regularizer subspace?
        p = _parms._regularization_x.project(p, rand);
        for (int c = 0; c < p.length; c++) {
          chks[_ncolA+c].set(row, p[c]);
          chks[_ncolA+_ncolX+c].set(row, p[c]);
        }
      }
    }
  }


  //--------------------------------------------------------------------------------------------------------------------
  // Update X step
  //--------------------------------------------------------------------------------------------------------------------

  private static class UpdateX extends MRTask<UpdateX> {
    // Input
    GLRMParameters _parms;
    GlrmLoss[] _lossFunc;
    final double _alpha;      // Step size divided by num cols in A
    final boolean _update;    // Should we update X from working copy?
    final Archetypes _yt;     // _yt = Y' (transpose of Y)
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)
    final int _ncats;         // Number of categorical cols in training frame
    final double[] _normSub;  // For standardizing training data
    final double[] _normMul;
    final int _weightId;

    // Output
    double _loss;    // Loss evaluated on A - XY using new X (and current Y)
    double _xreg;    // Regularization evaluated on new X

    UpdateX(GLRMParameters parms, Archetypes yt, double alpha, boolean update, int ncolA, int ncolX, int ncats,
            double[] normSub, double[] normMul, GlrmLoss[] lossFunc, int weightId) {
      assert yt != null && yt.rank() == ncolX;
      _parms = parms;
      _yt = yt;
      _lossFunc = lossFunc;
      _alpha = alpha;
      _update = update;
      _ncolA = ncolA;
      _ncolX = ncolX;

      // Info on A (cols 1 to ncolA of frame)
      assert ncats <= ncolA;
      _ncats = ncats;
      _weightId = weightId;
      _normSub = normSub;
      _normMul = normMul;
    }

    private Chunk chk_xold(Chunk[] chks, int c) {
      return chks[_ncolA + c];
    }
    private Chunk chk_xnew(Chunk[] chks, int c) {
      return chks[_ncolA + _ncolX + c];
    }

    @SuppressWarnings("ConstantConditions")  // The method is too complex for IntelliJ
    @Override public void map(Chunk[] cs) {
      assert (_ncolA + 2*_ncolX) == cs.length;
      double[] a = new double[_ncolA];
      double[] tgrad = new double[_ncolX];  // new gradient calculation with reduced memory allocation
      double[] u = new double[_ncolX];
      Chunk chkweight = _weightId >= 0 ? cs[_weightId] : new C0DChunk(1, cs[0]._len);
      Random rand = RandomUtils.getRNG(0);
      _loss = _xreg = 0;
      double[] xy = null;
      double[] prod = null;
      if (_yt._numLevels[0] > 0) {
        xy = new double[_yt._numLevels[0]]; // maximum categorical level column is always the first one
        prod = new double[_yt._numLevels[0]];
      }

      for (int row = 0; row < cs[0]._len; row++) {
        rand.setSeed(_parms._seed + cs[0].start() + row); //global row ID determines the seed
        Arrays.fill(tgrad, 0.0);  // temporary gradient for comparison

        // Additional user-specified weight on loss for this row
        double cweight = chkweight.atd(row);
        assert !Double.isNaN(cweight) : "User-specified weight cannot be NaN";

        // Copy old working copy of X to current X if requested
        if (_update) {
          for (int k = 0; k < _ncolX; k++)
            chk_xold(cs, k).set(row, chk_xnew(cs, k).atd(row));
        }

        // Compute gradient of objective at row
        // Categorical columns
        for (int j = 0; j < _ncats; j++) {
          if (Double.isNaN(a[j])) continue;   // Skip missing observations in row
          a[j] = cs[j].atd(row);
          int catColJLevel = _yt._numLevels[j];
          Arrays.fill(xy, 0, catColJLevel, 0);  // reset xy before accumulate sum

          // Calculate x_i * Y_j where Y_j is sub-matrix corresponding to categorical col j
          for (int level = 0; level < catColJLevel ; level++) {
            for (int k = 0; k < _ncolX; k++) {
              xy[level] += chk_xold(cs, k).atd(row) * _yt.getCat(j, level, k);
            }
          }

          // Gradient wrt x_i is matrix product \grad L_{i,j}(x_i * Y_j, A_{i,j}) * Y_j'
          double[] weight = _lossFunc[j].mlgrad(xy, (int) a[j], prod, catColJLevel );
          if (_yt._transposed) {
            for (int c = 0; c < catColJLevel ; c++) {
              int cidx = _yt.getCatCidx(j, c);
              double weights = cweight * weight[c];
              double[] yArchetypes = _yt._archetypes[cidx];
              for (int k = 0; k < _ncolX; k++)
                tgrad[k] += weights * yArchetypes[k];

            }
          } else {
            for (int c = 0; c < catColJLevel; c++) {
              int cidx = _yt.getCatCidx(j, c);
              double weights = cweight * weight[c];

              for (int k = 0; k < _ncolX; k++)
                tgrad[k] += weights * _yt._archetypes[k][cidx];
            }
          }
        }

        // Numeric columns
        for (int j = _ncats; j < _ncolA; j++) {
          int js = j - _ncats;
          a[j] = cs[j].atd(row);
          if (Double.isNaN(a[j])) continue;   // Skip missing observations in row

          // Inner product x_i * y_j
          double xy1 = 0;
          for (int k = 0; k < _ncolX; k++)
            xy1 += chk_xold(cs, k).atd(row) * _yt.getNum(js, k);

          // Sum over y_j weighted by gradient of loss \grad L_{i,j}(x_i * y_j, A_{i,j})
          double weight = cweight * _lossFunc[j].lgrad(xy1, (a[j] - _normSub[js]) * _normMul[js]);
          for (int k = 0; k < _ncolX; k++)
            tgrad[k] += weight * _yt.getNum(js, k);
        }

        // Update row x_i of working copy with new values
        for (int k = 0; k < _ncolX; k++) {
          double xold = chk_xold(cs, k).atd(row);   // Old value of x_i
          u[k] = xold - _alpha * tgrad[k];
        }
        double[] xnew = _parms._regularization_x.rproxgrad(u, _alpha*_parms._gamma_x, rand);
        _xreg += _parms._regularization_x.regularize(xnew);
        for (int k = 0; k < _ncolX; k++)
          chk_xnew(cs, k).set(row,xnew[k]);

        // Compute loss function using new x_i
        // Categorical columns
        for (int j = 0; j < _ncats; j++) {
          if (Double.isNaN(a[j])) continue;   // Skip missing observations in row
          multVecArrFast(xnew, prod, _yt, j);
          _loss +=  _lossFunc[j].mloss(prod, (int) a[j], _yt._numLevels[j]);
        }

        // Numeric columns
        for (int j = _ncats; j < _ncolA; j++) {
          if (Double.isNaN(a[j])) continue;   // Skip missing observations in row
          int js = j - _ncats;
          double txy = _yt.lmulNumCol(xnew, js);
          _loss += _lossFunc[j].loss(txy, (a[j] - _normSub[js]) * _normMul[js]);
        }
        _loss *= cweight;
      }
    }


    /* same as ArrayUtils.multVecArr() but faster I hope. */
    private void multVecArrFast(double[] xnew, double[] xy, Archetypes yt, int j) {
      int catColJLevel = yt._numLevels[j];
      if (yt._transposed) {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = yt.getCatCidx(j, level);
          double[] yArchetypes = yt._archetypes[cidx];
          xy[level] = 0.0;
          for (int k = 0; k < _ncolX; k++)
            xy[level] += xnew[k] * yArchetypes[k];
        }
      } else {
        for (int level = 0; level < catColJLevel; level++) {
          int cidx = yt.getCatCidx(j, level);
          xy[level] = 0.0;
          for (int k = 0; k < _ncolX; k++)
            xy[level] += xnew[k] * yt._archetypes[k][cidx];
        }
      }
    }

    @Override public void reduce(UpdateX other) {
      _loss += other._loss;
      _xreg += other._xreg;
    }
  }

  private static class UpdateY extends MRTask<UpdateY> {
    // Input
    GLRMParameters _parms;
    GlrmLoss[] _lossFunc;
    final double _alpha;      // Step size divided by num cols in A
    final Archetypes _ytold;  // Old Y' matrix
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)
    final int _ncats;         // Number of categorical cols in training frame
    final double[] _normSub;  // For standardizing training data
    final double[] _normMul;
    final int _weightId;

    // Output
    double[][] _ytnew;  // New Y matrix
    double _yreg;       // Regularization evaluated on new Y

    UpdateY(GLRMParameters parms, Archetypes yt, double alpha, int ncolA, int ncolX, int ncats, double[] normSub,
            double[] normMul, GlrmLoss[] lossFunc, int weightId) {
      assert yt != null && yt.rank() == ncolX;
      _parms = parms;
      _lossFunc = lossFunc;
      _alpha = alpha;
      _ncolA = ncolA;
      _ncolX = ncolX;
      _ytold = yt;
      _yreg = 0;

      // Info on A (cols 1 to ncolA of frame)
      assert ncats <= ncolA;
      _ncats = ncats;
      _weightId = weightId;
      _normSub = normSub;
      _normMul = normMul;
    }

    private Chunk chk_xnew(Chunk[] chks, int c) {
      return chks[_ncolA + _ncolX + c];
    }

    @Override public void map(Chunk[] cs) {
      assert (_ncolA + 2*_ncolX) == cs.length;
      _ytnew = new double[_ytold.nfeatures()][_ncolX];
      Chunk chkweight = _weightId >= 0 ? cs[_weightId]:new C0DChunk(1,cs[0]._len);
      double[] xy = null;
      double[] grad = null;
      if (_ytold._numLevels[0] > 0) {
        xy = new double[_ytold._numLevels[0]];
        grad = new double[_ytold._numLevels[0]];
      }

      // Categorical columns
      for (int j = 0; j < _ncats; j++) {
        int catColJLevel = _ytold._numLevels[j];
        // Compute gradient of objective at column
        for (int row = 0; row < cs[0]._len; row++) {
          double a = cs[j].atd(row);
          if (Double.isNaN(a)) continue;   // Skip missing observations in column
          double cweight = chkweight.atd(row);
          assert !Double.isNaN(cweight) : "User-specified weight cannot be NaN";

          // Calculate x_i * Y_j where Y_j is sub-matrix corresponding to categorical col j
          Arrays.fill(xy, 0.0);
          for (int level = 0; level < catColJLevel; level++) {
            for (int k = 0; k < _ncolX; k++) {
              xy[level] += chk_xnew(cs, k).atd(row) * _ytold.getCat(j,level,k);
            }
          }

          // Gradient for level p is x_i weighted by \grad_p L_{i,j}(x_i * Y_j, A_{i,j})
          double[] weight = _lossFunc[j].mlgrad(xy, (int)a, grad,catColJLevel);
          for (int level = 0; level < catColJLevel; level++) {
            for (int k = 0; k < _ncolX; k++)
              _ytnew[_ytold.getCatCidx(j, level)][k] += cweight * weight[level] * chk_xnew(cs, k).atd(row);
          }
        }
      }

      // Numeric columns
      for (int j = _ncats; j < _ncolA; j++) {
        int js = j - _ncats;
        int yidx = _ytold.getNumCidx(js);

        // Compute gradient of objective at column
        for (int row = 0; row < cs[0]._len; row++) {
          double a = cs[j].atd(row);
          if (Double.isNaN(a)) continue;   // Skip missing observations in column

          // Additional user-specified weight on loss for this row
          double cweight = chkweight.atd(row);
          assert !Double.isNaN(cweight) : "User-specified weight cannot be NaN";

          // Inner product x_i * y_j
          double txy = 0;
          for (int k = 0; k < _ncolX; k++)
            txy += chk_xnew(cs, k).atd(row) * _ytold.getNum(js,k);

          // Sum over x_i weighted by gradient of loss \grad L_{i,j}(x_i * y_j, A_{i,j})
          double weight = cweight * _lossFunc[j].lgrad(txy, (a - _normSub[js]) * _normMul[js]);
          for (int k = 0; k < _ncolX; k++)
            _ytnew[yidx][k] += weight * chk_xnew(cs, k).atd(row);
        }
      }
    }

    @Override public void reduce(UpdateY other) {
      ArrayUtils.add(_ytnew, other._ytnew);
    }

    @Override protected void postGlobal() {
      assert _ytnew.length == _ytold.nfeatures() && _ytnew[0].length == _ytold.rank();
      Random rand = RandomUtils.getRNG(_parms._seed);

      // Compute new y_j values using proximal gradient
      for (int j = 0; j < _ytnew.length; j++) {
        double[] u = new double[_ytnew[0].length];  // Do not touch this memory allocation.  Needed for proper function.
        for (int k = 0; k < _ytnew[0].length; k++)
          u[k] = _ytold._archetypes[j][k] - _alpha * _ytnew[j][k];

        _ytnew[j] = _parms._regularization_y.rproxgrad(u, _alpha*_parms._gamma_y, rand);
        _yreg += _parms._regularization_y.regularize(_ytnew[j]);
      }
    }
  }

  // Calculate the sum over the loss function in the optimization objective
  private static class ObjCalc extends MRTask<ObjCalc> {
    // Input
    GLRMParameters _parms;
    GlrmLoss[] _lossFunc;
    final Archetypes _yt;     // _yt = Y' (transpose of Y)
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)
    final int _ncats;         // Number of categorical cols in training frame
    final double[] _normSub;  // For standardizing training data
    final double[] _normMul;
    final int _weightId;
    final boolean _regX;      // Should I calculate regularization of (old) X matrix?

    // Output
    double _loss;       // Loss evaluated on A - XY using new X (and current Y)
    double _xold_reg;   // Regularization evaluated on old X

    ObjCalc(GLRMParameters parms, Archetypes yt, int ncolA, int ncolX, int ncats, double[] normSub, double[] normMul,
            GlrmLoss[] lossFunc, int weightId) {
      this(parms, yt, ncolA, ncolX, ncats, normSub, normMul, lossFunc, weightId, false);
    }
    ObjCalc(GLRMParameters parms, Archetypes yt, int ncolA, int ncolX, int ncats, double[] normSub, double[] normMul,
            GlrmLoss[] lossFunc, int weightId, boolean regX) {
      assert yt != null && yt.rank() == ncolX;
      assert ncats <= ncolA;
      _parms = parms;
      _yt = yt;
      _lossFunc = lossFunc;
      _ncolA = ncolA;
      _ncolX = ncolX;
      _ncats = ncats;
      _regX = regX;

      _weightId = weightId;
      _normSub = normSub;
      _normMul = normMul;
    }

    private Chunk chk_xnew(Chunk[] chks, int c) {
      return chks[_ncolA + _ncolX + c];
    }

    @SuppressWarnings("ConstantConditions")  // The method is too complex
    @Override public void map(Chunk[] cs) {
      assert (_ncolA + 2*_ncolX) == cs.length;
      Chunk chkweight = _weightId >= 0 ? cs[_weightId]:new C0DChunk(1,cs[0]._len);
      _loss = _xold_reg = 0;
      double[] xrow = null;
      double[] xy = null;

      if (_yt._numLevels[0] > 0)  // only allocate xy when there are categorical columns
        xy = new double[_yt._numLevels[0]];    // maximum categorical level column is always the first one

      if (_regX)  // allocation memory only if necessary
         xrow = new double[_ncolX];

      for (int row = 0; row < cs[0]._len; row++) {
        // Additional user-specified weight on loss for this row
        double cweight = chkweight.atd(row);
        assert !Double.isNaN(cweight) : "User-specified weight cannot be NaN";

        // Categorical columns
        for (int j = 0; j < _ncats; j++) {
          double a = cs[j].atd(row);
          if (Double.isNaN(a)) continue;
          int catColJLevel = _yt._numLevels[j];
          Arrays.fill(xy, 0, catColJLevel, 0);  // reset before next accumulation sum

          // Calculate x_i * Y_j where Y_j is sub-matrix corresponding to categorical col j
          for (int level = 0; level < catColJLevel; level++) {
            for (int k = 0; k < _ncolX; k++) {
              xy[level] += chk_xnew(cs, k).atd(row) * _yt.getCat(j, level, k);
            }
          }
          _loss += _lossFunc[j].mloss(xy, (int)a, catColJLevel);
        }

        // Numeric columns
        for (int j = _ncats; j < _ncolA; j++) {
          double a = cs[j].atd(row);
          if (Double.isNaN(a)) continue;   // Skip missing observations in row

          // Inner product x_i * y_j
          double txy = 0;
          int js = j - _ncats;
          for (int k = 0; k < _ncolX; k++)
            txy += chk_xnew(cs, k).atd(row) * _yt.getNum(js, k);
          _loss += _lossFunc[j].loss(txy, (a - _normSub[js]) * _normMul[js]);
        }
        _loss *= cweight;

        // Calculate regularization term for old X if requested
        if (_regX) {
          int idx = 0;
          for (int j = _ncolA; j < _ncolA+_ncolX; j++) {
            xrow[idx] = cs[j].atd(row);
            idx++;
          }
          assert idx == _ncolX;
          _xold_reg += _parms._regularization_x.regularize(xrow);
        }
      }
    }

    @Override public void reduce(ObjCalc other) {
      _loss += other._loss;
      _xold_reg += other._xold_reg;
    }
  }

  // Solves XD = AY' for X where A is m x n, Y is k x n, D is k x k, and m >> n > k
  // Resulting matrix X = (AY')D^(-1) will have dimensions m x k
  private static class CholMulTask extends MRTask<CholMulTask> {
    final Archetypes _yt;     // _yt = Y' (transpose of Y)
    final int _ncolA;         // Number of cols in training frame
    final int _ncolX;         // Number of cols in X (k)
    final int _ncats;         // Number of categorical cols in training frame
    final double[] _normSub;  // For standardizing training data
    final double[] _normMul;
    CholeskyDecomposition _chol;   // Cholesky decomposition of D = D', since we solve D'X' = DX' = AY'

    CholMulTask(CholeskyDecomposition chol, Archetypes yt, int ncolA, int ncolX, int ncats,
                double[] normSub, double[] normMul) {
      assert yt != null && yt.rank() == ncolX;
      assert ncats <= ncolA;
      _yt = yt;
      _ncolA = ncolA;
      _ncolX = ncolX;
      _ncats = ncats;
      _chol = chol;

      _normSub = normSub;
      _normMul = normMul;
    }

    // [A,X,W] A is read-only training data, X is left matrix in A = XY decomposition, W is working copy of X
    @Override public void map(Chunk[] cs) {
      assert (_ncolA + 2*_ncolX) == cs.length;
      double[] xrow = new double[_ncolX];

      for (int row = 0; row < cs[0]._len; row++) {
        // 1) Compute single row of AY'
        for (int k = 0; k < _ncolX; k++) {
          // Categorical columns
          double x = 0;
          for (int d = 0; d < _ncats; d++) {
            double a = cs[d].atd(row);
            if (Double.isNaN(a)) continue;
            x += _yt.getCat(d, (int)a, k);
          }

          // Numeric columns
          for (int d = _ncats; d < _ncolA; d++) {
            int ds = d - _ncats;
            double a = cs[d].atd(row);
            if (Double.isNaN(a)) continue;
            x += (a - _normSub[ds]) * _normMul[ds] * _yt.getNum(ds, k);
          }
          xrow[k] = x;
        }

        // 2) Cholesky solve for single row of X
        // _chol.solve(xrow);
        Matrix tmp = _chol.solve(new Matrix(new double[][] {xrow}).transpose());
        xrow = tmp.getColumnPackedCopy();

        // 3) Save row of solved values into X (and copy W = X)
        int i = 0;
        for (int d = _ncolA; d < _ncolA+_ncolX; d++) {
          cs[d].set(row, xrow[i]);
          cs[d+_ncolX].set(row, xrow[i++]);
        }
        assert i == xrow.length;
      }
    }
  }
}