package hex.util;
import Jama.EigenvalueDecomposition;
import Jama.Matrix;
import hex.DataInfo;
import hex.FrameTask;
import hex.ToEigenVec;
import hex.gram.Gram;
import water.DKV;
import water.Job;
import water.Key;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.NewChunk;
import water.fvec.Vec;
import water.util.ArrayUtils;
public class LinearAlgebraUtils {
/*
* Forward substitution: Solve Lx = b for x with L = lower triangular matrix, b = real vector
*/
public static double[] forwardSolve(double[][] L, double[] b) {
assert L != null && L.length == L[0].length && L.length == b.length;
double[] res = new double[b.length];
for(int i = 0; i < b.length; i++) {
res[i] = b[i];
for(int j = 0; j < i; j++)
res[i] -= L[i][j] * res[j];
res[i] /= L[i][i];
}
return res;
}
/*
* Impute missing values and transform numeric value x in col of dinfo._adaptedFrame
*/
private static double modifyNumeric(double x, int col, DataInfo dinfo) {
double y = (Double.isNaN(x) && dinfo._imputeMissing) ? dinfo._numMeans[col] : x; // Impute missing value with mean
if (dinfo._normSub != null && dinfo._normMul != null) // Transform x if requested
y = (y - dinfo._normSub[col]) * dinfo._normMul[col];
return y;
}
/*
* Return row with categoricals expanded in array tmp
*/
public static double[] expandRow(double[] row, DataInfo dinfo, double[] tmp, boolean modify_numeric) {
// Categorical columns
int cidx;
for(int col = 0; col < dinfo._cats; col++) {
if (Double.isNaN(row[col])) {
if (dinfo._imputeMissing)
cidx = dinfo.catNAFill()[col];
else if (!dinfo._catMissing[col])
continue; // Skip if entry missing and no NA bucket. All indicators will be zero.
else
cidx = dinfo._catOffsets[col+1]-1; // Otherwise, missing value turns into extra (last) factor
} else {
if ((dinfo._catOffsets[col + 1] - dinfo._catOffsets[col]) == 1)
cidx = dinfo.getCategoricalId(col, 0);
else
cidx = dinfo.getCategoricalId(col, (int) row[col]);
}
if (((dinfo._catOffsets[col+1]-dinfo._catOffsets[col]) == 1) && cidx >=0) // binary data here, no column expansion, copy data
tmp[cidx] = row[col];
else if(cidx >= 0) tmp[cidx] = 1;
}
// Numeric columns
int chk_cnt = dinfo._cats;
int exp_cnt = dinfo.numStart();
for(int col = 0; col < dinfo._nums; col++) {
// Only do imputation and transformation if requested
tmp[exp_cnt] = modify_numeric ? modifyNumeric(row[chk_cnt], col, dinfo) : row[chk_cnt];
exp_cnt++; chk_cnt++;
}
return tmp;
}
/**
* Computes B = XY where X is n by k and Y is k by p, saving result in new vecs
* Input: dinfo = X (large frame) with dinfo._adaptedFrame passed to doAll
* yt = Y' = transpose of Y (small matrix)
* Output: XY (large frame) is n by p
*/
public static class BMulTask extends FrameTask<BMulTask> {
final double[][] _yt; // _yt = Y' (transpose of Y)
public BMulTask(Key<Job> jobKey, DataInfo dinfo, double[][] yt) {
super(jobKey, dinfo);
_yt = yt;
}
@Override protected void processRow(long gid, DataInfo.Row row, NewChunk[] outputs) {
for(int p = 0; p < _yt.length; p++) {
double x = row.innerProduct(_yt[p]);
outputs[p].addNum(x);
}
}
}
/**
* Computes B = XY where X is n by k and Y is k by p, saving result in same frame
* Input: [X,B] (large frame) passed to doAll, where we write to B
* yt = Y' = transpose of Y (small matrix)
* ncolX = number of columns in X
*/
public static class BMulInPlaceTask extends MRTask<BMulInPlaceTask> {
final DataInfo _xinfo; // Info for frame X
final double[][] _yt; // _yt = Y' (transpose of Y)
final int _ncolX; // Number of cols in X
public BMulInPlaceTask(DataInfo xinfo, double[][] yt) {
assert yt != null && yt[0].length == numColsExp(xinfo._adaptedFrame,true);
_xinfo = xinfo;
_ncolX = xinfo._adaptedFrame.numCols();
_yt = yt;
}
public BMulInPlaceTask(DataInfo xinfo, double[][] yt, int nColsExp) {
assert yt != null && yt[0].length == nColsExp;
_xinfo = xinfo;
_ncolX = xinfo._adaptedFrame.numCols();
_yt = yt;
}
@Override public void map(Chunk[] cs) {
assert cs.length == _ncolX + _yt.length;
// Copy over only X frame chunks
Chunk[] xchk = new Chunk[_ncolX];
DataInfo.Row xrow = _xinfo.newDenseRow();
System.arraycopy(cs,0,xchk,0,_ncolX);
double sum;
for(int row = 0; row < cs[0]._len; row++) {
// Extract row of X
_xinfo.extractDenseRow(xchk, row, xrow);
if (xrow.isBad()) continue;
int bidx = _ncolX;
for (double[] ps : _yt ) {
// Inner product of X row with Y column (Y' row)
sum = xrow.innerProduct(ps);
cs[bidx].set(row, sum); // Save inner product to B
bidx++;
}
assert bidx == cs.length;
}
}
}
/**
* Computes A'Q where A is n by p and Q is n by k
* Input: [A,Q] (large frame) passed to doAll
* Output: atq = A'Q (small matrix) is \tilde{p} by k where \tilde{p} = number of cols in A with categoricals expanded
*/
public static class SMulTask extends MRTask<SMulTask> {
final DataInfo _ainfo; // Info for frame A
final int _ncolA; // Number of cols in A
final int _ncolExp; // Number of cols in A with categoricals expanded
final int _ncolQ; // Number of cols in Q
public double[][] _atq; // Output: A'Q is p_exp by k, where p_exp = number of cols in A with categoricals expanded
public SMulTask(DataInfo ainfo, int ncolQ) {
_ainfo = ainfo;
_ncolA = ainfo._adaptedFrame.numCols();
_ncolExp = numColsExp(ainfo._adaptedFrame,true);
_ncolQ = ncolQ;
}
public SMulTask(DataInfo ainfo, int ncolQ, int ncolExp) {
_ainfo = ainfo;
_ncolA = ainfo._adaptedFrame.numCols();
_ncolExp = ncolExp; // when call from GLRM or PCA
_ncolQ = ncolQ;
}
@Override public void map(Chunk cs[]) {
assert (_ncolA + _ncolQ) == cs.length;
_atq = new double[_ncolExp][_ncolQ]; // not okay to share.
for(int k = _ncolA; k < (_ncolA + _ncolQ); k++) {
// Categorical columns
int cidx;
for(int p = 0; p < _ainfo._cats; p++) {
for(int row = 0; row < cs[0]._len; row++) {
if(cs[p].isNA(row) && _ainfo._skipMissing) continue;
double q = cs[k].atd(row);
double a = cs[p].atd(row);
if (Double.isNaN(a)) {
if (_ainfo._imputeMissing)
cidx = _ainfo.catNAFill()[p];
else if (!_ainfo._catMissing[p])
continue; // Skip if entry missing and no NA bucket. All indicators will be zero.
else
cidx = _ainfo._catOffsets[p+1]-1; // Otherwise, missing value turns into extra (last) factor
} else
cidx = _ainfo.getCategoricalId(p, (int)a);
if(cidx >= 0) _atq[cidx][k-_ncolA] += q; // Ignore categorical levels outside domain
}
}
// Numeric columns
int pnum = 0;
int pexp = _ainfo.numStart();
for(int p = _ainfo._cats; p < _ncolA; p++) {
for(int row = 0; row < cs[0]._len; row++) {
if(cs[p].isNA(row) && _ainfo._skipMissing) continue;
double q = cs[k].atd(row);
double a = cs[p].atd(row);
a = modifyNumeric(a, pnum, _ainfo);
_atq[pexp][k-_ncolA] += q * a;
}
pexp++; pnum++;
}
assert pexp == _atq.length;
}
}
@Override public void reduce(SMulTask other) {
ArrayUtils.add(_atq, other._atq);
}
}
/**
* Get R = L' from Cholesky decomposition Y'Y = LL' (same as R from Y = QR)
* @param jobKey Job key for Gram calculation
* @param yinfo DataInfo for Y matrix
* @param transpose Should result be transposed to get L?
* @return L or R matrix from Cholesky of Y Gram
*/
public static double[][] computeR(Key<Job> jobKey, DataInfo yinfo, boolean transpose, double[][] xx) {
// Calculate Cholesky of Y Gram to get R' = L matrix
Gram.GramTask gtsk = new Gram.GramTask(jobKey, yinfo); // Gram is Y'Y/n where n = nrow(Y)
gtsk.doAll(yinfo._adaptedFrame);
Gram.Cholesky chol = gtsk._gram.cholesky(null); // If Y'Y = LL' Cholesky, then R = L'
double[][] L = chol.getL();
ArrayUtils.mult(L, Math.sqrt(gtsk._nobs)); // Must scale since Cholesky of Y'Y/n where nobs = nrow(Y)
return transpose ? L : ArrayUtils.transpose(L);
}
/**
* Solve for Q from Y = QR factorization and write into new frame
* @param jobKey Job key for Gram calculation
* @param yinfo DataInfo for Y matrix
* @param ywfrm Input frame [Y,W] where we write into W
* @return l2 norm of Q - W, where W is old matrix in frame, Q is computed factorization
*/
public static double computeQ(Key<Job> jobKey, DataInfo yinfo, Frame ywfrm, double[][] xx) {
double[][] cholL = computeR(jobKey, yinfo, true, xx);
ForwardSolve qrtsk = new ForwardSolve(yinfo, cholL);
qrtsk.doAll(ywfrm);
return qrtsk._sse; // \sum (Q_{i,j} - W_{i,j})^2
}
/**
* Solve for Q from Y = QR factorization and write into Y frame
* @param jobKey Job key for Gram calculation
* @param yinfo DataInfo for Y matrix
*/
public static void computeQInPlace(Key<Job> jobKey, DataInfo yinfo) {
double[][] cholL = computeR(jobKey, yinfo, true, null);
ForwardSolveInPlace qrtsk = new ForwardSolveInPlace(yinfo, cholL);
qrtsk.doAll(yinfo._adaptedFrame);
}
/**
* Given lower triangular L, solve for Q in QL' = A (LQ' = A') using forward substitution
* Dimensions: A is n by p, Q is n by p, R = L' is p by p
* Input: [A,Q] (large frame) passed to doAll, where we write to Q
*/
public static class ForwardSolve extends MRTask<ForwardSolve> {
final DataInfo _ainfo; // Info for frame A
final int _ncols; // Number of cols in A and in Q
final double[][] _L;
public double _sse; // Output: Sum-of-squared difference between old and new Q
public ForwardSolve(DataInfo ainfo, double[][] L) {
assert L != null && L.length == L[0].length && L.length == ainfo._adaptedFrame.numCols();
_ainfo = ainfo;
_ncols = ainfo._adaptedFrame.numCols();
_L = L;
_sse = 0;
}
@Override public void map(Chunk cs[]) {
assert 2 * _ncols == cs.length;
// Copy over only A frame chunks
Chunk[] achks = new Chunk[_ncols];
System.arraycopy(cs,0,achks,0,_ncols);
for(int row = 0; row < cs[0]._len; row++) {
// 1) Extract single expanded row of A
DataInfo.Row arow = _ainfo.newDenseRow();
_ainfo.extractDenseRow(achks, row, arow);
if (arow.isBad()) continue;
double[] aexp = arow.expandCats();
// 2) Solve for single row of Q using forward substitution
double[] qrow = forwardSolve(_L, aexp);
// 3) Save row of solved values into Q
int i = 0;
for(int d = _ncols; d < 2 * _ncols; d++) {
double qold = cs[d].atd(row);
double diff = qrow[i] - qold;
_sse += diff * diff; // Calculate SSE between Q_new and Q_old
cs[d].set(row, qrow[i++]);
}
assert i == qrow.length;
}
}
}
/**
* Given lower triangular L, solve for Q in QL' = A (LQ' = A') using forward substitution
* Dimensions: A is n by p, Q is n by p, R = L' is p by p
* Input: A (large frame) passed to doAll, where we overwrite each row of A with its row of Q
*/
public static class ForwardSolveInPlace extends MRTask<ForwardSolveInPlace> {
final DataInfo _ainfo; // Info for frame A
final int _ncols; // Number of cols in A
final double[][] _L;
public ForwardSolveInPlace(DataInfo ainfo, double[][] L) {
assert L != null && L.length == L[0].length && L.length == ainfo._adaptedFrame.numCols();
_ainfo = ainfo;
_ncols = ainfo._adaptedFrame.numCols();
_L = L;
}
@Override public void map(Chunk cs[]) {
assert _ncols == cs.length;
// Copy over only A frame chunks
Chunk[] achks = new Chunk[_ncols];
System.arraycopy(cs,0,achks,0,_ncols);
for(int row = 0; row < cs[0]._len; row++) {
// 1) Extract single expanded row of A
DataInfo.Row arow = _ainfo.newDenseRow();
_ainfo.extractDenseRow(achks, row, arow);
if (arow.isBad()) continue;
double[] aexp = arow.expandCats();
// 2) Solve for single row of Q using forward substitution
double[] qrow = forwardSolve(_L, aexp);
assert qrow.length == _ncols;
// 3) Overwrite row of A with row of solved values Q
for(int d = 0; d < _ncols; d++)
cs[d].set(row, qrow[d]);
}
}
}
/** Number of columns with categoricals expanded.
* @return Number of columns with categoricals expanded into indicator columns */
public static int numColsExp(Frame fr, boolean useAllFactorLevels) {
final int uAFL = useAllFactorLevels ? 0 : 1;
int cols = 0;
for( Vec vec : fr.vecs() )
cols += (vec.isCategorical() && vec.domain() != null) ? vec.domain().length - uAFL : 1;
return cols;
}
static double[] multiple(double[] diagYY /*diagonal*/, int nTot, int nVars) {
int ny = diagYY.length;
for (int i = 0; i < ny; i++) {
diagYY[i] *= nTot;
}
double[][] uu = new double[ny][ny];
for (int i = 0; i < ny; i++) {
for (int j = 0; j < ny; j++) {
double yyij = i==j ? diagYY[i] : 0;
uu[i][j] = (yyij - diagYY[i] * diagYY[j] / nTot) / (nVars * Math.sqrt(diagYY[i] * diagYY[j]));
if (Double.isNaN(uu[i][j])) {
uu[i][j] = 0;
}
}
}
EigenvalueDecomposition eigen = new EigenvalueDecomposition(new Matrix(uu));
double[] eigenvalues = eigen.getRealEigenvalues();
double[][] eigenvectors = eigen.getV().getArray();
int maxIndex = ArrayUtils.maxIndex(eigenvalues);
return eigenvectors[maxIndex];
}
static class ProjectOntoEigenVector extends MRTask<ProjectOntoEigenVector> {
ProjectOntoEigenVector(double[] yCoord) { _yCoord = yCoord; }
final double[] _yCoord; //projection
@Override public void map(Chunk[] cs, NewChunk[] nc) {
for (int i=0;i<cs[0]._len;++i) {
if (cs[0].isNA(i)) {
nc[0].addNA();
} else {
int which = (int) cs[0].at8(i);
nc[0].addNum((float)_yCoord[which]); //make it more reproducible by casting to float
}
}
}
}
public static Vec toEigen(Vec src) {
Frame train = new Frame(Key.<Frame>make(), new String[]{"enum"}, new Vec[]{src});
DataInfo dinfo = new DataInfo(train, null, 0, true /*_use_all_factor_levels*/, DataInfo.TransformType.NONE,
DataInfo.TransformType.NONE, /* skipMissing */ false, /* imputeMissing */ true,
/* missingBucket */ false, /* weights */ false, /* offset */ false, /* fold */ false, /* intercept */ false);
DKV.put(dinfo);
Gram.GramTask gtsk = new Gram.GramTask(null, dinfo).doAll(dinfo._adaptedFrame);
// round the numbers to float precision to be more reproducible
// double[] rounded = gtsk._gram._diag;
double[] rounded = new double[gtsk._gram._diag.length];
for (int i = 0; i < rounded.length; ++i)
rounded[i] = (float) gtsk._gram._diag[i];
dinfo.remove();
Vec v = new ProjectOntoEigenVector(multiple(rounded, (int) gtsk._nobs, 1)).doAll(1, (byte) 3, train).outputFrame().anyVec();
return v;
}
public static ToEigenVec toEigen = new ToEigenVec() {
@Override public Vec toEigenVec(Vec src) { return toEigen(src); }
};
}