package water.rapids.ast.prims.advmath;
import water.Key;
import water.MRTask;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.rapids.Env;
import water.rapids.Val;
import water.rapids.vals.ValFrame;
import water.rapids.vals.ValNum;
import water.rapids.ast.AstPrimitive;
import water.rapids.ast.AstRoot;
import water.util.ArrayUtils;
import java.util.Arrays;
/**
* Variance between columns of a frame
*/
public class AstVariance extends AstPrimitive {
@Override
public String[] args() {
return new String[]{"ary", "x", "y", "use", "symmetric"};
}
private enum Mode {Everything, AllObs, CompleteObs}
@Override
public int nargs() {
return 1 + 4; /* (var X Y use symmetric) */
}
@Override
public String str() {
return "var";
}
@Override
public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
Frame frx = stk.track(asts[1].exec(env)).getFrame();
Frame fry = stk.track(asts[2].exec(env)).getFrame();
if (frx.numRows() != fry.numRows())
throw new IllegalArgumentException("Frames must have the same number of rows, found " + frx.numRows() + " and " + fry.numRows());
String use = stk.track(asts[3].exec(env)).getStr();
boolean symmetric = asts[4].exec(env).getNum() == 1;
Mode mode;
switch (use) {
case "everything":
mode = Mode.Everything;
break;
case "all.obs":
mode = Mode.AllObs;
break;
case "complete.obs":
mode = Mode.CompleteObs;
break;
default:
throw new IllegalArgumentException("unknown use mode: " + use);
}
return fry.numRows() == 1 ? scalar(frx, fry, mode) : array(frx, fry, mode, symmetric);
}
// Scalar covariance for 1 row
private ValNum scalar(Frame frx, Frame fry, Mode mode) {
if (frx.numCols() != fry.numCols())
throw new IllegalArgumentException("Single rows must have the same number of columns, found " + frx.numCols() + " and " + fry.numCols());
Vec vecxs[] = frx.vecs();
Vec vecys[] = fry.vecs();
double xmean = 0, ymean = 0, ncols = frx.numCols(), NACount = 0, xval, yval, ss = 0;
for (int r = 0; r < ncols; r++) {
xval = vecxs[r].at(0);
yval = vecys[r].at(0);
if (Double.isNaN(xval) || Double.isNaN(yval))
NACount++;
else {
xmean += xval;
ymean += yval;
}
}
xmean /= (ncols - NACount);
ymean /= (ncols - NACount);
if (NACount != 0) {
if (mode.equals(Mode.AllObs)) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN);
}
for (int r = 0; r < ncols; r++) {
xval = vecxs[r].at(0);
yval = vecys[r].at(0);
if (!(Double.isNaN(xval) || Double.isNaN(yval)))
ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean);
}
return new ValNum(ss / (ncols - NACount - 1));
}
// Matrix covariance. Compute covariance between all columns from each Frame
// against each other. Return a matrix of covariances which is frx.numCols
// wide and fry.numCols tall.
private Val array(Frame frx, Frame fry, Mode mode, boolean symmetric) {
Vec[] vecxs = frx.vecs();
int ncolx = vecxs.length;
Vec[] vecys = fry.vecs();
int ncoly = vecys.length;
if (mode.equals(Mode.Everything) || mode.equals(Mode.AllObs)) {
if (mode.equals(Mode.AllObs)) {
for (Vec v : vecxs)
if (v.naCnt() != 0)
throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
if (!symmetric)
for (Vec v : vecys)
if (v.naCnt() != 0)
throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present");
}
CoVarTaskEverything[] cvs = new CoVarTaskEverything[ncoly];
double[] xmeans = new double[ncolx];
for (int x = 0; x < ncoly; x++)
xmeans[x] = vecxs[x].mean();
if (symmetric) {
//1-col returns scalar
if (ncoly == 1)
return new ValNum(vecys[0].naCnt() == 0 ? vecys[0].sigma() * vecys[0].sigma() : Double.NaN);
int[] idx = new int[ncoly];
for (int y = 1; y < ncoly; y++) idx[y] = y;
int[] first_index = new int[]{0};
//compute covariances between column_i and column_i+1, column_i+2, ...
Frame reduced_fr;
for (int y = 0; y < ncoly - 1; y++) {
idx = ArrayUtils.removeIds(idx, first_index);
reduced_fr = new Frame(frx.vecs(idx));
cvs[y] = new CoVarTaskEverything(vecys[y].mean(), xmeans).dfork(new Frame(vecys[y]).add(reduced_fr));
}
double[][] res_array = new double[ncoly][ncoly];
//fill in the diagonals (variances) using sigma from rollupstats
for (int y = 0; y < ncoly; y++)
res_array[y][y] = vecys[y].naCnt() == 0 ? vecys[y].sigma() * vecys[y].sigma() : Double.NaN;
//arrange the results into the bottom left of res_array. each successive cvs is 1 smaller in length
for (int y = 0; y < ncoly - 1; y++)
System.arraycopy(ArrayUtils.div(cvs[y].getResult()._covs, (fry.numRows() - 1)), 0, res_array[y], y + 1, ncoly - y - 1);
//copy over the bottom left of res_array to its top right
for (int y = 0; y < ncoly - 1; y++) {
for (int x = y + 1; x < ncoly; x++) {
res_array[x][y] = res_array[y][x];
}
}
//set Frame
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++) {
res[y] = Vec.makeVec(res_array[y], keys[y]);
}
return new ValFrame(new Frame(fry._names, res));
}
// Launch tasks; each does all Xs vs one Y
for (int y = 0; y < ncoly; y++)
cvs[y] = new CoVarTaskEverything(vecys[y].mean(), xmeans).dfork(new Frame(vecys[y]).add(frx));
// 1-col returns scalar
if (ncolx == 1 && ncoly == 1) {
return new ValNum(cvs[0].getResult()._covs[0] / (fry.numRows() - 1));
}
// Gather all the Xs-vs-Y covariance arrays; divide by rows
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++)
res[y] = Vec.makeVec(ArrayUtils.div(cvs[y].getResult()._covs, (fry.numRows() - 1)), keys[y]);
return new ValFrame(new Frame(fry._names, res));
} else { //if (mode.equals(Mode.CompleteObs)) {
//two-pass algorithm for computation of variance for numerical stability
if (symmetric) {
if (ncoly == 1)
return new ValNum(vecys[0].sigma() * vecys[0].sigma());
CoVarTaskCompleteObsMeanSym taskCompleteObsMeanSym = new CoVarTaskCompleteObsMeanSym().doAll(fry);
long NACount = taskCompleteObsMeanSym._NACount;
double[] ymeans = ArrayUtils.div(taskCompleteObsMeanSym._ysum, fry.numRows() - NACount);
// 1 task with all Ys
CoVarTaskCompleteObsSym cvs = new CoVarTaskCompleteObsSym(ymeans).doAll(new Frame(fry));
double[][] res_array = new double[ncoly][ncoly];
for (int y = 0; y < ncoly; y++) {
System.arraycopy(ArrayUtils.div(cvs._covs[y], (fry.numRows() - 1 - NACount)), y, res_array[y], y, ncoly - y);
}
//copy over the bottom left of res_array to its top right
for (int y = 0; y < ncoly - 1; y++) {
for (int x = y + 1; x < ncoly; x++) {
res_array[x][y] = res_array[y][x];
}
}
//set Frame
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++) {
res[y] = Vec.makeVec(res_array[y], keys[y]);
}
return new ValFrame(new Frame(fry._names, res));
}
CoVarTaskCompleteObsMean taskCompleteObsMean = new CoVarTaskCompleteObsMean(ncoly, ncolx).doAll(new Frame(fry).add(frx));
long NACount = taskCompleteObsMean._NACount;
double[] ymeans = ArrayUtils.div(taskCompleteObsMean._ysum, fry.numRows() - NACount);
double[] xmeans = ArrayUtils.div(taskCompleteObsMean._xsum, fry.numRows() - NACount);
// 1 task with all Xs and Ys
CoVarTaskCompleteObs cvs = new CoVarTaskCompleteObs(ymeans, xmeans).doAll(new Frame(fry).add(frx));
// 1-col returns scalar
if (ncolx == 1 && ncoly == 1) {
return new ValNum(cvs._covs[0][0] / (fry.numRows() - 1 - NACount));
}
// Gather all the Xs-vs-Y covariance arrays; divide by rows
Vec[] res = new Vec[ncoly];
Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly);
for (int y = 0; y < ncoly; y++)
res[y] = Vec.makeVec(ArrayUtils.div(cvs._covs[y], (fry.numRows() - 1 - NACount)), keys[y]);
return new ValFrame(new Frame(fry._names, res));
}
}
private static class CoVarTaskEverything extends MRTask<CoVarTaskEverything> {
double[] _covs;
final double _xmeans[], _ymean;
CoVarTaskEverything(double ymean, double[] xmeans) {
_ymean = ymean;
_xmeans = xmeans;
}
@Override
public void map(Chunk cs[]) {
final int ncolsx = cs.length - 1;
final Chunk cy = cs[0];
final int len = cy._len;
_covs = new double[ncolsx];
double sum;
for (int x = 0; x < ncolsx; x++) {
sum = 0;
final Chunk cx = cs[x + 1];
final double xmean = _xmeans[x];
for (int row = 0; row < len; row++)
sum += (cx.atd(row) - xmean) * (cy.atd(row) - _ymean);
_covs[x] = sum;
}
}
@Override
public void reduce(CoVarTaskEverything cvt) {
ArrayUtils.add(_covs, cvt._covs);
}
}
private static class CoVarTaskCompleteObsMean extends MRTask<CoVarTaskCompleteObsMean> {
double[] _xsum, _ysum;
long _NACount;
int _ncolx, _ncoly;
CoVarTaskCompleteObsMean(int ncoly, int ncolx) {
_ncolx = ncolx;
_ncoly = ncoly;
}
@Override
public void map(Chunk cs[]) {
_xsum = new double[_ncolx];
_ysum = new double[_ncoly];
double[] xvals = new double[_ncolx];
double[] yvals = new double[_ncoly];
double xval, yval;
boolean add;
int len = cs[0]._len;
for (int row = 0; row < len; row++) {
add = true;
//reset existing arrays to 0 rather than initializing new ones to save on garbage collection
Arrays.fill(xvals, 0);
Arrays.fill(yvals, 0);
for (int y = 0; y < _ncoly; y++) {
final Chunk cy = cs[y];
yval = cy.atd(row);
//if any yval along a row is NA, discard the entire row
if (Double.isNaN(yval)) {
_NACount++;
add = false;
break;
}
yvals[y] = yval;
}
if (add) {
for (int x = 0; x < _ncolx; x++) {
final Chunk cx = cs[x + _ncoly];
xval = cx.atd(row);
//if any xval along a row is NA, discard the entire row
if (Double.isNaN(xval)) {
_NACount++;
add = false;
break;
}
xvals[x] = xval;
}
}
//add is true iff row has been traversed and found no NAs among yvals and xvals
if (add) {
ArrayUtils.add(_xsum, xvals);
ArrayUtils.add(_ysum, yvals);
}
}
}
@Override
public void reduce(CoVarTaskCompleteObsMean cvt) {
ArrayUtils.add(_xsum, cvt._xsum);
ArrayUtils.add(_ysum, cvt._ysum);
_NACount += cvt._NACount;
}
}
private static class CoVarTaskCompleteObs extends MRTask<CoVarTaskCompleteObs> {
double[][] _covs;
final double _xmeans[], _ymeans[];
CoVarTaskCompleteObs(double[] ymeans, double[] xmeans) {
_ymeans = ymeans;
_xmeans = xmeans;
}
@Override
public void map(Chunk cs[]) {
int ncolx = _xmeans.length;
int ncoly = _ymeans.length;
double[] xvals = new double[ncolx];
double[] yvals = new double[ncoly];
_covs = new double[ncoly][ncolx];
double[] _covs_y;
double xval, yval, ymean;
boolean add;
int len = cs[0]._len;
for (int row = 0; row < len; row++) {
add = true;
//reset existing arrays to 0 rather than initializing new ones to save on garbage collection
Arrays.fill(xvals, 0);
Arrays.fill(yvals, 0);
for (int y = 0; y < ncoly; y++) {
final Chunk cy = cs[y];
yval = cy.atd(row);
//if any yval along a row is NA, discard the entire row
if (Double.isNaN(yval)) {
add = false;
break;
}
yvals[y] = yval;
}
if (add) {
for (int x = 0; x < ncolx; x++) {
final Chunk cx = cs[x + ncoly];
xval = cx.atd(row);
//if any xval along a row is NA, discard the entire row
if (Double.isNaN(xval)) {
add = false;
break;
}
xvals[x] = xval;
}
}
//add is true iff row has been traversed and found no NAs among yvals and xvals
if (add) {
for (int y = 0; y < ncoly; y++) {
_covs_y = _covs[y];
yval = yvals[y];
ymean = _ymeans[y];
for (int x = 0; x < ncolx; x++)
_covs_y[x] += (xvals[x] - _xmeans[x]) * (yval - ymean);
}
}
}
}
@Override
public void reduce(CoVarTaskCompleteObs cvt) {
ArrayUtils.add(_covs, cvt._covs);
}
}
private static class CoVarTaskCompleteObsMeanSym extends MRTask<CoVarTaskCompleteObsMeanSym> {
double[] _ysum;
long _NACount;
@Override
public void map(Chunk cs[]) {
int ncoly = cs.length;
_ysum = new double[ncoly];
double[] yvals = new double[ncoly];
double yval;
boolean add;
int len = cs[0]._len;
for (int row = 0; row < len; row++) {
add = true;
Arrays.fill(yvals, 0);
for (int y = 0; y < ncoly; y++) {
final Chunk cy = cs[y];
yval = cy.atd(row);
//if any yval along a row is NA, discard the entire row
if (Double.isNaN(yval)) {
_NACount++;
add = false;
break;
}
yvals[y] = yval;
}
if (add) {
ArrayUtils.add(_ysum, yvals);
}
}
}
@Override
public void reduce(CoVarTaskCompleteObsMeanSym cvt) {
ArrayUtils.add(_ysum, cvt._ysum);
_NACount += cvt._NACount;
}
}
private static class CoVarTaskCompleteObsSym extends MRTask<CoVarTaskCompleteObsSym> {
double[][] _covs;
final double _ymeans[];
CoVarTaskCompleteObsSym(double[] ymeans) {
_ymeans = ymeans;
}
@Override
public void map(Chunk cs[]) {
int ncoly = _ymeans.length;
double[] yvals = new double[ncoly];
_covs = new double[ncoly][ncoly];
double[] _covs_y;
double yval, ymean;
boolean add;
int len = cs[0]._len;
for (int row = 0; row < len; row++) {
add = true;
//reset existing arrays to 0 rather than initializing new ones to save on garbage collection
Arrays.fill(yvals, 0);
for (int y = 0; y < ncoly; y++) {
final Chunk cy = cs[y];
yval = cy.atd(row);
//if any yval along a row is NA, discard the entire row
if (Double.isNaN(yval)) {
add = false;
break;
}
yvals[y] = yval;
}
//add is true iff row has been traversed and found no NAs among yvals
if (add) {
for (int y = 0; y < ncoly; y++) {
_covs_y = _covs[y];
yval = yvals[y];
ymean = _ymeans[y];
for (int x = y; x < ncoly; x++)
_covs_y[x] += (yvals[x] - _ymeans[x]) * (yval - ymean);
}
}
}
}
@Override
public void reduce(CoVarTaskCompleteObsSym cvt) {
ArrayUtils.add(_covs, cvt._covs);
}
}
public static double getVar(Vec v) {
return v.naCnt() == 0 ? v.sigma() * v.sigma() : Double.NaN;
}
}