package water.rapids.ast.prims.advmath; import water.Key; import water.MRTask; import water.fvec.*; import water.rapids.Env; import water.rapids.Val; import water.rapids.ast.AstPrimitive; import water.rapids.ast.AstRoot; import water.rapids.vals.ValFrame; import water.rapids.vals.ValNum; import water.rapids.ast.AstFunction; import water.util.ArrayUtils; /** * Calculate Pearson's Correlation Coefficient between columns of a frame * <p/> * Formula: * Pearson's Correlation Coefficient = Cov(X,Y)/sigma(X) * sigma(Y) */ public class AstCorrelation extends AstPrimitive { @Override public String[] args() { return new String[]{"ary", "x", "y", "use"}; } private enum Mode {Everything, AllObs, CompleteObs} @Override public int nargs() { return 1 + 3; /* (cor X Y use) */ } @Override public String str() { return "cor"; } @Override public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) { Frame frx = stk.track(asts[1].exec(env)).getFrame(); Frame fry = stk.track(asts[2].exec(env)).getFrame(); if (frx.numRows() != fry.numRows()) throw new IllegalArgumentException("Frames must have the same number of rows, found " + frx.numRows() + " and " + fry.numRows()); String use = stk.track(asts[3].exec(env)).getStr(); Mode mode; switch (use) { case "everything": mode = Mode.Everything; break; case "all.obs": mode = Mode.AllObs; break; case "complete.obs": mode = Mode.CompleteObs; break; default: throw new IllegalArgumentException("unknown use mode: " + use); } return fry.numRows() == 1 ? scalar(frx, fry, mode) : array(frx, fry, mode); } // Pearson Correlation for one row, which will return a scalar value. private ValNum scalar(Frame frx, Frame fry, Mode mode) { if (frx.numCols() != fry.numCols()) throw new IllegalArgumentException("Single rows must have the same number of columns, found " + frx.numCols() + " and " + fry.numCols()); Vec vecxs[] = frx.vecs(); Vec vecys[] = fry.vecs(); double xmean = 0; double ymean = 0; double xvar = 0; double yvar = 0; double xsd; double ysd; double ncols = fry.numCols(); double NACount = 0; double xval; double yval; double ss = 0; for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (Double.isNaN(xval) || Double.isNaN(yval)) NACount++; else { xmean += xval; ymean += yval; } } xmean /= (ncols - NACount); ymean /= (ncols - NACount); for (int r = 0; r < ncols; r++) { xval = vecxs[r].at(0); yval = vecys[r].at(0); if (!(Double.isNaN(xval) || Double.isNaN(yval))) { //Compute variance of x and y vars xvar += Math.pow((vecxs[r].at(0) - xmean), 2); yvar += Math.pow((vecys[r].at(0) - ymean), 2); //Compute sum of squares of x and y ss += (vecxs[r].at(0) - xmean) * (vecys[r].at(0) - ymean); } } xsd = Math.sqrt(xvar / (ncols - 1 - NACount)); //Sample Standard Deviation ysd = Math.sqrt(yvar / (ncols - 1 - NACount)); //Sample Standard Deviation double denom = xsd * ysd; //sd(x) * sd(y) if (NACount != 0) { if (mode.equals(Mode.AllObs)) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present"); if (mode.equals(Mode.Everything)) return new ValNum(Double.NaN); } return new ValNum((ss / (ncols - NACount - 1)) / denom); //Pearson's Correlation Coefficient } // Matrix correlation. Compute correlation between all columns from each Frame // against each other. Return a matrix of correlations which is frx.numCols // wide and fry.numCols tall. private Val array(Frame frx, Frame fry, Mode mode) { Vec[] vecxs = frx.vecs(); int ncolx = vecxs.length; Vec[] vecys = fry.vecs(); int ncoly = vecys.length; if (mode.equals(Mode.Everything) || mode.equals(Mode.AllObs)) { if (mode.equals(Mode.AllObs)) { for (Vec v : vecxs) if (v.naCnt() != 0) throw new IllegalArgumentException("Mode is 'all.obs' but NAs are present"); } //Set up CoVarTask CoVarTask[] cvs = new CoVarTask[ncoly]; //Get mean of x vecs double[] xmeans = new double[ncolx]; for (int x = 0; x < ncolx; x++) { xmeans[x] = vecxs[x].mean(); } //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y) double[] sigmay = new double[ncoly]; double[] sigmax = new double[ncolx]; double[][] denom = new double[ncoly][ncolx]; // Launch tasks; each does all Xs vs one Y for (int y = 0; y < ncoly; y++) { //Get covariance between x and y cvs[y] = new CoVarTask(vecys[y].mean(), xmeans).dfork(new Frame(vecys[y]).add(frx)); //Get sigma of y vecs sigmay[y] = vecys[y].sigma(); } //Get sigma of x vecs for (int x = 0; x < ncolx; x++) { sigmax[x] = vecxs[x].sigma(); } //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y) for (int y = 0; y < ncoly; y++) { for (int x = 0; x < ncolx; x++) { denom[y][x] = sigmay[y] * sigmax[x]; } } // 1-col returns scalar if (ncolx == 1 && ncoly == 1) { return new ValNum((cvs[0].getResult()._covs[0] / (fry.numRows() - 1)) / denom[0][0]); } //Gather final result, which is the correlation coefficient per column Vec[] res = new Vec[ncoly]; Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly); for (int y = 0; y < ncoly; y++) { res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (fry.numRows() - 1)), denom[y]), keys[y]); } return new ValFrame(new Frame(fry._names, res)); } else { //if (mode.equals(Mode.CompleteObs)) //Omit NA rows between X and Y. //This will help with cov, sigma & mean calculations later as we only want to calculate cov, sigma, & mean //for rows with no NAs Frame frxy_naomit = new MRTask() { private void copyRow(int row, Chunk[] cs, NewChunk[] ncs) { for (int i = 0; i < cs.length; ++i) { if (cs[i] instanceof CStrChunk) ncs[i].addStr(cs[i], row); else if (cs[i] instanceof C16Chunk) ncs[i].addUUID(cs[i], row); else if (cs[i].hasFloat()) ncs[i].addNum(cs[i].atd(row)); else ncs[i].addNum(cs[i].at8(row), 0); } } @Override public void map(Chunk[] cs, NewChunk[] ncs) { int col; for (int row = 0; row < cs[0]._len; ++row) { for (col = 0; col < cs.length; ++col) if (cs[col].isNA(row)) break; if (col == cs.length) copyRow(row, cs, ncs); } } }.doAll(new Frame(frx).add(fry).types(), new Frame(frx).add(fry)).outputFrame(new Frame(frx).add(fry).names(), new Frame(frx).add(fry).domains()); //Collect new vecs that do not contain NA rows Vec[] vecxs_naomit = frxy_naomit.subframe(0, ncolx).vecs(); int ncolx_naomit = vecxs_naomit.length; Vec[] vecys_naomit = frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length).vecs(); int ncoly_naomit = vecys_naomit.length; //Set up CoVarTask CoVarTask[] cvs = new CoVarTask[ncoly_naomit]; //Get mean of X vecs double[] xmeans = new double[ncolx_naomit]; for (int x = 0; x < ncolx_naomit; x++) { xmeans[x] = vecxs_naomit[x].mean(); } //Set up double arrays to capture sd(x), sd(y) and sd(x) * sd(y) double[] sigmay = new double[ncoly_naomit]; double[] sigmax = new double[ncolx_naomit]; double[][] denom = new double[ncoly_naomit][ncolx_naomit]; // Launch tasks; each does all Xs vs one Y for (int y = 0; y < ncoly_naomit; y++) { //Get covariance between x and y cvs[y] = new CoVarTask(vecys_naomit[y].mean(), xmeans).dfork(new Frame(vecys_naomit[y]).add(frxy_naomit.subframe(0, ncolx))); //Get sigma of y vecs sigmay[y] = vecys_naomit[y].sigma(); } //Get sigma of x vecs for (int x = 0; x < ncolx_naomit; x++) { sigmax[x] = vecxs_naomit[x].sigma(); } //Denominator for correlation calculation is sigma_y * sigma_x (All x sigmas vs one Y) for (int y = 0; y < ncoly_naomit; y++) { for (int x = 0; x < ncolx_naomit; x++) { denom[y][x] = sigmay[y] * sigmax[x]; } } // 1-col returns scalar if (ncolx_naomit == 1 && ncoly_naomit == 1) { return new ValNum((cvs[0].getResult()._covs[0] / (frxy_naomit.numRows() - 1)) / denom[0][0]); } //Gather final result, which is the correlation coefficient per column Vec[] res = new Vec[ncoly_naomit]; Key<Vec>[] keys = Vec.VectorGroup.VG_LEN1.addVecs(ncoly_naomit); for (int y = 0; y < ncoly_naomit; y++) { res[y] = Vec.makeVec(ArrayUtils.div(ArrayUtils.div(cvs[y].getResult()._covs, (frxy_naomit.numRows() - 1)), denom[y]), keys[y]); } return new ValFrame(new Frame(frxy_naomit.subframe(ncolx, frxy_naomit.vecs().length)._names, res)); } } private static class CoVarTask extends MRTask<CoVarTask> { double[] _covs; final double _xmeans[], _ymean; CoVarTask(double ymean, double[] xmeans) { _ymean = ymean; _xmeans = xmeans; } @Override public void map(Chunk cs[]) { final int ncolsx = cs.length - 1; final Chunk cy = cs[0]; final int len = cy._len; _covs = new double[ncolsx]; double sum; for (int x = 0; x < ncolsx; x++) { sum = 0; final Chunk cx = cs[x + 1]; final double xmean = _xmeans[x]; for (int row = 0; row < len; row++) sum += (cx.atd(row) - xmean) * (cy.atd(row) - _ymean); _covs[x] = sum; } } @Override public void reduce(CoVarTask cvt) { ArrayUtils.add(_covs, cvt._covs); } } }