package hex.pca;
import hex.DataInfo;
import hex.SplitFrame;
import hex.pca.PCAModel.PCAParameters;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.DKV;
import water.Key;
import water.Scope;
import water.TestUtil;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.FrameUtils;
import java.util.concurrent.ExecutionException;
public class PCATest extends TestUtil {
public static final double TOLERANCE = 1e-6;
@BeforeClass public static void setup() { stall_till_cloudsize(1); }
@Test public void testArrests() throws InterruptedException, ExecutionException {
// Results with de-meaned training frame
double[] stddev = new double[] {83.732400, 14.212402, 6.489426, 2.482790};
double[][] eigvec = ard(ard(0.04170432, -0.04482166, 0.07989066, -0.99492173),
ard(0.99522128, -0.05876003, -0.06756974, 0.03893830),
ard(0.04633575, 0.97685748, -0.20054629, -0.05816914),
ard(0.07515550, 0.20071807, 0.97408059, 0.07232502));
// Results with standardized training frame
double[] stddev_std = new double[] {1.5748783, 0.9948694, 0.5971291, 0.4164494};
double[][] eigvec_std = ard(ard(-0.5358995, 0.4181809, -0.3412327, 0.64922780),
ard(-0.5831836, 0.1879856, -0.2681484, -0.74340748),
ard(-0.2781909, -0.8728062, -0.3780158, 0.13387773),
ard(-0.5434321, -0.1673186, 0.8177779, 0.08902432));
Frame train = null;
try {
train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv"); // TODO: Move this outside loop
for (DataInfo.TransformType std : new DataInfo.TransformType[] {
DataInfo.TransformType.DEMEAN,
DataInfo.TransformType.STANDARDIZE }) {
PCAModel model = null;
try {
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 4;
parms._transform = std;
parms._max_iterations = 1000;
parms._pca_method = PCAParameters.Method.Power;
model = new PCA(parms).trainModel().get();
if (std == DataInfo.TransformType.DEMEAN) {
TestUtil.checkStddev(stddev, model._output._std_deviation, TOLERANCE);
TestUtil.checkEigvec(eigvec, model._output._eigenvectors, TOLERANCE);
} else if (std == DataInfo.TransformType.STANDARDIZE) {
TestUtil.checkStddev(stddev_std, model._output._std_deviation, TOLERANCE);
TestUtil.checkEigvec(eigvec_std, model._output._eigenvectors, TOLERANCE);
}
} finally {
if( model != null ) model.delete();
}
}
} finally {
if(train != null) train.delete();
}
}
@Test public void testArrestsScoring() throws InterruptedException, ExecutionException {
// Results with original training frame
double[] stddev = new double[] {202.7230564, 27.8322637, 6.5230482, 2.5813652};
double[][] eigvec = ard(ard(-0.04239181, 0.01616262, -0.06588426, 0.99679535),
ard(-0.94395706, 0.32068580, 0.06655170, -0.04094568),
ard(-0.30842767, -0.93845891, 0.15496743, 0.01234261),
ard(-0.10963744, -0.12725666, -0.98347101, -0.06760284));
PCAModel model = null;
Frame train = null, score = null, scoreR = null;
try {
train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv");
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 4;
parms._transform = DataInfo.TransformType.NONE;
parms._pca_method = PCAParameters.Method.GramSVD;
model = new PCA(parms).trainModel().get();
TestUtil.checkStddev(stddev, model._output._std_deviation, 1e-5);
boolean[] flippedEig = TestUtil.checkEigvec(eigvec, model._output._eigenvectors, 1e-5);
score = model.score(train);
scoreR = parse_test_file(Key.make("scoreR.hex"), "smalldata/pca_test/USArrests_PCAscore.csv");
TestUtil.checkProjection(scoreR, score, TOLERANCE, flippedEig); // Flipped cols must match those from eigenvectors
// Build a POJO, validate same results
Assert.assertTrue(model.testJavaScoring(train,score,1e-5));
} finally {
if (train != null) train.delete();
if (score != null) score.delete();
if (scoreR != null) scoreR.delete();
if (model != null) model.delete();
}
}
@Test public void testIrisScoring() throws InterruptedException, ExecutionException {
// Results with original training frame
double[] stddev = new double[] {7.88175203, 1.56002774, 0.59189816, 0.25917329, 0.15415273, 0.09381276, 0.04768590};
double[][] eigvec = ard(ard(-0.03169051, -0.32305860, 0.185100382, -0.12336685, -0.14867156, 0.75932119, -0.496462912),
ard(-0.04289677, 0.04037565, -0.780961964, 0.19727933, 0.07251338, -0.12216945, -0.572298338),
ard(-0.05019689, 0.16836717, 0.551432201, -0.07122329, 0.08454116, -0.48327010, -0.647522462),
ard(-0.74915107, -0.26629420, -0.101102186, -0.48920057, 0.32458460, -0.09176909, 0.067412858),
ard(-0.37877011, -0.50636060, 0.142219195, 0.69081642, -0.26312992, -0.17811871, 0.041411296),
ard(-0.51177078, 0.65945159, -0.005079934, 0.04881900, -0.52128288, 0.17038367, 0.006223427),
ard(-0.16742875, 0.32166036, 0.145893901, 0.47102115, 0.72052968, 0.32523458, 0.020389463));
PCAModel model = null;
Frame train = null, score = null, scoreR = null;
try {
train = parse_test_file(Key.make("iris.hex"), "smalldata/iris/iris_wheader.csv");
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 7;
parms._transform = DataInfo.TransformType.NONE;
parms._use_all_factor_levels = true;
parms._pca_method = PCAParameters.Method.Power;
model = new PCA(parms).trainModel().get();
TestUtil.checkStddev(stddev, model._output._std_deviation, 1e-5);
boolean[] flippedEig = TestUtil.checkEigvec(eigvec, model._output._eigenvectors, 1e-5);
score = model.score(train);
scoreR = parse_test_file(Key.make("scoreR.hex"), "smalldata/pca_test/iris_PCAscore.csv");
TestUtil.checkProjection(scoreR, score, TOLERANCE, flippedEig); // Flipped cols must match those from eigenvectors
// Build a POJO, validate same results
Assert.assertTrue(model.testJavaScoring(train,score,1e-5));
} finally {
if (train != null) train.delete();
if (score != null) score.delete();
if (scoreR != null) scoreR.delete();
if (model != null) model.delete();
}
}
@Test public void testIrisSplitScoring() throws InterruptedException, ExecutionException {
PCAModel model = null;
Frame fr = null, fr2= null;
Frame tr = null, te= null;
try {
fr = parse_test_file("smalldata/iris/iris_wheader.csv");
SplitFrame sf = new SplitFrame(fr,new double[] { 0.5, 0.5 },new Key[] { Key.make("train.hex"), Key.make("test.hex")});
// Invoke the job
sf.exec().get();
Key[] ksplits = sf._destination_frames;
tr = DKV.get(ksplits[0]).get();
te = DKV.get(ksplits[1]).get();
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = ksplits[0];
parms._valid = ksplits[1];
parms._k = 4;
parms._max_iterations = 1000;
parms._pca_method = PCAParameters.Method.GramSVD;
model = new PCA(parms).trainModel().get();
// Done building model; produce a score column with cluster choices
fr2 = model.score(te);
Assert.assertTrue(model.testJavaScoring(te, fr2, 1e-5));
} finally {
if( fr != null ) fr.delete();
if( fr2 != null ) fr2.delete();
if( tr != null ) tr .delete();
if( te != null ) te .delete();
if (model != null) model.delete();
}
}
@Test public void testImputeMissing() throws InterruptedException, ExecutionException {
Frame train = null;
double missing_fraction = 0.75;
long seed = 12345;
try {
train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv");
// Add missing values to the training data
if (missing_fraction > 0) {
Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
DKV.put(frtmp._key, frtmp); // Need to put the frame (to be modified) into DKV for MissingInserter to pick up
FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
j.execImpl().get(); // MissingInserter is non-blocking, must block here explicitly
DKV.remove(frtmp._key); // Delete the frame header (not the data)
}
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 4;
parms._transform = DataInfo.TransformType.NONE;
parms._pca_method = PCAModel.PCAParameters.Method.GramSVD;
parms._impute_missing = true; // Don't skip rows with NA entries, but impute using mean of column
parms._seed = seed;
PCAModel pca = null;
pca = new PCA(parms).trainModel().get();
if (pca != null) pca.remove();
} finally {
if (train != null) train.delete();
}
}
@Test public void testGram() {
double[][] x = ard(ard(1, 2, 3), ard(4, 5, 6));
double[][] xgram = ard(ard(17, 22, 27), ard(22, 29, 36), ard(27, 36, 45)); // X'X
double[][] xtgram = ard(ard(14, 32), ard(32, 77)); // (X')'X' = XX'
double[][] xgram_glrm = ArrayUtils.formGram(x, false);
double[][] xtgram_glrm = ArrayUtils.formGram(x, true);
Assert.assertArrayEquals(xgram, xgram_glrm);
Assert.assertArrayEquals(xtgram, xtgram_glrm);
}
/* Make sure POJO works if the model is only built from categorical variables (no numeric columns) */
@Test public void testCatOnlyPUBDEV3988() throws InterruptedException, ExecutionException {
PCAModel model = null;
Frame train = null, score = null;
try {
train = parse_test_file(Key.make("prostate_cat.hex"), "smalldata/prostate/prostate_cat.csv");
for (int i = train.numCols() - 1; i > 0; i--) {
Vec v = train.vec(i);
if (v.get_type() != Vec.T_CAT) {
train.remove(i);
Vec.remove(v._key);
}
}
DKV.put(train);
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 2;
parms._transform = DataInfo.TransformType.STANDARDIZE;
parms._use_all_factor_levels = true;
parms._pca_method = PCAParameters.Method.GramSVD;
parms._impute_missing = false;
parms._seed = 12345;
PCA pcaParms = new PCA(parms);
model = pcaParms.trainModel().get(); // get normal data
score = model.score(train);
// Build a POJO, check results with original PCA
Assert.assertTrue(model.testJavaScoring(train,score,TOLERANCE));
} finally {
if (train != null) train.delete();
if (score != null) score.delete();
if (model != null) model.delete();
}
}
/*
Quick test to make sure changes made to PCA for rank deficient matrices do not cause leakage.
*/
@Test public void testPUBDEV3500NoLeakage() throws InterruptedException, ExecutionException {
Scope.enter();
Frame train = null;
try {
train = parse_test_file(Key.make("prostate_cat.hex"), "smalldata/prostate/prostate_cat.csv");
Scope.track(train);
PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
parms._train = train._key;
parms._k = 3;
parms._transform = DataInfo.TransformType.NONE;
parms._pca_method = PCAModel.PCAParameters.Method.Randomized;
parms._impute_missing = true; // Don't skip rows with NA entries, but impute using mean of column
parms._seed = 12345;
parms._use_all_factor_levels=true;
PCAModel pca = null;
pca = new PCA(parms).trainModel().get();
Scope.track_generic(pca);
Assert.assertTrue(pca._parms._k == pca._output._std_deviation.length);
} finally {
Scope.exit();
}
}
}