package hex.pca; import hex.DataInfo; import org.junit.BeforeClass; import org.junit.Test; import water.DKV; import water.Key; import water.Scope; import water.TestUtil; import water.fvec.Frame; import water.util.Log; import java.util.Random; import java.util.concurrent.ExecutionException; import static org.junit.Assert.assertTrue; import static water.TestUtil.parse_test_file; /** * Created by wendycwong on 2/27/17. */ public class PCAWideDataSetsTests extends TestUtil { public static final double _TOLERANCE = 1e-6; public static final String _smallDataset = "smalldata/pca_test/decathlon.csv"; public static final String _prostateDataset = "smalldata/prostate/prostate_cat.csv"; public static final DataInfo.TransformType[] _transformTypes = {DataInfo.TransformType.NONE, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.DEMEAN, DataInfo.TransformType.DESCALE}; public Random _rand = new Random(); public PCAModel _golden = null; // @BeforeClass public static void setup() { stall_till_cloudsize(1); } /* This unit test will test that pca method GramSVD works with wide datasets. It will first build a model using GramSVD under normal setting (_wideDataset is set to false). Next, it builds a GramSVD model with _wideDataSet set to true. The eigenvalues and eigenvectors from the two models are compared. Test will fail if any difference exceeds 1e-6. Six test cases are used: case 1. we test with a small dataset with all numerical data columns and make sure it works. case 2. we add NA rows to the small dataset with all numerical data columns. case 3. test with the same small dataset while preserving the categorical columns; case 4. test with the same small dataset with categorical columns and add NA rows; case 5. test with prostate dataset; case 6. test with prostate dataset with NA rows added. */ @Test public void testWideDataSetGramSVD() throws InterruptedException, ExecutionException { ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _smallDataset, false, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 1 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _smallDataset, true, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 2 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _smallDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 3 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _smallDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 4 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _prostateDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 5 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.GramSVD, _TOLERANCE, _prostateDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 6 } /* This unit test will test that pca method Power works with wide datasets. It will first build a model using GramSVD under normal setting (_wideDataset is set to false). Next, it builds a Power model with _wideDataSet set to true. The eigenvalues and eigenvectors from the two models are compared. Test will fail if any difference exceeds 1e-6. The same six test cases are used here. */ @Test public void testWideDataSetPower() throws InterruptedException, ExecutionException { ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _smallDataset, false, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 1 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _smallDataset, true, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 2 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _smallDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 3 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _smallDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 4 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _prostateDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 5 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Power, _TOLERANCE, _prostateDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 6 } /* This unit test will test that pca method Randomized works with wide datasets. It will first build a model using GramSVD under normal setting (_wideDataset is set to false). Next, it builds a Randomized model with _wideDataSet set to true. The eigenvalues and eigenvectors from the two models are compared. Test will fail if any difference exceeds 1e-6. The same six test cases are used here. */ @Test public void testWideDataSetRandomized() throws InterruptedException, ExecutionException { ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _smallDataset, false, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 1 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _smallDataset, true, true, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 2 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _smallDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 3 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _smallDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 4 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _prostateDataset, false, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 5 ActualPCATests.testWideDataSets(PCAModel.PCAParameters.Method.Randomized, _TOLERANCE, _prostateDataset, true, false, _transformTypes[_rand.nextInt(_transformTypes.length)]); // case 6 } } /* This class performs the actual PCA tests. */ class ActualPCATests { public static void testWideDataSets(PCAModel.PCAParameters.Method pcaMethod, double tolerance, String datafile, boolean addNAs, boolean removeColumns, DataInfo.TransformType transformType) throws InterruptedException, ExecutionException { Scope.enter(); PCAModel modelN = null; // store PCA models generated with original implementation PCAModel modelW = null; // store PCA models generated with wideDataSet set to true Frame train = null, scoreN = null, scoreW = null; try { train = parse_test_file(Key.make(datafile), datafile); Scope.track(train); if (removeColumns) { train.remove(12).remove(); // remove categorical columns train.remove(11).remove(); train.remove(10).remove(); } if (addNAs) { train.vec(0).setNA(0); // set NAs train.vec(3).setNA(10); train.vec(5).setNA(20); } DKV.put(train); PCAModel.PCAParameters parms = new PCAModel.PCAParameters(); parms._train = train._key; parms._k = 3; parms._transform = transformType; Log.info("Data transformation applied is "+parms._transform.name()); parms._use_all_factor_levels = true; parms._pca_method = PCAModel.PCAParameters.Method.GramSVD; parms._impute_missing = false; parms._seed = 12345; PCA pcaParms = new PCA(parms); modelN = pcaParms.trainModel().get(); // get normal data scoreN = modelN.score(train); Scope.track(scoreN); Scope.track_generic(modelN); parms._pca_method = pcaMethod; PCA pcaParmsW = new PCA(parms); pcaParmsW.setWideDataset(true); // force to treat dataset as wide even though it is not. modelW = pcaParmsW.trainModel().get(); scoreW = modelW.score(train); Scope.track(scoreW); Scope.track_generic(modelW); // compare eigenvectors and eigenvalues generated by original PCA and wide dataset PCA. TestUtil.checkStddev(modelW._output._std_deviation, modelN._output._std_deviation, tolerance); boolean[] flippedEig = TestUtil.checkEigvec(modelW._output._eigenvectors, modelN._output._eigenvectors, tolerance); TestUtil.checkProjection(scoreW, scoreN, tolerance, flippedEig); // Build a POJO, check results with original PCA assertTrue(modelN.testJavaScoring(train, scoreN, tolerance)); // Build a POJO, check results with wide dataset PCA assertTrue(modelW.testJavaScoring(train, scoreW, tolerance)); } finally { Scope.exit(); } } }