/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.hadoop.stochasticsvd; import java.io.File; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.mahout.common.MahoutTestCase; import org.apache.mahout.math.DenseMatrix; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.junit.Test; /** * * Tests SSVD solver with a made-up data running hadoop solver in a local mode. * It requests full-rank SSVD and then compares singular values to that of * Colt's SVD asserting epsilon(precision) 1e-10 or whatever most recent value * configured. * */ public class LocalSSVDSolverDenseTest extends MahoutTestCase { private static final double s_epsilon = 1.0E-10d; // I actually never saw errors more than 3% worst case for this test, // but since it's non-deterministic test, it still may occasionally produce // bad results with a non-zero probability, so i put this pct% for error // margin high enough so it (almost) never fails. private static final double s_precisionPct = 10; @Test public void testSSVDSolverDense() throws IOException { runSSVDSolver(0); } @Test public void testSSVDSolverPowerIterations1() throws IOException { runSSVDSolver(1); } /* * remove from active tests to save time. */ /* @Test */ public void testSSVDSolverPowerIterations2() throws IOException { runSSVDSolver(2); } public void runSSVDSolver(int q) throws IOException { Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); // conf.set("mapred.job.tracker","localhost:11011"); // conf.set("fs.default.name","hdfs://localhost:11010/"); File tmpDir = getTestTempDir("svdtmp"); conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath()); Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq"); // create distributed row matrix-like struct // SequenceFile.Writer w = SequenceFile.createWriter( // FileSystem.getLocal(conf), conf, aLocPath, IntWritable.class, // VectorWritable.class, CompressionType.NONE, new DefaultCodec()); // closeables.addFirst(w); // make input equivalent to 2 mln non-zero elements. // With 100mln the precision turns out to be only better (LLN law i guess) // With oversampling of 100, i don't get any error at all. int n = 100; int m = 2000; Vector singularValues = new DenseVector(new double[] { 10, 4, 1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1 }); SSVDTestsHelper.generateDenseInput(aLocPath, FileSystem.getLocal(conf), singularValues, m, n); FileSystem fs = FileSystem.get(conf); Path tempDirPath = getTestTempDirPath("svd-proc"); Path aPath = new Path(tempDirPath, "A/A.seq"); fs.copyFromLocalFile(aLocPath, aPath); Path svdOutPath = new Path(tempDirPath, "SSVD-out"); // Solver starts here: System.out.println("Input prepared, starting solver..."); int ablockRows = 867; int p = 10; int k = 3; SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3); /* * these are only tiny-test values to simulate high load cases, in reality * one needs much bigger */ ssvd.setOuterBlockHeight(500); ssvd.setAbtBlockHeight(400); ssvd.setOverwrite(true); ssvd.setQ(q); ssvd.setBroadcast(false); ssvd.run(); double[] stochasticSValues = ssvd.getSingularValues(); System.out.println("--SSVD solver singular values:"); dumpSv(stochasticSValues); // the full-rank svd for this test size takes too long to run, // so i comment it out, instead, i will be comparing // result singular values to the original values used // to generate input (which are guaranteed to be right). /* * System.out.println("--Colt SVD solver singular values:"); // try to run * * the same thing without stochastic algo double[][] a = * SSVDSolver.loadDistributedRowMatrix(fs, aPath, conf); * * * * SingularValueDecomposition svd2 = new SingularValueDecomposition(new * DenseMatrix(a)); * * a = null; * * double[] svalues2 = svd2.getSingularValues(); dumpSv(svalues2); * * for (int i = 0; i < k ; i++) { Assert .assertTrue(1-Math.abs((svalues2[i] * - stochasticSValues[i])/svalues2[i]) <= s_precisionPct/100); } */ // assert first k against those // used to generate surrogate input for (int i = 0; i < k; i++) { assertTrue(Math.abs((singularValues.getQuick(i) - stochasticSValues[i]) / singularValues.getQuick(i)) <= s_precisionPct / 100); } double[][] mQ = SSVDSolver.loadDistributedRowMatrix(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"), conf); SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon); double[][] u = SSVDSolver.loadDistributedRowMatrix(fs, new Path(svdOutPath, "U/[^_]*"), conf); SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(u), false, s_epsilon); double[][] v = SSVDSolver.loadDistributedRowMatrix(fs, new Path(svdOutPath, "V/[^_]*"), conf); SSVDPrototypeTest.assertOrthonormality(new DenseMatrix(v), false, s_epsilon); } static void dumpSv(double[] s) { System.out.printf("svs: "); for (double value : s) { System.out.printf("%f ", value); } System.out.println(); } static void dump(double[][] matrix) { for (double[] aMatrix : matrix) { for (double anAMatrix : aMatrix) { System.out.printf("%f ", anAMatrix); } System.out.println(); } } }