/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.math.hadoop.stochasticsvd;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.*;
import org.apache.mahout.math.function.DoubleFunction;
import org.apache.mahout.math.function.Functions;
import org.apache.mahout.math.function.VectorFunction;
import org.junit.Test;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Deque;
import java.util.Iterator;
import java.util.Random;
public class LocalSSVDPCASparseTest extends MahoutTestCase {
private static final double s_epsilon = 1.0E-10d;
@Test
public void testOmegaTRightMultiply() {
final Random rnd = RandomUtils.getRandom();
final long seed = rnd.nextLong();
final int n = 2000;
final int kp = 100;
final Omega omega = new Omega(seed, kp);
final Matrix materializedOmega = new DenseMatrix(n, kp);
for (int i = 0; i < n; i++)
for (int j = 0; j < kp; j++)
materializedOmega.setQuick(i, j, omega.getQuick(i, j));
Vector xi = new DenseVector(n);
xi.assign(new DoubleFunction() {
@Override
public double apply(double x) {
return rnd.nextDouble() * 100;
}
});
Vector s_o = omega.mutlithreadedTRightMultiply(xi);
Matrix xiVector = new DenseMatrix(n, 1);
xiVector.assignColumn(0, xi);
Vector s_o_control = materializedOmega.transpose().times(xiVector).viewColumn(0);
assertEquals(0, s_o.minus(s_o_control).aggregate(Functions.PLUS, Functions.ABS), 1e-10);
System.out.printf("s_omega=\n%s\n", s_o);
System.out.printf("s_omega_control=\n%s\n", s_o_control);
}
@Test
public void runPCATest1() throws IOException {
runSSVDSolver(1);
}
// @Test
public void runPCATest0() throws IOException {
runSSVDSolver(0);
}
public void runSSVDSolver(int q) throws IOException {
Configuration conf = new Configuration();
conf.set("mapred.job.tracker", "local");
conf.set("fs.default.name", "file:///");
// conf.set("mapred.job.tracker","localhost:11011");
// conf.set("fs.default.name","hdfs://localhost:11010/");
Deque<Closeable> closeables = Lists.newLinkedList();
try {
Random rnd = RandomUtils.getRandom();
File tmpDir = getTestTempDir("svdtmp");
conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());
Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");
// create distributed row matrix-like struct
SequenceFile.Writer w =
SequenceFile.createWriter(FileSystem.getLocal(conf),
conf,
aLocPath,
Text.class,
VectorWritable.class,
CompressionType.BLOCK,
new DefaultCodec());
closeables.addFirst(w);
int n = 100;
int m = 2000;
double percent = 5;
VectorWritable vw = new VectorWritable();
Text rkey = new Text();
Vector xi = new DenseVector(n);
double muAmplitude = 50.0;
for (int i = 0; i < m; i++) {
Vector dv = new SequentialAccessSparseVector(n);
String rowname = "row-"+i;
NamedVector namedRow = new NamedVector(dv, rowname);
for (int j = 0; j < n * percent / 100; j++) {
dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
}
rkey.set("row-i"+i);
vw.set(namedRow);
w.append(rkey, vw);
xi.assign(dv, Functions.PLUS);
}
closeables.remove(w);
Closeables.close(w, false);
xi.assign(Functions.mult(1.0 / m));
FileSystem fs = FileSystem.get(conf);
Path tempDirPath = getTestTempDirPath("svd-proc");
Path aPath = new Path(tempDirPath, "A/A.seq");
fs.copyFromLocalFile(aLocPath, aPath);
Path xiPath = new Path(tempDirPath, "xi/xi.seq");
SSVDHelper.saveVector(xi, xiPath, conf);
Path svdOutPath = new Path(tempDirPath, "SSVD-out");
// make sure we wipe out previous test results, just a convenience
fs.delete(svdOutPath, true);
// Solver starts here:
System.out.println("Input prepared, starting solver...");
int ablockRows = 867;
int p = 60;
int k = 40;
SSVDSolver ssvd =
new SSVDSolver(conf,
new Path[]{aPath},
svdOutPath,
ablockRows,
k,
p,
3);
ssvd.setOuterBlockHeight(500);
ssvd.setAbtBlockHeight(251);
ssvd.setPcaMeanPath(xiPath);
/*
* Removing V,U jobs from this test to reduce running time. i will keep them
* put in the dense test though.
*
* For PCA test, we also want to request U*Sigma output and check it for named
* vector propagation.
*/
ssvd.setComputeU(false);
ssvd.setComputeV(false);
ssvd.setcUSigma(true);
ssvd.setOverwrite(true);
ssvd.setQ(q);
ssvd.setBroadcast(true);
ssvd.run();
Vector stochasticSValues = ssvd.getSingularValues();
// try to run the same thing without stochastic algo
Matrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);
verifyInternals(svdOutPath, a, new Omega(ssvd.getOmegaSeed(), k + p), k + p, q);
// subtract pseudo pca mean
for (int i = 0; i < m; i++) {
a.viewRow(i).assign(xi, Functions.MINUS);
}
SingularValueDecomposition svd2 =
new SingularValueDecomposition(a);
Vector svalues2 = new DenseVector(svd2.getSingularValues());
System.out.println("--SSVD solver singular values:");
LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
System.out.println("--SVD solver singular values:");
LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);
for (int i = 0; i < k + p; i++) {
assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
}
DenseMatrix mQ =
SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/"
+ BtJob.OUTPUT_Q + "-*"), conf);
SSVDCommonTest.assertOrthonormality(mQ,
false,
s_epsilon);
// assert name propagation
for (Iterator<Pair<Writable, Vector>> iter = SSVDHelper.drmIterator(fs,
new Path(ssvd.getuSigmaPath()+"/*"),
conf,
closeables); iter.hasNext(); ) {
Pair<Writable, Vector> pair = iter.next();
Writable key = pair.getFirst();
Vector v = pair.getSecond();
assertTrue(v instanceof NamedVector);
assertTrue(key instanceof Text);
}
} finally {
IOUtils.close(closeables);
}
}
private void verifyInternals(Path tempDir, Matrix a, Omega omega, int kp, int q) {
int m = a.numRows();
int n = a.numCols();
Vector xi = a.aggregateColumns(new VectorFunction() {
@Override
public double apply(Vector v) {
return v.zSum() / v.size();
}
});
// materialize omega
Matrix momega = new DenseMatrix(n, kp);
for (int i = 0; i < n; i++)
for (int j = 0; j < kp; j++)
momega.setQuick(i, j, omega.getQuick(i, j));
Vector s_o = omega.mutlithreadedTRightMultiply(xi);
System.out.printf("s_omega=\n%s\n", s_o);
Matrix y = a.times(momega);
for (int i = 0; i < n; i++) y.viewRow(i).assign(s_o, Functions.MINUS);
QRDecomposition qr = new QRDecomposition(y);
Matrix qm = qr.getQ();
Vector s_q = qm.aggregateColumns(new VectorFunction() {
@Override
public double apply(Vector v) {
return v.zSum();
}
});
System.out.printf("s_q=\n%s\n", s_q);
Matrix b = qm.transpose().times(a);
Vector s_b = b.times(xi);
System.out.printf("s_b=\n%s\n", s_b);
}
}