/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.math.hadoop.stochasticsvd; import java.io.Closeable; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import java.util.Deque; import java.util.List; import java.util.Random; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Writable; import org.apache.mahout.common.IOUtils; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.common.iterator.sequencefile.PathFilters; import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.math.ssvd.EigenSolverWrapper; import com.google.common.collect.Lists; import com.google.common.io.Closeables; /** * Stochastic SVD solver (API class). * <P> * * Implementation details are in my working notes in MAHOUT-376 * (https://issues.apache.org/jira/browse/MAHOUT-376). * <P> * * As of the time of this writing, I don't have benchmarks for this method in * comparison to other methods. However, non-hadoop differentiating * characteristics of this method are thought to be : * <LI>"faster" and precision is traded off in favor of speed. However, there's * lever in terms of "oversampling parameter" p. Higher values of p produce * better precision but are trading off speed (and minimum RAM requirement). * This also means that this method is almost guaranteed to be less precise than * Lanczos unless full rank SVD decomposition is sought. * <LI>"more scale" -- can presumably take on larger problems than Lanczos one * (not confirmed by benchmark at this time) * <P> * <P> * * Specifically in regards to this implementation, <i>I think</i> couple of * other differentiating points are: * <LI>no need to specify input matrix height or width in command line, it is * what it gets to be. * <LI>supports any Writable as DRM row keys and copies them to correspondent * rows of U matrix; * <LI>can request U or V or U<sub>σ</sub>=U* Σ<sup>0.5</sup> or * V<sub>σ</sub>=V* Σ<sup>0.5</sup> none of which would require pass * over input A and these jobs are parallel map-only jobs. * <P> * <P> * * This class is central public API for SSVD solver. The use pattern is as * follows: * * <UL> * <LI>create the solver using constructor and supplying computation parameters. * <LI>set optional parameters thru setter methods. * <LI>call {@link #run()}. * <LI> {@link #getUPath()} (if computed) returns the path to the directory * containing m x k U matrix file(s). * <LI> {@link #getVPath()} (if computed) returns the path to the directory * containing n x k V matrix file(s). * * </UL> */ public class SSVDSolver { private double[] svalues; private boolean computeU = true; private boolean computeV = true; private String uPath; private String vPath; private int outerBlockHeight = 30000; private int abtBlockHeight = 200000; // configured stuff private final Configuration conf; private final Path[] inputPath; private final Path outputPath; private final int ablockRows; private final int k; private final int p; private int q; private final int reduceTasks; private int minSplitSize = -1; private boolean cUHalfSigma; private boolean cVHalfSigma; private boolean overwrite; private boolean broadcast = true; /** * create new SSVD solver. Required parameters are passed to constructor to * ensure they are set. Optional parameters can be set using setters . * <P> * * @param conf * hadoop configuration * @param inputPath * Input path (should be compatible with DistributedRowMatrix as of * the time of this writing). * @param outputPath * Output path containing U, V and singular values vector files. * @param ablockRows * The vertical hight of a q-block (bigger value require more memory * in mappers+ perhaps larger {@code minSplitSize} values * @param k * desired rank * @param p * SSVD oversampling parameter * @param reduceTasks * Number of reduce tasks (where applicable) * @throws IOException * when IO condition occurs. */ public SSVDSolver(Configuration conf, Path[] inputPath, Path outputPath, int ablockRows, int k, int p, int reduceTasks) { this.conf = conf; this.inputPath = inputPath; this.outputPath = outputPath; this.ablockRows = ablockRows; this.k = k; this.p = p; this.reduceTasks = reduceTasks; } public void setcUHalfSigma(boolean cUHat) { this.cUHalfSigma = cUHat; } public void setcVHalfSigma(boolean cVHat) { this.cVHalfSigma = cVHat; } public int getQ() { return q; } /** * sets q, amount of additional power iterations to increase precision * (0..2!). Defaults to 0. * * @param q */ public void setQ(int q) { this.q = q; } /** * The setting controlling whether to compute U matrix of low rank SSVD. * */ public void setComputeU(boolean val) { computeU = val; } /** * Setting controlling whether to compute V matrix of low-rank SSVD. * * @param val * true if we want to output V matrix. Default is true. */ public void setComputeV(boolean val) { computeV = val; } /** * Sometimes, if requested A blocks become larger than a split, we may need to * use that to ensure at least k+p rows of A get into a split. This is * requirement necessary to obtain orthonormalized Q blocks of SSVD. * * @param size * the minimum split size to use */ public void setMinSplitSize(int size) { minSplitSize = size; } /** * This contains k+p singular values resulted from the solver run. * * @return singlular values (largest to smallest) */ public double[] getSingularValues() { return svalues; } /** * returns U path (if computation were requested and successful). * * @return U output hdfs path, or null if computation was not completed for * whatever reason. */ public String getUPath() { return uPath; } /** * return V path ( if computation was requested and successful ) . * * @return V output hdfs path, or null if computation was not completed for * whatever reason. */ public String getVPath() { return vPath; } public boolean isOverwrite() { return overwrite; } /** * if true, driver to clean output folder first if exists. * * @param overwrite */ public void setOverwrite(boolean overwrite) { this.overwrite = overwrite; } public int getOuterBlockHeight() { return outerBlockHeight; } /** * The height of outer blocks during Q'A multiplication. Higher values allow * to produce less keys for combining and shuffle and sort therefore somewhat * improving running time; but require larger blocks to be formed in RAM (so * setting this too high can lead to OOM). * * @param outerBlockHeight */ public void setOuterBlockHeight(int outerBlockHeight) { this.outerBlockHeight = outerBlockHeight; } public int getAbtBlockHeight() { return abtBlockHeight; } /** * the block height of Y_i during power iterations. It is probably important * to set it higher than default 200,000 for extremely sparse inputs and when * more ram is available. y_i block height and ABt job would occupy approx. * abtBlockHeight x (k+p) x sizeof (double) (as dense). * * @param abtBlockHeight */ public void setAbtBlockHeight(int abtBlockHeight) { this.abtBlockHeight = abtBlockHeight; } public boolean isBroadcast() { return broadcast; } /** * If this property is true, use DestributedCache mechanism to broadcast some * stuff around. May improve efficiency. Default is false. * * @param broadcast */ public void setBroadcast(boolean broadcast) { this.broadcast = broadcast; } /** * run all SSVD jobs. * * @throws IOException * if I/O condition occurs. */ public void run() throws IOException { Deque<Closeable> closeables = Lists.newLinkedList(); try { Class<? extends Writable> labelType = sniffInputLabelType(inputPath, conf); FileSystem fs = FileSystem.get(conf); Path qPath = new Path(outputPath, "Q-job"); Path btPath = new Path(outputPath, "Bt-job"); Path uHatPath = new Path(outputPath, "UHat"); Path svPath = new Path(outputPath, "Sigma"); Path uPath = new Path(outputPath, "U"); Path vPath = new Path(outputPath, "V"); if (overwrite) { fs.delete(outputPath, true); } Random rnd = RandomUtils.getRandom(); long seed = rnd.nextLong(); QJob.run(conf, inputPath, qPath, ablockRows, minSplitSize, k, p, seed, reduceTasks); /* * restrict number of reducers to a reasonable number so we don't have to * run too many additions in the frontend when reconstructing BBt for the * last B' and BB' computations. The user may not realize that and gives a * bit too many (I would be happy i that were ever the case though). */ BtJob.run(conf, inputPath, qPath, btPath, minSplitSize, k, p, outerBlockHeight, q <= 0 ? Math.min(1000, reduceTasks) : reduceTasks, broadcast, labelType, q <= 0); // power iterations for (int i = 0; i < q; i++) { qPath = new Path(outputPath, String.format("ABt-job-%d", i + 1)); Path btPathGlob = new Path(btPath, BtJob.OUTPUT_BT + "-*"); ABtDenseOutJob.run(conf, inputPath, btPathGlob, qPath, ablockRows, minSplitSize, k, p, abtBlockHeight, reduceTasks, broadcast); btPath = new Path(outputPath, String.format("Bt-job-%d", i + 1)); BtJob.run(conf, inputPath, qPath, btPath, minSplitSize, k, p, outerBlockHeight, i == q - 1 ? Math.min(1000, reduceTasks) : reduceTasks, broadcast, labelType, i == q - 1); } UpperTriangular bbt = loadAndSumUpperTriangularMatrices(fs, new Path(btPath, BtJob.OUTPUT_BBT + "-*"), conf); // convert bbt to something our eigensolver could understand assert bbt.columnSize() == k + p; double[][] bbtSquare = new double[k + p][]; for (int i = 0; i < k + p; i++) { bbtSquare[i] = new double[k + p]; } for (int i = 0; i < k + p; i++) { for (int j = i; j < k + p; j++) { bbtSquare[i][j] = bbtSquare[j][i] = bbt.getQuick(i, j); } } svalues = new double[k + p]; // try something else. EigenSolverWrapper eigenWrapper = new EigenSolverWrapper(bbtSquare); double[] eigenva2 = eigenWrapper.getEigenValues(); for (int i = 0; i < k + p; i++) { svalues[i] = Math.sqrt(eigenva2[i]); // sqrt? } // save/redistribute UHat double[][] uHat = eigenWrapper.getUHat(); fs.mkdirs(uHatPath); SequenceFile.Writer uHatWriter = SequenceFile.createWriter(fs, conf, uHatPath = new Path(uHatPath, "uhat.seq"), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); closeables.addFirst(uHatWriter); int m = uHat.length; IntWritable iw = new IntWritable(); VectorWritable vw = new VectorWritable(); for (int i = 0; i < m; i++) { vw.set(new DenseVector(uHat[i], true)); iw.set(i); uHatWriter.append(iw, vw); } closeables.remove(uHatWriter); Closeables.closeQuietly(uHatWriter); SequenceFile.Writer svWriter = SequenceFile.createWriter(fs, conf, svPath = new Path(svPath, "svalues.seq"), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); closeables.addFirst(svWriter); vw.set(new DenseVector(svalues, true)); svWriter.append(iw, vw); closeables.remove(svWriter); Closeables.closeQuietly(svWriter); UJob ujob = null; if (computeU) { ujob = new UJob(); ujob.start(conf, new Path(btPath, BtJob.OUTPUT_Q + "-*"), uHatPath, svPath, uPath, k, reduceTasks, labelType, cUHalfSigma); // actually this is map-only job anyway } VJob vjob = null; if (computeV) { vjob = new VJob(); vjob.start(conf, new Path(btPath, BtJob.OUTPUT_BT + "-*"), uHatPath, svPath, vPath, k, reduceTasks, cVHalfSigma); } if (ujob != null) { ujob.waitForCompletion(); this.uPath = uPath.toString(); } if (vjob != null) { vjob.waitForCompletion(); this.vPath = vPath.toString(); } } catch (InterruptedException exc) { throw new IOException("Interrupted", exc); } catch (ClassNotFoundException exc) { throw new IOException(exc); } finally { IOUtils.close(closeables); } } private static Class<? extends Writable> sniffInputLabelType(Path[] inputPath, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); for (Path p : inputPath) { FileStatus[] fstats = fs.globStatus(p); if (fstats == null || fstats.length == 0) { continue; } FileStatus firstSeqFile; if (fstats[0].isDir()) { firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0]; } else { firstSeqFile = fstats[0]; } SequenceFile.Reader r = null; try { r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf); return r.getKeyClass().asSubclass(Writable.class); } finally { Closeables.closeQuietly(r); } } throw new IOException("Unable to open input files to determine input label type."); } private static final Pattern OUTPUT_FILE_PATTERN = Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?"); static final Comparator<FileStatus> PARTITION_COMPARATOR = new Comparator<FileStatus>() { private final Matcher matcher = OUTPUT_FILE_PATTERN.matcher(""); @Override public int compare(FileStatus o1, FileStatus o2) { matcher.reset(o1.getPath().getName()); if (!matcher.matches()) { throw new IllegalArgumentException("Unexpected file name, unable to deduce partition #:" + o1.getPath()); } int p1 = Integer.parseInt(matcher.group(3)); matcher.reset(o2.getPath().getName()); if (!matcher.matches()) { throw new IllegalArgumentException("Unexpected file name, unable to deduce partition #:" + o2.getPath()); } int p2 = Integer.parseInt(matcher.group(3)); return p1 - p2; } }; /** * helper capabiltiy to load distributed row matrices into dense matrix (to * support tests mainly). * * @param fs * filesystem * @param glob * FS glob * @param conf * configuration * @return Dense matrix array * @throws IOException * when I/O occurs. */ public static double[][] loadDistributedRowMatrix(FileSystem fs, Path glob, Configuration conf) throws IOException { FileStatus[] files = fs.globStatus(glob); if (files == null) { return null; } List<double[]> denseData = Lists.newArrayList(); /* * assume it is partitioned output, so we need to read them up in order of * partitions. */ Arrays.sort(files, PARTITION_COMPARATOR); for (FileStatus fstat : files) { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(), true, conf)) { Vector v = value.get(); int size = v.size(); double[] row = new double[size]; for (int i = 0; i < size; i++) { row[i] = v.get(i); } // ignore row label. denseData.add(row); } } return denseData.toArray(new double[denseData.size()][]); } /** * Load multiplel upper triangular matrices and sum them up. * * @param fs * @param glob * @param conf * @return the sum of upper triangular inputs. * @throws IOException */ public static UpperTriangular loadAndSumUpperTriangularMatrices(FileSystem fs, Path glob, Configuration conf) throws IOException { FileStatus[] files = fs.globStatus(glob); if (files == null) { return null; } /* * assume it is partitioned output, so we need to read them up in order of * partitions. */ Arrays.sort(files, PARTITION_COMPARATOR); DenseVector result = null; for (FileStatus fstat : files) { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(), true, conf)) { Vector v = value.get(); if (result == null) { result = new DenseVector(v); } else { result.addAll(v); } } } if (result == null) { throw new IOException("Unexpected underrun in upper triangular matrix files"); } return new UpperTriangular(result); } /** * Load only one upper triangular matrix and issue error if mroe than one is * found. */ public static UpperTriangular loadUpperTriangularMatrix(FileSystem fs, Path glob, Configuration conf) throws IOException { FileStatus[] files = fs.globStatus(glob); if (files == null) { return null; } /* * assume it is partitioned output, so we need to read them up in order of * partitions. */ Arrays.sort(files, PARTITION_COMPARATOR); UpperTriangular result = null; for (FileStatus fstat : files) { for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(), true, conf)) { Vector v = value.get(); if (result == null) { result = new UpperTriangular(v); } else { throw new IOException("Unexpected overrun in upper triangular matrix files"); } } } if (result == null) { throw new IOException("Unexpected underrun in upper triangular matrix files"); } return result; } }