SSVDSolver.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.math.hadoop.stochasticsvd;

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Deque;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Writable;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.ssvd.EigenSolverWrapper;

import com.google.common.collect.Lists;
import com.google.common.io.Closeables;

/**
 * Stochastic SVD solver (API class).
 * <P>
 * 
 * Implementation details are in my working notes in MAHOUT-376
 * (https://issues.apache.org/jira/browse/MAHOUT-376).
 * <P>
 * 
 * As of the time of this writing, I don't have benchmarks for this method in
 * comparison to other methods. However, non-hadoop differentiating
 * characteristics of this method are thought to be :
 * <LI>"faster" and precision is traded off in favor of speed. However, there's
 * lever in terms of "oversampling parameter" p. Higher values of p produce
 * better precision but are trading off speed (and minimum RAM requirement).
 * This also means that this method is almost guaranteed to be less precise than
 * Lanczos unless full rank SVD decomposition is sought.
 * <LI>"more scale" -- can presumably take on larger problems than Lanczos one
 * (not confirmed by benchmark at this time)
 * <P>
 * <P>
 * 
 * Specifically in regards to this implementation, <i>I think</i> couple of
 * other differentiating points are:
 * <LI>no need to specify input matrix height or width in command line, it is
 * what it gets to be.
 * <LI>supports any Writable as DRM row keys and copies them to correspondent
 * rows of U matrix;
 * <LI>can request U or V or U<sub>σ</sub>=U* Σ<sup>0.5</sup> or
 * V<sub>σ</sub>=V* Σ<sup>0.5</sup> none of which would require pass
 * over input A and these jobs are parallel map-only jobs.
 * <P>
 * <P>
 * 
 * This class is central public API for SSVD solver. The use pattern is as
 * follows:
 * 
 * <UL>
 * <LI>create the solver using constructor and supplying computation parameters.
 * <LI>set optional parameters thru setter methods.
 * <LI>call {@link #run()}.
 * <LI> {@link #getUPath()} (if computed) returns the path to the directory
 * containing m x k U matrix file(s).
 * <LI> {@link #getVPath()} (if computed) returns the path to the directory
 * containing n x k V matrix file(s).
 * 
 * </UL>
 */
public class SSVDSolver {

  private double[] svalues;
  private boolean computeU = true;
  private boolean computeV = true;
  private String uPath;
  private String vPath;
  private int outerBlockHeight = 30000;
  private int abtBlockHeight = 200000;

  // configured stuff
  private final Configuration conf;
  private final Path[] inputPath;
  private final Path outputPath;
  private final int ablockRows;
  private final int k;
  private final int p;
  private int q;
  private final int reduceTasks;
  private int minSplitSize = -1;
  private boolean cUHalfSigma;
  private boolean cVHalfSigma;
  private boolean overwrite;
  private boolean broadcast = true;

  /**
   * create new SSVD solver. Required parameters are passed to constructor to
   * ensure they are set. Optional parameters can be set using setters .
   * <P>
   * 
   * @param conf
   *          hadoop configuration
   * @param inputPath
   *          Input path (should be compatible with DistributedRowMatrix as of
   *          the time of this writing).
   * @param outputPath
   *          Output path containing U, V and singular values vector files.
   * @param ablockRows
   *          The vertical hight of a q-block (bigger value require more memory
   *          in mappers+ perhaps larger {@code minSplitSize} values
   * @param k
   *          desired rank
   * @param p
   *          SSVD oversampling parameter
   * @param reduceTasks
   *          Number of reduce tasks (where applicable)
   * @throws IOException
   *           when IO condition occurs.
   */
  public SSVDSolver(Configuration conf,
                    Path[] inputPath,
                    Path outputPath,
                    int ablockRows,
                    int k,
                    int p,
                    int reduceTasks) {
    this.conf = conf;
    this.inputPath = inputPath;
    this.outputPath = outputPath;
    this.ablockRows = ablockRows;
    this.k = k;
    this.p = p;
    this.reduceTasks = reduceTasks;
  }

  public void setcUHalfSigma(boolean cUHat) {
    this.cUHalfSigma = cUHat;
  }

  public void setcVHalfSigma(boolean cVHat) {
    this.cVHalfSigma = cVHat;
  }

  public int getQ() {
    return q;
  }

  /**
   * sets q, amount of additional power iterations to increase precision
   * (0..2!). Defaults to 0.
   * 
   * @param q
   */
  public void setQ(int q) {
    this.q = q;
  }

  /**
   * The setting controlling whether to compute U matrix of low rank SSVD.
   * 
   */
  public void setComputeU(boolean val) {
    computeU = val;
  }

  /**
   * Setting controlling whether to compute V matrix of low-rank SSVD.
   * 
   * @param val
   *          true if we want to output V matrix. Default is true.
   */
  public void setComputeV(boolean val) {
    computeV = val;
  }

  /**
   * Sometimes, if requested A blocks become larger than a split, we may need to
   * use that to ensure at least k+p rows of A get into a split. This is
   * requirement necessary to obtain orthonormalized Q blocks of SSVD.
   * 
   * @param size
   *          the minimum split size to use
   */
  public void setMinSplitSize(int size) {
    minSplitSize = size;
  }

  /**
   * This contains k+p singular values resulted from the solver run.
   * 
   * @return singlular values (largest to smallest)
   */
  public double[] getSingularValues() {
    return svalues;
  }

  /**
   * returns U path (if computation were requested and successful).
   * 
   * @return U output hdfs path, or null if computation was not completed for
   *         whatever reason.
   */
  public String getUPath() {
    return uPath;
  }

  /**
   * return V path ( if computation was requested and successful ) .
   * 
   * @return V output hdfs path, or null if computation was not completed for
   *         whatever reason.
   */
  public String getVPath() {
    return vPath;
  }

  public boolean isOverwrite() {
    return overwrite;
  }

  /**
   * if true, driver to clean output folder first if exists.
   * 
   * @param overwrite
   */
  public void setOverwrite(boolean overwrite) {
    this.overwrite = overwrite;
  }

  public int getOuterBlockHeight() {
    return outerBlockHeight;
  }

  /**
   * The height of outer blocks during Q'A multiplication. Higher values allow
   * to produce less keys for combining and shuffle and sort therefore somewhat
   * improving running time; but require larger blocks to be formed in RAM (so
   * setting this too high can lead to OOM).
   * 
   * @param outerBlockHeight
   */
  public void setOuterBlockHeight(int outerBlockHeight) {
    this.outerBlockHeight = outerBlockHeight;
  }

  public int getAbtBlockHeight() {
    return abtBlockHeight;
  }

  /**
   * the block height of Y_i during power iterations. It is probably important
   * to set it higher than default 200,000 for extremely sparse inputs and when
   * more ram is available. y_i block height and ABt job would occupy approx.
   * abtBlockHeight x (k+p) x sizeof (double) (as dense).
   * 
   * @param abtBlockHeight
   */
  public void setAbtBlockHeight(int abtBlockHeight) {
    this.abtBlockHeight = abtBlockHeight;
  }

  public boolean isBroadcast() {
    return broadcast;
  }

  /**
   * If this property is true, use DestributedCache mechanism to broadcast some
   * stuff around. May improve efficiency. Default is false.
   * 
   * @param broadcast
   */
  public void setBroadcast(boolean broadcast) {
    this.broadcast = broadcast;
  }

  /**
   * run all SSVD jobs.
   * 
   * @throws IOException
   *           if I/O condition occurs.
   */
  public void run() throws IOException {

    Deque<Closeable> closeables = Lists.newLinkedList();
    try {
      Class<? extends Writable> labelType =
        sniffInputLabelType(inputPath, conf);
      FileSystem fs = FileSystem.get(conf);

      Path qPath = new Path(outputPath, "Q-job");
      Path btPath = new Path(outputPath, "Bt-job");
      Path uHatPath = new Path(outputPath, "UHat");
      Path svPath = new Path(outputPath, "Sigma");
      Path uPath = new Path(outputPath, "U");
      Path vPath = new Path(outputPath, "V");

      if (overwrite) {
        fs.delete(outputPath, true);
      }

      Random rnd = RandomUtils.getRandom();
      long seed = rnd.nextLong();

      QJob.run(conf,
               inputPath,
               qPath,
               ablockRows,
               minSplitSize,
               k,
               p,
               seed,
               reduceTasks);

      /*
       * restrict number of reducers to a reasonable number so we don't have to
       * run too many additions in the frontend when reconstructing BBt for the
       * last B' and BB' computations. The user may not realize that and gives a
       * bit too many (I would be happy i that were ever the case though).
       */

      BtJob.run(conf,
                inputPath,
                qPath,
                btPath,
                minSplitSize,
                k,
                p,
                outerBlockHeight,
                q <= 0 ? Math.min(1000, reduceTasks) : reduceTasks,
                broadcast,
                labelType,
                q <= 0);

      // power iterations
      for (int i = 0; i < q; i++) {

        qPath = new Path(outputPath, String.format("ABt-job-%d", i + 1));
        Path btPathGlob = new Path(btPath, BtJob.OUTPUT_BT + "-*");
        ABtDenseOutJob.run(conf,
                           inputPath,
                           btPathGlob,
                           qPath,
                           ablockRows,
                           minSplitSize,
                           k,
                           p,
                           abtBlockHeight,
                           reduceTasks,
                           broadcast);

        btPath = new Path(outputPath, String.format("Bt-job-%d", i + 1));

        BtJob.run(conf,
                  inputPath,
                  qPath,
                  btPath,
                  minSplitSize,
                  k,
                  p,
                  outerBlockHeight,
                  i == q - 1 ? Math.min(1000, reduceTasks) : reduceTasks,
                  broadcast,
                  labelType,
                  i == q - 1);
      }

      UpperTriangular bbt =
        loadAndSumUpperTriangularMatrices(fs, new Path(btPath, BtJob.OUTPUT_BBT
            + "-*"), conf);

      // convert bbt to something our eigensolver could understand
      assert bbt.columnSize() == k + p;

      double[][] bbtSquare = new double[k + p][];
      for (int i = 0; i < k + p; i++) {
        bbtSquare[i] = new double[k + p];
      }

      for (int i = 0; i < k + p; i++) {
        for (int j = i; j < k + p; j++) {
          bbtSquare[i][j] = bbtSquare[j][i] = bbt.getQuick(i, j);
        }
      }

      svalues = new double[k + p];

      // try something else.
      EigenSolverWrapper eigenWrapper = new EigenSolverWrapper(bbtSquare);

      double[] eigenva2 = eigenWrapper.getEigenValues();
      for (int i = 0; i < k + p; i++) {
        svalues[i] = Math.sqrt(eigenva2[i]); // sqrt?
      }

      // save/redistribute UHat
      double[][] uHat = eigenWrapper.getUHat();

      fs.mkdirs(uHatPath);
      SequenceFile.Writer uHatWriter =
        SequenceFile.createWriter(fs,
                                  conf,
                                  uHatPath = new Path(uHatPath, "uhat.seq"),
                                  IntWritable.class,
                                  VectorWritable.class,
                                  CompressionType.BLOCK);
      closeables.addFirst(uHatWriter);

      int m = uHat.length;
      IntWritable iw = new IntWritable();
      VectorWritable vw = new VectorWritable();
      for (int i = 0; i < m; i++) {
        vw.set(new DenseVector(uHat[i], true));
        iw.set(i);
        uHatWriter.append(iw, vw);
      }

      closeables.remove(uHatWriter);
      Closeables.closeQuietly(uHatWriter);

      SequenceFile.Writer svWriter =
        SequenceFile.createWriter(fs,
                                  conf,
                                  svPath = new Path(svPath, "svalues.seq"),
                                  IntWritable.class,
                                  VectorWritable.class,
                                  CompressionType.BLOCK);

      closeables.addFirst(svWriter);

      vw.set(new DenseVector(svalues, true));
      svWriter.append(iw, vw);

      closeables.remove(svWriter);
      Closeables.closeQuietly(svWriter);

      UJob ujob = null;
      if (computeU) {
        ujob = new UJob();
        ujob.start(conf,
                   new Path(btPath, BtJob.OUTPUT_Q + "-*"),
                   uHatPath,
                   svPath,
                   uPath,
                   k,
                   reduceTasks,
                   labelType,
                   cUHalfSigma);
        // actually this is map-only job anyway
      }

      VJob vjob = null;
      if (computeV) {
        vjob = new VJob();
        vjob.start(conf,
                   new Path(btPath, BtJob.OUTPUT_BT + "-*"),
                   uHatPath,
                   svPath,
                   vPath,
                   k,
                   reduceTasks,
                   cVHalfSigma);
      }

      if (ujob != null) {
        ujob.waitForCompletion();
        this.uPath = uPath.toString();
      }
      if (vjob != null) {
        vjob.waitForCompletion();
        this.vPath = vPath.toString();
      }

    } catch (InterruptedException exc) {
      throw new IOException("Interrupted", exc);
    } catch (ClassNotFoundException exc) {
      throw new IOException(exc);

    } finally {
      IOUtils.close(closeables);
    }

  }

  private static Class<? extends Writable>
      sniffInputLabelType(Path[] inputPath, Configuration conf)
        throws IOException {
    FileSystem fs = FileSystem.get(conf);
    for (Path p : inputPath) {
      FileStatus[] fstats = fs.globStatus(p);
      if (fstats == null || fstats.length == 0) {
        continue;
      }

      FileStatus firstSeqFile;
      if (fstats[0].isDir()) {
        firstSeqFile = fs.listStatus(fstats[0].getPath(), PathFilters.logsCRCFilter())[0];
      } else {
        firstSeqFile = fstats[0];
      }

      SequenceFile.Reader r = null;
      try {
        r = new SequenceFile.Reader(fs, firstSeqFile.getPath(), conf);
        return r.getKeyClass().asSubclass(Writable.class);
      } finally {
        Closeables.closeQuietly(r);
      }
    }
    throw new IOException("Unable to open input files to determine input label type.");
  }

  private static final Pattern OUTPUT_FILE_PATTERN =
    Pattern.compile("(\\w+)-(m|r)-(\\d+)(\\.\\w+)?");

  static final Comparator<FileStatus> PARTITION_COMPARATOR =
    new Comparator<FileStatus>() {
      private final Matcher matcher = OUTPUT_FILE_PATTERN.matcher("");

      @Override
      public int compare(FileStatus o1, FileStatus o2) {
        matcher.reset(o1.getPath().getName());
        if (!matcher.matches()) {
          throw new IllegalArgumentException("Unexpected file name, unable to deduce partition #:"
              + o1.getPath());
        }
        int p1 = Integer.parseInt(matcher.group(3));
        matcher.reset(o2.getPath().getName());
        if (!matcher.matches()) {
          throw new IllegalArgumentException("Unexpected file name, unable to deduce partition #:"
              + o2.getPath());
        }

        int p2 = Integer.parseInt(matcher.group(3));
        return p1 - p2;
      }

    };

  /**
   * helper capabiltiy to load distributed row matrices into dense matrix (to
   * support tests mainly).
   * 
   * @param fs
   *          filesystem
   * @param glob
   *          FS glob
   * @param conf
   *          configuration
   * @return Dense matrix array
   * @throws IOException
   *           when I/O occurs.
   */
  public static double[][] loadDistributedRowMatrix(FileSystem fs,
                                                    Path glob,
                                                    Configuration conf)
    throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
      return null;
    }

    List<double[]> denseData = Lists.newArrayList();

    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);

    for (FileStatus fstat : files) {
      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                true,
                                                                                conf)) {
        Vector v = value.get();
        int size = v.size();
        double[] row = new double[size];
        for (int i = 0; i < size; i++) {
          row[i] = v.get(i);
        }
        // ignore row label.
        denseData.add(row);
      }
    }

    return denseData.toArray(new double[denseData.size()][]);
  }

  /**
   * Load multiplel upper triangular matrices and sum them up.
   * 
   * @param fs
   * @param glob
   * @param conf
   * @return the sum of upper triangular inputs.
   * @throws IOException
   */
  public static UpperTriangular
      loadAndSumUpperTriangularMatrices(FileSystem fs,
                                        Path glob,
                                        Configuration conf) throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
      return null;
    }

    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);

    DenseVector result = null;
    for (FileStatus fstat : files) {
      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                true,
                                                                                conf)) {
        Vector v = value.get();
        if (result == null) {
          result = new DenseVector(v);
        } else {
          result.addAll(v);
        }
      }
    }

    if (result == null) {
      throw new IOException("Unexpected underrun in upper triangular matrix files");
    }
    return new UpperTriangular(result);
  }

  /**
   * Load only one upper triangular matrix and issue error if mroe than one is
   * found.
   */
  public static UpperTriangular loadUpperTriangularMatrix(FileSystem fs,
                                                          Path glob,
                                                          Configuration conf)
    throws IOException {

    FileStatus[] files = fs.globStatus(glob);
    if (files == null) {
      return null;
    }

    /*
     * assume it is partitioned output, so we need to read them up in order of
     * partitions.
     */
    Arrays.sort(files, PARTITION_COMPARATOR);

    UpperTriangular result = null;
    for (FileStatus fstat : files) {
      for (VectorWritable value : new SequenceFileValueIterable<VectorWritable>(fstat.getPath(),
                                                                                true,
                                                                                conf)) {
        Vector v = value.get();
        if (result == null) {
          result = new UpperTriangular(v);
        } else {
          throw new IOException("Unexpected overrun in upper triangular matrix files");
        }
      }
    }

    if (result == null) {
      throw new IOException("Unexpected underrun in upper triangular matrix files");
    }
    return result;
  }

}