ParallelSGDFactorizer.java example

Explorer
mahout-commits-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.cf.taste.impl.recommender.svd;

import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
import org.apache.mahout.cf.taste.impl.common.RunningAverage;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.model.PreferenceArray;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.RandomWrapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Minimalistic implementation of Parallel SGD factorizer based on
 * <a href="http://www.sze.hu/~gtakacs/download/jmlr_2009.pdf">
 * "Scalable Collaborative Filtering Approaches for Large Recommender Systems"</a>
 * and
 * <a href="hwww.cs.wisc.edu/~brecht/papers/hogwildTR.pdf">
 * "Hogwild!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent"</a> */
public class ParallelSGDFactorizer extends AbstractFactorizer {

  private final DataModel dataModel;
  /** Parameter used to prevent overfitting. */
  private final double lambda;
  /** Number of features used to compute this factorization */
  private final int rank;
  /** Number of iterations */
  private final int numEpochs;

  private int numThreads;

  // these next two control decayFactor^steps exponential type of annealing learning rate and decay factor
  private double mu0 = 0.01;
  private double decayFactor = 1;
  // these next two control 1/steps^forget type annealing
  private int stepOffset = 0;
  // -1 equals even weighting of all examples, 0 means only use exponential annealing
  private double forgettingExponent = 0;

  // The following two should be inversely proportional :)
  private double biasMuRatio = 0.5;
  private double biasLambdaRatio = 0.1;

  /** TODO: this is not safe as += is not atomic on many processors, can be replaced with AtomicDoubleArray
   * but it works just fine right now  */
  /** user features */
  protected volatile double[][] userVectors;
  /** item features */
  protected volatile double[][] itemVectors;

  private final PreferenceShuffler shuffler;

  private int epoch = 1;
  /** place in user vector where the bias is stored */
  private static final int USER_BIAS_INDEX = 1;
  /** place in item vector where the bias is stored */
  private static final int ITEM_BIAS_INDEX = 2;
  private static final int FEATURE_OFFSET = 3;
  /** Standard deviation for random initialization of features */
  private static final double NOISE = 0.02;

  private static final Logger logger = LoggerFactory.getLogger(ParallelSGDFactorizer.class);

  protected static class PreferenceShuffler {

    private Preference[] preferences;
    private Preference[] unstagedPreferences;

    protected final RandomWrapper random = RandomUtils.getRandom();

    public PreferenceShuffler(DataModel dataModel) throws TasteException {
      cachePreferences(dataModel);
      shuffle();
      stage();
    }

    private int countPreferences(DataModel dataModel) throws TasteException {
      int numPreferences = 0;
      LongPrimitiveIterator userIDs = dataModel.getUserIDs();
      while (userIDs.hasNext()) {
        PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userIDs.nextLong());
        numPreferences += preferencesFromUser.length();
      }
      return numPreferences;
    }

    private void cachePreferences(DataModel dataModel) throws TasteException {
      int numPreferences = countPreferences(dataModel);
      preferences = new Preference[numPreferences];

      LongPrimitiveIterator userIDs = dataModel.getUserIDs();
      int index = 0;
      while (userIDs.hasNext()) {
        long userID = userIDs.nextLong();
        PreferenceArray preferencesFromUser = dataModel.getPreferencesFromUser(userID);
        for (Preference preference : preferencesFromUser) {
          preferences[index++] = preference;
        }
      }
    }

    public final void shuffle() {
      unstagedPreferences = preferences.clone();
      /* Durstenfeld shuffle */
      for (int i = unstagedPreferences.length - 1; i > 0; i--) {
        int rand = random.nextInt(i + 1);
        swapCachedPreferences(i, rand);
      }
    }

    //merge this part into shuffle() will make compiler-optimizer do some real absurd stuff, test on OpenJDK7
    private void swapCachedPreferences(int x, int y) {
      Preference p = unstagedPreferences[x];

      unstagedPreferences[x] = unstagedPreferences[y];
      unstagedPreferences[y] = p;
    }

    public final void stage() {
      preferences = unstagedPreferences;
    }

    public Preference get(int i) {
      return preferences[i];
    }

    public int size() {
      return preferences.length;
    }

  }

  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numEpochs)
    throws TasteException {
    super(dataModel);
    this.dataModel = dataModel;
    this.rank = numFeatures + FEATURE_OFFSET;
    this.lambda = lambda;
    this.numEpochs = numEpochs;

    shuffler = new PreferenceShuffler(dataModel);

    //max thread num set to n^0.25 as suggested by hogwild! paper
    numThreads = Math.min(Runtime.getRuntime().availableProcessors(), (int) Math.pow((double) shuffler.size(), 0.25));
  }

  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
      double mu0, double decayFactor, int stepOffset, double forgettingExponent) throws TasteException {
    this(dataModel, numFeatures, lambda, numIterations);

    this.mu0 = mu0;
    this.decayFactor = decayFactor;
    this.stepOffset = stepOffset;
    this.forgettingExponent = forgettingExponent;
  }

  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
      double mu0, double decayFactor, int stepOffset, double forgettingExponent, int numThreads) throws TasteException {
    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);

    this.numThreads = numThreads;
  }

  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
      double mu0, double decayFactor, int stepOffset, double forgettingExponent,
      double biasMuRatio, double biasLambdaRatio) throws TasteException {
    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent);

    this.biasMuRatio = biasMuRatio;
    this.biasLambdaRatio = biasLambdaRatio;
  }

  public ParallelSGDFactorizer(DataModel dataModel, int numFeatures, double lambda, int numIterations,
      double mu0, double decayFactor, int stepOffset, double forgettingExponent,
      double biasMuRatio, double biasLambdaRatio, int numThreads) throws TasteException {
    this(dataModel, numFeatures, lambda, numIterations, mu0, decayFactor, stepOffset, forgettingExponent, biasMuRatio,
         biasLambdaRatio);

    this.numThreads = numThreads;
  }

  protected void initialize() throws TasteException {
    RandomWrapper random = RandomUtils.getRandom();
    userVectors = new double[dataModel.getNumUsers()][rank];
    itemVectors = new double[dataModel.getNumItems()][rank];

    double globalAverage = getAveragePreference();
    for (int userIndex = 0; userIndex < userVectors.length; userIndex++) {
      userVectors[userIndex][0] = globalAverage;
      userVectors[userIndex][USER_BIAS_INDEX] = 0; // will store user bias
      userVectors[userIndex][ITEM_BIAS_INDEX] = 1; // corresponding item feature contains item bias
      for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
        userVectors[userIndex][feature] = random.nextGaussian() * NOISE;
      }
    }
    for (int itemIndex = 0; itemIndex < itemVectors.length; itemIndex++) {
      itemVectors[itemIndex][0] = 1; // corresponding user feature contains global average
      itemVectors[itemIndex][USER_BIAS_INDEX] = 1; // corresponding user feature contains user bias
      itemVectors[itemIndex][ITEM_BIAS_INDEX] = 0; // will store item bias
      for (int feature = FEATURE_OFFSET; feature < rank; feature++) {
        itemVectors[itemIndex][feature] = random.nextGaussian() * NOISE;
      }
    }
  }

  //TODO: needs optimization
  private double getMu(int i) {
    return mu0 * Math.pow(decayFactor, i - 1) * Math.pow(i + stepOffset, forgettingExponent);
  }

  @Override
  public Factorization factorize() throws TasteException {
    initialize();

    if (logger.isInfoEnabled()) {
      logger.info("starting to compute the factorization...");
    }

    for (epoch = 1; epoch <= numEpochs; epoch++) {
      shuffler.stage();

      final double mu = getMu(epoch);
      int subSize = shuffler.size() / numThreads + 1;

      ExecutorService executor=Executors.newFixedThreadPool(numThreads);

      try {
        for (int t = 0; t < numThreads; t++) {
          final int iStart = t * subSize;
          final int iEnd = Math.min((t + 1) * subSize, shuffler.size());

          executor.execute(new Runnable() {
            @Override
            public void run() {
              for (int i = iStart; i < iEnd; i++) {
                update(shuffler.get(i), mu);
              }
            }
          });
        }
      } finally {
        executor.shutdown();
        shuffler.shuffle();

        try {
          boolean terminated = executor.awaitTermination(numEpochs * shuffler.size(), TimeUnit.MICROSECONDS);
          if (!terminated) {
            logger.error("subtasks takes forever, return anyway");
          }
        } catch (InterruptedException e) {
          throw new TasteException("waiting fof termination interrupted", e);
        }
      }

    }

    return createFactorization(userVectors, itemVectors);
  }

  double getAveragePreference() throws TasteException {
    RunningAverage average = new FullRunningAverage();
    LongPrimitiveIterator it = dataModel.getUserIDs();
    while (it.hasNext()) {
      for (Preference pref : dataModel.getPreferencesFromUser(it.nextLong())) {
        average.addDatum(pref.getValue());
      }
    }
    return average.getAverage();
  }

  /** TODO: this is the vanilla sgd by Tacaks 2009, I speculate that using scaling technique proposed in:
   * Towards Optimal One Pass Large Scale Learning with Averaged Stochastic Gradient Descent section 5, page 6
   * can be beneficial in term s of both speed and accuracy.
   *
   * Tacaks' method doesn't calculate gradient of regularization correctly, which has non-zero elements everywhere of
   * the matrix. While Tacaks' method can only updates a single row/column, if one user has a lot of recommendation,
   * her vector will be more affected by regularization using an isolated scaling factor for both user vectors and
   * item vectors can remove this issue without inducing more update cost it even reduces it a bit by only performing
   * one addition and one multiplication.
   *
   * BAD SIDE1: the scaling factor decreases fast, it has to be scaled up from time to time before dropped to zero or
   *            caused roundoff error
   * BAD SIDE2: no body experiment on it before, and people generally use very small lambda
   *            so it's impact on accuracy may still be unknown.
   * BAD SIDE3: don't know how to make it work for L1-regularization or
   *            "pseudorank?" (sum of singular values)-regularization */
  protected void update(Preference preference, double mu) {
    int userIndex = userIndex(preference.getUserID());
    int itemIndex = itemIndex(preference.getItemID());

    double[] userVector = userVectors[userIndex];
    double[] itemVector = itemVectors[itemIndex];

    double prediction = dot(userVector, itemVector);
    double err = preference.getValue() - prediction;

    // adjust features
    for (int k = FEATURE_OFFSET; k < rank; k++) {
      double userFeature = userVector[k];
      double itemFeature = itemVector[k];

      userVector[k] += mu * (err * itemFeature - lambda * userFeature);
      itemVector[k] += mu * (err * userFeature - lambda * itemFeature);
    }

    // adjust user and item bias
    userVector[USER_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * userVector[USER_BIAS_INDEX]);
    itemVector[ITEM_BIAS_INDEX] += biasMuRatio * mu * (err - biasLambdaRatio * lambda * itemVector[ITEM_BIAS_INDEX]);
  }

  private double dot(double[] userVector, double[] itemVector) {
    double sum = 0;
    for (int k = 0; k < rank; k++) {
      sum += userVector[k] * itemVector[k];
    }
    return sum;
  }
}