/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.skp.experiment.cf.als.hadoop;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
import org.apache.mahout.cf.taste.impl.common.RunningAverage;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.mapreduce.MergeVectorsCombiner;
import org.apache.mahout.common.mapreduce.MergeVectorsReducer;
import org.apache.mahout.common.mapreduce.TransposeMapper;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.skp.experiment.math.als.hadoop.DistributedImplicitFeedbackAlternatingLeastSquaresSolver;
import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresSolver;
/**
* <p>MapReduce implementation of the two factorization algorithms described in
*
* <p>"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at
* http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p>
*
* "<p>Collaborative Filtering for Implicit Feedback Datasets" available at
* http://research.yahoo.com/pub/2433</p>
*
* </p>
* <p>Command line arguments specific to this class are:</p>
*
* <ol>
* <li>--input (path): Directory containing one or more text files with the dataset</li>
* <li>--output (path): path where output should go</li>
* <li>--lambda (double): regularization parameter to avoid overfitting</li>
* <li>--userFeatures (path): path to the user feature matrix</li>
* <li>--itemFeatures (path): path to the item feature matrix</li>
* </ol>
*/
public class DistributedParallelALSFactorizationJob extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(DistributedParallelALSFactorizationJob.class);
private static final String LZO_CODEC_CLASS = "org.apache.hadoop.io.compress.LzoCodec";
private static final int LARGE_MATRIX_MAP_TASKS_NUM = 1000000;
private static final String SMALL_MATRIX_MEMORY = "-Xmx2g";
static final String NUM_FEATURES = DistributedParallelALSFactorizationJob.class.getName() + ".numFeatures";
static final String LAMBDA = DistributedParallelALSFactorizationJob.class.getName() + ".lambda";
static final String ALPHA = DistributedParallelALSFactorizationJob.class.getName() + ".alpha";
static final String FEATURE_MATRIX = DistributedParallelALSFactorizationJob.class.getName() + ".featureMatrix";
static final String FEATURE_MATRIX_TRANSPOSE = DistributedParallelALSFactorizationJob.class.getName() + ".featureMatrixTranspose";
private int numUsers;
private int numItems;
private boolean implicitFeedback;
private int numIterations;
private int numFeatures;
private double lambda;
private double alpha;
private long dfsBlockSize;
public static void main(String[] args) throws Exception {
ToolRunner.run(new DistributedParallelALSFactorizationJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("lambda", null, "regularization parameter", true);
addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
addOption("numFeatures", null, "dimension of the feature space", true);
addOption("numIterations", null, "number of iterations", true);
addOption("numUsers", null, "number of users", true);
addOption("numItems", null, "number of items", true);
addOption("blockSize", null, "dfs block size.", false);
//addOption("runIterations", null, "true or false for iterations", true);
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
lambda = Double.parseDouble(parsedArgs.get("--lambda"));
alpha = Double.parseDouble(parsedArgs.get("--alpha"));
implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
numUsers = Integer.parseInt(parsedArgs.get("--numUsers"));
numItems = Integer.parseInt(parsedArgs.get("--numItems"));
dfsBlockSize = getOption("blockSize") == null ? 64 * 1024 * 1024 : Long.parseLong(getOption("blockSize"));
/*
* compute the factorization A = U M'
*
* where A (users x items) is the matrix of known ratings
* U (users x features) is the representation of users in the feature space
* M (items x features) is the representation of items in the feature space
*/
/* create A' */
Job itemRatings = prepareJob(getInputPath(), pathToItemRatings(),
TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class,
VectorWritable.class, VectorSumReducer.class, IntWritable.class,
VectorWritable.class, SequenceFileOutputFormat.class);
itemRatings.setCombinerClass(VectorSumReducer.class);
itemRatings.waitForCompletion(true);
//numItems =
// (int) itemRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
log.info("Number of Items\t{}", numItems);
/* create A */
Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(),
TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
VectorWritable.class);
userRatings.setCombinerClass(MergeVectorsCombiner.class);
userRatings.waitForCompletion(true);
//numUsers =
// (int) userRatings.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
log.info("Number of Users\t{}", numUsers);
/* count item per user */
Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnts"), SequenceFileInputFormat.class,
UserItemCntsMapper.class, IntWritable.class, IntWritable.class,
UserItemCntsReducer.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class);
userItemCntsJob.setJobName("user ratings count");
userItemCntsJob.setCombinerClass(UserItemCntsReducer.class);
userItemCntsJob.waitForCompletion(true);
//TODO this could be fiddled into one of the upper jobs
Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
IntWritable.class, VectorWritable.class);
averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
averageItemRatings.waitForCompletion(true);
Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());
/* create an initial M */
initializeM(averageRatings);
for (int currentIteration = 0; currentIteration < numIterations; currentIteration++) {
DistributedRowMatrix curM =
new DistributedRowMatrix(pathToM(currentIteration-1), getTempPath("Mtemp" + String.valueOf(currentIteration-1)),
numItems, numFeatures);
curM.setConf(new Configuration());
DistributedRowMatrix YtransposeY = curM.times(curM);
// broadcast M, read A row-wise, recompute U row-wise //
log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
YtransposeY.getRowPath(), numItems);
DistributedRowMatrix curU =
new DistributedRowMatrix(pathToU(currentIteration), getTempPath("Utmp" + String.valueOf(currentIteration)),
numUsers, numFeatures);
curU.setConf(new Configuration());
DistributedRowMatrix XtransposeX = curU.times(curU);
// set up index of U //
CreateMapFileFromSeq.createMapFile(pathToU(currentIteration));
// broadcast U, read A' row-wise, recompute M row-wise //
log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
runDistributedImplicitSolver(pathToItemRatings(), pathToM(currentIteration),
pathToU(currentIteration), XtransposeX.getRowPath(), numUsers);
}
return 0;
}
private void initializeM(Vector averageRatings) throws IOException {
Random random = RandomUtils.getRandom();
FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
SequenceFile.Writer writer = null;
//MapFile.Writer writer = null;
try {
//writer = new MapFile.Writer(getConf(), fs, pathToM(-1).toString(), IntWritable.class, VectorWritable.class);
writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"), IntWritable.class,
VectorWritable.class);
Iterator<Vector.Element> averages = averageRatings.iterateNonZero();
while (averages.hasNext()) {
Vector.Element e = averages.next();
Vector row = new DenseVector(numFeatures);
row.setQuick(0, e.get());
for (int m = 1; m < numFeatures; m++) {
row.setQuick(m, random.nextDouble());
}
writer.append(new IntWritable(e.index()), new VectorWritable(row));
}
} finally {
Closeables.closeQuietly(writer);
}
}
private void runSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose, int rowNums)
throws ClassNotFoundException, IOException, InterruptedException {
@SuppressWarnings("rawtypes")
Class<? extends Mapper> solverMapper = null;
if (implicitFeedback) {
solverMapper = SolveImplicitFeedbackMapper.class;
} else {
solverMapper = SolveExplicitFeedbackMapper.class;
}
Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper, IntWritable.class,
VectorWritable.class, SequenceFileOutputFormat.class);
Configuration solverConf = solverForUorI.getConfiguration();
solverConf.set(LAMBDA, String.valueOf(lambda));
solverConf.set(ALPHA, String.valueOf(alpha));
solverConf.setInt(NUM_FEATURES, numFeatures);
solverConf.set(FEATURE_MATRIX, pathToUorI.toString());
solverConf.set(FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString());
solverConf.setInt("rowNums", rowNums);
solverConf.set("mapred.child.java.opts", SMALL_MATRIX_MEMORY);
solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
solverConf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
solverConf.setBoolean("mapred.compress.map.output", true);
solverConf.set("mapred.map.output.compression.codec", LZO_CODEC_CLASS);
solverForUorI.waitForCompletion(true);
}
private void runDistributedImplicitSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose, int rowNums)
throws IOException, InterruptedException, ClassNotFoundException {
@SuppressWarnings("rawtypes")
Class<? extends Mapper> solverMapper = DistributedSolveImplicitFeedbackMapper.class;
Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper, IntWritable.class,
VectorWritable.class, SequenceFileOutputFormat.class);
Configuration solverConf = solverForUorI.getConfiguration();
solverConf.setLong("mapred.min.split.size", dfsBlockSize);
solverConf.setLong("mapred.max.split.size", dfsBlockSize);
solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
solverConf.setInt("mapred.map.tasks", LARGE_MATRIX_MAP_TASKS_NUM);
solverConf.setLong("mapred.task.timeout", 600000 * 5);
solverConf.setInt("mapred.job.reuse.jvm.num.tasks", -1);
solverConf.set("mapred.child.java.opts", SMALL_MATRIX_MEMORY);
solverConf.set(LAMBDA, String.valueOf(lambda));
solverConf.set(ALPHA, String.valueOf(alpha));
solverConf.setInt(NUM_FEATURES, numFeatures);
solverConf.set(FEATURE_MATRIX, pathToUorI.toString());
solverConf.set(FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString());
solverConf.setInt("rowNums", rowNums);
solverConf.setBoolean("mapred.compress.map.output", true);
solverConf.set("mapred.map.output.compression.codec", LZO_CODEC_CLASS);
solverForUorI.waitForCompletion(true);
}
public static class SolveImplicitFeedbackMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
private ImplicitFeedbackAlternatingLeastSquaresSolver solver;
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
int rowNums = ctx.getConfiguration().getInt("rowNums", -1);
//OpenIntObjectHashMap<Vector> Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx.getConfiguration());
OpenIntObjectHashMap<Vector> Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx);
Matrix YtransposeY = ALSMatrixUtil.readDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
//solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y, YtransposeY);
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
Vector uiOrmj = solver.solve(ratings);
ctx.write(userOrItemID, new VectorWritable(uiOrmj));
}
}
public static class DistributedSolveImplicitFeedbackMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
private DistributedImplicitFeedbackAlternatingLeastSquaresSolver solver;
private MapFile.Reader reader;
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
int rowNums =ctx.getConfiguration().getInt("rowNums", -1);
Matrix YtransposeY = fetchDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
FileSystem fs = FileSystem.get(ctx.getConfiguration());
reader = new MapFile.Reader(fs, YPath.toString(), ctx.getConfiguration());
solver = new DistributedImplicitFeedbackAlternatingLeastSquaresSolver(rowNums, numFeatures,
lambda, alpha, reader, YtransposeY);
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
private Matrix fetchDistributedRowMatrix(Path matrixPath, int numRows, int numCols) {
Matrix result = new DenseMatrix(numRows, numCols);
DistributedRowMatrix m =
new DistributedRowMatrix(matrixPath, new Path(matrixPath.toString() + "_tmp"), numRows, numCols);
m.setConf(new Configuration());
Iterator<MatrixSlice> rows = m.iterator();
while (rows.hasNext()) {
MatrixSlice row = rows.next();
result.assignRow(row.index(), row.vector());
}
return result;
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
Vector uiOrmj = solver.solve(ratings);
ctx.write(userOrItemID, new VectorWritable(uiOrmj));
}
}
public static class SolveExplicitFeedbackMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
private double lambda;
private int numFeatures;
private OpenIntObjectHashMap<Vector> UorM;
private AlternatingLeastSquaresSolver solver;
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
solver = new AlternatingLeastSquaresSolver();
Path UOrIPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
//UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx.getConfiguration());
UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx);
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
List<Vector> featureVectors = Lists.newArrayList();
Iterator<Vector.Element> interactions = ratings.iterateNonZero();
while (interactions.hasNext()) {
int index = interactions.next().index();
featureVectors.add(UorM.get(index));
}
Vector uiOrmj = solver.solve(featureVectors, ratings, lambda, numFeatures);
ctx.write(userOrItemID, new VectorWritable(uiOrmj));
}
}
public static class AverageRatingMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
@Override
protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException {
RunningAverage avg = new FullRunningAverage();
Iterator<Vector.Element> elements = v.get().iterateNonZero();
while (elements.hasNext()) {
avg.addDatum(elements.next().get());
}
Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
vector.setQuick(r.get(), avg.getAverage());
ctx.write(new IntWritable(0), new VectorWritable(vector));
}
}
public static class ItemRatingVectorsMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {
private static IntWritable outKey = new IntWritable();
@Override
protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
int userID = Integer.parseInt(tokens[0]);
int itemID = Integer.parseInt(tokens[1]);
float rating = Float.parseFloat(tokens[2]);
Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
ratings.set(userID, rating);
outKey.set(itemID);
ctx.write(outKey, new VectorWritable(ratings));
}
}
public static class UserItemCntsMapper extends Mapper<IntWritable, VectorWritable, IntWritable, IntWritable> {
private static IntWritable result = new IntWritable(1);
@Override
protected void map(IntWritable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
context.write(key, result);
}
}
public static class UserItemCntsReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable> {
private static IntWritable result = new IntWritable();
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException,
InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
result.set(sum);
context.write(key, result);
}
}
private Path pathToM(int iteration) {
return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration);
}
private Path pathToU(int iteration) {
return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration);
}
private Path pathToItemRatings() {
return getTempPath("itemRatings");
}
private Path pathToUserRatings() {
return getOutputPath("userRatings");
}
}