/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.skp.experiment.cf.als.hadoop;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.InetAddress;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
import org.apache.mahout.cf.taste.impl.common.RunningAverage;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.mapreduce.MergeVectorsCombiner;
import org.apache.mahout.common.mapreduce.MergeVectorsReducer;
import org.apache.mahout.common.mapreduce.TransposeMapper;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.als.AlternatingLeastSquaresSolver;
import org.apache.mahout.math.hadoop.DistributedRowMatrix;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import com.skp.experiment.cf.math.hadoop.MatrixDistanceSquaredJob;
import com.skp.experiment.common.HadoopClusterUtil;
import com.skp.experiment.common.Text2DistributedRowMatrixJob;
import com.skp.experiment.common.parameter.DefaultOptionCreator;
import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresSolver;
/**
* <p>MapReduce implementation of the two factorization algorithms described in
*
* <p>"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at
* http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p>
*
* "<p>Collaborative Filtering for Implicit Feedback Datasets" available at
* http://research.yahoo.com/pub/2433</p>
*
* </p>
* <p>Command line arguments specific to this class are:</p>
*
* <ol>
* <li>--input (path): Directory containing one or more text files with the dataset</li>
* <li>--output (path): path where output should go</li>
* <li>--lambda (double): regularization parameter to avoid overfitting</li>
* <li>--userFeatures (path): path to the user feature matrix</li>
* <li>--itemFeatures (path): path to the item feature matrix</li>
* </ol>
*/
public class ParallelALSFactorizationJob extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(ParallelALSFactorizationJob.class);
public static final String NUM_FEATURES = ParallelALSFactorizationJob.class.getName() + ".numFeatures";
public static final String LAMBDA = ParallelALSFactorizationJob.class.getName() + ".lambda";
public static final String ALPHA = ParallelALSFactorizationJob.class.getName() + ".alpha";
public static final String FEATURE_MATRIX = ParallelALSFactorizationJob.class.getName() + ".featureMatrix";
public static final String NUM_ROWS = ParallelALSFactorizationJob.class.getName() + ".numRows";
public static final String NUM_USERS = ParallelALSFactorizationJob.class.getName() + ".numUsers";
public static final String NUM_ITEMS = ParallelALSFactorizationJob.class.getName() + ".numItems";
public static final String FEATURE_MATRIX_TRANSPOSE = ParallelALSFactorizationJob.class.getName() + ".featureMatrixTranspose";
private static final String DELIMETER = ",";
private boolean implicitFeedback;
private int numIterations;
private int numFeatures;
private double lambda;
private double alpha;
private int numTaskTrackers;
private int numUsers;
private int numItems;
private int startIteration;
private String rmsePerIteration;
private boolean useRMSECurve;
private boolean cleanUp;
private boolean useTransform;
private boolean largeUserFeatures;
private static long taskTimeout = 600000 * 6;
private static final int multiplyMapTasks = 100000;
private static int rateIndex = 2;
private static final float SAFE_MARGIN = 3.5f;
private static enum COUNTER {
SETUP,
CLEANUP,
MAP
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new ParallelALSFactorizationJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("lambda", null, "regularization parameter", true);
addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false));
addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40));
addOption("numFeatures", null, "dimension of the feature space", true);
addOption("numIterations", null, "number of iterations", true);
addOption("indexSizes", null, "index sizes Path", true);
addOption("startIteration", null, "start iteration number", String.valueOf(0));
addOption("oldM", null, "old M matrix Path.", null);
addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory", String.valueOf(true));
addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true));
addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true));
addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true));
addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2));
Map<String,String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
try {
/** step 0: fetch dimention of training set matrix. */
Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")), DELIMETER,
Arrays.asList(0), Arrays.asList(1));
numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures"));
numIterations = Integer.parseInt(parsedArgs.get("--numIterations"));
lambda = Double.parseDouble(parsedArgs.get("--lambda"));
alpha = Double.parseDouble(parsedArgs.get("--alpha"));
implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback"));
numUsers = Integer.parseInt(indexSizesTmp.get("0"));
numItems = Integer.parseInt(indexSizesTmp.get("1"));
numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks;
startIteration = Integer.parseInt(parsedArgs.get("--startIteration"));
largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures"));
useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve"));
cleanUp = Boolean.parseBoolean(getOption("cleanUp"));
useTransform = Boolean.parseBoolean(getOption("useTransform"));
rateIndex = Integer.parseInt(getOption("rateIndex"));
FileSystem fs = FileSystem.get(getConf());
if (!fs.exists(pathToTransformed())) {
if (useTransform) {
// transform price into rating
Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class,
TransformColumnValueMapper.class, NullWritable.class, Text.class,
TextOutputFormat.class);
transformJob.waitForCompletion(true);
} else {
FileUtil.copy(FileSystem.get(getConf()), getInputPath(),
FileSystem.get(getConf()), pathToTransformed(), false, getConf());
}
}
/*
if (getOption("oldM") != null) {
runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM")));
return 0;
}
*/
/*
* compute the factorization A = U M'
*
* where A (users x items) is the matrix of known ratings
* U (users x features) is the representation of users in the feature space
* M (items x features) is the representation of items in the feature space
*/
if (startIteration == 0) {
if (!fs.exists(pathToItemRatings())) {
// create A'
Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(),
TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class,
VectorWritable.class, VectorSumReducer.class, IntWritable.class,
VectorWritable.class, SequenceFileOutputFormat.class);
itemRatings.setCombinerClass(VectorSumReducer.class);
long matrixSizeExp = (long)(8L * numUsers * numFeatures * SAFE_MARGIN);
long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (long)HadoopClusterUtil.MAP_TASKS_PER_NODE;
int numTaskPerDataNode = Math.max(1, (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double)matrixSizeExp));
//log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode);
if (matrixSizeExp > memoryThreshold) {
//log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
int numReducer = Math.min(numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()),
HadoopClusterUtil.getMaxMapTasks(getConf()));
//log.info("Number Of Reducer: " + numReducer);
itemRatings.setNumReduceTasks(numReducer);
}
itemRatings.waitForCompletion(true);
}
if (!fs.exists(pathToUserRatings())) {
Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(),
TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
VectorWritable.class);
userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
userRatings.setCombinerClass(MergeVectorsCombiner.class);
userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf()));
userRatings.waitForCompletion(true);
}
if (!fs.exists(getOutputPath("userItemCnt"))) {
// count item per user
Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"), SequenceFileInputFormat.class,
UserItemCntsMapper.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class);
userItemCntsJob.setJobName("user ratings count");
userItemCntsJob.waitForCompletion(true);
}
if (!fs.exists(getTempPath("averageRatings"))) {
//TODO this could be fiddled into one of the upper jobs
Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"),
AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
IntWritable.class, VectorWritable.class);
averageItemRatings.setCombinerClass(MergeVectorsCombiner.class);
averageItemRatings.waitForCompletion(true);
}
if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) {
Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf());
/** create an initial M */
initializeM(averageRatings);
}
}
for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) {
DistributedRowMatrix curM =
new DistributedRowMatrix(pathToM(currentIteration-1),
getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration-1) + "/M"),
numItems, numFeatures);
curM.setConf(getConf());
DistributedRowMatrix YtransposeY = curM.times(curM);
/** broadcast M, read A row-wise, recompute U row-wise */
log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations);
runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1),
YtransposeY.getRowPath(), numItems, false);
DistributedRowMatrix curU =
new DistributedRowMatrix(pathToU(currentIteration),
getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"),
numUsers, numFeatures);
curU.setConf(getConf());
DistributedRowMatrix XtransposeX = curU.times(curU);
/** broadcast U, read A' row-wise, recompute M row-wise */
log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations);
runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration),
XtransposeX.getRowPath(), numUsers, largeUserFeatures);
/** calculate rmse on each updated matrix U, M and decide to further iteration */
if (currentIteration > startIteration && useRMSECurve) {
Pair<Integer, Double> UsquaredError =
calculateMatrixDistanceSquared(pathToU(currentIteration-1), pathToU(currentIteration), currentIteration);
Pair<Integer, Double> MsquaredError =
calculateMatrixDistanceSquared(pathToM(currentIteration-1), pathToM(currentIteration), currentIteration);
String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() +
DELIMETER + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() +
DELIMETER + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE;
rmsePerIteration += currentRMSE;
log.info("iteration {}: {}", currentIteration, currentRMSE);
}
if (currentIteration >= startIteration + 2 && cleanUp) {
fs.deleteOnExit(pathToU(currentIteration - 2));
fs.deleteOnExit(pathToM(currentIteration - 2));
}
}
return 0;
} catch (Exception e) {
e.printStackTrace();
return -1;
} finally {
if (useRMSECurve) {
HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration);
}
}
}
private Pair<Integer, Double> calculateMatrixDistanceSquared(Path oldMatrix, Path newMatrix, int iteration)
throws IOException, InterruptedException, ClassNotFoundException {
FileSystem fs = FileSystem.get(getConf());
Path path = getTempPath("rmse-" + iteration);
fs.delete(path, true);
Job rmseJob = MatrixDistanceSquaredJob.createMinusJob(getConf(), oldMatrix, newMatrix, path);
rmseJob.waitForCompletion(true);
Pair<Integer, Double> result = MatrixDistanceSquaredJob.retrieveDistanceSquaredOutput(getConf(), path);
fs.delete(path, true);
return result;
}
/*
private void runOnetimeSolver(Path input, Path output, Path oldMPath) throws Exception {
ToolRunner.run(new Text2DistributedRowMatrixJob(), new String[] {
"-i", input.toString(), "-o", pathToUserRatings().toString(),
"-ri", "0", "-ci", "1", "-vi", "2"
});
Path MPath = oldMPath;
DistributedRowMatrix M =
new DistributedRowMatrix(MPath, getTempPath("Mtemp"), numItems, numFeatures);
M.setConf(new Configuration());
DistributedRowMatrix YtransposeY = M.times(M);
// recompute U for given input ratings
Job solverForU = prepareJob(pathToUserRatings(), output,
SequenceFileInputFormat.class,
ParallelALSFactorizationJob.SolveImplicitFeedbackMapper.class, IntWritable.class, VectorWritable.class,
SequenceFileOutputFormat.class);
Configuration solverConf = solverForU.getConfiguration();
solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
solverConf.set(ParallelALSFactorizationJob.LAMBDA, String.valueOf(lambda));
solverConf.set(ParallelALSFactorizationJob.ALPHA, String.valueOf(alpha));
solverConf.setInt(ParallelALSFactorizationJob.NUM_FEATURES, numFeatures);
solverConf.set(ParallelALSFactorizationJob.FEATURE_MATRIX, MPath.toString());
solverConf.set(ParallelALSFactorizationJob.FEATURE_MATRIX_TRANSPOSE, YtransposeY.getRowPath().toString());
solverConf.setInt("mapred.map.tasks", numTaskTrackers);
solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), pathToUserRatings()));
solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), pathToUserRatings()));
solverForU.waitForCompletion(true);
}
*/
private void initializeM(Vector averageRatings) throws IOException {
Random random = RandomUtils.getRandom();
FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf());
SequenceFile.Writer writer = null;
try {
writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"), IntWritable.class,
VectorWritable.class);
Iterator<Vector.Element> averages = averageRatings.iterateNonZero();
while (averages.hasNext()) {
Vector.Element e = averages.next();
Vector row = new DenseVector(numFeatures);
row.setQuick(0, e.get());
for (int m = 1; m < numFeatures; m++) {
row.setQuick(m, random.nextDouble());
}
writer.append(new IntWritable(e.index()), new VectorWritable(row));
}
} finally {
Closeables.closeQuietly(writer);
}
}
public static class TransformColumnValueMapper
extends Mapper<LongWritable, Text, NullWritable, Text> {
private static Text outValue = new Text();
private String buildOutput(String[] tokens) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < tokens.length; i++) {
if (i > 0) {
sb.append(DELIMETER);
}
sb.append(tokens[i]);
}
return sb.toString();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
int sz = tokens.length;
tokens[sz-1] = String.valueOf(Math.log(Float.parseFloat(tokens[sz-1]) + 1.0f) + 1.0f);
outValue.set(buildOutput(tokens));
context.write(NullWritable.get(), outValue);
}
}
public static class ItemRatingVectorsMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> {
private static IntWritable outKey = new IntWritable();
@Override
protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
try {
int sz = tokens.length;
int userID = Integer.parseInt(tokens[0]);
int itemID = Integer.parseInt(tokens[1]);
float rating = Float.parseFloat(tokens[sz-1]);
Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
ratings.set(userID, rating);
outKey.set(itemID);
ctx.write(outKey, new VectorWritable(ratings));
} catch (NumberFormatException e) {
log.info(line.toString());
return;
}
}
}
private void runSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose, int numRows, boolean largeMatrix)
throws ClassNotFoundException, IOException, InterruptedException {
@SuppressWarnings("rawtypes")
Class<? extends Mapper> solverMapper = implicitFeedback ?
SolveImplicitFeedbackMultithreadedMapper.class : SolveExplicitFeedbackMapper.class;
Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper, IntWritable.class,
VectorWritable.class, SequenceFileOutputFormat.class);
Configuration solverConf = solverForUorI.getConfiguration();
long matrixSizeExp = (long)(8L * numRows * numFeatures * SAFE_MARGIN);
long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / HadoopClusterUtil.MAP_TASKS_PER_NODE;
int numTaskPerDataNode = Math.max(1, (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / matrixSizeExp));
if (matrixSizeExp > memoryThreshold) {
solverConf.set("mapred.child.java.opts", "-Xmx8g");
solverConf.set("mapred.map.child.java.opts", "-Xmx8g");
solverConf.setLong("dfs.block.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed()));
solverConf.setInt("mapred.map.tasks", HadoopClusterUtil.getNumberOfTaskTrackers(getConf()));
solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed()));
solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed()));
solverConf.set(SolveImplicitFeedbackMultithreadedMapper.LOCK_FILE, pathToHostLocks().toString());
solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.LOCK_FILE_NUMS,
Math.min(HadoopClusterUtil.MAP_TASKS_PER_NODE, numTaskPerDataNode));
} else {
solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), ratings));
solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), ratings));
solverConf.setInt("mapred.map.tasks", HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks);
//solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
}
solverConf.setLong("mapred.task.timeout", taskTimeout);
solverConf.setBoolean("mapred.map.tasks.speculative.execution", false);
solverConf.set(SolveImplicitFeedbackMultithreadedMapper.LAMBDA, String.valueOf(lambda));
solverConf.set(SolveImplicitFeedbackMultithreadedMapper.ALPHA, String.valueOf(alpha));
solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.NUM_FEATURES, numFeatures);
solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.NUM_ROWS, numRows);
solverConf.set(SolveImplicitFeedbackMultithreadedMapper.FEATURE_MATRIX, pathToUorI.toString());
solverConf.set(SolveImplicitFeedbackMultithreadedMapper.FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString());
solverForUorI.waitForCompletion(true);
}
public static class SolveExplicitFeedbackMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
private double lambda;
private int numFeatures;
private OpenIntObjectHashMap<Vector> UorM;
private AlternatingLeastSquaresSolver solver;
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
solver = new AlternatingLeastSquaresSolver();
Path UOrIPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
//UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx.getConfiguration());
UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx);
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
List<Vector> featureVectors = Lists.newArrayList();
Iterator<Vector.Element> interactions = ratings.iterateNonZero();
while (interactions.hasNext()) {
int index = interactions.next().index();
featureVectors.add(UorM.get(index));
}
Vector uiOrmj = solver.solve(featureVectors, ratings, lambda, numFeatures);
ctx.write(userOrItemID, new VectorWritable(uiOrmj));
}
}
/*
public static class SolveImplicitFeedbackMapper extends MultithreadedMapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
private ImplicitFeedbackAlternatingLeastSquaresSolver solver;
private String lockPath = null;
private long sleepPeriod = 30000;
private int lockNums;
private Path currentLockPath = null;
//private static OpenIntObjectHashMap<Vector> Y;
private static Matrix Y;
private static Matrix YtransposeY;
private static Map<Integer, Vector> outputMap = Collections.synchronizedMap(new HashMap<Integer, Vector>());
private static StringBuffer sb = new StringBuffer();
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
ctx.getCounter(COUNTER.SETUP).increment(1);
Configuration conf = ctx.getConfiguration();
double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
int numRows = ctx.getConfiguration().getInt(NUM_ROWS, -1);
Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
lockPath = conf.get("lock.file");
lockNums = conf.getInt("lock.file.nums", 1);
if (lockPath != null) {
checkLock(ctx, lockNums);
}
//Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx.getConfiguration());
//Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx);
Y = ALSMatrixUtil.readDenseMatrixByRows(YPath, ctx, numRows, numFeatures);
YtransposeY = ALSMatrixUtil.readDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y, YtransposeY);
ctx.setStatus("Size: " + Y.rowSize() + "," + Y.columnSize());
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
private void checkLock(Context ctx, int lockNums) throws InterruptedException, IOException {
InetAddress thisIp =InetAddress.getLocalHost();
String hostIp = thisIp.getHostAddress();
// busy wait
Configuration conf = ctx.getConfiguration();
long totalSleep = 0;
boolean haveLock = false;
FileSystem fs = FileSystem.get(conf);
while (haveLock == false) {
for (int i = 0; i < lockNums; i++) {
Path checkPath = new Path(lockPath, hostIp + "_" + i);
if (fs.exists(checkPath) == false) {
haveLock = true;
currentLockPath = checkPath;
BufferedWriter br = new BufferedWriter(
new OutputStreamWriter(fs.create(currentLockPath)));
br.write(ctx.getTaskAttemptID().toString());
break;
}
}
if (haveLock == false) {
Random random = new Random();
int diff = 1000 + random.nextInt(1000) % 1000;
totalSleep += diff + sleepPeriod;
ctx.setStatus("sleeping: " + String.valueOf(totalSleep));
Thread.sleep(sleepPeriod + diff);
}
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
context.getCounter(COUNTER.CLEANUP).increment(1);
context.setStatus("cleanup size: " + Y.rowSize() + "," + Y.columnSize());
context.setStatus(sb.toString());
for (Entry<Integer, Vector> output : outputMap.entrySet()) {
context.write(new IntWritable(output.getKey()), new VectorWritable(output.getValue()));
log.info(output.getKey() + "\t" + output.getValue());
System.out.println(output.getKey() + "\t" + output.getValue());
//context.setStatus(output.getKey() + "\t" + output.getValue());
}
if (currentLockPath != null) {
FileSystem fs = FileSystem.get(context.getConfiguration());
fs.deleteOnExit(currentLockPath);
}
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
ctx.getCounter(COUNTER.MAP).increment(1);
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
Vector uiOrmj = solver.solve(ratings);
//ctx.write(userOrItemID, new VectorWritable(uiOrmj));
sb.append(userOrItemID.get() + "\t" + uiOrmj.toString() + "\t");
//outputMap.put(userOrItemID.get(), uiOrmj);
outputMap.put(userOrItemID.get(), ratings);
log.info(userOrItemID.get() + "\t" + uiOrmj.size() + "\t" + uiOrmj.toString());
System.out.println(userOrItemID.get() + "\t" + uiOrmj.toString());
}
}
*/
public static class AverageRatingMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> {
@Override
protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException {
RunningAverage avg = new FullRunningAverage();
Iterator<Vector.Element> elements = v.get().iterateNonZero();
while (elements.hasNext()) {
avg.addDatum(elements.next().get());
}
Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
vector.setQuick(r.get(), avg.getAverage());
ctx.write(new IntWritable(0), new VectorWritable(vector));
}
}
public static class UserItemCntsMapper extends Mapper<IntWritable, VectorWritable, IntWritable, IntWritable> {
private static IntWritable result = new IntWritable(1);
@Override
protected void map(IntWritable key, VectorWritable value, Context context)
throws IOException, InterruptedException {
result.set(value.get().getNumNondefaultElements());
context.write(key, result);
}
}
private Path pathToM(int iteration) {
return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration);
}
private Path pathToU(int iteration) {
return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration);
}
private Path pathToItemRatings() {
return getTempPath("itemRatings");
}
private Path pathToUserRatings() {
return getOutputPath("userRatings");
}
private Path pathToHostLocks() {
return getTempPath("hosts");
}
private Path pathToTransformed() {
return getTempPath("transfomed");
}
}