/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.skp.experiment.cf.als.hadoop; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.net.InetAddress; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Random; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Mapper.Context; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ToolRunner; import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils; import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; import org.apache.mahout.cf.taste.impl.common.RunningAverage; import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.common.mapreduce.MergeVectorsCombiner; import org.apache.mahout.common.mapreduce.MergeVectorsReducer; import org.apache.mahout.common.mapreduce.TransposeMapper; import org.apache.mahout.common.mapreduce.VectorSumReducer; import org.apache.mahout.math.DenseVector; import org.apache.mahout.math.Matrix; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.mahout.math.als.AlternatingLeastSquaresSolver; import org.apache.mahout.math.hadoop.DistributedRowMatrix; import org.apache.mahout.math.map.OpenIntObjectHashMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.io.Closeables; import com.skp.experiment.cf.math.hadoop.MatrixDistanceSquaredJob; import com.skp.experiment.common.HadoopClusterUtil; import com.skp.experiment.common.Text2DistributedRowMatrixJob; import com.skp.experiment.common.parameter.DefaultOptionCreator; import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresSolver; /** * <p>MapReduce implementation of the two factorization algorithms described in * * <p>"Large-scale Parallel Collaborative Filtering for the Netflix Prize" available at * http://www.hpl.hp.com/personal/Robert_Schreiber/papers/2008%20AAIM%20Netflix/netflix_aaim08(submitted).pdf.</p> * * "<p>Collaborative Filtering for Implicit Feedback Datasets" available at * http://research.yahoo.com/pub/2433</p> * * </p> * <p>Command line arguments specific to this class are:</p> * * <ol> * <li>--input (path): Directory containing one or more text files with the dataset</li> * <li>--output (path): path where output should go</li> * <li>--lambda (double): regularization parameter to avoid overfitting</li> * <li>--userFeatures (path): path to the user feature matrix</li> * <li>--itemFeatures (path): path to the item feature matrix</li> * </ol> */ public class ParallelALSFactorizationJob extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(ParallelALSFactorizationJob.class); public static final String NUM_FEATURES = ParallelALSFactorizationJob.class.getName() + ".numFeatures"; public static final String LAMBDA = ParallelALSFactorizationJob.class.getName() + ".lambda"; public static final String ALPHA = ParallelALSFactorizationJob.class.getName() + ".alpha"; public static final String FEATURE_MATRIX = ParallelALSFactorizationJob.class.getName() + ".featureMatrix"; public static final String NUM_ROWS = ParallelALSFactorizationJob.class.getName() + ".numRows"; public static final String NUM_USERS = ParallelALSFactorizationJob.class.getName() + ".numUsers"; public static final String NUM_ITEMS = ParallelALSFactorizationJob.class.getName() + ".numItems"; public static final String FEATURE_MATRIX_TRANSPOSE = ParallelALSFactorizationJob.class.getName() + ".featureMatrixTranspose"; private static final String DELIMETER = ","; private boolean implicitFeedback; private int numIterations; private int numFeatures; private double lambda; private double alpha; private int numTaskTrackers; private int numUsers; private int numItems; private int startIteration; private String rmsePerIteration; private boolean useRMSECurve; private boolean cleanUp; private boolean useTransform; private boolean largeUserFeatures; private static long taskTimeout = 600000 * 6; private static final int multiplyMapTasks = 100000; private static int rateIndex = 2; private static final float SAFE_MARGIN = 3.5f; private static enum COUNTER { SETUP, CLEANUP, MAP } public static void main(String[] args) throws Exception { ToolRunner.run(new ParallelALSFactorizationJob(), args); } @Override public int run(String[] args) throws Exception { addInputOption(); addOutputOption(); addOption("lambda", null, "regularization parameter", true); addOption("implicitFeedback", null, "data consists of implicit feedback?", String.valueOf(false)); addOption("alpha", null, "confidence parameter (only used on implicit feedback)", String.valueOf(40)); addOption("numFeatures", null, "dimension of the feature space", true); addOption("numIterations", null, "number of iterations", true); addOption("indexSizes", null, "index sizes Path", true); addOption("startIteration", null, "start iteration number", String.valueOf(0)); addOption("oldM", null, "old M matrix Path.", null); addOption("largeUserFeatures", null, "true if user x feature matrix is too large for memory", String.valueOf(true)); addOption("rmseCurve", null, "true if want to extract rmse curve", String.valueOf(true)); addOption("cleanUp", null, "true if want to clean up temporary matrix", String.valueOf(true)); addOption("useTransform", null, "true if using logarithm as transform", String.valueOf(true)); addOption("rateIndex", null, "0 based index for rate column in input file.", String.valueOf(2)); Map<String,String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } try { /** step 0: fetch dimention of training set matrix. */ Map<String, String> indexSizesTmp = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSizes")), DELIMETER, Arrays.asList(0), Arrays.asList(1)); numFeatures = Integer.parseInt(parsedArgs.get("--numFeatures")); numIterations = Integer.parseInt(parsedArgs.get("--numIterations")); lambda = Double.parseDouble(parsedArgs.get("--lambda")); alpha = Double.parseDouble(parsedArgs.get("--alpha")); implicitFeedback = Boolean.parseBoolean(parsedArgs.get("--implicitFeedback")); numUsers = Integer.parseInt(indexSizesTmp.get("0")); numItems = Integer.parseInt(indexSizesTmp.get("1")); numTaskTrackers = HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks; startIteration = Integer.parseInt(parsedArgs.get("--startIteration")); largeUserFeatures = Boolean.parseBoolean(getOption("largeUserFeatures")); useRMSECurve = Boolean.parseBoolean(getOption("rmseCurve")); cleanUp = Boolean.parseBoolean(getOption("cleanUp")); useTransform = Boolean.parseBoolean(getOption("useTransform")); rateIndex = Integer.parseInt(getOption("rateIndex")); FileSystem fs = FileSystem.get(getConf()); if (!fs.exists(pathToTransformed())) { if (useTransform) { // transform price into rating Job transformJob = prepareJob(getInputPath(), pathToTransformed(), TextInputFormat.class, TransformColumnValueMapper.class, NullWritable.class, Text.class, TextOutputFormat.class); transformJob.waitForCompletion(true); } else { FileUtil.copy(FileSystem.get(getConf()), getInputPath(), FileSystem.get(getConf()), pathToTransformed(), false, getConf()); } } /* if (getOption("oldM") != null) { runOnetimeSolver(pathToTransformed(), getOutputPath("U"), new Path(getOption("oldM"))); return 0; } */ /* * compute the factorization A = U M' * * where A (users x items) is the matrix of known ratings * U (users x features) is the representation of users in the feature space * M (items x features) is the representation of items in the feature space */ if (startIteration == 0) { if (!fs.exists(pathToItemRatings())) { // create A' Job itemRatings = prepareJob(pathToTransformed(), pathToItemRatings(), TextInputFormat.class, ItemRatingVectorsMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); itemRatings.setCombinerClass(VectorSumReducer.class); long matrixSizeExp = (long)(8L * numUsers * numFeatures * SAFE_MARGIN); long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (long)HadoopClusterUtil.MAP_TASKS_PER_NODE; int numTaskPerDataNode = Math.max(1, (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / (double)matrixSizeExp)); //log.info("matrix Size: " + matrixSizeExp + ", memorhThreshold: " + memoryThreshold + ", numTaskPerDataNode: " + numTaskPerDataNode); if (matrixSizeExp > memoryThreshold) { //log.info("A: {}", numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf())); int numReducer = Math.min(numTaskPerDataNode * HadoopClusterUtil.getNumberOfTaskTrackers(getConf()), HadoopClusterUtil.getMaxMapTasks(getConf())); //log.info("Number Of Reducer: " + numReducer); itemRatings.setNumReduceTasks(numReducer); } itemRatings.waitForCompletion(true); } if (!fs.exists(pathToUserRatings())) { Job userRatings = prepareJob(pathToItemRatings(), pathToUserRatings(), TransposeMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); userRatings.setNumReduceTasks(HadoopClusterUtil.getNumberOfTaskTrackers(getConf())); userRatings.setCombinerClass(MergeVectorsCombiner.class); userRatings.setNumReduceTasks(HadoopClusterUtil.getMaxMapTasks(getConf())); userRatings.waitForCompletion(true); } if (!fs.exists(getOutputPath("userItemCnt"))) { // count item per user Job userItemCntsJob = prepareJob(pathToUserRatings(), getOutputPath("userItemCnt"), SequenceFileInputFormat.class, UserItemCntsMapper.class, IntWritable.class, IntWritable.class, SequenceFileOutputFormat.class); userItemCntsJob.setJobName("user ratings count"); userItemCntsJob.waitForCompletion(true); } if (!fs.exists(getTempPath("averageRatings"))) { //TODO this could be fiddled into one of the upper jobs Job averageItemRatings = prepareJob(pathToItemRatings(), getTempPath("averageRatings"), AverageRatingMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); averageItemRatings.setCombinerClass(MergeVectorsCombiner.class); averageItemRatings.waitForCompletion(true); } if (!fs.exists(new Path(pathToM(-1), "part-m-00000"))) { Vector averageRatings = ALSMatrixUtil.readFirstRow(getTempPath("averageRatings"), getConf()); /** create an initial M */ initializeM(averageRatings); } } for (int currentIteration = startIteration; currentIteration < numIterations; currentIteration++) { DistributedRowMatrix curM = new DistributedRowMatrix(pathToM(currentIteration-1), getTempPath("Mtemp/tmp-" + String.valueOf(currentIteration-1) + "/M"), numItems, numFeatures); curM.setConf(getConf()); DistributedRowMatrix YtransposeY = curM.times(curM); /** broadcast M, read A row-wise, recompute U row-wise */ log.info("Recomputing U (iteration {}/{})", currentIteration, numIterations); runSolver(pathToUserRatings(), pathToU(currentIteration), pathToM(currentIteration - 1), YtransposeY.getRowPath(), numItems, false); DistributedRowMatrix curU = new DistributedRowMatrix(pathToU(currentIteration), getTempPath("Utmp/tmp-" + String.valueOf(currentIteration) + "/U"), numUsers, numFeatures); curU.setConf(getConf()); DistributedRowMatrix XtransposeX = curU.times(curU); /** broadcast U, read A' row-wise, recompute M row-wise */ log.info("Recomputing M (iteration {}/{})", currentIteration, numIterations); runSolver(pathToItemRatings(), pathToM(currentIteration), pathToU(currentIteration), XtransposeX.getRowPath(), numUsers, largeUserFeatures); /** calculate rmse on each updated matrix U, M and decide to further iteration */ if (currentIteration > startIteration && useRMSECurve) { Pair<Integer, Double> UsquaredError = calculateMatrixDistanceSquared(pathToU(currentIteration-1), pathToU(currentIteration), currentIteration); Pair<Integer, Double> MsquaredError = calculateMatrixDistanceSquared(pathToM(currentIteration-1), pathToM(currentIteration), currentIteration); String currentRMSE = currentIteration + DELIMETER + UsquaredError.getFirst() + DELIMETER + UsquaredError.getSecond() + DELIMETER + MsquaredError.getFirst() + DELIMETER + MsquaredError.getSecond() + DefaultOptionCreator.NEWLINE; rmsePerIteration += currentRMSE; log.info("iteration {}: {}", currentIteration, currentRMSE); } if (currentIteration >= startIteration + 2 && cleanUp) { fs.deleteOnExit(pathToU(currentIteration - 2)); fs.deleteOnExit(pathToM(currentIteration - 2)); } } return 0; } catch (Exception e) { e.printStackTrace(); return -1; } finally { if (useRMSECurve) { HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("RMSE"), rmsePerIteration); } } } private Pair<Integer, Double> calculateMatrixDistanceSquared(Path oldMatrix, Path newMatrix, int iteration) throws IOException, InterruptedException, ClassNotFoundException { FileSystem fs = FileSystem.get(getConf()); Path path = getTempPath("rmse-" + iteration); fs.delete(path, true); Job rmseJob = MatrixDistanceSquaredJob.createMinusJob(getConf(), oldMatrix, newMatrix, path); rmseJob.waitForCompletion(true); Pair<Integer, Double> result = MatrixDistanceSquaredJob.retrieveDistanceSquaredOutput(getConf(), path); fs.delete(path, true); return result; } /* private void runOnetimeSolver(Path input, Path output, Path oldMPath) throws Exception { ToolRunner.run(new Text2DistributedRowMatrixJob(), new String[] { "-i", input.toString(), "-o", pathToUserRatings().toString(), "-ri", "0", "-ci", "1", "-vi", "2" }); Path MPath = oldMPath; DistributedRowMatrix M = new DistributedRowMatrix(MPath, getTempPath("Mtemp"), numItems, numFeatures); M.setConf(new Configuration()); DistributedRowMatrix YtransposeY = M.times(M); // recompute U for given input ratings Job solverForU = prepareJob(pathToUserRatings(), output, SequenceFileInputFormat.class, ParallelALSFactorizationJob.SolveImplicitFeedbackMapper.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); Configuration solverConf = solverForU.getConfiguration(); solverConf.setBoolean("mapred.map.tasks.speculative.execution", false); solverConf.set(ParallelALSFactorizationJob.LAMBDA, String.valueOf(lambda)); solverConf.set(ParallelALSFactorizationJob.ALPHA, String.valueOf(alpha)); solverConf.setInt(ParallelALSFactorizationJob.NUM_FEATURES, numFeatures); solverConf.set(ParallelALSFactorizationJob.FEATURE_MATRIX, MPath.toString()); solverConf.set(ParallelALSFactorizationJob.FEATURE_MATRIX_TRANSPOSE, YtransposeY.getRowPath().toString()); solverConf.setInt("mapred.map.tasks", numTaskTrackers); solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), pathToUserRatings())); solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), pathToUserRatings())); solverForU.waitForCompletion(true); } */ private void initializeM(Vector averageRatings) throws IOException { Random random = RandomUtils.getRandom(); FileSystem fs = FileSystem.get(pathToM(-1).toUri(), getConf()); SequenceFile.Writer writer = null; try { writer = new SequenceFile.Writer(fs, getConf(), new Path(pathToM(-1), "part-m-00000"), IntWritable.class, VectorWritable.class); Iterator<Vector.Element> averages = averageRatings.iterateNonZero(); while (averages.hasNext()) { Vector.Element e = averages.next(); Vector row = new DenseVector(numFeatures); row.setQuick(0, e.get()); for (int m = 1; m < numFeatures; m++) { row.setQuick(m, random.nextDouble()); } writer.append(new IntWritable(e.index()), new VectorWritable(row)); } } finally { Closeables.closeQuietly(writer); } } public static class TransformColumnValueMapper extends Mapper<LongWritable, Text, NullWritable, Text> { private static Text outValue = new Text(); private String buildOutput(String[] tokens) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < tokens.length; i++) { if (i > 0) { sb.append(DELIMETER); } sb.append(tokens[i]); } return sb.toString(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString()); int sz = tokens.length; tokens[sz-1] = String.valueOf(Math.log(Float.parseFloat(tokens[sz-1]) + 1.0f) + 1.0f); outValue.set(buildOutput(tokens)); context.write(NullWritable.get(), outValue); } } public static class ItemRatingVectorsMapper extends Mapper<LongWritable,Text,IntWritable,VectorWritable> { private static IntWritable outKey = new IntWritable(); @Override protected void map(LongWritable offset, Text line, Context ctx) throws IOException, InterruptedException { String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString()); try { int sz = tokens.length; int userID = Integer.parseInt(tokens[0]); int itemID = Integer.parseInt(tokens[1]); float rating = Float.parseFloat(tokens[sz-1]); Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); ratings.set(userID, rating); outKey.set(itemID); ctx.write(outKey, new VectorWritable(ratings)); } catch (NumberFormatException e) { log.info(line.toString()); return; } } } private void runSolver(Path ratings, Path output, Path pathToUorI, Path pathToTranspose, int numRows, boolean largeMatrix) throws ClassNotFoundException, IOException, InterruptedException { @SuppressWarnings("rawtypes") Class<? extends Mapper> solverMapper = implicitFeedback ? SolveImplicitFeedbackMultithreadedMapper.class : SolveExplicitFeedbackMapper.class; Job solverForUorI = prepareJob(ratings, output, SequenceFileInputFormat.class, solverMapper, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); Configuration solverConf = solverForUorI.getConfiguration(); long matrixSizeExp = (long)(8L * numRows * numFeatures * SAFE_MARGIN); long memoryThreshold = HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / HadoopClusterUtil.MAP_TASKS_PER_NODE; int numTaskPerDataNode = Math.max(1, (int) (HadoopClusterUtil.PHYSICAL_MEMERY_LIMIT / matrixSizeExp)); if (matrixSizeExp > memoryThreshold) { solverConf.set("mapred.child.java.opts", "-Xmx8g"); solverConf.set("mapred.map.child.java.opts", "-Xmx8g"); solverConf.setLong("dfs.block.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed())); solverConf.setInt("mapred.map.tasks", HadoopClusterUtil.getNumberOfTaskTrackers(getConf())); solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed())); solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMaxBlockSize(getConf(), pathToTransformed())); solverConf.set(SolveImplicitFeedbackMultithreadedMapper.LOCK_FILE, pathToHostLocks().toString()); solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.LOCK_FILE_NUMS, Math.min(HadoopClusterUtil.MAP_TASKS_PER_NODE, numTaskPerDataNode)); } else { solverConf.setLong("mapred.min.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), ratings)); solverConf.setLong("mapred.max.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), ratings)); solverConf.setInt("mapred.map.tasks", HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * multiplyMapTasks); //solverConf.setBoolean("mapred.map.tasks.speculative.execution", false); } solverConf.setLong("mapred.task.timeout", taskTimeout); solverConf.setBoolean("mapred.map.tasks.speculative.execution", false); solverConf.set(SolveImplicitFeedbackMultithreadedMapper.LAMBDA, String.valueOf(lambda)); solverConf.set(SolveImplicitFeedbackMultithreadedMapper.ALPHA, String.valueOf(alpha)); solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.NUM_FEATURES, numFeatures); solverConf.setInt(SolveImplicitFeedbackMultithreadedMapper.NUM_ROWS, numRows); solverConf.set(SolveImplicitFeedbackMultithreadedMapper.FEATURE_MATRIX, pathToUorI.toString()); solverConf.set(SolveImplicitFeedbackMultithreadedMapper.FEATURE_MATRIX_TRANSPOSE, pathToTranspose.toString()); solverForUorI.waitForCompletion(true); } public static class SolveExplicitFeedbackMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> { private double lambda; private int numFeatures; private OpenIntObjectHashMap<Vector> UorM; private AlternatingLeastSquaresSolver solver; @Override protected void setup(Context ctx) throws IOException, InterruptedException { lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA)); numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1); solver = new AlternatingLeastSquaresSolver(); Path UOrIPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX)); //UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx.getConfiguration()); UorM = ALSMatrixUtil.readMatrixByRows(UOrIPath, ctx); Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!"); } @Override protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx) throws IOException, InterruptedException { Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get()); List<Vector> featureVectors = Lists.newArrayList(); Iterator<Vector.Element> interactions = ratings.iterateNonZero(); while (interactions.hasNext()) { int index = interactions.next().index(); featureVectors.add(UorM.get(index)); } Vector uiOrmj = solver.solve(featureVectors, ratings, lambda, numFeatures); ctx.write(userOrItemID, new VectorWritable(uiOrmj)); } } /* public static class SolveImplicitFeedbackMapper extends MultithreadedMapper<IntWritable,VectorWritable,IntWritable,VectorWritable> { private ImplicitFeedbackAlternatingLeastSquaresSolver solver; private String lockPath = null; private long sleepPeriod = 30000; private int lockNums; private Path currentLockPath = null; //private static OpenIntObjectHashMap<Vector> Y; private static Matrix Y; private static Matrix YtransposeY; private static Map<Integer, Vector> outputMap = Collections.synchronizedMap(new HashMap<Integer, Vector>()); private static StringBuffer sb = new StringBuffer(); @Override protected void setup(Context ctx) throws IOException, InterruptedException { ctx.getCounter(COUNTER.SETUP).increment(1); Configuration conf = ctx.getConfiguration(); double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA)); double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA)); int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1); int numRows = ctx.getConfiguration().getInt(NUM_ROWS, -1); Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX)); Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE)); lockPath = conf.get("lock.file"); lockNums = conf.getInt("lock.file.nums", 1); if (lockPath != null) { checkLock(ctx, lockNums); } //Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx.getConfiguration()); //Y = ALSMatrixUtil.readMatrixByRows(YPath, ctx); Y = ALSMatrixUtil.readDenseMatrixByRows(YPath, ctx, numRows, numFeatures); YtransposeY = ALSMatrixUtil.readDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures); solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y, YtransposeY); ctx.setStatus("Size: " + Y.rowSize() + "," + Y.columnSize()); Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!"); } private void checkLock(Context ctx, int lockNums) throws InterruptedException, IOException { InetAddress thisIp =InetAddress.getLocalHost(); String hostIp = thisIp.getHostAddress(); // busy wait Configuration conf = ctx.getConfiguration(); long totalSleep = 0; boolean haveLock = false; FileSystem fs = FileSystem.get(conf); while (haveLock == false) { for (int i = 0; i < lockNums; i++) { Path checkPath = new Path(lockPath, hostIp + "_" + i); if (fs.exists(checkPath) == false) { haveLock = true; currentLockPath = checkPath; BufferedWriter br = new BufferedWriter( new OutputStreamWriter(fs.create(currentLockPath))); br.write(ctx.getTaskAttemptID().toString()); break; } } if (haveLock == false) { Random random = new Random(); int diff = 1000 + random.nextInt(1000) % 1000; totalSleep += diff + sleepPeriod; ctx.setStatus("sleeping: " + String.valueOf(totalSleep)); Thread.sleep(sleepPeriod + diff); } } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { context.getCounter(COUNTER.CLEANUP).increment(1); context.setStatus("cleanup size: " + Y.rowSize() + "," + Y.columnSize()); context.setStatus(sb.toString()); for (Entry<Integer, Vector> output : outputMap.entrySet()) { context.write(new IntWritable(output.getKey()), new VectorWritable(output.getValue())); log.info(output.getKey() + "\t" + output.getValue()); System.out.println(output.getKey() + "\t" + output.getValue()); //context.setStatus(output.getKey() + "\t" + output.getValue()); } if (currentLockPath != null) { FileSystem fs = FileSystem.get(context.getConfiguration()); fs.deleteOnExit(currentLockPath); } } @Override protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx) throws IOException, InterruptedException { ctx.getCounter(COUNTER.MAP).increment(1); Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get()); Vector uiOrmj = solver.solve(ratings); //ctx.write(userOrItemID, new VectorWritable(uiOrmj)); sb.append(userOrItemID.get() + "\t" + uiOrmj.toString() + "\t"); //outputMap.put(userOrItemID.get(), uiOrmj); outputMap.put(userOrItemID.get(), ratings); log.info(userOrItemID.get() + "\t" + uiOrmj.size() + "\t" + uiOrmj.toString()); System.out.println(userOrItemID.get() + "\t" + uiOrmj.toString()); } } */ public static class AverageRatingMapper extends Mapper<IntWritable,VectorWritable,IntWritable,VectorWritable> { @Override protected void map(IntWritable r, VectorWritable v, Context ctx) throws IOException, InterruptedException { RunningAverage avg = new FullRunningAverage(); Iterator<Vector.Element> elements = v.get().iterateNonZero(); while (elements.hasNext()) { avg.addDatum(elements.next().get()); } Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE, 1); vector.setQuick(r.get(), avg.getAverage()); ctx.write(new IntWritable(0), new VectorWritable(vector)); } } public static class UserItemCntsMapper extends Mapper<IntWritable, VectorWritable, IntWritable, IntWritable> { private static IntWritable result = new IntWritable(1); @Override protected void map(IntWritable key, VectorWritable value, Context context) throws IOException, InterruptedException { result.set(value.get().getNumNondefaultElements()); context.write(key, result); } } private Path pathToM(int iteration) { return iteration == numIterations - 1 ? getOutputPath("M") : getTempPath("M-" + iteration); } private Path pathToU(int iteration) { return iteration == numIterations - 1 ? getOutputPath("U") : getTempPath("U-" + iteration); } private Path pathToItemRatings() { return getTempPath("itemRatings"); } private Path pathToUserRatings() { return getOutputPath("userRatings"); } private Path pathToHostLocks() { return getTempPath("hosts"); } private Path pathToTransformed() { return getTempPath("transfomed"); } }