/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.skp.experiment.cf.als.hadoop;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.SequentialAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.base.Preconditions;
import com.skp.experiment.common.mapreduce.MultithreadedMapMapper;
import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresSolver;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.InetAddress;
import java.util.Random;
/**
* This is extended class for ALS-WR algorithm in mahout.
* Why re-implement this? followings are reasons.
* 1) mahout`s implementation don`t take care of cases when U/M matrix is too large. too large matrix will lead to out of heap space
* 2) mahout`s implementation use single thread to calculate ui or mj(feature vector).
* once whole U/M matrix is loaded into memory, we can actually increase speed using multiple thread to calculate ui or mj.
* 3) mahout`s implementation spend many time to load matrix into memory. multiple thread to load hdfs data into memory can speed up setup process.
*
*
*/
public class SolveImplicitFeedbackMultithreadedMapper
extends MultithreadedMapMapper<IntWritable, VectorWritable, IntWritable, VectorWritable> {
private static final Log LOG = LogFactory.getLog(SolveImplicitFeedbackMultithreadedMapper.class);
public static final String NUM_FEATURES = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".numFeatures";
public static final String LAMBDA = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".lambda";
public static final String ALPHA = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".alpha";
public static final String FEATURE_MATRIX = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".featureMatrix";
public static final String NUM_ROWS = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".numRows";
public static final String NUM_USERS = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".numUsers";
public static final String NUM_ITEMS = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".numItems";
public static final String FEATURE_MATRIX_TRANSPOSE = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".featureMatrixTranspose";
public static final String LOCK_FILE = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".lockFile";
public static final String LOCK_FILE_NUMS = SolveImplicitFeedbackMultithreadedMapper.class.getName() + ".lockFileNums";
private String lockPath = null;
private long sleepPeriod = 30000;
private int lockNums;
private Path currentLockPath = null;
private static Matrix Y;
private static Matrix YtransposeY;
private static ImplicitFeedbackAlternatingLeastSquaresSolver solver;
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
/** parse parameters from configuration */
Configuration conf = ctx.getConfiguration();
double lambda = Double.parseDouble(ctx.getConfiguration().get(LAMBDA));
double alpha = Double.parseDouble(ctx.getConfiguration().get(ALPHA));
int numFeatures = ctx.getConfiguration().getInt(NUM_FEATURES, -1);
int numRows = ctx.getConfiguration().getInt(NUM_ROWS, -1);
Path YPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX));
Path YtransposeYPath = new Path(ctx.getConfiguration().get(FEATURE_MATRIX_TRANSPOSE));
/** set file lock if necessary */
lockPath = conf.get(LOCK_FILE);
lockNums = conf.getInt(LOCK_FILE_NUMS, 1);
if (lockPath != null) {
checkLock(ctx, lockNums);
}
/** load necessary matrix U/M into memory */
Y = ALSMatrixUtil.readMatrixByRowsMultithred(YPath, ctx.getConfiguration(), numRows, numFeatures);
YtransposeY = ALSMatrixUtil.readDistributedRowMatrix(YtransposeYPath, numFeatures, numFeatures);
/** initiate linear solver */
solver = new ImplicitFeedbackAlternatingLeastSquaresSolver(numFeatures, lambda, alpha, Y, YtransposeY);
LOG.info("Matrix dimension in memory " + Y.rowSize() + "," + Y.columnSize());
Preconditions.checkArgument(numFeatures > 0, "numFeatures was not set correctly!");
}
/** create file lock per each datanode to prevent too many map task simultaneously
* runs on same datanode */
private void checkLock(Context ctx, int lockNums) throws InterruptedException, IOException {
InetAddress thisIp =InetAddress.getLocalHost();
String hostIp = thisIp.getHostAddress();
// busy wait
Configuration conf = ctx.getConfiguration();
long totalSleep = 0;
boolean haveLock = false;
FileSystem fs = FileSystem.get(conf);
while (haveLock == false) {
for (int i = 0; i < lockNums; i++) {
Path checkPath = new Path(lockPath, hostIp + "_" + i);
if (fs.exists(checkPath) == false) {
haveLock = true;
currentLockPath = checkPath;
BufferedWriter br = new BufferedWriter(
new OutputStreamWriter(fs.create(currentLockPath)));
br.write(ctx.getTaskAttemptID().toString());
break;
}
}
if (haveLock == false) {
Random random = new Random();
int diff = 1000 + random.nextInt(1000) % 1000;
totalSleep += diff + sleepPeriod;
ctx.setStatus("sleeping: " + String.valueOf(totalSleep));
Thread.sleep(sleepPeriod + diff);
}
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException {
if (currentLockPath != null) {
FileSystem fs = FileSystem.get(context.getConfiguration());
fs.deleteOnExit(currentLockPath);
}
}
@Override
protected void map(IntWritable userOrItemID, VectorWritable ratingsWritable, Context ctx)
throws IOException, InterruptedException {
Vector ratings = new SequentialAccessSparseVector(ratingsWritable.get());
Vector uiOrmj = solver.solve(ratings);
ctx.write(userOrItemID, new VectorWritable(uiOrmj));
}
}