package com.skp.experiment.cf.als.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.SortedSet;
import java.util.TreeSet;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.common.TopK;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.cf.taste.hadoop.item.VectorAndPrefsWritable;
import org.apache.mahout.cf.taste.hadoop.item.VectorOrPrefWritable;
import org.apache.mahout.cf.taste.recommender.RecommendedItem;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.mapreduce.VectorSumReducer;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
import org.apache.mahout.math.map.OpenIntObjectHashMap;
import com.skp.experiment.cf.evaluate.hadoop.EvaluatorUtil;
import com.skp.experiment.common.HadoopClusterUtil;
import com.skp.experiment.common.Text2DistributedRowMatrixJob;
import com.skp.experiment.common.join.ImprovedRepartitionJoinAndFilterJob;
import com.skp.experiment.common.mapreduce.IdentityMapper;
import com.skp.experiment.common.mapreduce.ToVectorAndPrefReducer;
import com.skp.experiment.math.als.hadoop.ImplicitFeedbackAlternatingLeastSquaresReasonSolver;
/*
* 1. vectorize recommendations.
* 2. convert recommendations vector into VectorOrPrefWritable
* 3. convert training into VectorOrPrefWritable
* 4. merge 2, 3 and convert into VectorAndPrefsWritable
* 5. reasoning
*
*/
public class DistributedRecommenderReasonJob extends AbstractJob {
public static final String NUM_FEATURES = DistributedRecommenderReasonJob.class.getName() + ".numFeatures";
public static final String LAMBDA = DistributedRecommenderReasonJob.class.getName() + ".lambda";
public static final String ALPHA = DistributedRecommenderReasonJob.class.getName() + ".alpha";
public static final String FEATURE_MATRIX = DistributedRecommenderReasonJob.class.getName() + ".featureMatrix";
public static final String TOP_K_SIMILARITY = DistributedRecommenderReasonJob.class.getName() + ".topKSimilarity";
public static final String ITEM_META = DistributedRecommenderReasonJob.class.getName() + ".itemMeta";
public static final String ITEM_ITEM_SIMILARITY = DistributedRecommenderReasonJob.class.getName() + ".itemItemSimilarity";
public static final String SIMILARITY_WEIGHT = DistributedRecommenderReasonJob.class.getName() + ".similarityWeight";
public static final String CATEGORY_WEIGHT = DistributedRecommenderReasonJob.class.getName() + ".categoryWeight";
public static final String DELIMETER = ",";
public static enum REASON_COUNTER {
RANDOM_REASON,
SIMILARITY_REASON,
CATEGORY_REASON
};
public static void main(String[] args) throws Exception {
ToolRunner.run(new DistributedRecommenderReasonJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("trainings", null, "path to training set");
addOption("itemFeatures", null, "path to the item feature matrix");
addOption("indexPath", null, "path to index.");
addOption("indexSize", null, "path to the index size.");
addOption("numFeatures", null, "number of latent features.", String.valueOf(30));
addOption("topKSimilarity", "topK", "top K similarity per item", String.valueOf(100));
addOption("itemMeta", null, "path to item meta", true);
addOption("itemIndex", null, "path to item index", true);
addOption("similarityWeight", "simW", "similarity weight.", String.valueOf(1));
addOption("categoryWeight", "cateW", "weight for category match.", String.valueOf(1));
addOption("associationRule", "ar", "path to association rules", null);
Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null) {
return -1;
}
Path recommendationsPath = getInputPath();
Path trainingPath = new Path(getOption("trainings"));
Path itemFeatures = new Path(getOption("itemFeatures"));
Path itemSimPath = getTempPath("itemSims");
//minInputSplitSize = getConf().getLong("mapred.min.split.size", HadoopClusterUtil.DEFALUT_INPUT_SPLIT_SIZE);
//getConf().setLong("mapred.min.split.size", HadoopClusterUtil.DEFALUT_INPUT_SPLIT_SIZE);
Map<String, String> indexsSize = ALSMatrixUtil.fetchTextFiles(new Path(getOption("indexSize")),
DELIMETER, Arrays.asList(0), Arrays.asList(1));
if (getOption("associationRule") == null) {
getItemItemSimilarity(trainingPath, itemSimPath, Integer.parseInt(indexsSize.get("0")),
Integer.parseInt(getOption("topKSimilarity")));
} else {
String indexStr = getOption("indexPath") + "/1";
getAssociationRules(new Path(getOption("associationRule")), new Path(indexStr), itemSimPath);
}
/* 1. vectorize recommendations */
ToolRunner.run(new Text2DistributedRowMatrixJob(), new String[]{
"-i", recommendationsPath.toString(), "-o", getTempPath("recommendation.vector").toString(),
"-ri", "0", "-ci", "1", "-vi", "2"
});
Job recommendJob = prepareJob(getTempPath("recommendation.vector"), getTempPath("recommendation.conv"),
SequenceFileInputFormat.class, Vector2VectorOrPrefWritableMapper.class,
IntWritable.class, VectorOrPrefWritable.class,
SequenceFileOutputFormat.class);
recommendJob.waitForCompletion(true);
// 2. convert ratings into vectorOrPrefWritable
Job ratingJob = prepareJob(trainingPath, getTempPath("trainings"), TextInputFormat.class,
Text2VectorOrPrefWritableMapper.class, IntWritable.class, VectorOrPrefWritable.class,
SequenceFileOutputFormat.class);
ratingJob.waitForCompletion(true);
// 3. merge recommendations and ratings into VectorAndPrefsWritable
Job mergeJob = prepareJob(getTempPath("recommendation.conv"), getTempPath("merged"),
SequenceFileInputFormat.class,
IdentityMapper.class, IntWritable.class, VectorOrPrefWritable.class,
ToVectorAndPrefReducer.class, IntWritable.class, VectorAndPrefsWritable.class,
SequenceFileOutputFormat.class);
FileInputFormat.addInputPath(mergeJob, getTempPath("trainings"));
mergeJob.waitForCompletion(true);
// 4. build item meta
Path mergedItemMeta = getTempPath("mergedItemMeta");
ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), new String[]{
"-i", getOption("itemIndex"), "-o", mergedItemMeta.toString(),
"--srcKeyIndex", "1", "-tgt", getOption("itemMeta") + ":1:0:2:inner",
"--mapOnly", "true"
});
// 5. now prefs refer to ratings and vector refers to recommendations
Job reasonJob = prepareJob(getTempPath("merged"), getOutputPath(), SequenceFileInputFormat.class,
SolveImplicitFeedbackReasonSolverMapper.class, NullWritable.class, Text.class,
TextOutputFormat.class);
reasonJob.getConfiguration().setLong("mapred.task.timeout", 600000 * 6 * 10);
reasonJob.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
reasonJob.getConfiguration().setLong("mapred.min.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), getTempPath("merged")));
reasonJob.getConfiguration().setLong("mapred.max.split.size", HadoopClusterUtil.getMinInputSplitSizeMax(getConf(), getTempPath("merged")));
reasonJob.getConfiguration().setInt("mapred.map.tasks", HadoopClusterUtil.getNumberOfTaskTrackers(getConf()) * 100000);
reasonJob.getConfiguration().set("mapred.child.java.opts", "-Xmx2g");
reasonJob.getConfiguration().set("mapred.map.child.java.opts", "-Xmx2g");
//reasonJob.getConfiguration().set("lock.file", getTempPath("hosts").toString());
reasonJob.getConfiguration().set(ITEM_ITEM_SIMILARITY, itemSimPath.toString());
reasonJob.getConfiguration().set(ITEM_META, mergedItemMeta.toString());
reasonJob.getConfiguration().setInt(TOP_K_SIMILARITY, Integer.parseInt(getOption("topKSimilarity")));
reasonJob.getConfiguration().setFloat(SIMILARITY_WEIGHT, Float.parseFloat(getOption("similarityWeight")));
reasonJob.getConfiguration().setFloat(CATEGORY_WEIGHT, Float.parseFloat(getOption("categoryWeight")));
reasonJob.waitForCompletion(true);
return 0;
}
private void getItemItemSimilarity(Path train, Path output, int numCols, int topK) throws Exception {
Job vectorizeJob = prepareJob(train, getTempPath("vectorized"), TextInputFormat.class,
ItemBooleanRatingVectorMapper.class, IntWritable.class, VectorWritable.class,
VectorSumReducer.class, IntWritable.class, VectorWritable.class,
SequenceFileOutputFormat.class);
vectorizeJob.setCombinerClass(VectorSumReducer.class);
vectorizeJob.waitForCompletion(true);
RowSimilarityJob rowSimJob = new RowSimilarityJob();
rowSimJob.setConf(getConf());
rowSimJob.getConf().setInt("io.sort.factor", 100);
rowSimJob.getConf().setLong("mapred.task.timeout", 600000 * 60);
rowSimJob.getConf().setBoolean("mapred.map.tasks.speculative.execution", false);
rowSimJob.run(new String[]{
"-i", getTempPath("vectorized").toString(), "-o", output.toString(),
"--numberOfColumns", String.valueOf(numCols),
"--similarityClassname", "SIMILARITY_COOCCURRENCE",
"--maxSimilaritiesPerRow", String.valueOf(topK),
"--excludeSelfSimilarity", "true",
"--tempDir", getTempPath("similarity").toString()
});
}
private void getAssociationRules(Path arPath, Path indexPath, Path output) throws Exception {
ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), new String[]{
"-i", arPath.toString(), "-o", getTempPath("firstSim").toString(),
"-sidx", "0", "-tgt", indexPath.toString() + ":0:1:0:sub", "--mapOnly", "true"
});
ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), new String[]{
"-i", getTempPath("firstSim").toString(), "-o", getTempPath("secondSim").toString(),
"-sidx", "1", "-tgt", indexPath.toString() + ":1:1:0:sub", "--mapOnly", "true"
});
ToolRunner.run(new Text2DistributedRowMatrixJob(), new String[]{
"-i", getTempPath("secondSim").toString(), "-o", output.toString(),
"-ri", "0", "-ci", "1", "-vi", "2"
});
}
public static class ItemBooleanRatingVectorMapper extends
Mapper<LongWritable, Text, IntWritable, VectorWritable> {
private static IntWritable rowID = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
int userID = Integer.parseInt(tokens[0]);
int itemID = Integer.parseInt(tokens[1]);
float rating = 1;
Vector ratings = new RandomAccessSparseVector(Integer.MAX_VALUE, 1);
ratings.set(userID, rating);
rowID.set(itemID);
context.write(rowID, new VectorWritable(ratings));
}
}
public static class Text2VectorOrPrefWritableMapper extends
Mapper<LongWritable, Text, IntWritable, VectorOrPrefWritable> {
private static IntWritable outKey = new IntWritable();
@Override
protected void map(LongWritable offset, Text line, Context context)
throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
int userID = Integer.parseInt(tokens[0]);
int itemID = Integer.parseInt(tokens[1]);
float rating = Float.parseFloat(tokens[2]);
outKey.set(userID);
context.write(outKey, new VectorOrPrefWritable((long)itemID, rating));
}
}
public static class Vector2VectorOrPrefWritableMapper extends
Mapper<IntWritable, VectorWritable, IntWritable, VectorOrPrefWritable> {
@Override
protected void map(IntWritable userID, VectorWritable value, Context context)
throws IOException, InterruptedException {
context.write(userID, new VectorOrPrefWritable(value.get()));
}
}
public static class SolveImplicitFeedbackReasonSolverMapper extends
Mapper<IntWritable, VectorAndPrefsWritable, NullWritable, Text> {
private ImplicitFeedbackAlternatingLeastSquaresReasonSolver solver;
private static Text outValue = new Text();
private OpenIntObjectHashMap<TopK<RecommendedItem>> itemSims;
private Map<Integer, String> itemMetas = new HashMap<Integer, String>();
private float simWeight = 0;
private float cateWeight = 0;
private Random random;
@Override
protected void setup(Context ctx) throws IOException,
InterruptedException {
random = RandomUtils.getRandom();
int topK = ctx.getConfiguration().getInt(TOP_K_SIMILARITY, 100);
Path YPath = new Path(ctx.getConfiguration().get(ITEM_ITEM_SIMILARITY));
Path itemMetaPath = new Path(ctx.getConfiguration().get(ITEM_META));
itemSims = ALSMatrixUtil.readMatrixByRowsTopK(YPath, ctx, topK);
for (Entry<String, String> item :
ALSMatrixUtil.fetchTextFiles(ctx, itemMetaPath, DELIMETER,
Arrays.asList(0), Arrays.asList(1)).entrySet()) {
itemMetas.put(Integer.parseInt(item.getKey()), item.getValue());
}
ctx.setStatus("Item Sims: " + itemSims.size());
simWeight = ctx.getConfiguration().getFloat(SIMILARITY_WEIGHT, 2);
cateWeight = ctx.getConfiguration().getFloat(CATEGORY_WEIGHT, 1);
}
static <K,V extends Comparable<? super V>>
SortedSet<Map.Entry<K,V>> entriesSortedByValues(Map<K,V> map) {
SortedSet<Map.Entry<K,V>> sortedEntries = new TreeSet<Map.Entry<K,V>>(
new Comparator<Map.Entry<K,V>>() {
@Override public int compare(Map.Entry<K,V> e1, Map.Entry<K,V> e2) {
return e1.getValue().compareTo(e2.getValue());
}
}
);
sortedEntries.addAll(map.entrySet());
return sortedEntries;
}
// get distance between x, and y, assumes lenth of x, and y is same
private float getDistance(String x, String y) {
int common = 0;
for (int i = 0; i < x.length() && i < y.length(); i++) {
if (x.charAt(i) != y.charAt(i)) {
break;
}
common++;
}
return common / (float)x.length();
}
@Override
protected void map(IntWritable userIDWritable, VectorAndPrefsWritable recAndRatings,
Context context) throws IOException, InterruptedException {
List<Long> ratedItemIDs = recAndRatings.getUserIDs();
List<Float> ratedScores = recAndRatings.getValues();
Vector recommendations = recAndRatings.getVector();
Map<Integer, Boolean> ratedItemIDsMap = new HashMap<Integer, Boolean>();
for (Long id : ratedItemIDs) {
ratedItemIDsMap.put(id.intValue(), true);
}
Iterator<Vector.Element> recs = recommendations.iterateNonZero();
while (recs.hasNext()) {
Vector.Element rec = recs.next();
Map<Long, Float> candidatesMap = new HashMap<Long, Float>();
Map<Long, Float> simMap = new HashMap<Long, Float>();
List<RecommendedItem> simList = new ArrayList<RecommendedItem>();
// get best reason for this recommendation
int bestItemId = -1;
//double bestItemScore = -1.0;
float bestItemScore = 0;
//check no rules for this rec_id item
if (itemSims.containsKey(rec.index())) {
simList = itemSims.get(rec.index()).retrieve();
}
//build similarity map
for (RecommendedItem item : simList) {
simMap.put(item.getItemID(), item.getValue());
}
for (Long ratedItemID : ratedItemIDs) {
float curCandidateScore = 0;
if (simMap.containsKey(ratedItemID)) {
curCandidateScore += simWeight + simMap.get(ratedItemID);
}
if (itemMetas.containsKey(rec.index()) && itemMetas.containsKey(ratedItemID.intValue())) {
curCandidateScore += cateWeight * getDistance(itemMetas.get(rec.index()), itemMetas.get(ratedItemID.intValue()));
}
candidatesMap.put(ratedItemID, curCandidateScore);
}
for (Entry<Long, Float> candidate : candidatesMap.entrySet()) {
if (bestItemScore < candidate.getValue()) {
bestItemId = candidate.getKey().intValue();
bestItemScore = candidate.getValue();
if (bestItemScore >= simWeight * 1) {
context.getCounter(REASON_COUNTER.SIMILARITY_REASON).increment(1);
} else if (bestItemScore > 0) {
context.getCounter(REASON_COUNTER.CATEGORY_REASON).increment(1);
}
}
}
// no match for similarity, category. just randomly pick
if (bestItemId == -1) {
context.getCounter(REASON_COUNTER.RANDOM_REASON).increment(1);
bestItemId = ratedItemIDs.get(random.nextInt(ratedItemIDs.size())).intValue();
}
outValue.set(userIDWritable.get() + DELIMETER + rec.index() + DELIMETER +
rec.get() + DELIMETER + bestItemId);
context.write(NullWritable.get(), outValue);
}
}
}
}