package com.skp.experiment.cf.evaluate.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.Pair;
import com.skp.experiment.common.HadoopClusterUtil;
/*
* This class evaluate recommendations with flag indicating existence in validation set.
*/
public class EvaluatorJob extends AbstractJob {
private static final String DELIMETER = ",";
private static final String RECOMMENDATIONS_PER_USER = EvaluatorJob.class.getName() + ".recommendationsPerUser";
protected static double negativePref = -1.0;
protected Path trainingSetPath;
protected Path validationSetPath;
protected Path recommendationSetPath;
public static void main(String[] args) throws Exception {
ToolRunner.run(new EvaluatorJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("topK", "k", "recommendations per user.", false);
addOption("cleanUp", null, "true if only want stat, otherwise false", String.valueOf(false));
if (parseArguments(args) == null) {
return -1;
}
Job evaluateJob = prepareJob(getInputPath(), getOutputPath(), TextInputFormat.class,
ToVectorizeMapper.class, RecommendationsKey.class, Text.class,
MergeTopKRecommendationsReducer.class, NullWritable.class, Text.class,
TextOutputFormat.class);
Configuration conf = evaluateJob.getConfiguration();
conf.setInt(RECOMMENDATIONS_PER_USER, Integer.parseInt(getOption("topK")));
evaluateJob.setPartitionerClass(RecommendationsKeyPartitioner.class);
evaluateJob.setSortComparatorClass(RecommendationsKeyComparator.class);
evaluateJob.setGroupingComparatorClass(RecommendationsKeyGroupingComparator.class);
evaluateJob.waitForCompletion(true);
writeOutEvaluationMetrics(getOutputPath());
if (Boolean.parseBoolean(getOption("cleanUp")) == true) {
HadoopClusterUtil.deletePartFiles(getConf(), getOutputPath());
}
return 0;
}
/* aggregate average precision per user and create final result*/
public void writeOutEvaluationMetrics(Path output) throws IOException {
Map<Integer, Double> stats =
EvaluatorUtil.getResultSumPerColumns(getConf(), output, Arrays.asList(0), true);
StringBuffer sb = new StringBuffer();
double totalCount = stats.get(EvaluatorUtil.RECORD_COUNT_SUM_INDEX);
TreeSet<Integer> keys = new TreeSet<Integer>(stats.keySet());
for (Integer statKey : keys) {
if (statKey != EvaluatorUtil.RECORD_COUNT_SUM_INDEX) {
sb.append(DELIMETER);
}
if (statKey < 3) {
sb.append(stats.get(statKey).intValue());
} else {
sb.append(stats.get(statKey) / totalCount);
}
}
HadoopClusterUtil.writeToHdfs(getConf(), getOutputPath("_stats"), sb.toString());
}
/** use secondary sort here */
private static class ToVectorizeMapper extends
Mapper<LongWritable, Text, RecommendationsKey, Text> {
private static Text outValue = new Text();
private static RecommendationsKey outKey = new RecommendationsKey();
@Override
protected void map(LongWritable offset, Text line, Context context)
throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
try {
int sz = tokens.length;
String userID = tokens[0];
String itemID = tokens[1];
float rating = Float.parseFloat(tokens[2]);
int itemCount = Integer.parseInt(tokens[sz-2]);
double flag = Double.parseDouble(tokens[sz-1]);
outValue.set(itemID + DELIMETER + rating + DELIMETER + flag);
outKey.set(userID, rating, itemCount);
context.write(outKey, outValue);
} catch (NumberFormatException e) {
return;
}
}
}
private static class MergeTopKRecommendationsReducer extends
Reducer<RecommendationsKey, Text, NullWritable, Text> {
private static List<Evaluator> evaluators;
private static Text outValue = new Text();
private static int topK = 0;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
topK = context.getConfiguration().getInt(RECOMMENDATIONS_PER_USER, 0);
evaluators =
Arrays.asList(new PrecisionEvaluator(), new MeanAveragePrecisionEvaluator(),
new ExpectedPercentileRankEvaluator(), new RecallEvaluator());
}
@Override
protected void reduce(RecommendationsKey compositeKey, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String userID = compositeKey.getUserID();
int itemCount = compositeKey.getItemCount();
List<Pair<String, Double>> items = new ArrayList<Pair<String, Double>>();
for (Text value: values) {
String[] tokens = value.toString().split(DELIMETER);
String itemID = tokens[0];
double flag = Double.parseDouble(tokens[2]);
Pair<String, Double> newPair = new Pair<String, Double>(itemID, flag);
items.add(newPair);
}
List<Pair<Integer, Double>> scores = new ArrayList<Pair<Integer, Double>>();
for (Evaluator evaluator : evaluators) {
scores.addAll(evaluator.evaluate(items, topK, itemCount, negativePref));
}
outValue.set(userID + DELIMETER + itemCount + DELIMETER + scores.get(0).getFirst()
+ DELIMETER + buildOutput(scores));
context.write(NullWritable.get(), outValue);
}
private String buildOutput(List<Pair<Integer, Double>> scores) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < scores.size(); i++) {
if (i != 0) {
sb.append(DELIMETER);
}
sb.append(scores.get(i).getSecond());
}
return sb.toString();
}
}
}