package com.skp.experiment.cf.evaluate.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.common.AbstractJob;
import com.skp.experiment.common.HadoopClusterUtil;
import com.skp.experiment.common.OptionParseUtil;
public class MultiDatasetJaccardJob extends AbstractJob {
private static final String DELIMETER = ",";
private static final String VALUE_COLUMN_INDEX = MultiDatasetJaccardJob.class.getName() + ".valueColumnIndex";
private static final String KEY_COLUMN_INDEX = MultiDatasetJaccardJob.class.getName() + ".keyColumnIndex";
private static final String INPUT_PATHS = MultiDatasetJaccardJob.class.getName() + ".inputPaths";
private static enum COUNT {
NOT_COMMON_TO_ALL
}
public static void main(String[] args) throws Exception {
ToolRunner.run(new MultiDatasetJaccardJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("otherDatasets", "others", "other dataset paths delimetered by ,");
addOption("keyIndexs", "kidxs", "key indexs for group by seperated by ,", "0");
addOption("valueIndexs", "vidxs", "value indexs for calculating jaccard.", "1");
addOption("cleanUp", null, "true if only want _stats. otherwise false", String.valueOf(false));
Map<String, String> parsedArgs = parseArguments(args);
if (parsedArgs == null || getOption("otherDatasets") == null) {
return -1;
}
String[] inputPaths = TasteHadoopUtils.splitPrefTokens(getOption("otherDatasets"));
List<String> totalInputPaths = new ArrayList<String>();
totalInputPaths.add(getInputPath().toString());
totalInputPaths.addAll(Arrays.asList(inputPaths));
Job jaccardJob = prepareJob(getInputPath(), getOutputPath(), TextInputFormat.class,
MultiSetJaccardMapper.class, Text.class, Text.class,
MultiSetJaccardReducer.class, NullWritable.class, Text.class, TextOutputFormat.class);
jaccardJob.setJarByClass(MultiDatasetJaccardJob.class);
jaccardJob.getConfiguration().set(KEY_COLUMN_INDEX, getOption("keyIndexs"));
jaccardJob.getConfiguration().set(VALUE_COLUMN_INDEX, getOption("valueIndexs"));
jaccardJob.getConfiguration().set(INPUT_PATHS, buildInputPathString(totalInputPaths));
for (String path : inputPaths) {
FileInputFormat.addInputPath(jaccardJob, new Path(path));
}
jaccardJob.waitForCompletion(true);
writeResultStat(getOutputPath("_stats"));
if (Boolean.parseBoolean(getOption("cleanUp")) == true) {
HadoopClusterUtil.deletePartFiles(getConf(), getOutputPath());
}
return 0;
}
private String buildInputPathString(List<String> paths) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < paths.size(); i++) {
if (i > 0) {
sb.append(DELIMETER);
}
sb.append(paths.get(i));
}
return sb.toString();
}
private void writeResultStat(Path output) throws IOException {
Map<Integer, Double> stats =
EvaluatorUtil.getResultSumPerColumns(getConf(), getOutputPath(), Arrays.asList(1), false);
long totalCount = stats.get(EvaluatorUtil.RECORD_COUNT_SUM_INDEX).longValue();
String outputString = totalCount + DELIMETER + (stats.get(1) / totalCount);
HadoopClusterUtil.writeToHdfs(getConf(), output, outputString);
}
public static class MultiSetJaccardMapper
extends Mapper<LongWritable, Text, Text, Text>{
private static int fileIndex = 0;
private static List<Integer> keyColumnIndexs;
private static Text outKey = new Text();
private static Text outValue = new Text();
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
keyColumnIndexs =
OptionParseUtil.decode(context.getConfiguration().get(KEY_COLUMN_INDEX), DELIMETER);
String[] inputPaths = TasteHadoopUtils.splitPrefTokens(context.getConfiguration().get(INPUT_PATHS));
FileSplit split = (FileSplit)context.getInputSplit();
Path path = split.getPath();
for (int i = 0; i < inputPaths.length; i++) {
if (path.toString().contains(inputPaths[i])) {
fileIndex = i;
break;
}
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
outKey.set(OptionParseUtil.encode(tokens, keyColumnIndexs, DELIMETER));
outValue.set(value.toString() + DELIMETER + fileIndex);
context.write(outKey, outValue);
}
}
public static class MultiSetJaccardReducer
extends Reducer<Text, Text, NullWritable, Text> {
private static List<Integer> valueIndexs;
private static Text outValue = new Text();
private static int totalInputPathNum = 0;
@Override
protected void setup(Context context) throws IOException,
InterruptedException {
valueIndexs =
OptionParseUtil.decode(context.getConfiguration().get(VALUE_COLUMN_INDEX), DELIMETER);
String[] inputPaths = TasteHadoopUtils.splitPrefTokens(context.getConfiguration().get(INPUT_PATHS));
totalInputPathNum = inputPaths.length;
}
@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
long commonCount = 0;
Map<String, Integer> occurrences = new HashMap<String, Integer>();
Set<Integer> fileIndexs = new HashSet<Integer>();
for (Text value : values) {
String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
String currentValue = OptionParseUtil.encode(tokens, valueIndexs, DELIMETER);
int fileIndex = Integer.parseInt(tokens[tokens.length-1]);
fileIndexs.add(fileIndex);
if (!occurrences.containsKey(currentValue)) {
occurrences.put(currentValue, 0);
}
occurrences.put(currentValue, occurrences.get(currentValue) + 1);
}
if (fileIndexs.size() < totalInputPathNum) {
context.getCounter(MultiDatasetJaccardJob.COUNT.NOT_COMMON_TO_ALL).increment(1);
return;
}
for (Entry<String, Integer> entry : occurrences.entrySet()) {
if (entry.getValue() > 1) {
commonCount++;
}
}
double jaccard = 0.0;
if (occurrences.size() > 0) {
jaccard = commonCount / (double)occurrences.size();
}
outValue.set(key.toString() + DELIMETER + jaccard);
context.write(NullWritable.get(), outValue);
}
}
}