package com.skp.experiment.common.join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.common.AbstractJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.skp.experiment.cf.als.hadoop.ParallelALSFactorizationJob;
import com.skp.experiment.common.OptionParseUtil;
public class ImprovedRepartitionJoinJob extends AbstractJob {
private static final Logger log = LoggerFactory.getLogger(ImprovedRepartitionJoinJob.class);
private static final String SRC_TABLE = ImprovedRepartitionJoinJob.class.getName() + ".srcTable";
private static final String TGT_TABLE = ImprovedRepartitionJoinJob.class.getName() + ".tgtTable";
private static final String SRC_KEY_INDEX = ImprovedRepartitionJoinJob.class.getName() + ".srcKeyIndex";
private static final String TGT_KEY_INDEX = ImprovedRepartitionJoinJob.class.getName() + ".tgtKeyIndex";
private static final String SRC_VALUE_INDEX = ImprovedRepartitionJoinJob.class.getName() + ".srcValueIndex";
private static final String TGT_VALUE_INDEX = ImprovedRepartitionJoinJob.class.getName() + ".tgtValueIndex";
private static final String JOIN_TYPE = ImprovedRepartitionJoinJob.class.getName() + ".joinType";
private static final String DELIMETER = ",";
private static int SRC_TABLE_SUFFIX = 1;
private static int TGT_TABLE_SUFFIX = 0;
private static String nullStr = "0";
public static void main(String[] args) throws Exception {
ToolRunner.run(new ImprovedRepartitionJoinJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("targetTable", "tgt", "path to target table.");
addOption("srcKeyIndex", "sidx", "src key index, seperated by comma");
addOption("srcValueIndex", "svalues", "src values index");
addOption("tgtKeyIndex", "tidx", "tgt key index");
addOption("tgtValueIndex", "tvalues", "tgt values index");
addOption("delimeter", "d", "delimeter", ",");
addOption("joinType", "type", "join type{inner(default), outter}", "inner");
if (parseArguments(args) == null) {
return -1;
}
Job joinJob = prepareJob(getInputPath(), getOutputPath(), TextInputFormat.class,
ImprovedRepartitionJoinMapper.class, CompositeJoinKey.class, CompositeJoinValue.class,
ImprovedRepartitionJoinReducer.class, NullWritable.class, Text.class,
TextOutputFormat.class);
joinJob.getConfiguration().set(SRC_TABLE, getInputPath().toString());
joinJob.getConfiguration().set(SRC_KEY_INDEX, getOption("srcKeyIndex"));
joinJob.getConfiguration().set(SRC_VALUE_INDEX, getOption("srcValueIndex"));
joinJob.getConfiguration().set(TGT_KEY_INDEX, getOption("tgtKeyIndex"));
joinJob.getConfiguration().set(TGT_VALUE_INDEX, getOption("tgtValueIndex"));
joinJob.getConfiguration().set(JOIN_TYPE, getOption("joinType"));
FileInputFormat.addInputPath(joinJob, new Path(getOption("targetTable")));
joinJob.setPartitionerClass(CompositeJoinKeyPartitioner.class);
joinJob.setSortComparatorClass(CompositeJoinKeyComparator.class);
joinJob.setGroupingComparatorClass(CompositeJoinKeyGroupingComparator.class);
joinJob.waitForCompletion(true);
return 0;
}
public static class ImprovedRepartitionJoinMapper extends
Mapper<LongWritable, Text, CompositeJoinKey, CompositeJoinValue> {
private static boolean isSrcTable = false;
private static List<Integer> srcKeyIndexs;
private static List<Integer> srcValueIndexs;
private static List<Integer> tgtKeyIndexs;
private static List<Integer> tgtValueIndexs;
private static CompositeJoinKey outKey = new CompositeJoinKey();
private static CompositeJoinValue outValue = new CompositeJoinValue();
@Override
protected void setup(Context ctx) throws IOException,
InterruptedException {
String srcTableName = ctx.getConfiguration().get(SRC_TABLE);
srcKeyIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_KEY_INDEX), DELIMETER);
srcValueIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_VALUE_INDEX), DELIMETER);
tgtKeyIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(TGT_KEY_INDEX), DELIMETER);
tgtValueIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(TGT_VALUE_INDEX), DELIMETER);
FileSplit split = (FileSplit)ctx.getInputSplit();
Path path = split.getPath();
//System.out.println(path.toString() + "\t" + srcTableName);
if (path.toString().contains(srcTableName)) {
isSrcTable = true;
} else {
isSrcTable = false;
}
}
@Override
protected void map(LongWritable offset, Text line, Context ctx)
throws IOException, InterruptedException {
String[] tokens = TasteHadoopUtils.splitPrefTokens(line.toString());
List<Integer> keyIndexs = isSrcTable ? srcKeyIndexs : tgtKeyIndexs;
List<Integer> valueIndexs = isSrcTable ? srcValueIndexs : tgtValueIndexs;
int suffix = isSrcTable ? SRC_TABLE_SUFFIX : TGT_TABLE_SUFFIX;
outKey.set(OptionParseUtil.encode(tokens, keyIndexs, DELIMETER), suffix);
outValue.set(OptionParseUtil.encode(tokens, valueIndexs, DELIMETER), suffix);
if (outKey.getJoinKey() != null && outValue.getValue() != null) {
ctx.write(outKey, outValue);
} else {
log.info("MAP:\t" + outKey.getJoinKey() + "\t" + outKey.getSuffix() + "\t" + outValue.getValue());
}
//System.out.println("MAP:\t" + outKey.getJoinKey() + "," + outKey.getSuffix() + "," + outValue.getValue());
//ctx.write(outKey, outValue);
}
}
public static class ImprovedRepartitionJoinReducer extends
Reducer<CompositeJoinKey, CompositeJoinValue, NullWritable, Text> {
private static String joinType;
private static Text outValue = new Text();
@Override
protected void setup(Context ctx)
throws IOException, InterruptedException {
joinType = ctx.getConfiguration().get(JOIN_TYPE).toLowerCase();
}
@Override
protected void reduce(CompositeJoinKey key, Iterable<CompositeJoinValue> values, Context ctx)
throws IOException,
InterruptedException {
List<String> tgtTableValues = new ArrayList<String>();
for (CompositeJoinValue value : values) {
//System.out.println("REDUCE: " + key.getJoinKey() + "\t" + value.getValue() + "," + value.getSuffix());
if (value.getSuffix() == TGT_TABLE_SUFFIX) {
tgtTableValues.add(value.getValue());
} else {
int matchedCount = 0;
for (String tgtValue : tgtTableValues) {
if (joinType.equals("minus") == false) {
outValue.set(key.getJoinKey() + DELIMETER + value.getValue() + DELIMETER + tgtValue);
ctx.write(NullWritable.get(), outValue);
}
matchedCount++;
}
if (matchedCount == 0 && (joinType.equals("outter") || joinType.equals("minus"))) {
if (joinType.equals("outter")) {
outValue.set(key.getJoinKey() + DELIMETER + value.getValue() + DELIMETER + nullStr);
} else if (joinType.equals("minus")) {
outValue.set(key.getJoinKey() + DELIMETER + value.getValue());
}
ctx.write(NullWritable.get(), outValue);
}
}
}
}
}
}