package com.skp.experiment.common.join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import com.skp.experiment.common.OptionParseUtil;
/**
* This class offer Repartition Join(map only), Improved Repartition Join(using secondary sort) for
* inner, outter, minus, gsub join options on multiple tables.
* 1) inner: if there is same key exist in all target tables, then <src key, src value, [tgt values]> return
* 2) outter: if there is no same key exists in any target tables, then <src key, src values, [0]> return
* 3) sub
* ex)
* hadoop jar $jar main -i data/cf/raw_input.txt -o data/cf/merged
* --srcKeyIndex 0,1(src tables`s key column seperated by comma)
* --srcValueIndex 2(src tables`s key column seperated by comma)
* --tgtTableOptions data/cf/meta.txt:0,1:2:filter;data/cf/meta2.txt:1,0:2:filter
* (tgt table`s name:tgt table`s key columns:tgt table`s value columns:joinType)
*
*/
public class ImprovedRepartitionJoinAndFilterJob extends AbstractJob {
public static final String SRC_TABLE = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".srcTable";
public static final String SRC_KEY_INDEX = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".srcKeyIndex";
public static final String SRC_VALUE_INDEX = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".srcValueIndex";
public static final String TGT_TABLE_OPTIONS = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".tgtTableOptions";
public static final String REPARTITION_JOIN = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".repartitionJoin";
public static final String DEFAULT_VALUE = ImprovedRepartitionJoinAndFilterJob.class.getName() + ".defaultValue";
public static final String DELIMETER = ",";
public static void main(String[] args) throws Exception {
ToolRunner.run(new ImprovedRepartitionJoinAndFilterJob(), args);
}
@Override
public int run(String[] args) throws Exception {
addInputOption();
addOutputOption();
addOption("srcKeyIndex", "sidx", "src key index, seperated by comma");
addOption("tgtTableOptions", "tgt", "list of option for tgt table {(tablename:keyindexs:valueindexs)}");
addOption("mapOnly", null, "true if only repartition join needs to be used.");
addOption("defaultValue", null, "default value for outer join", "null");
if (parseArguments(args) == null) {
return -1;
}
boolean repartitionOnly =
getOption("mapOnly") != null && getOption("mapOnly").equals("true") ? true : false;
Job joinJob = null;
if (repartitionOnly) {
/**map only job */
joinJob = prepareJob(getInputPath(), getOutputPath(), TextInputFormat.class,
RepartitionJoinMapper.class, NullWritable.class, Text.class,
TextOutputFormat.class);
} else {
/** improved repartition join:
* reference tables comes in reducer first, and src table later.
* using secondary sort.
* */
joinJob = prepareJob(getInputPath(), getOutputPath(), TextInputFormat.class,
ImprovedRepartitionJoinMapper.class, CompositeJoinKey.class, CompositeJoinValue.class,
ImprovedRepartitionJoinReducer.class, NullWritable.class, Text.class,
TextOutputFormat.class);
/** secondary sort setup */
joinJob.setPartitionerClass(CompositeJoinKeyPartitioner.class);
joinJob.setSortComparatorClass(CompositeJoinKeyComparator.class);
joinJob.setGroupingComparatorClass(CompositeJoinKeyGroupingComparator.class);
}
joinJob.getConfiguration().set(SRC_TABLE, getInputPath().toString());
joinJob.getConfiguration().set(SRC_KEY_INDEX, getOption("srcKeyIndex"));
joinJob.getConfiguration().set(TGT_TABLE_OPTIONS, getOption("tgtTableOptions"));
joinJob.getConfiguration().set(DEFAULT_VALUE, getOption("defaultValue"));
// iterate all target tables and add them into input paths
List<JoinOption> tgtTableOptions = JoinOptionUtils.parseOptionStrings(getOption("tgtTableOptions"));
if (!repartitionOnly) {
for (JoinOption option : tgtTableOptions) {
FileInputFormat.addInputPath(joinJob, new Path(option.getTable()));
}
}
joinJob.waitForCompletion(true);
return 0;
}
private static List<Integer> getAllColumnIndexs(String[] tokens) {
List<Integer> ret = new ArrayList<Integer>();
for (int i = 0; i < tokens.length; i++) {
ret.add(i);
}
return ret;
}
private static String joinTokens(String[] tokens) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < tokens.length; i++) {
if (i != 0) {
sb.append(DELIMETER);
}
sb.append(tokens[i]);
}
return sb.toString();
}
public static class ImprovedRepartitionJoinMapper extends
Mapper<LongWritable, Text, CompositeJoinKey, CompositeJoinValue> {
private static String srcTableName;
private static List<Integer> srcKeyIndexs;
private static List<Integer> srcValueIndexs;
private static List<JoinOption> tgtOptions;
private static int joinOptionIndex;
private static CompositeJoinKey outKey = new CompositeJoinKey();
private static CompositeJoinValue outValue = new CompositeJoinValue();
@Override
protected void setup(Context ctx) throws IOException,
InterruptedException {
// get parameter about source tables
srcTableName = ctx.getConfiguration().get(SRC_TABLE);
srcKeyIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_KEY_INDEX), JoinOption.INNER_DELIMETER);
//srcValueIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_VALUE_INDEX), JoinOption.INNER_DELIMETER);
// parse options for target tables
tgtOptions = JoinOptionUtils.parseOptionStrings(ctx.getConfiguration().get(TGT_TABLE_OPTIONS));
// note that suffix for tables are define following order.
// src table = n, tgt table 0 = 0, tgt table 1 = 1, .... tgt table n -1 = n -1
joinOptionIndex = tgtOptions.size();
FileSplit split = (FileSplit)ctx.getInputSplit();
Path path = split.getPath();
for (int i = 0; i < tgtOptions.size(); i++) {
if (path.toString().contains(tgtOptions.get(i).getTable())) {
joinOptionIndex = i;
break;
}
}
}
@Override
protected void map(LongWritable offset, Text line, Context ctx)
throws IOException, InterruptedException {
String[] tokens = JoinOptionUtils.splitPrefTokens(line.toString());
if (srcValueIndexs == null) {
srcValueIndexs = getAllColumnIndexs(tokens);
}
JoinOption curOption = new JoinOption();
if (joinOptionIndex == tgtOptions.size()) {
// src table setup.
curOption.setTable(srcTableName);
curOption.setTargetTableKeyIndexs(srcKeyIndexs);
curOption.setTargetTableValueIndexs(srcValueIndexs);
curOption.setType("inner");
} else {
// target table setup
curOption.setTable(tgtOptions.get(joinOptionIndex).getTable());
curOption.setTargetTableKeyIndexs(tgtOptions.get(joinOptionIndex).getTargetTableKeyIndexs());
curOption.setTargetTableValueIndexs(tgtOptions.get(joinOptionIndex).getTargetTableValueIndexs());
curOption.setType(tgtOptions.get(joinOptionIndex).getType());
}
String keyStr = JoinOptionUtils.fetchFileds(tokens, curOption.getTargetTableKeyIndexs());
String valueStr = JoinOptionUtils.fetchFileds(tokens, curOption.getTargetTableValueIndexs());
int suffix = joinOptionIndex;
outKey.set(keyStr, suffix);
outValue.set(valueStr, suffix);
ctx.write(outKey, outValue);
}
}
public static class ImprovedRepartitionJoinReducer extends
Reducer<CompositeJoinKey, CompositeJoinValue, NullWritable, Text> {
private static Text outValue = new Text();
private static List<JoinOption> tgtOptions;
private static String defaultValue = "";
@Override
protected void setup(Context ctx)
throws IOException, InterruptedException {
tgtOptions = JoinOptionUtils.parseOptionStrings(ctx.getConfiguration().get(TGT_TABLE_OPTIONS));
defaultValue = ctx.getConfiguration().get(DEFAULT_VALUE);
}
@Override
protected void reduce(CompositeJoinKey key, Iterable<CompositeJoinValue> values, Context ctx)
throws IOException,
InterruptedException {
// following code only works when secondary sort is properly used.
// assumes that target tables appear first before source tables.
Map<Integer, List<String>> tgtTableValues = new HashMap<Integer, List<String>>();
for (int i = 0; i < tgtOptions.size(); i++) {
tgtTableValues.put(i, new ArrayList<String>());
}
for (CompositeJoinValue value : values) {
int suffix = value.getSuffix();
boolean isSrcTable = suffix == tgtOptions.size() ? true : false;
if (!isSrcTable) {
tgtTableValues.get(value.getSuffix()).add(value.getValue());
} else {
mergeOutput(ctx, key, value, tgtOptions, tgtTableValues);
}
}
}
private void mergeOutput(Context ctx, CompositeJoinKey key, CompositeJoinValue value,
List<JoinOption> tgtOptions, Map<Integer, List<String>> tgtTableValues)
throws IOException, InterruptedException {
boolean skipThisValue = false;
String[] valueTokens = value.getValue().split(DELIMETER);
StringBuffer sb = new StringBuffer();
for (int i = 0; i < tgtOptions.size(); i++) {
JoinOption option = tgtOptions.get(i);
List<String> curTgtTableValues = tgtTableValues.get(i);
// check when any target table has filter option and values for this key
if (option.getType().equals("filter") && curTgtTableValues.size() > 0) {
skipThisValue = true;
break;
}
if (option.getType().equals("inner")) {
// option is inner but there is no match in this target table for this key
if (curTgtTableValues.size() == 0) {
skipThisValue = true;
break;
} else {
// inner join
sb.append(JoinOptionUtils.DELIMETER).append(curTgtTableValues.get(0));
}
}
if (option.getType().equals("outer")) {
sb.append(JoinOptionUtils.DELIMETER).append(
curTgtTableValues.size() > 0 ? curTgtTableValues.get(0) : defaultValue);
}
if (option.getType().equals("sub")) {
if (curTgtTableValues.size() == 0) {
skipThisValue = true;
break;
} else {
for (int j = 0; j < option.getSourceTableKeyIndexs().size(); j++) {
int srcIdx = option.getSourceTableKeyIndexs().get(j);
if (srcIdx >= 0 && srcIdx < valueTokens.length) {
valueTokens[srcIdx] = curTgtTableValues.get(j);
}
}
}
}
}
if (!skipThisValue) {
String necessaryStr = joinTokens(valueTokens);
String extraStr = sb.toString();
if (extraStr.length() > 0) {
outValue.set(necessaryStr + extraStr);
} else {
outValue.set(necessaryStr);
}
ctx.write(NullWritable.get(), outValue);
}
}
}
public static class RepartitionJoinMapper extends
Mapper<LongWritable, Text, NullWritable, Text> {
private static List<Integer> srcKeyIndexs;
private static List<Integer> srcValueIndexs;
private static List<JoinOption> tgtOptions;
private static List<Map<String, String>> tgtTableCaches;
private static Text outValue = new Text();
private static String defaultValue = "null";
@Override
protected void setup(Context ctx) throws IOException, InterruptedException {
//srcTableName = ctx.getConfiguration().get(SRC_TABLE);
srcKeyIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_KEY_INDEX), JoinOption.INNER_DELIMETER);
//srcValueIndexs = OptionParseUtil.decode(ctx.getConfiguration().get(SRC_VALUE_INDEX), JoinOption.INNER_DELIMETER);
tgtOptions = JoinOptionUtils.parseOptionStrings(ctx.getConfiguration().get(TGT_TABLE_OPTIONS));
defaultValue = ctx.getConfiguration().get(DEFAULT_VALUE);
tgtTableCaches = new ArrayList<Map<String, String>>();
for (int i = 0; i < tgtOptions.size(); i++) {
Path curPath = new Path(tgtOptions.get(i).getTable());
Map<String, String> cache = JoinUtils.fetchTextFiles(ctx,
curPath, DELIMETER, tgtOptions.get(i).getTargetTableKeyIndexs(),
tgtOptions.get(i).getTargetTableValueIndexs());
tgtTableCaches.add(cache);
}
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
*/
@Override
protected void map(LongWritable key, Text value, Context ctx)
throws IOException, InterruptedException {
String[] tokens = JoinOptionUtils.splitPrefTokens(value.toString());
if (srcValueIndexs == null) {
srcValueIndexs = getAllColumnIndexs(tokens);
}
String srcTableKey = JoinOptionUtils.fetchFileds(tokens, srcKeyIndexs);
String srcTableValue = JoinOptionUtils.fetchFileds(tokens, srcValueIndexs);
boolean skipThisValue = false;
//System.err.println("key: " + key.toString() + "\tvalue: " + value.toString());
StringBuffer sb = new StringBuffer();
//sb.append(value.toString());
for (int i = 0; i < tgtOptions.size(); i++) {
JoinOption option = tgtOptions.get(i);
Map<String, String> curTgtTableCache = tgtTableCaches.get(i);
if (option.getType().equals("filter") && curTgtTableCache.containsKey(srcTableKey)) {
skipThisValue = true;
break;
}
if (option.getType().equals("inner")) {
if (!curTgtTableCache.containsKey(srcTableKey)) {
skipThisValue = true;
break;
}
sb.append(JoinOptionUtils.DELIMETER).append(curTgtTableCache.get(srcTableKey));
}
if (option.getType().equals("outer")) {
sb.append(JoinOptionUtils.DELIMETER).append(
curTgtTableCache.containsKey(srcTableKey) ? curTgtTableCache.get(srcTableKey) : defaultValue);
}
if (option.getType().equals("sub")) {
if (!curTgtTableCache.containsKey(srcTableKey)) {
skipThisValue = true;
break;
}
String[] tgtTableValues = curTgtTableCache.get(srcTableKey).split(DELIMETER);
for (int j = 0; j < option.getSourceTableKeyIndexs().size(); j++) {
int srcIdx = option.getSourceTableKeyIndexs().get(j);
if (srcIdx >= 0 && srcIdx < tokens.length) {
tokens[srcIdx] = tgtTableValues[j];
}
}
}
}
//System.err.println("extra: " + sb.toString());
if (!skipThisValue) {
String neccessary = joinTokens(tokens);
String extra = sb.toString();
if (extra.length() > 0) {
outValue.set(neccessary + extra);
} else {
outValue.set(neccessary);
}
//System.err.println("output: " + outValue.toString());
//outValue.set(sb.toString());
ctx.write(NullWritable.get(), outValue);
}
}
}
}