package hip.ch6.joins.replicated.simple;
import hip.ch6.joins.User;
import hip.ch6.joins.UserLog;
import hip.util.Cli;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class ReplicatedJoin extends Configured implements Tool {
/**
* Main entry point for the example.
*
* @param args arguments
* @throws Exception when something goes wrong
*/
public static void main(final String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new ReplicatedJoin(), args);
System.exit(res);
}
/**
* The MapReduce driver - setup and launch the job.
*
* @param args the command-line arguments
* @return the process exit code
* @throws Exception if something goes wrong
*/
public int run(final String[] args) throws Exception {
Cli cli = Cli.builder().setArgs(args).addOptions(Options.values()).build();
int result = cli.runCmd();
if (result != 0) {
return result;
}
Path usersPath = new Path(cli.getArgValueAsString(Options.USERS));
Path userLogsPath = new Path(cli.getArgValueAsString(Options.USER_LOGS));
Path outputPath = new Path(cli.getArgValueAsString(Options.OUTPUT));
Configuration conf = super.getConf();
Job job = new Job(conf);
job.setJarByClass(ReplicatedJoin.class);
job.setMapperClass(JoinMap.class);
job.addCacheFile(usersPath.toUri());
job.getConfiguration().set(JoinMap.DISTCACHE_FILENAME_CONFIG, usersPath.getName());
job.setNumReduceTasks(0);
FileInputFormat.setInputPaths(job, userLogsPath);
FileOutputFormat.setOutputPath(job, outputPath);
return job.waitForCompletion(true) ? 0 : 1;
}
public static class JoinMap extends Mapper<LongWritable, Text, Text, Text> {
public static final String DISTCACHE_FILENAME_CONFIG = "replicatedjoin.distcache.filename";
private Map<String, User> users = new HashMap<String, User>();
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
URI[] files = context.getCacheFiles();
final String distributedCacheFilename = context.getConfiguration().get(DISTCACHE_FILENAME_CONFIG);
boolean found = false;
for (URI uri : files) {
System.out.println("Distcache file: " + uri);
File path = new File(uri.getPath());
if (path.getName().equals(distributedCacheFilename)) {
loadCache(path);
found = true;
break;
}
}
if (!found) {
throw new IOException("Unable to find file " + distributedCacheFilename);
}
}
private void loadCache(File file) throws IOException {
for (String line : FileUtils.readLines(file)) {
User user = User.fromString(line);
users.put(user.getName(), user);
}
}
@Override
protected void map(LongWritable offset, Text value, Context context)
throws IOException, InterruptedException {
UserLog userLog = UserLog.fromText(value);
User user = users.get(userLog.getName());
if (user != null) {
context.write(
new Text(user.toString()),
new Text(userLog.toString()));
}
}
}
public enum Options implements Cli.ArgGetter {
USERS(Cli.ArgBuilder.builder().hasArgument(true).required(true).description("User input file or directory")),
USER_LOGS(Cli.ArgBuilder.builder().hasArgument(true).required(true).description("User logs input file or directory")),
OUTPUT(Cli.ArgBuilder.builder().hasArgument(true).required(true).description("HDFS output directory"));
private final Cli.ArgInfo argInfo;
Options(final Cli.ArgBuilder builder) {
this.argInfo = builder.setArgName(name()).build();
}
@Override
public Cli.ArgInfo getArgInfo() {
return argInfo;
}
}
public enum UserOptions implements Cli.ArgGetter {
USERS(Cli.ArgBuilder.builder().hasArgument(true).required(true).description("User input file or directory")),
OUTPUT(Cli.ArgBuilder.builder().hasArgument(true).required(true).description("HDFS output directory"));
private final Cli.ArgInfo argInfo;
UserOptions(final Cli.ArgBuilder builder) {
this.argInfo = builder.setArgName(name()).build();
}
@Override
public Cli.ArgInfo getArgInfo() {
return argInfo;
}
}
}