package com.manning.hip.ch4.joins.semijoin;
import com.manning.hip.ch4.joins.replicated.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class FinalJoinJob {
public static void main(String... args) throws Exception {
runJob(new Path(args[0]), new Path(args[1]), new Path(args[2]));
}
public static void runJob(Path userLogsPath,
Path usersPath,
Path outputPath)
throws Exception {
Configuration conf = new Configuration();
FileSystem fs = usersPath.getFileSystem(conf);
FileStatus usersStatus = fs.getFileStatus(usersPath);
if(usersStatus.isDir()) {
for(FileStatus f: fs.listStatus(usersPath)) {
if(f.getPath().getName().startsWith("part")) {
DistributedCache.addCacheFile(f.getPath().toUri(), conf);
}
}
} else {
DistributedCache.addCacheFile(usersPath.toUri(), conf);
}
Job job = new Job(conf);
job.setJarByClass(FinalJoinJob.class);
job.setMapperClass(GenericReplicatedJoin.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(KeyValueTextInputFormat.class);
outputPath.getFileSystem(conf).delete(outputPath, true);
FileInputFormat.setInputPaths(job, userLogsPath);
FileOutputFormat.setOutputPath(job, outputPath);
if(!job.waitForCompletion(true)) {
throw new Exception("Job failed");
}
}
}