package hip.ch6.joins.replicated.framework;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Main {
public static void main(String... args) throws Exception {
runJob(new Path(args[0]), new Path(args[1]), new Path(args[2]));
}
public static void runJob(Path inputPath,
Path smallFilePath,
Path outputPath)
throws Exception {
Configuration conf = new Configuration();
FileSystem fs = smallFilePath.getFileSystem(conf);
FileStatus smallFilePathStatus = fs.getFileStatus(smallFilePath);
if (smallFilePathStatus.isDir()) {
for (FileStatus f : fs.listStatus(smallFilePath)) {
if (f.getPath().getName().startsWith("part")) {
DistributedCache.addCacheFile(f.getPath().toUri(), conf);
}
}
} else {
DistributedCache.addCacheFile(smallFilePath.toUri(), conf);
}
Job job = new Job(conf);
job.setJarByClass(Main.class);
job.setMapperClass(GenericReplicatedJoin.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setNumReduceTasks(0);
outputPath.getFileSystem(conf).delete(outputPath, true);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
}