package com.manning.hip.ch4.joins.semijoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; public class Main { public static void main(String... args) throws Exception { runJob(new Path(args[0]), new Path(args[1]), new Path(args[2])); } public static void runJob(Path smallFilePath, Path largeFilePath, Path workPath) throws Exception { Configuration conf = new Configuration(); FileSystem fs = workPath.getFileSystem(conf); fs.delete(workPath, true); fs.mkdirs(workPath); ///////////////////////////////////////////////////// // JOB 1 - Produce unique keys from the large file ///////////////////////////////////////////////////// Path uniqueKeyOutputPath = new Path(workPath, "unique"); UniqueHashedKeyJob.runJob(largeFilePath, uniqueKeyOutputPath); ///////////////////////////////////////////////////// // JOB 2 - Use the unique keys from the large file to // retain the contents of the small file that // match ///////////////////////////////////////////////////// Path filteredSmallOutputPath = new Path(workPath, "filtered"); ReplicatedFilterJob.runJob(smallFilePath, uniqueKeyOutputPath, filteredSmallOutputPath); ///////////////////////////////////////////////////// // JOB 3 - The final join ///////////////////////////////////////////////////// Path resultOutputPath = new Path(workPath, "result"); FinalJoinJob.runJob(largeFilePath, filteredSmallOutputPath, resultOutputPath); } }