package hip.ch6.joins.semijoin; import hip.ch6.joins.replicated.framework.GenericReplicatedJoin; import hip.ch6.joins.replicated.framework.Pair; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class ReplicatedFilterJob extends GenericReplicatedJoin { public static void runJob(Configuration conf, Path usersPath, Path uniqueUsersPath, Path outputPath) throws Exception { FileSystem fs = uniqueUsersPath.getFileSystem(conf); FileStatus uniqueUserStatus = fs.getFileStatus(uniqueUsersPath); if (uniqueUserStatus.isDir()) { for (FileStatus f : fs.listStatus(uniqueUsersPath)) { if (f.getPath().getName().startsWith("part")) { DistributedCache.addCacheFile(f.getPath().toUri(), conf); } } } else { DistributedCache.addCacheFile(uniqueUsersPath.toUri(), conf); } Job job = new Job(conf); job.setJarByClass(ReplicatedFilterJob.class); job.setMapperClass(ReplicatedFilterJob.class); job.setNumReduceTasks(0); job.setInputFormatClass(KeyValueTextInputFormat.class); outputPath.getFileSystem(conf).delete(outputPath, true); FileInputFormat.setInputPaths(job, usersPath); FileOutputFormat.setOutputPath(job, outputPath); if (!job.waitForCompletion(true)) { throw new Exception("Job failed"); } } @Override public Pair join(Pair inputSplitPair, Pair distCachePair) { return inputSplitPair; } }