package hip.ch7.bloom;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.bloom.BloomFilter;
import org.apache.hadoop.util.bloom.Key;
import java.io.File;
import java.io.IOException;
public class BloomJoin {
public static void main(String... args) throws Exception {
runJob(args[0], new Path(args[1]), new Path(args[2]));
}
public static void runJob(String inputPath,
Path outputPath,
Path bloomFilterPath)
throws Exception {
Configuration conf = new Configuration();
DistributedCache.addCacheFile(bloomFilterPath.toUri(), conf);
Job job = new Job(conf);
job.setJarByClass(BloomJoin.class);
job.setMapperClass(Map.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setNumReduceTasks(0);
outputPath.getFileSystem(conf).delete(outputPath, true);
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
job.waitForCompletion(true);
}
public static class Map extends Mapper<Text, Text, Text, Text> {
BloomFilter filter;
@Override
protected void setup(
Context context)
throws IOException, InterruptedException {
Path[] files = DistributedCache.getLocalCacheFiles(context.getConfiguration());
filter = BloomFilterDumper.fromFile(
new File(files[0].toString()));
System.out.println("Filter = " + filter);
}
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
System.out.println("K[" + key + "]");
if(filter.membershipTest(new Key(key.toString().getBytes()))) {
context.write(key, value);
}
}
}
}