package hip.ch7.bloom; import hip.ch3.avro.AvroBytesRecord; import hip.util.Cli; import hip.util.CliCommonOpts; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroJob; import org.apache.avro.mapred.AvroOutputFormat; import org.apache.avro.mapred.AvroWrapper; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; import java.io.IOException; import java.util.Iterator; public class BloomFilterCreator extends Configured implements Tool { /** * Main entry point for the example. * * @param args arguments * @throws Exception when something goes wrong */ public static void main(final String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new BloomFilterCreator(), args); System.exit(res); } /** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(CliCommonOpts.MrIoOpts.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path inputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.INPUT)); Path outputPath = new Path(cli.getArgValueAsString(CliCommonOpts.MrIoOpts.OUTPUT)); Configuration conf = super.getConf(); JobConf job = new JobConf(conf); job.setJarByClass(BloomFilterCreator.class); job.set(AvroJob.OUTPUT_SCHEMA, AvroBytesRecord.SCHEMA.toString()); job.set(AvroJob.OUTPUT_CODEC, SnappyCodec.class.getName()); job.setInputFormat(KeyValueTextInputFormat.class); job.setOutputFormat(AvroOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); return JobClient.runJob(job).isSuccessful() ? 0 : 1; } public static class Map implements Mapper<Text, Text, NullWritable, BloomFilter> { private BloomFilter filter = new BloomFilter(1000, 5, Hash.MURMUR_HASH); OutputCollector<NullWritable, BloomFilter> collector; @Override public void configure(JobConf job) { } @Override public void map(Text key, Text value, OutputCollector<NullWritable, BloomFilter> output, Reporter reporter) throws IOException { System.out.println("K[" + key + "]"); int age = Integer.valueOf(value.toString()); if (age > 30) { filter.add(new Key(key.toString().getBytes())); } collector = output; } @Override public void close() throws IOException { System.out.println(filter); collector.collect(NullWritable.get(), filter); } } public static class Reduce implements Reducer<NullWritable, BloomFilter, AvroWrapper<GenericRecord>, NullWritable> { private BloomFilter filter = new BloomFilter(1000, 5, Hash.MURMUR_HASH); OutputCollector<AvroWrapper<GenericRecord>, NullWritable> collector; @Override public void reduce(NullWritable key, Iterator<BloomFilter> values, OutputCollector<AvroWrapper<GenericRecord>, NullWritable> output, Reporter reporter) throws IOException { while (values.hasNext()) { BloomFilter bf = values.next(); filter.or(bf); System.out.println(filter); } collector = output; } @Override public void close() throws IOException { System.out.println(filter); if (collector != null) { collector.collect( new AvroWrapper<GenericRecord>( AvroBytesRecord.toGenericRecord(filter)), NullWritable.get()); } } @Override public void configure(JobConf job) { } } }