package hip.ch6.joins.bloom; import hip.ch3.avro.AvroBytesRecord; import hip.ch6.joins.User; import hip.ch6.joins.replicated.simple.ReplicatedJoin; import hip.util.Cli; import org.apache.avro.file.DataFileStream; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyOutputFormat; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; import java.io.File; import java.io.IOException; import java.io.InputStream; public class BloomFilterCreator extends Configured implements Tool { /** * Main entry point for the example. * * @param args arguments * @throws Exception when something goes wrong */ public static void main(final String[] args) throws Exception { int res = ToolRunner.run(new Configuration(), new BloomFilterCreator(), args); System.exit(res); } /** * The MapReduce driver - setup and launch the job. * * @param args the command-line arguments * @return the process exit code * @throws Exception if something goes wrong */ public int run(final String[] args) throws Exception { Cli cli = Cli.builder().setArgs(args).addOptions(ReplicatedJoin.UserOptions.values()).build(); int result = cli.runCmd(); if (result != 0) { return result; } Path usersPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.USERS)); Path outputPath = new Path(cli.getArgValueAsString(ReplicatedJoin.UserOptions.OUTPUT)); Configuration conf = super.getConf(); Job job = new Job(conf); job.setJarByClass(BloomFilterCreator.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, AvroBytesRecord.SCHEMA); job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, SnappyCodec.class.getName()); job.setOutputFormatClass(AvroKeyOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(BloomFilter.class); FileInputFormat.setInputPaths(job, usersPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(1); return job.waitForCompletion(true) ? 0 : 1; } public static class Map extends Mapper<LongWritable, Text, NullWritable, BloomFilter> { private BloomFilter filter = new BloomFilter(1000, 5, Hash.MURMUR_HASH); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { User user = User.fromText(value); if ("CA".equals(user.getState())) { filter.add(new Key(user.getName().getBytes())); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { context.write(NullWritable.get(), filter); } } public static class Reduce extends Reducer<NullWritable, BloomFilter, AvroKey<GenericRecord>, NullWritable> { private BloomFilter filter = new BloomFilter(1000, 5, Hash.MURMUR_HASH); @Override protected void reduce(NullWritable key, Iterable<BloomFilter> values, Context context) throws IOException, InterruptedException { for (BloomFilter bf : values) { filter.or(bf); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { context.write(new AvroKey<GenericRecord>(AvroBytesRecord.toGenericRecord(filter)), NullWritable.get()); } } public static BloomFilter readFromAvro(InputStream is) throws IOException { DataFileStream<Object> reader = new DataFileStream<Object>( is, new GenericDatumReader<Object>()); reader.hasNext(); BloomFilter filter = new BloomFilter(); AvroBytesRecord .fromGenericRecord((GenericRecord) reader.next(), filter); IOUtils.closeQuietly(is); IOUtils.closeQuietly(reader); return filter; } public static BloomFilter fromFile(File f) throws IOException { return readFromAvro(FileUtils.openInputStream(f)); } }