package ch.unibe.scg.cells.benchmarks; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.MRJobConfig; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import ch.unibe.scg.cells.hadoop.UnibeModule; import com.google.common.base.Charsets; import com.google.common.primitives.Longs; import com.google.inject.Guice; /** Count words, reading from HDFS, writing to HBase. */ public class HadoopBenchmark { final static String TEST_TABLE = HadoopBenchmark.class.getSimpleName(); // TODO: Saner path handling. final static String INPUT_PATH = "hdfs://haddock.unibe.ch/tmp/books"; /** Count words in file. */ public static class WordMapper extends Mapper<ImmutableBytesWritable, ImmutableBytesWritable, Text, IntWritable> { @Override public void map(ImmutableBytesWritable key, ImmutableBytesWritable value, Context context) throws IOException, InterruptedException { String input = new String(value.get(), Charsets.ISO_8859_1); Map<String, Integer> counts = new HashMap<>(); for (String word : input.split("\\s+")) { if (word.isEmpty()) { continue; } if (!counts.containsKey(word)) { counts.put(word, 0); } counts.put(word, counts.get(word) + 1); } for (Entry<String, Integer> e : counts.entrySet()) { context.write(new Text(e.getKey()), new IntWritable(e.getValue())); } } } /** Add up counts. */ public static class WordReduce extends Reducer<Text, IntWritable, NullWritable, NullWritable> { static byte[] FAMILY = "f".getBytes(Charsets.UTF_8); static byte[] COLUMN = "c".getBytes(Charsets.UTF_8); private HTable htable; @Override protected void setup(Context context) throws IOException { Configuration config = Guice.createInjector(new UnibeModule()).getInstance(Configuration.class); htable = new HTable(config, TEST_TABLE); } @Override protected void cleanup(Context context) throws IOException { htable.close(); } @Override public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (IntWritable val : values) { sum += val.get(); } Put put = new Put(key.toString().getBytes(Charsets.UTF_8)); put.add(FAMILY, COLUMN, Longs.toByteArray(sum)); htable.put(put); } } /** Takes no arguments. */ public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.setInt(MRJobConfig.NUM_REDUCES, 2); Job job = Job.getInstance(conf, "hadoop-wordcount-bench"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(WordMapper.class); job.setReducerClass(WordReduce.class); job.setInputFormatClass(RawFileFormat.class); job.setOutputFormatClass(NullOutputFormat.class); FileInputFormat.addInputPath(job, new Path(INPUT_PATH)); job.setJarByClass(HadoopBenchmark.class); job.waitForCompletion(true); } }