/**
* Copyright 2011 Thibault Dory
* Licensed under the GPL Version 3 license
*/
package hbase_mapreduce;
import java.io.IOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
/**
* @author Thibault Dory
* This class implements the two MapReduce phases to build the reverse index
*/
public class MapReduceHbaseDB {
// TableMapper<KeyOut, ValueOut>
public static class Mapper1 extends TableMapper<Text, Text> {
//The value of the keyword is hardcoded for now, I know this is bad, patch welcome :-)
public String keyword = "location";
private int numRecords = 0;
@Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
String id = Bytes.toString(row.get());
try{
Integer.valueOf(id);
Text ID = new Text(id);
String article = Bytes.toString(values.getValue(Bytes.toBytes("myColumnFamily"), Bytes.toBytes("value")));
String [] words = article.split(" ");
for(String w : words){
if(w.equalsIgnoreCase(keyword)){
Text foundWord = new Text(w);
try {
context.write(foundWord,ID);
} catch (InterruptedException e) {
throw new IOException(e);
}
}
}
}catch(Exception e){
}
numRecords++;
if ((numRecords % 10000) == 0) {
context.setStatus("mapper processed " + numRecords + " records so far");
}
}
}
//TableReducer<KeyIn, ValueIn, KeyOut>
public static class Reducer1 extends TableReducer<Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
String index = "[";
for(Text t : values){
index += t.toString() + ",";
}
index += "]";
Put put = new Put(Bytes.toBytes("results"));
put.add(Bytes.toBytes("resultF"), Bytes.toBytes("docsID"), Bytes.toBytes(index));
context.write(key, put);
}
}
public static class Mapper2 extends TableMapper<Text, IntWritable> {
private int numRecords = 0;
@Override
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
String rawList = Bytes.toString(values.getValue(Bytes.toBytes("resultF"), Bytes.toBytes("docsID")));
String[] list = rawList.split(",");
for(String t : list){
try{
Integer.valueOf(t);
context.write(new Text(t), new IntWritable(1));
}catch(Exception e){
}
}
numRecords++;
if ((numRecords % 10000) == 0) {
context.setStatus("mapper processed " + numRecords + " records so far");
}
}
}
//TableReducer<KeyIn, ValueIn, KeyOut>
public static class Reducer2 extends TableReducer<Text, IntWritable, Text> {
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for(IntWritable i : values){
sum += i.get();
}
Put put = new Put(Bytes.toBytes("results"));
put.add(Bytes.toBytes("resultF"), Bytes.toBytes(key.toString()), Bytes.toBytes(sum));
System.out.println(key);
context.write(key, put);
}
}
public static void main(String[] args) throws Exception {
long t0 = System.nanoTime();
try {
//First mapreduce phase setup
HBaseConfiguration conf = new HBaseConfiguration();
conf.set("mapred.job.tracker", args[0]+":8021");
Job job;
job = new Job(conf, "MapReducePhase1");
job.setJarByClass(MapReduceHbaseDB.class);
Scan scan = new Scan();
String columns = "myColumnFamily";
scan.addColumns(columns);
scan.setCaching(10000);
//Second mapreduce phase setup
HBaseConfiguration conf2 = new HBaseConfiguration();
Job job2 = new Job(conf2, "MapReducePhase2");
job2.setJarByClass(MapReduceHbaseDB.class);
Scan scan2 = new Scan();
String columns2 = "resultF";
scan2.addColumns(columns2);
scan2.setCaching(10000);
//Execution of the first mapreduce phase
job.setOutputFormatClass(TableOutputFormat.class);
TableMapReduceUtil.initTableMapperJob("myTable", scan, Mapper1.class, Text.class,
Text.class, job);
TableMapReduceUtil.initTableReducerJob("result", Reducer1.class, job);
job.waitForCompletion(true);
long t2 = System.nanoTime();
//Execution of the second mapreduce phase
job2.setOutputFormatClass(TableOutputFormat.class);
TableMapReduceUtil.initTableMapperJob("result", scan2, Mapper2.class, Text.class,
IntWritable.class, job2);
TableMapReduceUtil.initTableReducerJob("result2", Reducer2.class, job2);
job2.waitForCompletion(true);
long t1 = System.nanoTime();
double totalTime = (t1-t0)/1000000000.0;
System.out.println("Total time for the search : "+totalTime+" seconds");
double firstPhaseTime = (t2-t0)/1000000000.0;
System.out.println("Time for the first mapreduce phase : "+firstPhaseTime+" seconds");
double secondPhaseTime = (t1-t2)/1000000000.0;
System.out.println("Time for the second mapreduce phase : "+secondPhaseTime+" seconds");
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
}