MapReduceHbaseDB.java example

Explorer
Wikipedia-noSQL-Benchmark-master
- src
/**
 * Copyright 2011 Thibault Dory
 * Licensed under the GPL Version 3 license
 */

package hbase_mapreduce;

import java.io.IOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;

/**
 * @author Thibault Dory
 * This class implements the two MapReduce phases to build the reverse index
 */

public class MapReduceHbaseDB {

	// TableMapper<KeyOut, ValueOut>
	public static class Mapper1 extends TableMapper<Text, Text> {

		//The value of the keyword is hardcoded for now, I know this is bad, patch welcome :-)
		public String keyword = "location";
        private int numRecords = 0;

        @Override
        public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
        		String id = Bytes.toString(row.get());
        		try{
        			Integer.valueOf(id);
        			Text ID = new Text(id);
            		String article = Bytes.toString(values.getValue(Bytes.toBytes("myColumnFamily"), Bytes.toBytes("value")));
            		String [] words = article.split(" ");
            		for(String w : words){
            			if(w.equalsIgnoreCase(keyword)){
            				Text foundWord = new Text(w);
            				try {
            	                context.write(foundWord,ID);
            	            } catch (InterruptedException e) {
            	                throw new IOException(e);
            	            }
            			}
            		}
        		}catch(Exception e){
        			
        		}
        		
        		
	            numRecords++;
	            if ((numRecords % 10000) == 0) {
	                context.setStatus("mapper processed " + numRecords + " records so far");
	            }
        	
        }
    }

	
	//TableReducer<KeyIn, ValueIn, KeyOut>
    public static class Reducer1 extends TableReducer<Text, Text, Text> {

        public void reduce(Text key, Iterable<Text> values, Context context)
                throws IOException, InterruptedException {
        	String index = "[";
        	for(Text t : values){
        		index += t.toString() + ",";
        	}
        	index += "]";
            Put put = new Put(Bytes.toBytes("results"));
            put.add(Bytes.toBytes("resultF"), Bytes.toBytes("docsID"), Bytes.toBytes(index));
            context.write(key, put);
        }
    }
    
    public static class Mapper2 extends TableMapper<Text, IntWritable> {

        private int numRecords = 0;

        @Override
        public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
        		String rawList = Bytes.toString(values.getValue(Bytes.toBytes("resultF"), Bytes.toBytes("docsID")));
        		String[] list = rawList.split(",");
        		for(String t : list){
        			try{
        				Integer.valueOf(t);
        				context.write(new Text(t), new IntWritable(1));
        			}catch(Exception e){
        				
        			}
        		}
        		
	            numRecords++;
	            if ((numRecords % 10000) == 0) {
	                context.setStatus("mapper processed " + numRecords + " records so far");
	            }
        	
        }
    }

	
	//TableReducer<KeyIn, ValueIn, KeyOut>
    public static class Reducer2 extends TableReducer<Text, IntWritable, Text> {

        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
        	int sum = 0;
        	for(IntWritable i : values){
        		sum += i.get();
        	}
            Put put = new Put(Bytes.toBytes("results"));
            put.add(Bytes.toBytes("resultF"), Bytes.toBytes(key.toString()), Bytes.toBytes(sum));
            System.out.println(key);
            context.write(key, put);
        }
    }
    
    public static void main(String[] args) throws Exception {
    	long t0 = System.nanoTime();
    	
    	
		try {
			//First mapreduce phase setup
	    	HBaseConfiguration conf = new HBaseConfiguration();
	    	conf.set("mapred.job.tracker", args[0]+":8021");
	        Job job;
			job = new Job(conf, "MapReducePhase1");
			job.setJarByClass(MapReduceHbaseDB.class);
	        Scan scan = new Scan();
	        String columns = "myColumnFamily";
	        scan.addColumns(columns);
	        scan.setCaching(10000);
	        
	        //Second mapreduce phase setup
	        HBaseConfiguration conf2 = new HBaseConfiguration();
	        Job job2 = new Job(conf2, "MapReducePhase2");
	        job2.setJarByClass(MapReduceHbaseDB.class);
	        Scan scan2 = new Scan();
	        String columns2 = "resultF"; 
	        scan2.addColumns(columns2);
	        scan2.setCaching(10000);
	        
	        //Execution of the first mapreduce phase
	        job.setOutputFormatClass(TableOutputFormat.class);
	        TableMapReduceUtil.initTableMapperJob("myTable", scan, Mapper1.class, Text.class,
	                Text.class, job);
	        TableMapReduceUtil.initTableReducerJob("result", Reducer1.class, job);
	        
	        job.waitForCompletion(true);
	        
	        long t2 = System.nanoTime();
	        
	        //Execution of the second mapreduce phase
	        job2.setOutputFormatClass(TableOutputFormat.class);
	        TableMapReduceUtil.initTableMapperJob("result", scan2, Mapper2.class, Text.class,
	                IntWritable.class, job2);
	        TableMapReduceUtil.initTableReducerJob("result2", Reducer2.class, job2);
	        
	        job2.waitForCompletion(true);
	        
	        long t1 = System.nanoTime();
			double totalTime = (t1-t0)/1000000000.0;
			System.out.println("Total time for the search : "+totalTime+" seconds");
	        
			double firstPhaseTime = (t2-t0)/1000000000.0;
			System.out.println("Time for the first mapreduce phase : "+firstPhaseTime+" seconds");
			
			double secondPhaseTime = (t1-t2)/1000000000.0;
			System.out.println("Time for the second mapreduce phase : "+secondPhaseTime+" seconds");

		} catch (IOException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		}

    }
}