RHScanTable.java example

/**************************
 * Author:Bikash Agrawal
 * Email: er.bikash21@gmail.com
 * Created: 10 May 2013
 * Website: www.bikashagrawal.com.np
 * 
 * Description: This class is used to get timeseries data from tsdb table.
 */


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.StringUtils;
import org.godhuli.rhipe.RHRaw;

public class RHScanTable  extends org.apache.hadoop.mapreduce.InputFormat<RHRaw, RHResult> 
    implements Configurable {

	private final static Log LOG = LogFactory.getLog(RHScanTable.class);

	public static boolean ValueIsString = false;
	public static boolean SingleCFQ = false;
	public static byte[][][] CFQ;
	/** Job parameter that specifies the input table. */
	public static final String INPUT_TABLE = "rhipe.hbase.tablename";

	/************SET BATCH for better performance****************/
	public static final String batch = "rhipe.hbase.set.batch";
		
	private Configuration conf = null;
	private HTable table = null;
	private Scan[] scans = null;
	private TableRecordReader trr = null;
	
	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
	 */
	@Override
	public RecordReader<RHRaw, RHResult> 
	    createRecordReader(InputSplit split, TaskAttemptContext context) 
	    throws IOException, InterruptedException {
		if (scans == null) {
			throw new IOException("No scans were provided");
		}
		if (table == null) {
			throw new IOException("No table was provided.");
		}
		if (trr == null) {
			trr = new TableRecordReader();
		}
		
		TableSplit tSplit = (TableSplit)split;
		LOG.info("Split in Record Reader  " + tSplit);
		Scan scan = new Scan(scans[0]);		
		scan.setStartRow(tSplit.getStartRow());
		scan.setStopRow(tSplit.getEndRow());
		LOG.info("Table in Record Reader  " + Bytes.toStringBinary(tSplit.getTableName()));
		trr.setScan(scan);
		trr.setHTable(table);
		trr.init();
		
		return trr;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
	 */
	@Override
	public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
		if (table == null) {
			throw new IOException("No table was provided.");
		}
		
		Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
		if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
			throw new IOException("Expecting at least one region.");
		}

		Set<InputSplit> splits = new HashSet<InputSplit>();
		for (int i = 0; i < keys.getFirst().length; i++) {
			String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
			//LOG.info("Split length  " + keys.getFirst().length);
			for (Scan s : scans) {
				byte[] startRow = s.getStartRow();
				byte[] stopRow = s.getStopRow();
				
				// determine if the given start an stop key fall into the region
				if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && 
					 (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
					byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i]	: startRow;
					byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) 
										&& keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;					
								
					InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
					LOG.info("the current regionInfo's startKey is :"+Bytes.toStringBinary(splitStart)+"  , the current regionInfo's endkey is : "+Bytes.toStringBinary(splitStop) + "  , the current regionInfo's table is "+Bytes.toStringBinary(table.getTableName())+"  , the current regionInfo's regionLocation is :"+regionLocation );
					//LOG.info("Split server =>" + "  " + split);
					splits.add(split);
				}
			}
		}
		
		return new ArrayList<InputSplit>(splits);
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.conf.Configurable#getConf()
	 */
	@Override
	public Configuration getConf() {
		return conf;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
	 */
	@Override
	public void setConf(Configuration conf) {
		this.conf = conf;
		
	
		RHHBaseRecorder.ValueIsString = conf.get("rhipe_hbase_values_are_string")!=null
		&& conf.get("rhipe_hbase_values_are_string").equals("TRUE");
		RHHBaseRecorder.SingleCFQ = conf.get("rhipe.hbase.single.cfq")!=null
		&& conf.get("rhipe.hbase.single.cfq").equals("TRUE");

		String tableName = conf.get(INPUT_TABLE);
		try {
		setHTable(new HTable(HBaseConfiguration.create(conf), tableName));
		} catch (Exception e) {
		LOG.error(StringUtils.stringifyException(e));
		}
		Scan[] scans = null;		
		scans = new Scan[] { new Scan() };
		LOG.info("Start Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.start"))));	
		LOG.info("End Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.end"))));
		LOG.info("Filter in  my " + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.filter"))));
		String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":");
		LOG.info("cache " +  Integer.parseInt(x[0]) + " block " + Integer.parseInt(x[1]));
	    scans = Fun.generateScansTbl(  Integer.parseInt(x[0]),
	    Integer.parseInt(x[1]) == 1? true: false, 
		Integer.parseInt(conf.get("rhipe.hbase.set.batch"))); 	
		setScans(scans);
	}
	
	
	// get size of table. Not an efficient way of calculating. Calculation is done in client side.
	public static int getSizeofFile (int caching, boolean cacheBlocks, int batch) throws IOException   {
    	Scan s = new Scan();
    	s.setBatch(batch);
    	s.setCaching(caching); // 1 is the default in Scan, which will be bad for
        s.setCacheBlocks(false); // don't set to true for MR jobs

		@SuppressWarnings({ "deprecation", "static-access" })
		Configuration config = new HBaseConfiguration().create();
    	HTable tt = new HTable(config, "tsdb");
    	
    	ResultScanner ss = tt.getScanner(s);
    	int col = 0;
    	int size = 0;
    	for(Result r:ss){	
    		col = 0;
            for(KeyValue kv : r.raw()){ 
            	 col = col+kv.getLength(); 
            	 System.out.print("\n Length keyValue " +kv.getLength() + "\n");   
            }	
            size = size + col/1000;
        }

    	LOG.info(" \n Size of HBase in kilo Bytes " + size);
		//return size;
		return size;
		
	}
	
	/**
	 * get column qualifier for whole table associate with row keys.
	 * 
	 * @param table
	 *            The table to get the data from.
	 */
	 public static Scan[] getAllColumnQualifier (HTable table) {
		 ArrayList<Scan> scans = new ArrayList<Scan>();
		 Scan scans2 = new Scan();
		 try{	          
	        	ResultScanner ss = table.getScanner(scans2);
	             for(Result r:ss){	            	
	                 for(KeyValue kv : r.raw()){ 
	                    scans2.addColumn(kv.getFamily(),kv.getQualifier());
	                 }	               
	             }
	             //return s;
	        } catch (IOException e){
	            e.printStackTrace();
	        }
		    scans.add(scans2);
			return scans.toArray(new Scan[scans.size()]);
	    }
	
	/**
	 * Allows subclasses to get the {@link HTable}.
	 */
	protected HTable getHTable() {
		return this.table;
	}

	/**
	 * Allows subclasses to set the {@link HTable}.
	 * 
	 * @param table
	 *            The table to get the data from.
	 */
	protected void setHTable(HTable table) {
		this.table = table;
	}
	
	/**
	 * @return
	 */
	public Scan[] getScans() {
		return scans;
	}

	/**
	 * @param scans The scans to use as boundaries.
	 */
	public void setScans(Scan[] scans) {
		this.scans = scans;
	}
	
	/**
	 * Iterate over an HBase table data, return (ImmutableBytesWritable, Result)
	 * pairs.
	 */
	protected class TableRecordReader extends RecordReader<RHRaw, RHResult> {

		private ResultScanner scanner = null;
		private Scan scan = null;
		private HTable htable = null;
		private byte[] lastRow = null;
		private RHRaw key = null;
		private Result _value = null;
		private RHResult value = null;
		private Result oldresult = null;
		public void restart(byte[] firstRow) throws IOException {
			Scan newScan = new Scan(scan);
			newScan.setStartRow(firstRow);
			this.scanner = this.htable.getScanner(newScan);
		}

		public void init() throws IOException {
			restart(scan.getStartRow());
		}

		public void setHTable(HTable htable) {
			this.htable = htable;
		}

		public void setScan(Scan scan) {
			this.scan = scan;
		}
		public void close() {
			this.scanner.close();
		}

		public RHRaw getCurrentKey() throws IOException, InterruptedException {
			return key;
		}

		public RHResult getCurrentValue() throws IOException, InterruptedException {
			return value;
		}

		public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
				InterruptedException {
		}
		public boolean nextKeyValue() throws IOException, InterruptedException {
			
			LOG.debug("recovered from  <-" );
			if (key == null)
				key = new RHRaw();
			if (value == null){
			    value = new RHResult();
			    oldresult = new Result();
			}
			try {
			    oldresult  = this.scanner.next();
			    if(oldresult != null) {
				value.set(oldresult); 
			    }
			} catch (IOException e) {
			    LOG.debug("recovered from " + StringUtils.stringifyException(e));
			    restart(lastRow);
			    scanner.next(); // skip presumed already mapped row
			    oldresult = scanner.next();
			    value.set(oldresult);
			}
			if (oldresult != null && oldresult.size() > 0) {
			    byte[] _b = oldresult.getRow();
			    key.set(_b);
			    lastRow = _b;
			    return true;
			}
			return false;
		}

		public float getProgress() {
			// Depends on the total number of tuples
			return 0;
		}
	}
}