RHHBaseRecorder.java example

/**
 * Copyright 2010 Mozilla Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */



/*
 * Description: Creating a custom Input Format, by help of RecordReader. Detail can be found in below links
 * https://www.inkling.com/read/hadoop-definitive-guide-tom-white-3rd/chapter-7/input-formats
 * http://hadoopi.wordpress.com/2013/05/27/understand-recordreader-inputsplit/
 * 
 * Implemented using RecordReader concepts.
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RegexStringComparator;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.StringUtils;
import org.godhuli.rhipe.RHRaw;


public class RHHBaseRecorder  extends org.apache.hadoop.mapreduce.InputFormat<RHRaw, RHResult> 
    implements Configurable {

	private final static Log LOG = LogFactory.getLog(RHHBaseRecorder.class);
	public static boolean ValueIsString = false;
	public static boolean SingleCFQ = false;
	public static byte[][][] CFQ;
	/** Job parameter that specifies the input table. */
	public static final String INPUT_TABLE = "rhipe.hbase.tablename";
	
	/** Job parameter that specifies the filtering parameter for table. */
	public static final String Filter = "rhipe.hbase.filter";
	
	/************SET BATCH for better performance****************/
	public static final String batch = "rhipe.hbase.set.batch";
	
	/************SET BATCH for better performance****************/
	public static final String sizecal = "rhipe.hbase.set.size";
	
	/** Base-64 encoded array of scanners.	 */
	public static final String RHIPE_COLSPEC = "rhipe.hbase.colspec";
	
	private Configuration conf = null;
	private HTable table = null;
	private Scan[] scans = null;
	private TableRecordReader trr = null;
	
	
	
	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
	 *	http://hadoop.apache.org/docs/current2/api/org/apache/hadoop/mapreduce/InputFormat.html
	 *
	 * RHRAW and RHRESULT is immutable raw bytes format used in RHIPE. For more details see RHIPE.
	 */
	@Override
	public RecordReader<RHRaw, RHResult> 
	    createRecordReader(InputSplit split, TaskAttemptContext context) 
	    throws IOException, InterruptedException {
		if (scans == null) {
			throw new IOException("No scans were provided");
		}
		if (table == null) {
			throw new IOException("No table was provided.");
		}
		if (trr == null) {
			trr = new TableRecordReader();
		}
		
		TableSplit tSplit = (TableSplit)split;
		LOG.info("Split in Record Reader  " + tSplit);
		Scan scan = new Scan(scans[0]);		
		scan.setStartRow(tSplit.getStartRow());
		scan.setStopRow(tSplit.getEndRow());
		//LOG.info("Table in Record Reader  " + Bytes.toStringBinary(tSplit.getTableName()));
		trr.setScan(scan);
		trr.setHTable(table);
		trr.init();
		return trr;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
	 * http://hadoop.apache.org/docs/current2/api/org/apache/hadoop/mapreduce/InputFormat.html
	 * 
	 * Get Number of splits between the start and end rowkeys.
	 */
	@Override
	public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
		if (table == null) {
			throw new IOException("No table was provided.");
		}
		
		Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
		if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
			throw new IOException("Expecting at least one region.");
		}

		Set<InputSplit> splits = new HashSet<InputSplit>();
		for (int i = 0; i < keys.getFirst().length; i++) {
			String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
			//LOG.info("Split length  " + keys.getFirst().length);
			for (Scan s : scans) {
				byte[] startRow = s.getStartRow();
				byte[] stopRow = s.getStopRow();
				// determine if the given start an stop key fall into the region
				if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) && 
					 (stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
					byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i]	: startRow;
					byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) 
										&& keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;					
										//LOG.info("Start Split now " +  Bytes.toStringBinary(splitStart));	
										//LOG.info("Start stop now " +   Bytes.toStringBinary(splitStop));
										//System.out.println("\n================" + Bytes.toLong(splitStart,0));
										//System.out.println("\n================" + splitStart.length);
									    //LOG.info("  Start row-***-   " +Bytes.toLong(startRow,0));
										//LOG.info("  End row-***-   " +Bytes.toLong(stopRow,0));								
					InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
					LOG.info("the current regionInfo's startKey is :"+Bytes.toStringBinary(splitStart)+"  , the current regionInfo's endkey is : "+Bytes.toStringBinary(splitStop) + "  , the current regionInfo's table is "+Bytes.toStringBinary(table.getTableName())+"  , the current regionInfo's regionLocation is :"+regionLocation );
					//LOG.info("Table Name  " + table.getTableName());
					//LOG.info("Split server =>" + "  " + split);
					splits.add(split);
				}
			}
		}
		
		return new ArrayList<InputSplit>(splits);
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.conf.Configurable#getConf()
	 */
	@Override
	public Configuration getConf() {
		return conf;
	}

	/* (non-Javadoc)
	 * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
	 */
	@Override
	public void setConf(Configuration conf) {
		this.conf = conf;
		RHHBaseRecorder.ValueIsString = conf.get("rhipe_hbase_values_are_string")!=null
		&& conf.get("rhipe_hbase_values_are_string").equals("TRUE");
		RHHBaseRecorder.SingleCFQ = conf.get("rhipe.hbase.single.cfq")!=null
		&& conf.get("rhipe.hbase.single.cfq").equals("TRUE");
		String tableName = conf.get(INPUT_TABLE);
		try {
		setHTable(new HTable(HBaseConfiguration.create(conf), tableName));
		} catch (Exception e) {
		LOG.error(StringUtils.stringifyException(e));
		}
		Scan[] scans = null;
		if (conf.get(RHIPE_COLSPEC) != null) {
			try {
			    String[] cols = conf.get(RHIPE_COLSPEC).split(",");
			    ArrayList<Pair<String,String>> l = null;
			    if(cols.length > 0){
				l = new ArrayList<Pair<String,String>>(cols.length);
				for(int i=0;i < cols.length;i++) {
				    String[] x = cols[i].split(":");
				    if(x.length==1){
					l.add(new Pair<String,String>(x[0],null));
					LOG.info("Added family: "+x[0]);
				    } else{
					l.add(new Pair<String,String>(x[0],x[1]));
					LOG.info("Added "+x[0]+":"+x[1]);
				    }
				}
			    }
			    String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":");
			    scans = Fun.generateScans(conf.get("rhipe.hbase.rowlim.start"),
						       conf.get("rhipe.hbase.rowlim.end"),
						       l,
						       Integer.parseInt(x[0]),
						       Integer.parseInt(x[1]) == 1? true: false);
			} catch (Exception e) {
				LOG.error("An error occurred.", e);
			}
		} else {
			//Scan[] scans = null;
			scans = new Scan[] { new Scan() };
		    LOG.info("Start Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.start"))));	
		    LOG.info("End Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.end"))));
		    //LOG.info("Filter in   " + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.filter"))));
		    //LOG.info("Filter out  " + conf.get("rhipe.hbase.filter"));
			String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":");
			LOG.info("cache " +  Integer.parseInt(x[0]) + " block " + Integer.parseInt(x[1]));
	    	scans = Fun.generateScansRows(conf.get("rhipe.hbase.rowlim.start"),
				       conf.get("rhipe.hbase.rowlim.end"),					     
				       Integer.parseInt(x[0]),
				       Integer.parseInt(x[1]) == 1? true: false, 
				       conf.get("rhipe.hbase.filter"),
				       Integer.parseInt(conf.get("rhipe.hbase.set.batch"))); 
	    	 //scans = getAllColumnQualifier(table);   			
		}
		setScans(scans);
	}

	public static int getSizeofFile (String st, String en, int caching, boolean cacheBlocks,  String filter, int batch) throws IOException  {
    	Scan s = new Scan();
    	s.setCacheBlocks(false); // don't set to true for MR jobs
    	LOG.info(" Calculation of File size in hbase at client side. ------   " );
     	if(st != null) {
     	    	byte[] stb1 = org.apache.commons.codec.binary.Base64.decodeBase64(st);
     	    	 //LOG.info("  Start row in ------   " +Bytes.toStringBinary(stb1));
     		s.setStartRow(stb1);
     	}
     	if(en != null) {
     	    byte[] enb2 = org.apache.commons.codec.binary.Base64.decodeBase64(en);
     	    //LOG.info("  End row in------   " +Bytes.toStringBinary(enb2));
     	    s.setStopRow(enb2);
     	}
     	//LOG.info("  Filter- -----   " +  filter);
     	RowFilter rowFilterRegex = new RowFilter(CompareFilter.CompareOp.EQUAL,
                 new RegexStringComparator( Bytes.toString(org.apache.commons.codec.binary.Base64.decodeBase64(filter))));     
     	s.setFilter(rowFilterRegex);
	    HTable tt  = new HTable(HBaseConfiguration.create(), "tsdb");		  	
    	ResultScanner ss = tt.getScanner(s);
    	int col = 0;
    	int size = 0;
    	for(Result r:ss){	
    		col = 0;
            for(KeyValue kv : r.raw()){ 
            	 col = col+kv.getLength(); 
            	 //System.out.print("\n Length keyValue " +kv.getLength() + "\n");    
            }	
            size = size + col/1000;
        }
    	System.out.print("\n Size " +size + "\n");    	
		return size;
	}
	
	
	
	 public static Scan[] getAllColumnQualifier (HTable table) {
		 ArrayList<Scan> scans = new ArrayList<Scan>();
		 Scan scans2 = new Scan();
		 try{	          
	        	int count =0;
	             
	             ResultScanner ss = table.getScanner(scans2);
	             for(Result r:ss){	            	
	                 for(KeyValue kv : r.raw()){ 
	                	//System.out.print("\n Rowkey " +org.apache.commons.codec.binary.Base64.encodeBase64String(kv.getRow())  ); 
	                   // System.out.print(new String(kv.getFamily()) + ":");
	                    //System.out.print(org.apache.commons.codec.binary.Base64.encodeBase64String(kv.getQualifier() )+ " ");	
	                    //s.addFamily(kv.getFamily());
	                    scans2.addColumn(kv.getFamily(),kv.getQualifier());
	                 }	               
	             }
	             //return s;
	        } catch (IOException e){
	            e.printStackTrace();
	        }
		    scans.add(scans2);
			return scans.toArray(new Scan[scans.size()]);
	    }
	
	/**
	 * Allows subclasses to get the {@link HTable}.
	 * Get HBase Table
	 */
	protected HTable getHTable() {
		return this.table;
	}

	/**
	 * Allows subclasses to set the {@link HTable}.
	 * 
	 * @param table
	 *  The table to get the data from. Set Hbase table For our case it is normally tsdb.
	 */
	protected void setHTable(HTable table) {
		this.table = table;
	}
	
	/**
	 * @return
	 */
	public Scan[] getScans() {
		return scans;
	}

	/**
	 * @param scans The scans to use as boundaries.
	 */
	public void setScans(Scan[] scans) {
		this.scans = scans;
	}
	
	/**
	 * Iterate over an HBase table data, return (ImmutableBytesWritable, Result)
	 * pairs.
	 */
	protected class TableRecordReader extends RecordReader<RHRaw, RHResult> {

		private ResultScanner scanner = null;
		private Scan scan = null;
		private HTable htable = null;
		private byte[] lastRow = null;
		private RHRaw key = null;
		private RHResult value = null;
		private Result oldresult = null;
		public void restart(byte[] firstRow) throws IOException {
			Scan newScan = new Scan(scan);
			newScan.setStartRow(firstRow);
			this.scanner = this.htable.getScanner(newScan);
		}

		public void init() throws IOException {
			restart(scan.getStartRow());
		}

		public void setHTable(HTable htable) {
			this.htable = htable;
		}

		public void setScan(Scan scan) {
			this.scan = scan;
		}
		public void close() {
			this.scanner.close();
		}

		public RHRaw getCurrentKey() throws IOException, InterruptedException {
			return key;
		}

		public RHResult getCurrentValue() throws IOException, InterruptedException {
			return value;
		}

		public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
				InterruptedException {
		}
		public boolean nextKeyValue() throws IOException, InterruptedException {
			if (key == null)
				key = new RHRaw();
			if (value == null){
			    value = new RHResult();
			    oldresult = new Result();
			}
			try {
			    oldresult  = this.scanner.next();
			    if(oldresult != null) {
				value.set(oldresult); 
			    }
			} catch (IOException e) {
			    //LOG.debug("recovered from " + StringUtils.stringifyException(e));
			    restart(lastRow);
			    scanner.next(); // skip presumed already mapped row
			    oldresult = scanner.next();
			    value.set(oldresult);
			}
			if (oldresult != null && oldresult.size() > 0) {
			    byte[] _b = oldresult.getRow();
			    key.set(_b);
			    lastRow = _b;
			    return true;
			}
			return false;
		}

		public float getProgress() {
			return 0;
		}
	}
}