/**************************
* Author:Bikash Agrawal
* Email: er.bikash21@gmail.com
* Created: 10 May 2013
* Website: www.bikashagrawal.com.np
*
* Description: This class is used to get timeseries data from tsdb table.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableSplit;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.util.StringUtils;
import org.godhuli.rhipe.RHRaw;
public class RHScanTable extends org.apache.hadoop.mapreduce.InputFormat<RHRaw, RHResult>
implements Configurable {
private final static Log LOG = LogFactory.getLog(RHScanTable.class);
public static boolean ValueIsString = false;
public static boolean SingleCFQ = false;
public static byte[][][] CFQ;
/** Job parameter that specifies the input table. */
public static final String INPUT_TABLE = "rhipe.hbase.tablename";
/************SET BATCH for better performance****************/
public static final String batch = "rhipe.hbase.set.batch";
private Configuration conf = null;
private HTable table = null;
private Scan[] scans = null;
private TableRecordReader trr = null;
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache.hadoop.mapreduce.InputSplit, org.apache.hadoop.mapreduce.TaskAttemptContext)
*/
@Override
public RecordReader<RHRaw, RHResult>
createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
if (scans == null) {
throw new IOException("No scans were provided");
}
if (table == null) {
throw new IOException("No table was provided.");
}
if (trr == null) {
trr = new TableRecordReader();
}
TableSplit tSplit = (TableSplit)split;
LOG.info("Split in Record Reader " + tSplit);
Scan scan = new Scan(scans[0]);
scan.setStartRow(tSplit.getStartRow());
scan.setStopRow(tSplit.getEndRow());
LOG.info("Table in Record Reader " + Bytes.toStringBinary(tSplit.getTableName()));
trr.setScan(scan);
trr.setHTable(table);
trr.init();
return trr;
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)
*/
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
if (table == null) {
throw new IOException("No table was provided.");
}
Pair<byte[][], byte[][]> keys = table.getStartEndKeys();
if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region.");
}
Set<InputSplit> splits = new HashSet<InputSplit>();
for (int i = 0; i < keys.getFirst().length; i++) {
String regionLocation = table.getRegionLocation(keys.getFirst()[i]).getServerAddress().getHostname();
//LOG.info("Split length " + keys.getFirst().length);
for (Scan s : scans) {
byte[] startRow = s.getStartRow();
byte[] stopRow = s.getStopRow();
// determine if the given start an stop key fall into the region
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 || Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
(stopRow.length == 0 || Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
byte[] splitStart = startRow.length == 0 || Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys.getFirst()[i] : startRow;
byte[] splitStop = (stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0)
&& keys.getSecond()[i].length > 0 ? keys.getSecond()[i] : stopRow;
InputSplit split = new TableSplit(table.getTableName(), splitStart, splitStop, regionLocation);
LOG.info("the current regionInfo's startKey is :"+Bytes.toStringBinary(splitStart)+" , the current regionInfo's endkey is : "+Bytes.toStringBinary(splitStop) + " , the current regionInfo's table is "+Bytes.toStringBinary(table.getTableName())+" , the current regionInfo's regionLocation is :"+regionLocation );
//LOG.info("Split server =>" + " " + split);
splits.add(split);
}
}
}
return new ArrayList<InputSplit>(splits);
}
/* (non-Javadoc)
* @see org.apache.hadoop.conf.Configurable#getConf()
*/
@Override
public Configuration getConf() {
return conf;
}
/* (non-Javadoc)
* @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;
RHHBaseRecorder.ValueIsString = conf.get("rhipe_hbase_values_are_string")!=null
&& conf.get("rhipe_hbase_values_are_string").equals("TRUE");
RHHBaseRecorder.SingleCFQ = conf.get("rhipe.hbase.single.cfq")!=null
&& conf.get("rhipe.hbase.single.cfq").equals("TRUE");
String tableName = conf.get(INPUT_TABLE);
try {
setHTable(new HTable(HBaseConfiguration.create(conf), tableName));
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
}
Scan[] scans = null;
scans = new Scan[] { new Scan() };
LOG.info("Start Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.start"))));
LOG.info("End Row Key" + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.rowlim.end"))));
LOG.info("Filter in my " + Bytes.toStringBinary(org.apache.commons.codec.binary.Base64.decodeBase64(conf.get("rhipe.hbase.filter"))));
String[] x = conf.get("rhipe.hbase.mozilla.cacheblocks").split(":");
LOG.info("cache " + Integer.parseInt(x[0]) + " block " + Integer.parseInt(x[1]));
scans = Fun.generateScansTbl( Integer.parseInt(x[0]),
Integer.parseInt(x[1]) == 1? true: false,
Integer.parseInt(conf.get("rhipe.hbase.set.batch")));
setScans(scans);
}
// get size of table. Not an efficient way of calculating. Calculation is done in client side.
public static int getSizeofFile (int caching, boolean cacheBlocks, int batch) throws IOException {
Scan s = new Scan();
s.setBatch(batch);
s.setCaching(caching); // 1 is the default in Scan, which will be bad for
s.setCacheBlocks(false); // don't set to true for MR jobs
@SuppressWarnings({ "deprecation", "static-access" })
Configuration config = new HBaseConfiguration().create();
HTable tt = new HTable(config, "tsdb");
ResultScanner ss = tt.getScanner(s);
int col = 0;
int size = 0;
for(Result r:ss){
col = 0;
for(KeyValue kv : r.raw()){
col = col+kv.getLength();
System.out.print("\n Length keyValue " +kv.getLength() + "\n");
}
size = size + col/1000;
}
LOG.info(" \n Size of HBase in kilo Bytes " + size);
//return size;
return size;
}
/**
* get column qualifier for whole table associate with row keys.
*
* @param table
* The table to get the data from.
*/
public static Scan[] getAllColumnQualifier (HTable table) {
ArrayList<Scan> scans = new ArrayList<Scan>();
Scan scans2 = new Scan();
try{
ResultScanner ss = table.getScanner(scans2);
for(Result r:ss){
for(KeyValue kv : r.raw()){
scans2.addColumn(kv.getFamily(),kv.getQualifier());
}
}
//return s;
} catch (IOException e){
e.printStackTrace();
}
scans.add(scans2);
return scans.toArray(new Scan[scans.size()]);
}
/**
* Allows subclasses to get the {@link HTable}.
*/
protected HTable getHTable() {
return this.table;
}
/**
* Allows subclasses to set the {@link HTable}.
*
* @param table
* The table to get the data from.
*/
protected void setHTable(HTable table) {
this.table = table;
}
/**
* @return
*/
public Scan[] getScans() {
return scans;
}
/**
* @param scans The scans to use as boundaries.
*/
public void setScans(Scan[] scans) {
this.scans = scans;
}
/**
* Iterate over an HBase table data, return (ImmutableBytesWritable, Result)
* pairs.
*/
protected class TableRecordReader extends RecordReader<RHRaw, RHResult> {
private ResultScanner scanner = null;
private Scan scan = null;
private HTable htable = null;
private byte[] lastRow = null;
private RHRaw key = null;
private Result _value = null;
private RHResult value = null;
private Result oldresult = null;
public void restart(byte[] firstRow) throws IOException {
Scan newScan = new Scan(scan);
newScan.setStartRow(firstRow);
this.scanner = this.htable.getScanner(newScan);
}
public void init() throws IOException {
restart(scan.getStartRow());
}
public void setHTable(HTable htable) {
this.htable = htable;
}
public void setScan(Scan scan) {
this.scan = scan;
}
public void close() {
this.scanner.close();
}
public RHRaw getCurrentKey() throws IOException, InterruptedException {
return key;
}
public RHResult getCurrentValue() throws IOException, InterruptedException {
return value;
}
public void initialize(InputSplit inputsplit, TaskAttemptContext context) throws IOException,
InterruptedException {
}
public boolean nextKeyValue() throws IOException, InterruptedException {
LOG.debug("recovered from <-" );
if (key == null)
key = new RHRaw();
if (value == null){
value = new RHResult();
oldresult = new Result();
}
try {
oldresult = this.scanner.next();
if(oldresult != null) {
value.set(oldresult);
}
} catch (IOException e) {
LOG.debug("recovered from " + StringUtils.stringifyException(e));
restart(lastRow);
scanner.next(); // skip presumed already mapped row
oldresult = scanner.next();
value.set(oldresult);
}
if (oldresult != null && oldresult.size() > 0) {
byte[] _b = oldresult.getRow();
key.set(_b);
lastRow = _b;
return true;
}
return false;
}
public float getProgress() {
// Depends on the total number of tuples
return 0;
}
}
}