/******************************************************************************* * * Pentaho Big Data * * Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.hbase.mapred; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.mapred.TableInputFormat; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.util.StringUtils; import org.pentaho.di.core.Const; import org.pentaho.hadoop.shim.api.process.RequiredCredentialsToken; import org.pentaho.hbase.factory.HBaseClientFactoryLocator; import java.io.IOException; /** * Extends the mapred TableInputFormat and adds the ability to specify the table to read from via a property (rather * than abusing the input path). Also adds more configuration properties (like those int the mapreduce package's * implementation).<p> * <p/> * The following properties can be set in Pentaho MR job to configure the split:<br><br> * <p/> * <code> hbase.mapred.inputtable // name of the HBase table to read from hbase.mapred.tablecolumns // space delimited * list of columns in ColFam:ColName format (ColName can be ommitted to read all columns from a family) * hbase.mapreduce.scan.cachedrows // number of rows for caching that will be passed to scanners * hbase.mapreduce.scan.timestamp // timestamp used to filter columns with a specific time stamp * hbase.mapreduce.scan.timerange.start // starting timestamp to filter in a given timestamp range * hbase.mapreduce.scan.timerange.end // end timestamp to filter in a given timestamp range </code> * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) */ @RequiredCredentialsToken( RequiredCredentialsToken.Type.HBASE ) public class PentahoTableInputFormat extends TableInputFormat { // Note that the hbase.mapred.tablecolumns property is inherited // from TableInputFormat. This property expects a space-delimited list // of column names to read in the format "ColumnFamily:ColumnName". The // ColumnName may be ommitted in order to read *all* columns from the // specified family /** * The name of the table to read from */ public static final String INPUT_TABLE = "hbase.mapred.inputtable"; /** * The number of rows (integer) for caching that will be passed to scanners. */ public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows"; /** * The timestamp (long) used to filter columns with a specific timestamp. */ public static final String SCAN_TIMESTAMP = "hbase.mapreduce.scan.timestamp"; /** * The starting timestamp (long) used to filter columns with a specific range of versions. */ public static final String SCAN_TIMERANGE_START = "hbase.mapreduce.scan.timerange.start"; /** * The ending timestamp (long) used to filter columns with a specific range of versions. */ public static final String SCAN_TIMERANGE_END = "hbase.mapreduce.scan.timerange.end"; protected final Log PLOG = LogFactory.getLog( PentahoTableInputFormat.class ); private PentahoTableInputFormat delegate; public void configure( JobConf job ) { String tableName = job.get( INPUT_TABLE ); // columns can be colFam:colName or colFam: // the later can be used to set up a scan that String colArg = job.get( COLUMN_LIST ); if ( !Const.isEmpty( colArg ) ) { String[] colNames = colArg.split( " " ); byte[][] m_cols = new byte[ colNames.length ][]; for ( int i = 0; i < m_cols.length; i++ ) { String colN = colNames[ i ]; m_cols[ i ] = Bytes.toBytes( colN ); } setInputColumns( m_cols ); } Configuration conf = HBaseConfiguration.create( job ); delegate = HBaseClientFactoryLocator.getHBaseClientFactory( conf ).getTableInputFormatImpl( this, conf ); try { setHBaseTable( conf, tableName ); } catch ( Exception e ) { PLOG.error( StringUtils.stringifyException( e ) ); } // set our table record reader PentahoTableRecordReader rr = createRecordReader( conf ); String cacheSize = job.get( SCAN_CACHEDROWS ); if ( !Const.isEmpty( cacheSize ) ) { rr.setScanCacheRowSize( Integer.parseInt( cacheSize ) ); } String ts = job.get( SCAN_TIMESTAMP ); if ( !Const.isEmpty( ts ) ) { rr.setTimestamp( Long.parseLong( ts ) ); } String tsStart = job.get( SCAN_TIMERANGE_START ); String tsEnd = job.get( SCAN_TIMERANGE_END ); if ( !Const.isEmpty( tsStart ) && !Const.isEmpty( tsEnd ) ) { rr.setTimeStampRange( Long.parseLong( tsStart ), Long.parseLong( tsEnd ) ); } setTableRecordReader( rr ); } public void validateInput( JobConf job ) throws IOException { // expecting a table name String tableName = job.get( INPUT_TABLE ); if ( Const.isEmpty( tableName ) ) { throw new IOException( "expecting one table name" ); } // connected to table? if ( !checkHBaseTable() ) { throw new IOException( "could not connect to table '" + tableName + "'" ); } // expecting at least one column/column family String colArg = job.get( COLUMN_LIST ); if ( colArg == null || colArg.length() == 0 ) { throw new IOException( "expecting at least one column/column family" ); } } protected void setHBaseTable( Configuration conf, String tableName ) throws IOException { delegate.setHBaseTable( conf, tableName ); } protected boolean checkHBaseTable() { return delegate.checkHBaseTable(); } protected PentahoTableRecordReader createRecordReader( Configuration conf ) { return delegate.createRecordReader( conf ); } }