PentahoTableInputFormat.java example

Explorer

pentaho-hadoop-shims-master
- api
  - src
    - main
      - java
        org
        pentaho
        hadoop
        mapreduce
        BaseKeyValueOrdinals.java
        InKeyValueOrdinals.java
        OutKeyValueOrdinals.java
        shim
        ActiveHadoopShimFileProvider.java
        ConfigurationException.java
        HadoopConfiguration.java
        HadoopConfigurationClassLoader.java
        HadoopConfigurationFileSystemManager.java
        HadoopConfigurationLocator.java
        ShimVersion.java
        api
        ActiveHadoopConfigurationLocator.java
        Configuration.java
        DistributedCacheUtil.java
        Required.java
        ShimProperties.java
        fs
        FileSystem.java
        Path.java
        mapred
        RunningJob.java
        TaskCompletionEvent.java
        process
        Processable.java
        RequiredCredentialsToken.java
        spi
        HadoopConfigurationProvider.java
        HadoopShim.java
        PentahoHadoopShim.java
        PigShim.java
        SnappyShim.java
        SqoopShim.java
        hbase
        shim
        api
        ColumnFilter.java
        HBaseValueMeta.java
        Mapping.java
        spi
        HBaseBytesUtilShim.java
        HBaseConnection.java
        HBaseShim.java
        oozie
        shim
        api
        OozieClient.java
        OozieClientException.java
        OozieClientFactory.java
        OozieJob.java
    - test
      - java
        org
        pentaho
        hadoop
        mapreduce
        OrdinalExtractionTest.java
        shim
        ActiveHadoopShimFileProviderTest.java
        HadoopConfigurationClassLoaderTest.java
        HadoopConfigurationLocatorTest.java
        HadoopConfigurationTest.java
        HadoopExcludeJarsTest.java
        HadoopRunningOnClusterTest.java
        MockActiveHadoopConfigurationLocator.java
        MockHadoopConfigurationProvider.java
        ShimVersionTest.java
        api
        HadoopConfigurationFileSystemManagerTest.java
        MockFileProvider.java
        ShimPropertiesTest.java
        spi
        MockHBaseShim.java
        MockHadoopShim.java
        MockPigShim.java
        MockSnappyShim.java
        MockSqoopShim.java
        hbase
        shim
        api
        ColumnFilterTest.java
        HBaseValueMetaTest.java
        MappingTest.java
        spi
        HBaseConnectionTest.java
        MockHBaseByteConverterUsingJavaByteBuffer.java
        MockHBaseBytesUtilShim.java
- common
  - common-hbase-comparators-api
    - src
      - main
        java
        org
        pentaho
        hbase
        shim
        common
        wrapper
        HBaseConnectionInterface.java
        HBaseShimInterface.java
        spi
        IDeserializedBooleanComparator.java
        IDeserializedNumericComparator.java
  - common-shim
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        CommonHadoopShim.java
        CommonPigShim.java
        CommonSnappyShim.java
        CommonSqoopShim.java
        ConfigurationProxy.java
        DistributedCacheUtilImpl.java
        DriverProxyInvocationChain.java
        HiveSQLUtils.java
        ShimUtils.java
        fs
        FileSystemProxy.java
        PathProxy.java
        invocationhandler
        CaptureResultSetInvocationHandler.java
        ConnectionInvocationHandler.java
        DatabaseMetaDataInvocationHandler.java
        DriverInvocationHandler.java
        ResultSetInvocationHandler.java
        ResultSetMetaDataInvocationHandler.java
        mapred
        RunningJobProxy.java
        TaskCompletionEventProxy.java
        utils
        OverloadedIterator.java
        OverloadedServiceLoader.java
        hbase
        shim
        common
        CommonHBaseBytesUtil.java
        CommonHBaseConnection.java
        CommonHBaseShim.java
        fake
        FakeHBaseConnection.java
        FakeHBaseShim.java
      - test
        java
        org
        apache
        hadoop
        mapred
        TaskCompletionEventList.java
        pentaho
        hadoop
        mapreduce
        GenericTransCombinerTest.java
        GenericTransReduceTest.java
        MRTestUtil.java
        MockOutputCollector.java
        MockRecordReader.java
        PentahoMapReduceIT.java
        PentahoMapRunnableTest.java
        converter
        TypeConverterFactoryTest.java
        converters
        BytesWritableToByteArrayConverterTest.java
        DoubleWritableToDoubleConverterTest.java
        DoubleWritableToLongConverterTest.java
        ImmutableBytesWritablePassThroughConverterTest.java
        IntWritableToLongConverterTest.java
        KettleTypeToBooleanWritableConverterTest.java
        KettleTypeToBytesWritableConverterTest.java
        KettleTypeToDoubleWritableConverterTest.java
        KettleTypeToIntWritableConverterTest.java
        KettleTypeToLongWritableConverterTest.java
        KettleTypeToTextConverterTest.java
        LongWritableToLongConverterTest.java
        LongWritableToTextConverterTest.java
        NullConverterTest.java
        NullWritableConverterTest.java
        ObjectToStringConverterTest.java
        ResultPassThroughConverterTest.java
        TextToIntegerConverterTest.java
        TextToLongConverterTest.java
        TextToStringConverterTest.java
        test
        MRUtilTest.java
        shim
        common
        CommonHadoopShimTest.java
        CommonPigShimTest.java
        ConfigurationProxyTest.java
        DistributedCacheTestUtil.java
        DistributedCacheUtilImplOSDependentTest.java
        DistributedCacheUtilImplTest.java
        DriverProxyInvocationChainTest.java
        HiveSQLUtilsTest.java
        fs
        FileSystemProxyTest.java
        mapred
        MockRunningJob.java
        RunningJobProxyTest.java
        TaskCompletionEventProxyTest.java
        hbase
        shim
        common
        CommonHBaseBytesUtilTest.java
  - hadoop-shim
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        ConfigurationProxyV2.java
        FileSystemProxyV2.java
        HadoopShimImpl.java
        RunningJobProxyV2.java
        SnappyShimImpl.java
      - test
        java
        org
        pentaho
        hadoop
        shim
        common
        ConfigurationProxyV2Test.java
  - hbase
    - src
      - main
        java
        org
        apache
        hadoop
        hbase
        mapred
        Table10InputFormatDiscloser.java
        pentaho
        hadoop
        hbase
        factory
        HBase10Admin.java
        HBase10ClientFactory.java
        HBase10ClientFactoryLocatorImpl.java
        HBase10Put.java
        HBase10Table.java
  - hbase-comparators
    - src
      - main
        java
        org
        pentaho
        hbase
        shim
        common
        DeserializedBooleanComparator.java
        DeserializedNumericComparator.java
        HBaseConnectionImpl.java
        HBaseShimImpl.java
      - test
        java
        org
        pentaho
        hbase
        shim
        common
        CommonHBaseConnectionTest.java
        HBase11HBaseConnectionTest.java
  - mapred
    - src
      - main
        java
        org
        pentaho
        hadoop
        mapreduce
        GenericTransCombiner.java
        GenericTransReduce.java
        MRUtil.java
        OutputCollectorRowListener.java
        PentahoMapReduceBase.java
        PentahoMapRunnable.java
        converter
        TypeConversionException.java
        TypeConverterFactory.java
        converters
        BytesWritableToByteArrayConverter.java
        DoubleWritableToDoubleConverter.java
        DoubleWritableToLongConverter.java
        ImmutableBytesWritablePassThroughConverter.java
        IntWritableToLongConverter.java
        KettleTypeToBooleanWritableConverter.java
        KettleTypeToBytesWritableConverter.java
        KettleTypeToDoubleWritableConverter.java
        KettleTypeToIntWritableConverter.java
        KettleTypeToLongWritableConverter.java
        KettleTypeToTextConverter.java
        LongWritableToLongConverter.java
        LongWritableToTextConverter.java
        NullConverter.java
        NullWritableConverter.java
        ObjectToStringConverter.java
        ResultPassThroughConverter.java
        TextToIntegerConverter.java
        TextToLongConverter.java
        TextToStringConverter.java
        spi
        ITypeConverter.java
        hbase
        factory
        HBaseAdmin.java
        HBaseClientFactory.java
        HBaseClientFactoryLocator.java
        HBasePut.java
        HBaseTable.java
        mapred
        PentahoTableInputFormat.java
        PentahoTableRecordReader.java
        PentahoTableRecordReaderImpl.java
  - modern
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        ClassPathModifyingSqoopShim.java
        authentication
        HadoopNoAuthConsumer.java
        PropertyAuthenticationProviderParser.java
        authorization
        AuthenticatingHadoopShim.java
        HadoopAuthorizationService.java
        HasHadoopAuthorizationService.java
        NoOpHadoopAuthorizationService.java
        delegating
        DelegatingHBaseConnection.java
        DelegatingHBaseShim.java
        DelegatingHadoopShim.java
        DelegatingOozieClientFactory.java
        DelegatingPigShim.java
        DelegatingSnappyShim.java
        DelegatingSqoopShim.java
      - test
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        di
        job
        entries
        oozie
        OozieClientFactoryImpl.java
        hadoop
        shim
        common
        ClassPathModifyingSqoopShimTest.java
        HadoopShimTest.java
        ShimRegistrationTest.java
        authentication
        PropertyAuthenticationProviderParserTest.java
        authorization
        AuthenticatingHadoopShimTest.java
        delegating
        DelegatingHBaseConnectionTest.java
        DelegatingHBaseShimTest.java
        DelegatingHadoopShimTest.java
        DelegatingOozieClientFactoryTest.java
        DelegatingPigShimTest.java
        DelegatingSnappyShimTest.java
        DelegatingUtils.java
  - pig-shim-1.0
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        PigShimImpl.java
  - pig-shim-1.1
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        PigShimImpl.java
- shims
  - cdh510
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        cdh510
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - cdh511
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        cdh511
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - emr52
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        emr52
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - emr531
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        emr531
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - hdi35
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdi35
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        invocationhandler
        HDIDriverInvocationHandler.java
  - hdp25
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdp25
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - hdp26
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdp26
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - mapr510
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        mapr510
        ConfigurationProxyV2.java
        HadoopShim.java
        MapR5DistributedCacheUtilImpl.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        test
        java
        org
        pentaho
        hadoop
        shim
        mapr510
        MapR5DistributedCacheUtilImplOSDependentTest.java
  - mapr520
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        mapr520
        ConfigurationProxyV2.java
        HadoopShim.java
        MapR5DistributedCacheUtilImpl.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        test
        java
        org
        pentaho
        hadoop
        shim
        mapr520
        MapR5DistributedCacheUtilImplOSDependentTest.java

/*******************************************************************************
 *
 * Pentaho Big Data
 *
 * Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.hbase.mapred;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.mapred.TableInputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.StringUtils;
import org.pentaho.di.core.Const;
import org.pentaho.hadoop.shim.api.process.RequiredCredentialsToken;
import org.pentaho.hbase.factory.HBaseClientFactoryLocator;

import java.io.IOException;

/**
 * Extends the mapred TableInputFormat and adds the ability to specify the table to read from via a property (rather
 * than abusing the input path). Also adds more configuration properties (like those int the mapreduce package's
 * implementation).<p>
 * <p/>
 * The following properties can be set in Pentaho MR job to configure the split:<br><br>
 * <p/>
 * <code> hbase.mapred.inputtable // name of the HBase table to read from hbase.mapred.tablecolumns // space delimited
 * list of columns in ColFam:ColName format (ColName can be ommitted to read all columns from a family)
 * hbase.mapreduce.scan.cachedrows // number of rows for caching that will be passed to scanners
 * hbase.mapreduce.scan.timestamp // timestamp used to filter columns with a specific time stamp
 * hbase.mapreduce.scan.timerange.start // starting timestamp to filter in a given timestamp range
 * hbase.mapreduce.scan.timerange.end // end timestamp to filter in a given timestamp range </code>
 *
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 */

@RequiredCredentialsToken( RequiredCredentialsToken.Type.HBASE )
public class PentahoTableInputFormat extends TableInputFormat {

  // Note that the hbase.mapred.tablecolumns property is inherited
  // from TableInputFormat. This property expects a space-delimited list
  // of column names to read in the format "ColumnFamily:ColumnName". The
  // ColumnName may be ommitted in order to read *all* columns from the 
  // specified family

  /**
   * The name of the table to read from
   */
  public static final String INPUT_TABLE = "hbase.mapred.inputtable";

  /**
   * The number of rows (integer) for caching that will be passed to scanners.
   */
  public static final String SCAN_CACHEDROWS = "hbase.mapreduce.scan.cachedrows";

  /**
   * The timestamp (long) used to filter columns with a specific timestamp.
   */
  public static final String SCAN_TIMESTAMP = "hbase.mapreduce.scan.timestamp";

  /**
   * The starting timestamp (long) used to filter columns with a specific range of versions.
   */
  public static final String SCAN_TIMERANGE_START = "hbase.mapreduce.scan.timerange.start";

  /**
   * The ending timestamp (long) used to filter columns with a specific range of versions.
   */
  public static final String SCAN_TIMERANGE_END = "hbase.mapreduce.scan.timerange.end";

  protected final Log PLOG = LogFactory.getLog( PentahoTableInputFormat.class );

  private PentahoTableInputFormat delegate;

  public void configure( JobConf job ) {

    String tableName = job.get( INPUT_TABLE );

    // columns can be colFam:colName or colFam: 
    // the later can be used to set up a scan that 
    String colArg = job.get( COLUMN_LIST );

    if ( !Const.isEmpty( colArg ) ) {
      String[] colNames = colArg.split( " " );
      byte[][] m_cols = new byte[ colNames.length ][];
      for ( int i = 0; i < m_cols.length; i++ ) {
        String colN = colNames[ i ];
        m_cols[ i ] = Bytes.toBytes( colN );
      }
      setInputColumns( m_cols );
    }

    Configuration conf = HBaseConfiguration.create( job );

    delegate = HBaseClientFactoryLocator.getHBaseClientFactory( conf ).getTableInputFormatImpl( this, conf );

    try {
      setHBaseTable( conf, tableName );
    } catch ( Exception e ) {
      PLOG.error( StringUtils.stringifyException( e ) );
    }

    // set our table record reader
    PentahoTableRecordReader rr = createRecordReader( conf );

    String cacheSize = job.get( SCAN_CACHEDROWS );
    if ( !Const.isEmpty( cacheSize ) ) {
      rr.setScanCacheRowSize( Integer.parseInt( cacheSize ) );
    }

    String ts = job.get( SCAN_TIMESTAMP );
    if ( !Const.isEmpty( ts ) ) {
      rr.setTimestamp( Long.parseLong( ts ) );
    }

    String tsStart = job.get( SCAN_TIMERANGE_START );
    String tsEnd = job.get( SCAN_TIMERANGE_END );
    if ( !Const.isEmpty( tsStart ) && !Const.isEmpty( tsEnd ) ) {
      rr.setTimeStampRange( Long.parseLong( tsStart ), Long.parseLong( tsEnd ) );
    }

    setTableRecordReader( rr );
  }

  public void validateInput( JobConf job ) throws IOException {
    // expecting a table name
    String tableName = job.get( INPUT_TABLE );
    if ( Const.isEmpty( tableName ) ) {
      throw new IOException( "expecting one table name" );
    }

    // connected to table?
    if ( !checkHBaseTable() ) {
      throw new IOException( "could not connect to table '"
        + tableName + "'" );
    }

    // expecting at least one column/column family

    String colArg = job.get( COLUMN_LIST );
    if ( colArg == null || colArg.length() == 0 ) {
      throw new IOException( "expecting at least one column/column family" );
    }
  }

  protected void setHBaseTable( Configuration conf, String tableName ) throws IOException {
    delegate.setHBaseTable( conf, tableName );
  }

  protected boolean checkHBaseTable() {
    return delegate.checkHBaseTable();
  }

  protected PentahoTableRecordReader createRecordReader( Configuration conf ) {
    return delegate.createRecordReader( conf );
  }
}