TableMapReduceUtil.java example

Explorer

CCIndex_HBase_0.90.0-master
- src
  - main
    - java
      - org
        apache
        hadoop
        hbase
        Abortable.java
        CCIndexTestCase.java
        Chore.java
        ClockOutOfSyncException.java
        ClusterStatus.java
        DoNotRetryIOException.java
        DroppedSnapshotException.java
        HBaseConfiguration.java
        HColumnDescriptor.java
        HConstants.java
        HMsg.java
        HRegionInfo.java
        HRegionLocation.java
        HServerAddress.java
        HServerInfo.java
        HServerLoad.java
        HTableDescriptor.java
        InvalidFamilyOperationException.java
        KeyValue.java
        LocalHBaseCluster.java
        MasterAddressTracker.java
        MasterNotRunningException.java
        NotAllMetaRegionsOnlineException.java
        NotServingRegionException.java
        PleaseHoldException.java
        RegionException.java
        RemoteExceptionHandler.java
        Server.java
        Stoppable.java
        TableExistsException.java
        TableNotDisabledException.java
        TableNotFoundException.java
        UnknownRegionException.java
        UnknownRowLockException.java
        UnknownScannerException.java
        VersionAnnotation.java
        YouAreDeadException.java
        ZooKeeperConnectionException.java
        avro
        AvroServer.java
        AvroUtil.java
        generated
        AAlreadyExists.java
        AClusterStatus.java
        AColumn.java
        AColumnFamilyDescriptor.java
        AColumnValue.java
        ACompressionAlgorithm.java
        ADelete.java
        AFamilyDescriptor.java
        AGet.java
        AIOError.java
        AIllegalArgument.java
        AMasterNotRunning.java
        APut.java
        ARegionLoad.java
        AResult.java
        AResultEntry.java
        AScan.java
        AServerAddress.java
        AServerInfo.java
        AServerLoad.java
        ATableDescriptor.java
        ATableExists.java
        ATimeRange.java
        HBase.java
        IOError.java
        TCell.java
        catalog
        CatalogTracker.java
        MetaEditor.java
        MetaReader.java
        RootLocationEditor.java
        client
        Action.java
        Delete.java
        Get.java
        HBaseAdmin.java
        HConnection.java
        HConnectionManager.java
        HTable.java
        HTableFactory.java
        HTableInterface.java
        HTableInterfaceFactory.java
        HTablePool.java
        Increment.java
        MetaScanner.java
        MultiAction.java
        MultiPut.java
        MultiPutResponse.java
        MultiResponse.java
        NoServerForRegionException.java
        Put.java
        RegionOfflineException.java
        Result.java
        ResultScanner.java
        RetriesExhaustedException.java
        RetriesExhaustedWithDetailsException.java
        Row.java
        RowLock.java
        Scan.java
        ScannerCallable.java
        ScannerTimeoutException.java
        ServerCallable.java
        UnmodifyableHColumnDescriptor.java
        UnmodifyableHRegionInfo.java
        UnmodifyableHTableDescriptor.java
        ccindex
        CCIndexAdmin.java
        CCIndexConstants.java
        CCIndexDescriptor.java
        HTable.java
        IndexKeyGenerator.java
        IndexNotFoundException.java
        IndexSpecification.java
        IndexSpecificationArray.java
        IndexedTable.java
        Optimizer.java
        Range.java
        ResultReader.java
        SimpleIndexKeyGenerator.java
        SimpleOptimizer.java
        SingleReader.java
        Utilities.java
        test.java
        package-info.java
        replication
        ReplicationAdmin.java
        executor
        EventHandler.java
        ExecutorService.java
        RegionTransitionData.java
        filter
        BinaryComparator.java
        BinaryPrefixComparator.java
        ColumnCountGetFilter.java
        ColumnPaginationFilter.java
        ColumnPrefixFilter.java
        CompareFilter.java
        DependentColumnFilter.java
        FamilyFilter.java
        Filter.java
        FilterBase.java
        FilterList.java
        FirstKeyOnlyFilter.java
        InclusiveStopFilter.java
        IncompatibleFilterException.java
        InvalidRowFilterException.java
        KeyOnlyFilter.java
        PageFilter.java
        PrefixFilter.java
        QualifierFilter.java
        RegexStringComparator.java
        RowFilter.java
        SingleColumnValueExcludeFilter.java
        SingleColumnValueFilter.java
        SkipFilter.java
        SubstringComparator.java
        TimestampsFilter.java
        ValueFilter.java
        WhileMatchFilter.java
        WritableByteArrayComparable.java
        package-info.java
        io
        CodeToClassAndBack.java
        HalfStoreFileReader.java
        HbaseMapWritable.java
        HbaseObjectWritable.java
        HeapSize.java
        ImmutableBytesWritable.java
        Reference.java
        TimeRange.java
        WritableWithSize.java
        hfile
        BlockCache.java
        BoundedRangeFileInputStream.java
        CachedBlock.java
        CachedBlockQueue.java
        Compression.java
        HFile.java
        HFileScanner.java
        LruBlockCache.java
        SimpleBlockCache.java
        ipc
        ByteBufferOutputStream.java
        HBaseClient.java
        HBaseRPC.java
        HBaseRPCErrorHandler.java
        HBaseRPCProtocolVersion.java
        HBaseRPCStatistics.java
        HBaseRpcMetrics.java
        HBaseServer.java
        HMasterInterface.java
        HMasterRegionInterface.java
        HRegionInterface.java
        ServerNotRunningException.java
        mapred
        Driver.java
        GroupingTableMap.java
        HRegionPartitioner.java
        IdentityTableMap.java
        IdentityTableReduce.java
        RowCounter.java
        TableInputFormat.java
        TableInputFormatBase.java
        TableMap.java
        TableMapReduceUtil.java
        TableOutputFormat.java
        TableRecordReader.java
        TableRecordReaderImpl.java
        TableReduce.java
        TableSplit.java
        package-info.java
        mapreduce
        CopyTable.java
        Driver.java
        Export.java
        GroupingTableMapper.java
        HFileOutputFormat.java
        HRegionPartitioner.java
        IdentityTableMapper.java
        IdentityTableReducer.java
        Import.java
        ImportTsv.java
        KeyValueSortReducer.java
        LoadIncrementalHFiles.java
        MultiTableOutputFormat.java
        PutSortReducer.java
        RowCounter.java
        SimpleTotalOrderPartitioner.java
        TableInputFormat.java
        TableInputFormatBase.java
        TableMapReduceUtil.java
        TableMapper.java
        TableOutputCommitter.java
        TableOutputFormat.java
        TableRecordReader.java
        TableRecordReaderImpl.java
        TableReducer.java
        TableSplit.java
        hadoopbackport
        InputSampler.java
        TotalOrderPartitioner.java
        package-info.java
        replication
        VerifyReplication.java
        master
        ActiveMasterManager.java
        AssignmentManager.java
        BulkAssigner.java
        CatalogJanitor.java
        DeadServer.java
        HMaster.java
        HMasterCommandLine.java
        LoadBalancer.java
        LogCleaner.java
        LogCleanerDelegate.java
        MasterFileSystem.java
        MasterServices.java
        ServerManager.java
        TimeToLiveLogCleaner.java
        handler
        ClosedRegionHandler.java
        DeleteTableHandler.java
        DisableTableHandler.java
        EnableTableHandler.java
        MetaServerShutdownHandler.java
        ModifyTableHandler.java
        OpenedRegionHandler.java
        ServerShutdownHandler.java
        TableAddFamilyHandler.java
        TableDeleteFamilyHandler.java
        TableEventHandler.java
        TableModifyFamilyHandler.java
        TotesHRegionInfo.java
        metrics
        MasterMetrics.java
        MasterStatistics.java
        metrics
        HBaseInfo.java
        MetricsMBeanBase.java
        MetricsRate.java
        MetricsString.java
        PersistentMetricsTimeVaryingRate.java
        file
        TimeStampingFileContext.java
        regionserver
        ChangedReadersObserver.java
        ColumnCount.java
        ColumnTracker.java
        CompactSplitThread.java
        CompactionRequestor.java
        DebugPrint.java
        DeleteTracker.java
        ExplicitColumnTracker.java
        FlushRequester.java
        GetClosestRowBeforeTracker.java
        HRegion.java
        HRegionServer.java
        HRegionServerCommandLine.java
        InternalScan.java
        InternalScanner.java
        KeyValueHeap.java
        KeyValueScanner.java
        KeyValueSkipListSet.java
        LeaseException.java
        LeaseListener.java
        Leases.java
        LogRoller.java
        LruHashMap.java
        MemStore.java
        MemStoreFlusher.java
        NoSuchColumnFamilyException.java
        OnlineRegions.java
        PriorityCompactionQueue.java
        ReadWriteConsistencyControl.java
        RegionServerRunningException.java
        RegionServerServices.java
        RegionServerStoppedException.java
        ScanDeleteTracker.java
        ScanQueryMatcher.java
        ScanWildcardColumnTracker.java
        ShutdownHook.java
        SplitTransaction.java
        Store.java
        StoreFile.java
        StoreFileScanner.java
        StoreFlusher.java
        StoreScanner.java
        TimeRangeTracker.java
        WrongRegionException.java
        ccindex
        ByteUtil.java
        Checker.java
        CheckerMaster.java
        Flusher.java
        IndexMaintenanceException.java
        IndexMaintenanceUtils.java
        IndexedRegion.java
        IndexedRegionServer.java
        handler
        CloseMetaHandler.java
        CloseRegionHandler.java
        CloseRootHandler.java
        OpenMetaHandler.java
        OpenRegionHandler.java
        OpenRootHandler.java
        metrics
        RegionServerMetrics.java
        RegionServerStatistics.java
        wal
        FailedLogCloseException.java
        HLog.java
        HLogKey.java
        HLogSplitter.java
        OrphanHLogAfterSplitException.java
        SequenceFileLogReader.java
        SequenceFileLogWriter.java
        WALEdit.java
        WALObserver.java
        replication
        ReplicationPeer.java
        ReplicationZookeeper.java
        master
        ReplicationLogCleaner.java
        regionserver
        Replication.java
        ReplicationSink.java
        ReplicationSinkMetrics.java
        ReplicationSource.java
        ReplicationSourceInterface.java
        ReplicationSourceManager.java
        ReplicationSourceMetrics.java
        ReplicationStatistics.java
        rest
        Constants.java
        ExistsResource.java
        Main.java
        ProtobufMessageHandler.java
        RESTServlet.java
        RegionsResource.java
        ResourceBase.java
        ResourceConfig.java
        ResultGenerator.java
        RootResource.java
        RowResource.java
        RowResultGenerator.java
        RowSpec.java
        ScannerInstanceResource.java
        ScannerResource.java
        ScannerResultGenerator.java
        SchemaResource.java
        StorageClusterStatusResource.java
        StorageClusterVersionResource.java
        TableResource.java
        VersionResource.java
        client
        Client.java
        Cluster.java
        RemoteAdmin.java
        RemoteHTable.java
        Response.java
        filter
        GZIPRequestStream.java
        GZIPRequestWrapper.java
        GZIPResponseStream.java
        GZIPResponseWrapper.java
        GzipFilter.java
        metrics
        RESTMetrics.java
        RESTStatistics.java
        model
        CellModel.java
        CellSetModel.java
        ColumnSchemaModel.java
        RowModel.java
        ScannerModel.java
        StorageClusterStatusModel.java
        StorageClusterVersionModel.java
        TableInfoModel.java
        TableListModel.java
        TableModel.java
        TableRegionModel.java
        TableSchemaModel.java
        VersionModel.java
        protobuf
        generated
        CellMessage.java
        CellSetMessage.java
        ColumnSchemaMessage.java
        ScannerMessage.java
        StorageClusterStatusMessage.java
        TableInfoMessage.java
        TableListMessage.java
        TableSchemaMessage.java
        VersionMessage.java
        provider
        JAXBContextResolver.java
        consumer
        ProtobufMessageBodyConsumer.java
        producer
        PlainTextMessageBodyProducer.java
        ProtobufMessageBodyProducer.java
        transform
        Base64.java
        NullTransform.java
        Transform.java
        security
        User.java
        thrift
        ThriftServer.java
        ThriftUtilities.java
        generated
        AlreadyExists.java
        BatchMutation.java
        ColumnDescriptor.java
        Hbase.java
        IOError.java
        IllegalArgument.java
        Mutation.java
        TCell.java
        TRegionInfo.java
        TRowResult.java
        util
        Base64.java
        BloomFilter.java
        ByteBloomFilter.java
        Bytes.java
        ClassSize.java
        CompressionTest.java
        DefaultEnvironmentEdge.java
        DynamicByteBloomFilter.java
        EnvironmentEdge.java
        EnvironmentEdgeManager.java
        FSUtils.java
        FileSystemVersionException.java
        HBaseConfTool.java
        HBaseFsck.java
        HBaseFsckRepair.java
        HMerge.java
        Hash.java
        IncrementingEnvironmentEdge.java
        InfoServer.java
        JVMClusterUtil.java
        JenkinsHash.java
        JvmVersion.java
        Keying.java
        MD5Hash.java
        ManualEnvironmentEdge.java
        Merge.java
        MetaUtils.java
        MurmurHash.java
        Pair.java
        PairOfSameType.java
        ServerCommandLine.java
        Sleeper.java
        SoftValueSortedMap.java
        Strings.java
        Threads.java
        VersionInfo.java
        Writables.java
        zookeeper
        ClusterStatusTracker.java
        HQuorumPeer.java
        MetaNodeTracker.java
        MiniZooKeeperCluster.java
        RegionServerTracker.java
        RootRegionTracker.java
        ZKAssign.java
        ZKConfig.java
        ZKServerTool.java
        ZKTable.java
        ZKTableDisable.java
        ZKUtil.java
        ZooKeeperListener.java
        ZooKeeperMainServerArg.java
        ZooKeeperNodeTracker.java
        ZooKeeperWatcher.java

/**
 * Copyright 2008 The Apache Software Foundation
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.mapreduce;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.StringUtils;

/**
 * Utility for {@link TableMapper} and {@link TableReducer}
 */
@SuppressWarnings("unchecked")
public class TableMapReduceUtil {
  static Log LOG = LogFactory.getLog(TableMapReduceUtil.class);
  
  /**
   * Use this before submitting a TableMap job. It will appropriately set up
   * the job.
   *
   * @param table  The table name to read from.
   * @param scan  The scan instance with the columns, time range etc.
   * @param mapper  The mapper class to use.
   * @param outputKeyClass  The class of the output key.
   * @param outputValueClass  The class of the output value.
   * @param job  The current job to adjust.  Make sure the passed job is
   * carrying all necessary HBase configuration.
   * @throws IOException When setting up the details fails.
   */
  public static void initTableMapperJob(String table, Scan scan,
      Class<? extends TableMapper> mapper,
      Class<? extends WritableComparable> outputKeyClass,
      Class<? extends Writable> outputValueClass, Job job)
  throws IOException {
    initTableMapperJob(table, scan, mapper, outputKeyClass, outputValueClass,
        job, true);
  }

  /**
   * Use this before submitting a TableMap job. It will appropriately set up
   * the job.
   *
   * @param table  The table name to read from.
   * @param scan  The scan instance with the columns, time range etc.
   * @param mapper  The mapper class to use.
   * @param outputKeyClass  The class of the output key.
   * @param outputValueClass  The class of the output value.
   * @param job  The current job to adjust.  Make sure the passed job is
   * carrying all necessary HBase configuration.
   * @param addDependencyJars upload HBase jars and jars for any of the configured
   *           job classes via the distributed cache (tmpjars).
   * @throws IOException When setting up the details fails.
   */
  public static void initTableMapperJob(String table, Scan scan,
      Class<? extends TableMapper> mapper,
      Class<? extends WritableComparable> outputKeyClass,
      Class<? extends Writable> outputValueClass, Job job,
      boolean addDependencyJars)
  throws IOException {
    job.setInputFormatClass(TableInputFormat.class);
    if (outputValueClass != null) job.setMapOutputValueClass(outputValueClass);
    if (outputKeyClass != null) job.setMapOutputKeyClass(outputKeyClass);
    job.setMapperClass(mapper);
    job.getConfiguration().set(TableInputFormat.INPUT_TABLE, table);
    job.getConfiguration().set(TableInputFormat.SCAN,
      convertScanToString(scan));
    if (addDependencyJars) {
      addDependencyJars(job);
    }
  }

  /**
   * Writes the given scan into a Base64 encoded string.
   *
   * @param scan  The scan to write out.
   * @return The scan saved in a Base64 encoded string.
   * @throws IOException When writing the scan fails.
   */
  static String convertScanToString(Scan scan) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataOutputStream dos = new DataOutputStream(out);
    scan.write(dos);
    return Base64.encodeBytes(out.toByteArray());
  }

  /**
   * Converts the given Base64 string back into a Scan instance.
   *
   * @param base64  The scan details.
   * @return The newly created Scan instance.
   * @throws IOException When reading the scan instance fails.
   */
  static Scan convertStringToScan(String base64) throws IOException {
    ByteArrayInputStream bis = new ByteArrayInputStream(Base64.decode(base64));
    DataInputStream dis = new DataInputStream(bis);
    Scan scan = new Scan();
    scan.readFields(dis);
    return scan;
  }

  /**
   * Use this before submitting a TableReduce job. It will
   * appropriately set up the JobConf.
   *
   * @param table  The output table.
   * @param reducer  The reducer class to use.
   * @param job  The current job to adjust.
   * @throws IOException When determining the region count fails.
   */
  public static void initTableReducerJob(String table,
    Class<? extends TableReducer> reducer, Job job)
  throws IOException {
    initTableReducerJob(table, reducer, job, null);
  }

  /**
   * Use this before submitting a TableReduce job. It will
   * appropriately set up the JobConf.
   *
   * @param table  The output table.
   * @param reducer  The reducer class to use.
   * @param job  The current job to adjust.
   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
   * default partitioner.
   * @throws IOException When determining the region count fails.
   */
  public static void initTableReducerJob(String table,
    Class<? extends TableReducer> reducer, Job job,
    Class partitioner) throws IOException {
    initTableReducerJob(table, reducer, job, partitioner, null, null, null);
  }

  /**
   * Use this before submitting a TableReduce job. It will
   * appropriately set up the JobConf.
   *
   * @param table  The output table.
   * @param reducer  The reducer class to use.
   * @param job  The current job to adjust.  Make sure the passed job is
   * carrying all necessary HBase configuration.
   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
   * default partitioner.
   * @param quorumAddress Distant cluster to write to; default is null for
   * output to the cluster that is designated in <code>hbase-site.xml</code>.
   * Set this String to the zookeeper ensemble of an alternate remote cluster
   * when you would have the reduce write a cluster that is other than the
   * default; e.g. copying tables between clusters, the source would be
   * designated by <code>hbase-site.xml</code> and this param would have the
   * ensemble address of the remote cluster.  The format to pass is particular.
   * Pass <code> <hbase.zookeeper.quorum>:<hbase.zookeeper.client.port>:<zookeeper.znode.parent>
   * </code> such as <code>server,server2,server3:2181:/hbase</code>.
   * @param serverClass redefined hbase.regionserver.class
   * @param serverImpl redefined hbase.regionserver.impl
   * @throws IOException When determining the region count fails.
   */
  public static void initTableReducerJob(String table,
    Class<? extends TableReducer> reducer, Job job,
    Class partitioner, String quorumAddress, String serverClass,
    String serverImpl) throws IOException {
    initTableReducerJob(table, reducer, job, partitioner, quorumAddress,
        serverClass, serverImpl, true);
  }

  /**
   * Use this before submitting a TableReduce job. It will
   * appropriately set up the JobConf.
   *
   * @param table  The output table.
   * @param reducer  The reducer class to use.
   * @param job  The current job to adjust.  Make sure the passed job is
   * carrying all necessary HBase configuration.
   * @param partitioner  Partitioner to use. Pass <code>null</code> to use
   * default partitioner.
   * @param quorumAddress Distant cluster to write to; default is null for
   * output to the cluster that is designated in <code>hbase-site.xml</code>.
   * Set this String to the zookeeper ensemble of an alternate remote cluster
   * when you would have the reduce write a cluster that is other than the
   * default; e.g. copying tables between clusters, the source would be
   * designated by <code>hbase-site.xml</code> and this param would have the
   * ensemble address of the remote cluster.  The format to pass is particular.
   * Pass <code> <hbase.zookeeper.quorum>:<hbase.zookeeper.client.port>:<zookeeper.znode.parent>
   * </code> such as <code>server,server2,server3:2181:/hbase</code>.
   * @param serverClass redefined hbase.regionserver.class
   * @param serverImpl redefined hbase.regionserver.impl
   * @param addDependencyJars upload HBase jars and jars for any of the configured
   *           job classes via the distributed cache (tmpjars).
   * @throws IOException When determining the region count fails.
   */
  public static void initTableReducerJob(String table,
    Class<? extends TableReducer> reducer, Job job,
    Class partitioner, String quorumAddress, String serverClass,
    String serverImpl, boolean addDependencyJars) throws IOException {

    Configuration conf = job.getConfiguration();
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null) job.setReducerClass(reducer);
    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    if (quorumAddress != null) {
      // Calling this will validate the format
      ZKUtil.transformClusterKey(quorumAddress);
      conf.set(TableOutputFormat.QUORUM_ADDRESS,quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
      conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
      conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
      job.setPartitionerClass(HRegionPartitioner.class);
      HTable outputTable = new HTable(conf, table);
      int regions = outputTable.getRegionsInfo().size();
      if (job.getNumReduceTasks() > regions) {
        job.setNumReduceTasks(outputTable.getRegionsInfo().size());
      }
    } else if (partitioner != null) {
      job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
      addDependencyJars(job);
    }
  }

  /**
   * Ensures that the given number of reduce tasks for the given job
   * configuration does not exceed the number of regions for the given table.
   *
   * @param table  The table to get the region count for.
   * @param job  The current job to adjust.
   * @throws IOException When retrieving the table details fails.
   */
  public static void limitNumReduceTasks(String table, Job job)
  throws IOException {
    HTable outputTable = new HTable(job.getConfiguration(), table);
    int regions = outputTable.getRegionsInfo().size();
    if (job.getNumReduceTasks() > regions)
      job.setNumReduceTasks(regions);
  }

  /**
   * Sets the number of reduce tasks for the given job configuration to the
   * number of regions the given table has.
   *
   * @param table  The table to get the region count for.
   * @param job  The current job to adjust.
   * @throws IOException When retrieving the table details fails.
   */
  public static void setNumReduceTasks(String table, Job job)
  throws IOException {
    HTable outputTable = new HTable(job.getConfiguration(), table);
    int regions = outputTable.getRegionsInfo().size();
    job.setNumReduceTasks(regions);
  }

  /**
   * Sets the number of rows to return and cache with each scanner iteration.
   * Higher caching values will enable faster mapreduce jobs at the expense of
   * requiring more heap to contain the cached rows.
   *
   * @param job The current job to adjust.
   * @param batchSize The number of rows to return in batch with each scanner
   * iteration.
   */
  public static void setScannerCaching(Job job, int batchSize) {
    job.getConfiguration().setInt("hbase.client.scanner.caching", batchSize);
  }

  /**
   * Add the HBase dependency jars as well as jars for any of the configured
   * job classes to the job configuration, so that JobClient will ship them
   * to the cluster and add them to the DistributedCache.
   */
  public static void addDependencyJars(Job job) throws IOException {
    try {
      addDependencyJars(job.getConfiguration(),
          org.apache.zookeeper.ZooKeeper.class,
          job.getMapOutputKeyClass(),
          job.getMapOutputValueClass(),
          job.getInputFormatClass(),
          job.getOutputKeyClass(),
          job.getOutputValueClass(),
          job.getOutputFormatClass(),
          job.getPartitionerClass(),
          job.getCombinerClass());
    } catch (ClassNotFoundException e) {
      throw new IOException(e);
    }    
  }
  
  /**
   * Add the jars containing the given classes to the job's configuration
   * such that JobClient will ship them to the cluster and add them to
   * the DistributedCache.
   */
  public static void addDependencyJars(Configuration conf,
      Class... classes) throws IOException {

    FileSystem localFs = FileSystem.getLocal(conf);

    Set<String> jars = new HashSet<String>();

    // Add jars that are already in the tmpjars variable
    jars.addAll( conf.getStringCollection("tmpjars") );

    // Add jars containing the specified classes
    for (Class clazz : classes) {
      if (clazz == null) continue;

      String pathStr = findContainingJar(clazz);
      if (pathStr == null) {
        LOG.warn("Could not find jar for class " + clazz +
                 " in order to ship it to the cluster.");
        continue;
      }
      Path path = new Path(pathStr);
      if (!localFs.exists(path)) {
        LOG.warn("Could not validate jar file " + path + " for class "
                 + clazz);
        continue;
      }
      jars.add(path.makeQualified(localFs).toString());
    }
    if (jars.isEmpty()) return;

    conf.set("tmpjars",
             StringUtils.arrayToString(jars.toArray(new String[0])));
  }

  /** 
   * Find a jar that contains a class of the same name, if any.
   * It will return a jar file, even if that is not the first thing
   * on the class path that has a class with the same name.
   * 
   * This is shamelessly copied from JobConf
   * 
   * @param my_class the class to find.
   * @return a jar file that contains the class, or null.
   * @throws IOException
   */
  private static String findContainingJar(Class my_class) {
    ClassLoader loader = my_class.getClassLoader();
    String class_file = my_class.getName().replaceAll("\\.", "/") + ".class";
    try {
      for(Enumeration itr = loader.getResources(class_file);
          itr.hasMoreElements();) {
        URL url = (URL) itr.nextElement();
        if ("jar".equals(url.getProtocol())) {
          String toReturn = url.getPath();
          if (toReturn.startsWith("file:")) {
            toReturn = toReturn.substring("file:".length());
          }
          // URLDecoder is a misnamed class, since it actually decodes
          // x-www-form-urlencoded MIME type rather than actual
          // URL encoding (which the file path has). Therefore it would
          // decode +s to ' 's which is incorrect (spaces are actually
          // either unencoded or encoded as "%20"). Replace +s first, so
          // that they are kept sacred during the decoding process.
          toReturn = toReturn.replaceAll("\\+", "%2B");
          toReturn = URLDecoder.decode(toReturn, "UTF-8");
          return toReturn.replaceAll("!.*$", "");
        }
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    return null;
  }


}