HFileMapReduceJobOutput.java example

Explorer

kiji-mapreduce-master
- cdh4mr1-bridge
  - src
    - main
      - java
        org
        kiji
        mapreduce
        platform
        CDH4MR1KijiMRBridge.java
        CDH4MR1KijiMRBridgeFactory.java
        package-info.java
- hadoop2-hbase96-bridge
  - src
    - main
      - java
        org
        kiji
        mapreduce
        platform
        Hadoop2HBase96xKijiMRBridge.java
        Hadoop2HBase96xKijiMRBridgeFactory.java
        package-info.java
- kiji-mapreduce
  - src
    - main
      - java
        org
        apache
        hadoop
        mapreduce
        lib
        input
        CombineFileInputFormat.java
        CombineFileRecordReader.java
        CombineFileSplit.java
        map
        KijiMultithreadedMapper.java
        output
        MapFileOutputFormat.java
        kiji
        mapreduce
        DistributedCacheJars.java
        HFileLoader.java
        JobConfigurationException.java
        KVOutputJob.java
        KijiContext.java
        KijiMapReduceJob.java
        KijiMapReduceJobBuilder.java
        KijiMapper.java
        KijiReducer.java
        KijiTableContext.java
        KijiTableReducer.java
        MapReduceJobInput.java
        MapReduceJobOutput.java
        avro
        AvroKeyReader.java
        AvroKeyWriter.java
        AvroMapReader.java
        AvroValueReader.java
        AvroValueWriter.java
        package-info.java
        bulkimport
        AvroBulkImporter.java
        KijiBulkImportJobBuilder.java
        KijiBulkImporter.java
        impl
        BulkImportMapper.java
        KijiBulkImporters.java
        package-info.java
        package-info.java
        framework
        HBaseKijiTableInputFormat.java
        HBaseKijiTableInputFormatFactory.java
        HFileKeyValue.java
        JobHistoryCounters.java
        JobHistoryKijiTable.java
        KijiConfKeys.java
        KijiTableInputFormat.java
        KijiTableInputFormatFactory.java
        KijiTableInputJobBuilder.java
        MapReduceJobBuilder.java
        package-info.java
        gather
        GathererContext.java
        KijiGatherJobBuilder.java
        KijiGatherer.java
        impl
        GatherMapper.java
        InternalGathererContext.java
        KijiGatherers.java
        package-info.java
        package-info.java
        impl
        DirectKijiTableWriterContext.java
        HFileWriterContext.java
        HTableInputFormat.java
        HTableReader.java
        InternalKijiContext.java
        KijiMappers.java
        KijiReducers.java
        KijiTableContextFactory.java
        KijiTableMapper.java
        KijiTableSplit.java
        package-info.java
        input
        AvroKeyMapReduceJobInput.java
        AvroKeyValueMapReduceJobInput.java
        FileMapReduceJobInput.java
        HTableMapReduceJobInput.java
        KijiTableMapReduceJobInput.java
        MapReduceJobInputs.java
        SequenceFileMapReduceJobInput.java
        TextMapReduceJobInput.java
        WholeTextFileMapReduceJobInput.java
        XMLMapReduceJobInput.java
        impl
        WholeFileInputFormat.java
        WholeFileRecordReader.java
        XMLInputFormat.java
        package-info.java
        package-info.java
        kvstore
        KeyValueStore.java
        KeyValueStoreClient.java
        KeyValueStoreReader.java
        KeyValueStoreReaderFactory.java
        RequiredStores.java
        framework
        KeyValueStoreConfiguration.java
        package-info.java
        impl
        KeyValueStoreConfigSerializer.java
        KeyValueStoreConfigValidator.java
        XmlKeyValueStoreParser.java
        package-info.java
        lib
        AvroKVRecordKeyValueStore.java
        AvroRecordKeyValueStore.java
        EmptyKeyValueStore.java
        FileStoreHelper.java
        InMemoryMapKeyValueStore.java
        KijiTableKeyValueStore.java
        SeqFileKeyValueStore.java
        TextFileKeyValueStore.java
        UnconfiguredKeyValueStore.java
        package-info.java
        package-info.java
        output
        AvroKeyMapReduceJobOutput.java
        AvroKeyValueMapReduceJobOutput.java
        DirectKijiTableMapReduceJobOutput.java
        FileMapReduceJobOutput.java
        HFileMapReduceJobOutput.java
        KijiTableMapReduceJobOutput.java
        MapFileMapReduceJobOutput.java
        MapReduceJobOutputs.java
        SequenceFileMapReduceJobOutput.java
        TextMapReduceJobOutput.java
        framework
        HFileReducerMapReduceJobOutput.java
        KijiHFileOutputFormat.java
        package-info.java
        package-info.java
        package-info.java
        pivot
        KijiCellRewriter.java
        KijiPivotJobBuilder.java
        KijiPivoter.java
        impl
        KijiPivoters.java
        PivoterMapper.java
        package-info.java
        package-info.java
        produce
        KijiProduceJobBuilder.java
        KijiProducer.java
        KijiProducerOutputException.java
        ProducerContext.java
        impl
        InternalProducerContext.java
        KijiProducers.java
        ProduceMapper.java
        package-info.java
        package-info.java
        reducer
        IdentityReducer.java
        package-info.java
        tools
        KijiBulkImport.java
        KijiBulkLoad.java
        KijiGather.java
        KijiJobHistory.java
        KijiLaunchMapReduce.java
        KijiPivot.java
        KijiProduce.java
        framework
        JobIOConfKeys.java
        JobIOSpecParseException.java
        JobInputSpec.java
        JobOutputSpec.java
        JobTool.java
        KijiJobTool.java
        MapReduceJobInputFactory.java
        MapReduceJobOutputFactory.java
        package-info.java
        package-info.java
        util
        AvroMapReduce.java
        Jars.java
        Lists.java
        LruCache.java
        package-info.java
    - test
      - java
        org
        kiji
        mapreduce
        IntegrationTestJobHistoryKijiTable.java
        IntegrationTestKijiBulkLoad.java
        IntegrationTestKijiTableInputFormat.java
        IntegrationTestTableMapReducer.java
        KijiMRTestLayouts.java
        TestAvroKeyValueWriter.java
        TestAvroMapReader.java
        TestBulkImporter.java
        TestDistributedCacheJars.java
        TestGatherMapFamily.java
        TestGatherer.java
        TestGathererReducer.java
        TestKijiBulkImportJobBuilder.java
        TestKijiGatherJobBuilder.java
        TestKijiMapReduceJobBuilder.java
        TestKijiProduceJobBuilder.java
        TestLaunchMapReduce.java
        TestPivoter.java
        TestProducer.java
        TestingResources.java
        framework
        TestColumnReaderSpecOverrides.java
        TestKijiTableInputFormat.java
        input
        TestKijiTableMapReduceJobInput.java
        impl
        TestXMLInputFormat.java
        kvstore
        TestKeyValueStoreConfiguration.java
        TestKeyValueStoreReaderFactory.java
        impl
        TestXmlKeyValueStoreParser.java
        lib
        TestAvroAllKVRecordKeyValueStore.java
        TestAvroKVRecordKeyValueStore.java
        TestAvroRecordKeyValueStore.java
        TestInMemoryMapKeyValueStore.java
        TestKijiTableKeyValueStore.java
        TestSeqFileKeyValueStore.java
        TestTextFileKeyValueStore.java
        output
        TestFileMapReduceJobOutput.java
        TestKijiHFileOutputFormat.java
        TestKijiTableMapReduceJobOutput.java
        pivot
        TestKijiCellRewriter.java
        testlib
        HFileReduceJob.java
        IntegrationTestSimpleBulkImporter.java
        IntegrationTestTableMapper.java
        SimpleBulkImporter.java
        SimpleIntSumReducer.java
        SimpleTableMapReducer.java
        SimpleTableMapperAsBulkImporter.java
        SimpleTableMapperAsGatherer.java
        tools
        TestJobInputSpec.java
        TestJobOutputSpec.java
        util
        TestJars.java
        TestKijiProducers.java
        TestLists.java
        TestLruCache.java
        schema
        filter
        TestRegexQualifierColumnFilter.java
- kiji-mapreduce-archetype
  - src
    - main
      - resources
        archetype-resources
        src
        main
        java
        bulkimport
        ExampleBulkImporter.java
        package-info.java
        gather
        ExampleGatherer.java
        package-info.java
        produce
        ExampleProducer.java
        package-info.java
        reduce
        ExampleIdentityReducer.java
        package-info.java
- kiji-mapreduce-cassandra
  - src
    - main
      - java
        org
        kiji
        mapreduce
        framework
        CassandraInputSplit.java
        CassandraKijiTableInputFormat.java
        CassandraKijiTableInputFormatFactory.java
        CassandraSubSplit.java
        CassandraSubSplitCombiner.java
        CassandraSubSplitCreator.java
        CassandraTokenRange.java
        ConsistentHostOrderPolicy.java
        package-info.java
    - test
      - java
        org
        kiji
        mapreduce
        framework
        TestSubSplits.java
- platform-api
  - src
    - main
      - java
        org
        kiji
        mapreduce
        platform
        KijiMRPlatformBridge.java
        KijiMRPlatformBridgeFactory.java
        package-info.java
- profiling
  - src
    - main
      - java
        org
        kiji
        mapreduce
        util
        MRLogTimerAspect.java
        SerializeLoggerAspect.java

/**
 * (c) Copyright 2012 WibiData, Inc.
 *
 * See the NOTICE file distributed with this work for additional
 * information regarding copyright ownership.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.kiji.mapreduce.output;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;

import com.google.common.collect.Lists;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.kiji.annotations.ApiAudience;
import org.kiji.annotations.ApiStability;
import org.kiji.mapreduce.JobConfigurationException;
import org.kiji.mapreduce.KijiTableContext;
import org.kiji.mapreduce.framework.HFileKeyValue;
import org.kiji.mapreduce.framework.KijiConfKeys;
import org.kiji.mapreduce.impl.HFileWriterContext;
import org.kiji.mapreduce.output.framework.KijiHFileOutputFormat;
import org.kiji.mapreduce.platform.KijiMRPlatformBridge;
import org.kiji.mapreduce.tools.framework.JobIOConfKeys;
import org.kiji.schema.Kiji;
import org.kiji.schema.KijiRegion;
import org.kiji.schema.KijiRowKeySplitter;
import org.kiji.schema.KijiTable;
import org.kiji.schema.KijiURI;
import org.kiji.schema.layout.KijiTableLayout;
import org.kiji.schema.util.ResourceUtils;

/**
 * The class HFileMapReduceJobOutput is used to indicate the usage of HFiles based on the layout
 * of a Kiji table as output for a MapReduce job.
 *
 * <p>
 *   This job output writes output from MapReduce jobs to HFiles. The generated HFiles
 *   can be directly loaded into the regions of an existing HTable. Use a
 *   {@link org.kiji.mapreduce.HFileLoader} to load HFiles into a Kiji table.
 * </p>
 *
 * <h2>Configuring an output:</h2>
 * <p>
 *   HFileMapReduceJobOutput must be configured with the address of the Kiji table to
 *   write to as well as a location to write output HFiles to:
 * </p>
 * <pre>
 *   <code>
 *     final KijiURI tableURI = KijiURI.newBuilder("kiji://.env/default/mytable").build();
 *     final Path hfileLocation = new Path("/path/to/hfile/output");
 *
 *     final MapReduceJobOutput kijiTableOutput =
 *         MapReduceJobOutputs.newHFileMapReduceJobOutput(tableURI, hfileLocation);
 *   </code>
 * </pre>
 * @see DirectKijiTableMapReduceJobOutput
 */
@ApiAudience.Public
@ApiStability.Evolving
public final class HFileMapReduceJobOutput extends KijiTableMapReduceJobOutput {
  private static final Logger LOG = LoggerFactory.getLogger(HFileMapReduceJobOutput.class);

  /**
   * If <code>mNumSplits</code> has this special value, the number of splits should be
   * set equal to the number of existing regions in the target Kiji table.
   */
  private static final int NUM_SPLITS_AUTO = 0;

  /** The path to the directory to create the HFiles in. */
  private Path mPath;

  /** Default constructor. Accessible via {@link MapReduceJobOutputs}. */
  HFileMapReduceJobOutput() {
  }
  /** {@inheritDoc} */
  @Override
  public void initialize(Map<String, String> params) throws IOException {
    super.initialize(params);
    mPath = new Path(params.get(JobIOConfKeys.FILE_PATH_KEY));
  }

  /**
   * Creates job output of HFiles that can be efficiently loaded into a Kiji table.
   * The number of HFiles created (which determines the number of reduce tasks) will
   * match the number of existing regions in the target Kiji table.
   *
   * @param tableURI The kiji table the resulting HFiles are intended for.
   * @param path The directory path to output the HFiles to.
   */
  HFileMapReduceJobOutput(KijiURI tableURI, Path path) {
    this(tableURI, path, NUM_SPLITS_AUTO);
  }

  /**
   * Creates job output of HFiles that can be efficiently loaded into a Kiji table.
   * The number of HFiles created (which determines the number of reduce tasks) is
   * specified with the <code>numSplits</code> argument.  Controlling the number of
   * splits is only possible when targeting a Kiji table with <hashRowKeys>
   * enabled.  Typically, you should allow the system to
   * match the number of splits to the number of regions in the table by using the
   * {@link #HFileMapReduceJobOutput(KijiURI, Path)} constructor instead.
   *
   * @param table The kiji table the resulting HFiles are intended for.
   * @param path The directory path to output the HFiles to.
   * @param numSplits Number of splits (determines the number of reduce tasks).
   */
  HFileMapReduceJobOutput(KijiURI table, Path path, int numSplits) {
    super(table, numSplits);
    mPath = path;
  }

  /** {@inheritDoc} */
  @Override
  public void configure(Job job) throws IOException {
    // sets Hadoop output format, Kiji output table and # of reducers:
    super.configure(job);

    final Configuration conf = job.getConfiguration();

    // Kiji table context:
    conf.setClass(
        KijiConfKeys.KIJI_TABLE_CONTEXT_CLASS,
        HFileWriterContext.class,
        KijiTableContext.class);

    // Set the output path.
    FileOutputFormat.setOutputPath(job, mPath);

    // Configure the total order partitioner so generated HFile shards are contiguous and sorted.
    configurePartitioner(job, makeTableKeySplit(getOutputTableURI(), getNumReduceTasks(), conf));

    // Note: the HFile job output requires the reducer of the MapReduce job to be IdentityReducer.
    //     This is enforced externally.
  }

  /** {@inheritDoc} */
  @Override
  protected Class<? extends OutputFormat> getOutputFormatClass() {
    return KijiHFileOutputFormat.class;
  }

  /**
   * Generates a split for a given table.
   *
   * @param tableURI URI of the Kiji table to split.
   * @param nsplits Number of splits.
   * @param conf Base Hadoop configuration used to open the Kiji instance.
   * @return a list of split start keys, as HFileKeyValue (with no value, just the keys).
   * @throws IOException on I/O error.
   */
  public static List<HFileKeyValue> makeTableKeySplit(KijiURI tableURI,
      int nsplits,
      Configuration conf)
      throws IOException {
    final Kiji kiji = Kiji.Factory.open(tableURI, conf);
    try {
      final KijiTable table = kiji.openTable(tableURI.getTable());
      try {
        if (NUM_SPLITS_AUTO == nsplits) {
          final List<HFileKeyValue> startKeys = Lists.newArrayList();
          for (KijiRegion region : table.getRegions()) {
            startKeys.add(HFileKeyValue.createFromRowKey(region.getStartKey()));
          }
          return startKeys;

        } else {
          switch (KijiTableLayout.getEncoding(table.getLayout().getDesc().getKeysFormat())) {
          case RAW: {
            // The user has explicitly specified how many HFiles to create, but this is not
            // possible when row key hashing is disabled.
            throw new JobConfigurationException(String.format(
                "Table '%s' has row key hashing disabled, so the number of HFile splits must be"
                + "determined by the number of HRegions in the HTable. "
                + "Use an HFileMapReduceJobOutput constructor that enables auto splitting.",
                table.getName()));
          }
          case FORMATTED:
          case HASH:
          case HASH_PREFIX: {
            // Those cases are supported:
            break;
          }
          default:
            throw new RuntimeException("Unhandled row key encoding: "
                + KijiTableLayout.getEncoding(table.getLayout().getDesc().getKeysFormat()));
          }
          return generateEvenStartKeys(nsplits);
        }
      } finally {
        ResourceUtils.releaseOrLog(table);
      }
    } finally {
      ResourceUtils.releaseOrLog(kiji);
    }
  }

  /** @return the path where to write HFiles. */
  public Path getPath() {
    return mPath;
  }

  /**
   * Configures the partitioner for generating HFiles.
   *
   * <p>Each generated HFile should fit within a region of of the target table.
   * Additionally, it's optimal to have only one HFile to load into each region, since a
   * read from that region will require reading from each HFile under management (until
   * compaction happens and merges them all back into one HFile).</p>
   *
   * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
   * records output from the Mapper based on their rank in a total ordering of the
   * keys.  The <code>startKeys</code> argument should contain a list of the first key in
   * each of those partitions.</p>
   *
   * @param job The job to configure.
   * @param startKeys A list of keys that will mark the boundaries between the partitions
   *     for the sorted map output records.
   * @throws IOException If there is an error.
   */
  public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys)
      throws IOException {
    KijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath =
        new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
      final URI cacheUri =
          new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
      DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
      throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
  }

  /**
   * <p>Write out a SequenceFile that can be read by TotalOrderPartitioner
   * that contains the split points in startKeys.</p>
   *
   * <p>This method was copied from HFileOutputFormat in hbase-0.90.1-cdh3u0.  I had to
   * copy it because it's private.</p>
   *
   * @param conf The job configuration.
   * @param partitionsPath output path for SequenceFile.
   * @param startKeys the region start keys to use as the partitions.
   * @throws IOException If there is an error.
   */
  public static void writePartitionFile(
      Configuration conf, Path partitionsPath, List<HFileKeyValue> startKeys)
      throws IOException {
    if (startKeys.isEmpty()) {
      throw new IllegalArgumentException("No regions passed");
    }

    // We're generating a list of split points, and we don't ever
    // have keys < the first region (which has an empty start key)
    // so we need to remove it. Otherwise we would end up with an
    // empty reducer with index 0.
    TreeSet<HFileKeyValue> sorted = new TreeSet<HFileKeyValue>();
    sorted.addAll(startKeys);

    HFileKeyValue first = sorted.first();
    if (0 != first.getRowKey().length) {
      throw new IllegalArgumentException(
          "First region of table should have empty start row key. Instead has: "
          + Bytes.toStringBinary(first.getRowKey()));
    }
    sorted.remove(first);

    // Write the actual file
    final SequenceFile.Writer writer = KijiMRPlatformBridge.get().newSeqFileWriter(
        conf, partitionsPath, HFileKeyValue.class, NullWritable.class);

    try {
      for (HFileKeyValue startKey : sorted) {
        writer.append(startKey, NullWritable.get());
      }
    } finally {
      writer.close();
    }
  }

  /**
   * <p>Generate a list of start keys (one per region).  Since we know
   * that the row keys in kiji are byte strings of length 16, we can reliably split
   * them evenly.</p>
   *
   * @param numRegions The number of regions to generate start keys for.
   * @return A list of start keys with size equal to <code>numRegions</code>.
   */
  private static List<HFileKeyValue> generateEvenStartKeys(int numRegions) {
    List<HFileKeyValue> startKeys = new ArrayList<HFileKeyValue>(numRegions);

    // The first key is a special case, it must be empty.
    startKeys.add(HFileKeyValue.createFromRowKey(HConstants.EMPTY_BYTE_ARRAY));

    if (numRegions > 1) {
      byte[][] splitKeys = KijiRowKeySplitter.get().getSplitKeys(numRegions);
      for (byte[] hbaseRowKey : splitKeys) {
        startKeys.add(HFileKeyValue.createFromRowKey(hbaseRowKey));
      }
    }
    return startKeys;
  }
}