IntegrationTestTableSnapshotInputFormat.java example

Explorer
hbase-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hbase.mapreduce;

import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.IntegrationTestBase;
import org.apache.hadoop.hbase.IntegrationTestingUtility;
import org.apache.hadoop.hbase.testclassification.IntegrationTests;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.util.ToolRunner;
import org.junit.After;
import org.junit.Before;
import org.junit.experimental.categories.Category;

/**
 * An integration test to test {@link TableSnapshotInputFormat} which enables
 * reading directly from snapshot files without going through hbase servers.
 *
 * This test creates a table and loads the table with the rows ranging from
 * 'aaa' to 'zzz', and for each row, sets the columns f1:(null) and f2:(null) to be
 * the the same as the row value.
 * <pre>
 * aaa, f1: => aaa
 * aaa, f2: => aaa
 * aab, f1: => aab
 * ....
 * zzz, f2: => zzz
 * </pre>
 *
 * Then the test creates a snapshot from this table, and overrides the values in the original
 * table with values 'after_snapshot_value'. The test, then runs a mapreduce job over the snapshot
 * with a scan start row 'bbb' and stop row 'yyy'. The data is saved in a single reduce output
 * file, and
 * inspected later to verify that the MR job has seen all the values from the snapshot.
 *
 * <p> These parameters can be used to configure the job:
 * <br>"IntegrationTestTableSnapshotInputFormat.table" => the name of the table
 * <br>"IntegrationTestTableSnapshotInputFormat.snapshot" => the name of the snapshot
 * <br>"IntegrationTestTableSnapshotInputFormat.numRegions" => number of regions in the table
 * to be created (default, 32).
 * <br>"IntegrationTestTableSnapshotInputFormat.tableDir" => temporary directory to restore the
 * snapshot files
 *
 */
@Category(IntegrationTests.class)
// Not runnable as a unit test. See TestTableSnapshotInputFormat
public class IntegrationTestTableSnapshotInputFormat extends IntegrationTestBase {

  private static final Log LOG = LogFactory.getLog(IntegrationTestTableSnapshotInputFormat.class);

  private static final String TABLE_NAME_KEY = "IntegrationTestTableSnapshotInputFormat.table";
  private static final String DEFAULT_TABLE_NAME = "IntegrationTestTableSnapshotInputFormat";

  private static final String SNAPSHOT_NAME_KEY =
      "IntegrationTestTableSnapshotInputFormat.snapshot";
  private static final String NUM_REGIONS_KEY =
      "IntegrationTestTableSnapshotInputFormat.numRegions";

  private static final String MR_IMPLEMENTATION_KEY =
    "IntegrationTestTableSnapshotInputFormat.API";
  private static final String MAPRED_IMPLEMENTATION = "mapred";
  private static final String MAPREDUCE_IMPLEMENTATION = "mapreduce";

  private static final int DEFAULT_NUM_REGIONS = 32;
  private static final String TABLE_DIR_KEY = "IntegrationTestTableSnapshotInputFormat.tableDir";

  private static final byte[] START_ROW = Bytes.toBytes("bbb");
  private static final byte[] END_ROW = Bytes.toBytes("yyy");

  // mapred API missing feature pairity with mapreduce. See comments in
  // mapred.TestTableSnapshotInputFormat
  private static final byte[] MAPRED_START_ROW = Bytes.toBytes("aaa");
  private static final byte[] MAPRED_END_ROW = Bytes.toBytes("zz{"); // 'z' + 1 => '{'

  private IntegrationTestingUtility util;

  @Override
  public void setConf(Configuration conf) {
    super.setConf(conf);
    util = getTestingUtil(conf);
  }

  @Override
  @Before
  public void setUp() throws Exception {
    super.setUp();
    util = getTestingUtil(getConf());
    util.initializeCluster(1);
    this.setConf(util.getConfiguration());
  }

  @Override
  @After
  public void cleanUp() throws Exception {
    util.restoreCluster();
  }

  @Override
  public void setUpCluster() throws Exception {
  }

  @Override
  public int runTestFromCommandLine() throws Exception {
    Configuration conf = getConf();
    TableName tableName = TableName.valueOf(conf.get(TABLE_NAME_KEY, DEFAULT_TABLE_NAME));
    String snapshotName = conf.get(SNAPSHOT_NAME_KEY, tableName.getQualifierAsString()
      + "_snapshot_" + System.currentTimeMillis());
    int numRegions = conf.getInt(NUM_REGIONS_KEY, DEFAULT_NUM_REGIONS);
    String tableDirStr = conf.get(TABLE_DIR_KEY);
    Path tableDir;
    if (tableDirStr == null) {
      tableDir = util.getDataTestDirOnTestFS(tableName.getQualifierAsString());
    } else {
      tableDir = new Path(tableDirStr);
    }

    final String mr = conf.get(MR_IMPLEMENTATION_KEY, MAPREDUCE_IMPLEMENTATION);
    if (mr.equalsIgnoreCase(MAPREDUCE_IMPLEMENTATION)) {
      /*
       * We create the table using HBaseAdmin#createTable(), which will create the table
       * with desired number of regions. We pass bbb as startKey and yyy as endKey, so if
       * desiredNumRegions is > 2, we create regions empty - bbb and yyy - empty, and we
       * create numRegions - 2 regions between bbb - yyy. The test uses a Scan with startRow
       * bbb and endRow yyy, so, we expect the first and last region to be filtered out in
       * the input format, and we expect numRegions - 2 splits between bbb and yyy.
       */
      LOG.debug("Running job with mapreduce API.");
      int expectedNumSplits = numRegions > 2 ? numRegions - 2 : numRegions;

      org.apache.hadoop.hbase.mapreduce.TestTableSnapshotInputFormat.doTestWithMapReduce(util,
        tableName, snapshotName, START_ROW, END_ROW, tableDir, numRegions,
        expectedNumSplits, false);
    } else if (mr.equalsIgnoreCase(MAPRED_IMPLEMENTATION)) {
      /*
       * Similar considerations to above. The difference is that mapred API does not support
       * specifying start/end rows (or a scan object at all). Thus the omission of first and
       * last regions are not performed. See comments in mapred.TestTableSnapshotInputFormat
       * for details of how that test works around the problem. This feature should be added
       * in follow-on work.
       */
      LOG.debug("Running job with mapred API.");
      int expectedNumSplits = numRegions;

      org.apache.hadoop.hbase.mapred.TestTableSnapshotInputFormat.doTestWithMapReduce(util,
        tableName, snapshotName, MAPRED_START_ROW, MAPRED_END_ROW, tableDir, numRegions,
        expectedNumSplits, false);
    } else {
      throw new IllegalArgumentException("Unrecognized mapreduce implementation: " + mr +".");
    }

    return 0;
  }

  @Override // Chaos Monkey is not intended to be run with this test
  public TableName getTablename() {
    return null;
  }

  @Override // Chaos Monkey is not inteded to be run with this test
  protected Set<String> getColumnFamilies() { return null; }

  public static void main(String[] args) throws Exception {
    Configuration conf = HBaseConfiguration.create();
    IntegrationTestingUtility.setUseDistributedCluster(conf);
    int ret = ToolRunner.run(conf, new IntegrationTestTableSnapshotInputFormat(), args);
    System.exit(ret);
  }

}