CompactorTest.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.txn.compactor;

import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.ValidTxnList;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.AbortTxnRequest;
import org.apache.hadoop.hive.metastore.api.CommitTxnRequest;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchTxnException;
import org.apache.hadoop.hive.metastore.api.OpenTxnRequest;
import org.apache.hadoop.hive.metastore.api.OpenTxnsResponse;
import org.apache.hadoop.hive.metastore.api.Order;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.metastore.api.SerDeInfo;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.metastore.api.Table;
import org.apache.hadoop.hive.metastore.api.TxnAbortedException;
import org.apache.hadoop.hive.metastore.txn.TxnDbUtil;
import org.apache.hadoop.hive.metastore.txn.TxnStore;
import org.apache.hadoop.hive.metastore.txn.TxnUtils;
import org.apache.hadoop.hive.ql.io.AcidInputFormat;
import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.RecordIdentifier;
import org.apache.hadoop.hive.ql.io.RecordUpdater;
import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.Stack;
import java.util.concurrent.atomic.AtomicBoolean;

/**
 * Super class for all of the compactor test modules.
 */
public abstract class CompactorTest {
  static final private String CLASS_NAME = CompactorTest.class.getName();
  static final private Logger LOG = LoggerFactory.getLogger(CLASS_NAME);

  protected TxnStore txnHandler;
  protected IMetaStoreClient ms;
  protected long sleepTime = 1000;
  protected HiveConf conf;

  private final AtomicBoolean stop = new AtomicBoolean();
  private final File tmpdir;

  protected CompactorTest() throws Exception {
    conf = new HiveConf();
    TxnDbUtil.setConfValues(conf);
    TxnDbUtil.cleanDb();
    ms = new HiveMetaStoreClient(conf);
    txnHandler = TxnUtils.getTxnStore(conf);
    tmpdir = new File (Files.createTempDirectory("compactor_test_table_").toString());
  }

  protected void compactorTestCleanup() throws IOException {
    FileUtils.deleteDirectory(tmpdir);
  }

  protected void startInitiator() throws Exception {
    startThread('i', true);
  }

  protected void startWorker() throws Exception {
    startThread('w', true);
  }

  protected void startCleaner() throws Exception {
    startThread('c', true);
  }

  protected void startCleaner(AtomicBoolean looped) throws Exception {
    startThread('c', false, looped);
  }

  protected Table newTable(String dbName, String tableName, boolean partitioned) throws TException {
    return newTable(dbName, tableName, partitioned, new HashMap<String, String>(), null, false);
  }

  protected Table newTable(String dbName, String tableName, boolean partitioned,
                           Map<String, String> parameters)  throws TException {
    return newTable(dbName, tableName, partitioned, parameters, null, false);

  }

  protected Table newTempTable(String tableName) throws TException {
    return newTable("default", tableName, false, null, null, true);
  }

  protected Table newTable(String dbName, String tableName, boolean partitioned,
                           Map<String, String> parameters, List<Order> sortCols,
                           boolean  isTemporary)
      throws  TException {
    Table table = new Table();
    table.setTableName(tableName);
    table.setDbName(dbName);
    table.setOwner("me");
    table.setSd(newStorageDescriptor(getLocation(tableName, null), sortCols));
    List<FieldSchema> partKeys = new ArrayList<FieldSchema>(1);
    if (partitioned) {
      partKeys.add(new FieldSchema("ds", "string", "no comment"));
      table.setPartitionKeys(partKeys);
    }

    table.setParameters(parameters);
    if (isTemporary) table.setTemporary(true);

    // drop the table first, in case some previous test created it
    ms.dropTable(dbName, tableName);

    ms.createTable(table);
    return table;
  }

  protected Partition newPartition(Table t, String value) throws Exception {
    return newPartition(t, value, null);
  }

  protected Partition newPartition(Table t, String value, List<Order> sortCols) throws Exception {
    Partition part = new Partition();
    part.addToValues(value);
    part.setDbName(t.getDbName());
    part.setTableName(t.getTableName());
    part.setSd(newStorageDescriptor(getLocation(t.getTableName(), value), sortCols));
    part.setParameters(new HashMap<String, String>());
    ms.add_partition(part);
    return part;
  }

  protected long openTxn() throws MetaException {
    List<Long> txns = txnHandler.openTxns(new OpenTxnRequest(1, System.getProperty("user.name"),
        Worker.hostname())).getTxn_ids();
    return txns.get(0);
  }

  protected void addDeltaFile(Table t, Partition p, long minTxn, long maxTxn, int numRecords)
      throws Exception {
    addFile(t, p, minTxn, maxTxn, numRecords, FileType.DELTA, 2, true);
  }

  protected void addLengthFile(Table t, Partition p, long minTxn, long maxTxn, int numRecords)
    throws Exception {
    addFile(t, p, minTxn, maxTxn, numRecords, FileType.LENGTH_FILE, 2, true);
  }

  protected void addBaseFile(Table t, Partition p, long maxTxn, int numRecords) throws Exception {
    addFile(t, p, 0, maxTxn, numRecords, FileType.BASE, 2, true);
  }

  protected void addLegacyFile(Table t, Partition p, int numRecords) throws Exception {
    addFile(t, p, 0, 0, numRecords, FileType.LEGACY, 2, true);
  }

  protected void addDeltaFile(Table t, Partition p, long minTxn, long maxTxn, int numRecords,
                              int numBuckets, boolean allBucketsPresent) throws Exception {
    addFile(t, p, minTxn, maxTxn, numRecords, FileType.DELTA, numBuckets, allBucketsPresent);
  }

  protected void addBaseFile(Table t, Partition p, long maxTxn, int numRecords, int numBuckets,
                             boolean allBucketsPresent) throws Exception {
    addFile(t, p, 0, maxTxn, numRecords, FileType.BASE, numBuckets, allBucketsPresent);
  }

  protected List<Path> getDirectories(HiveConf conf, Table t, Partition p) throws Exception {
    String partValue = (p == null) ? null : p.getValues().get(0);
    String location = getLocation(t.getTableName(), partValue);
    Path dir = new Path(location);
    FileSystem fs = FileSystem.get(conf);
    FileStatus[] stats = fs.listStatus(dir);
    List<Path> paths = new ArrayList<Path>(stats.length);
    for (int i = 0; i < stats.length; i++) paths.add(stats[i].getPath());
    return paths;
  }

  protected void burnThroughTransactions(int num)
      throws MetaException, NoSuchTxnException, TxnAbortedException {
    burnThroughTransactions(num, null, null);
  }

  protected void burnThroughTransactions(int num, Set<Long> open, Set<Long> aborted)
      throws MetaException, NoSuchTxnException, TxnAbortedException {
    OpenTxnsResponse rsp = txnHandler.openTxns(new OpenTxnRequest(num, "me", "localhost"));
    for (long tid : rsp.getTxn_ids()) {
      if (aborted != null && aborted.contains(tid)) {
        txnHandler.abortTxn(new AbortTxnRequest(tid));
      } else if (open == null || (open != null && !open.contains(tid))) {
        txnHandler.commitTxn(new CommitTxnRequest(tid));
      }
    }
  }

  protected void stopThread() {
    stop.set(true);
  }

  private StorageDescriptor newStorageDescriptor(String location, List<Order> sortCols) {
    StorageDescriptor sd = new StorageDescriptor();
    List<FieldSchema> cols = new ArrayList<FieldSchema>(2);
    cols.add(new FieldSchema("a", "varchar(25)", "still no comment"));
    cols.add(new FieldSchema("b", "int", "comment"));
    sd.setCols(cols);
    sd.setLocation(location);
    sd.setInputFormat(MockInputFormat.class.getName());
    sd.setOutputFormat(MockOutputFormat.class.getName());
    sd.setNumBuckets(1);
    SerDeInfo serde = new SerDeInfo();
    serde.setSerializationLib(LazySimpleSerDe.class.getName());
    sd.setSerdeInfo(serde);
    List<String> bucketCols = new ArrayList<String>(1);
    bucketCols.add("a");
    sd.setBucketCols(bucketCols);

    if (sortCols != null) {
      sd.setSortCols(sortCols);
    }
    return sd;
  }

  // I can't do this with @Before because I want to be able to control when the thread starts
  private void startThread(char type, boolean stopAfterOne) throws Exception {
    startThread(type, stopAfterOne, new AtomicBoolean());
  }

  private void startThread(char type, boolean stopAfterOne, AtomicBoolean looped)
    throws Exception {
    TxnDbUtil.setConfValues(conf);
    CompactorThread t = null;
    switch (type) {
      case 'i': t = new Initiator(); break;
      case 'w': t = new Worker(); break;
      case 'c': t = new Cleaner(); break;
      default: throw new RuntimeException("Huh? Unknown thread type.");
    }
    t.setThreadId((int) t.getId());
    t.setHiveConf(conf);
    stop.set(stopAfterOne);
    t.init(stop, looped);
    if (stopAfterOne) t.run();
    else t.start();
  }

  private String getLocation(String tableName, String partValue) {
    String location =  tmpdir.getAbsolutePath() +
      System.getProperty("file.separator") + tableName;
    if (partValue != null) {
      location += System.getProperty("file.separator") + "ds=" + partValue;
    }
    return location;
  }

  private enum FileType {BASE, DELTA, LEGACY, LENGTH_FILE};

  private void addFile(Table t, Partition p, long minTxn, long maxTxn,
                       int numRecords,  FileType type, int numBuckets,
                       boolean allBucketsPresent) throws Exception {
    String partValue = (p == null) ? null : p.getValues().get(0);
    Path location = new Path(getLocation(t.getTableName(), partValue));
    String filename = null;
    switch (type) {
      case BASE: filename = "base_" + maxTxn; break;
      case LENGTH_FILE: // Fall through to delta
      case DELTA: filename = makeDeltaDirName(minTxn, maxTxn); break;
      case LEGACY: break; // handled below
    }

    FileSystem fs = FileSystem.get(conf);
    for (int bucket = 0; bucket < numBuckets; bucket++) {
      if (bucket == 0 && !allBucketsPresent) continue; // skip one
      Path partFile = null;
      if (type == FileType.LEGACY) {
        partFile = new Path(location, String.format(AcidUtils.LEGACY_FILE_BUCKET_DIGITS, bucket) + "_0");
      } else {
        Path dir = new Path(location, filename);
        fs.mkdirs(dir);
        partFile = AcidUtils.createBucketFile(dir, bucket);
        if (type == FileType.LENGTH_FILE) {
          partFile = new Path(partFile.toString() + AcidUtils.DELTA_SIDE_FILE_SUFFIX);
        }
      }
      FSDataOutputStream out = fs.create(partFile);
      if (type == FileType.LENGTH_FILE) {
        out.writeInt(numRecords);
      } else {
        for (int i = 0; i < numRecords; i++) {
          RecordIdentifier ri = new RecordIdentifier(maxTxn - 1, bucket, i);
          ri.write(out);
          out.writeBytes("mary had a little lamb its fleece was white as snow\n");
        }
      }
      out.close();
    }
  }

  static class MockInputFormat implements AcidInputFormat<WritableComparable,Text> {

    @Override
    public AcidInputFormat.RowReader<Text> getReader(InputSplit split,
                                                          Options options) throws
        IOException {
      return null;
    }

    @Override
    public RawReader<Text> getRawReader(Configuration conf, boolean collapseEvents, int bucket,
                                        ValidTxnList validTxnList,
                                        Path baseDirectory, Path... deltaDirectory) throws IOException {

      List<Path> filesToRead = new ArrayList<Path>();
      if (baseDirectory != null) {
        if (baseDirectory.getName().startsWith(AcidUtils.BASE_PREFIX)) {
          Path p = AcidUtils.createBucketFile(baseDirectory, bucket);
          FileSystem fs = p.getFileSystem(conf);
          if (fs.exists(p)) filesToRead.add(p);
        } else {
          filesToRead.add(new Path(baseDirectory, "000000_0"));

        }
      }
      for (int i = 0; i < deltaDirectory.length; i++) {
        Path p = AcidUtils.createBucketFile(deltaDirectory[i], bucket);
        FileSystem fs = p.getFileSystem(conf);
        if (fs.exists(p)) filesToRead.add(p);
      }
      return new MockRawReader(conf, filesToRead);
    }

    @Override
    public InputSplit[] getSplits(JobConf entries, int i) throws IOException {
      return new InputSplit[0];
    }

    @Override
    public RecordReader<WritableComparable, Text> getRecordReader(InputSplit inputSplit, JobConf entries,
                                                            Reporter reporter) throws IOException {
      return null;
    }

    @Override
    public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files) throws
        IOException {
      return false;
    }
  }

  static class MockRawReader implements AcidInputFormat.RawReader<Text> {
    private final Stack<Path> filesToRead;
    private final Configuration conf;
    private FSDataInputStream is = null;
    private final FileSystem fs;
    private boolean lastWasDelete = true;

    MockRawReader(Configuration conf, List<Path> files) throws IOException {
      filesToRead = new Stack<Path>();
      for (Path file : files) filesToRead.push(file);
      this.conf = conf;
      fs = FileSystem.get(conf);
    }

    @Override
    public ObjectInspector getObjectInspector() {
      return null;
    }

    @Override
    public boolean isDelete(Text value) {
      // Alternate between returning deleted and not.  This is easier than actually
      // tracking operations. We test that this is getting properly called by checking that only
      // half the records show up in base files after major compactions.
      lastWasDelete = !lastWasDelete;
      return lastWasDelete;
    }

    @Override
    public boolean next(RecordIdentifier identifier, Text text) throws IOException {
      if (is == null) {
        // Open the next file
        if (filesToRead.empty()) return false;
        Path p = filesToRead.pop();
        LOG.debug("Reading records from " + p.toString());
        is = fs.open(p);
      }
      String line = null;
      try {
        identifier.readFields(is);
        line = is.readLine();
      } catch (EOFException e) {
      }
      if (line == null) {
        // Set our current entry to null (since it's done) and try again.
        is = null;
        return next(identifier, text);
      }
      text.set(line);
      return true;
    }

    @Override
    public RecordIdentifier createKey() {
      return new RecordIdentifier();
    }

    @Override
    public Text createValue() {
      return new Text();
    }

    @Override
    public long getPos() throws IOException {
      return 0;
    }

    @Override
    public void close() throws IOException {

    }

    @Override
    public float getProgress() throws IOException {
      return 0;
    }
  }

  // This class isn't used and I suspect does totally the wrong thing.  It's only here so that I
  // can provide some output format to the tables and partitions I create.  I actually write to
  // those tables directory.
  static class MockOutputFormat implements AcidOutputFormat<WritableComparable, Text> {

    @Override
    public RecordUpdater getRecordUpdater(Path path, Options options) throws
        IOException {
      return null;
    }

    @Override
    public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getRawRecordWriter(Path path, Options options) throws IOException {
      return new MockRecordWriter(path, options);
    }

    @Override
    public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath,
                                              Class<? extends Writable> valueClass,
                                              boolean isCompressed, Properties tableProperties,
                                              Progressable progress) throws IOException {
      return null;
    }

    @Override
    public RecordWriter<WritableComparable, Text> getRecordWriter(FileSystem fileSystem, JobConf entries,
                                                            String s,
                                                            Progressable progressable) throws
        IOException {
      return null;
    }

    @Override
    public void checkOutputSpecs(FileSystem fileSystem, JobConf entries) throws IOException {

    }
  }

  // This class isn't used and I suspect does totally the wrong thing.  It's only here so that I
  // can provide some output format to the tables and partitions I create.  I actually write to
  // those tables directory.
  static class MockRecordWriter implements org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter {
    private final FSDataOutputStream os;

    MockRecordWriter(Path basedir, AcidOutputFormat.Options options) throws IOException {
      FileSystem fs = FileSystem.get(options.getConfiguration());
      Path p = AcidUtils.createFilename(basedir, options);
      os = fs.create(p);
    }

    @Override
    public void write(Writable w) throws IOException {
      Text t = (Text)w;
      os.writeBytes(t.toString());
      os.writeBytes("\n");
    }

    @Override
    public void close(boolean abort) throws IOException {
      os.close();
    }
  }

  /**
   * in Hive 1.3.0 delta file names changed to delta_xxxx_yyyy_zzzz; prior to that
   * the name was delta_xxxx_yyyy.  We want to run compaction tests such that both formats
   * are used since new (1.3) code has to be able to read old files.
   */
  abstract boolean useHive130DeltaDirName();

  String makeDeltaDirName(long minTxnId, long maxTxnId) {
    if(minTxnId != maxTxnId) {
      //covers both streaming api and post compaction style.
      return makeDeltaDirNameCompacted(minTxnId, maxTxnId);
    }
    return useHive130DeltaDirName() ?
      AcidUtils.deltaSubdir(minTxnId, maxTxnId, 0) : AcidUtils.deltaSubdir(minTxnId, maxTxnId);
  }
  /**
   * delta dir name after compaction
   */
  String makeDeltaDirNameCompacted(long minTxnId, long maxTxnId) {
    return AcidUtils.deltaSubdir(minTxnId, maxTxnId);
  }
}