ReplicationSemanticAnalyzer.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.parse;

import com.google.common.base.Predicate;
import com.google.common.collect.Collections2;
import com.google.common.primitives.Ints;
import org.antlr.runtime.tree.Tree;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
import org.apache.hadoop.hive.metastore.api.Database;
import org.apache.hadoop.hive.metastore.api.Function;
import org.apache.hadoop.hive.metastore.api.InvalidOperationException;
import org.apache.hadoop.hive.metastore.api.NotificationEvent;
import org.apache.hadoop.hive.metastore.messaging.EventUtils;
import org.apache.hadoop.hive.metastore.messaging.MessageFactory;
import org.apache.hadoop.hive.metastore.messaging.event.filters.AndFilter;
import org.apache.hadoop.hive.metastore.messaging.event.filters.DatabaseAndTableFilter;
import org.apache.hadoop.hive.metastore.messaging.event.filters.EventBoundaryFilter;
import org.apache.hadoop.hive.metastore.messaging.event.filters.MessageFormatFilter;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.repl.DumpType;
import org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper;
import org.apache.hadoop.hive.ql.parse.repl.dump.events.EventHandler;
import org.apache.hadoop.hive.ql.parse.repl.dump.events.EventHandlerFactory;
import org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData;
import org.apache.hadoop.hive.ql.parse.repl.dump.io.FunctionSerializer;
import org.apache.hadoop.hive.ql.parse.repl.dump.io.JsonWriter;
import org.apache.hadoop.hive.ql.parse.repl.dump.Utils;
import org.apache.hadoop.hive.ql.parse.repl.load.MetaData;
import org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler;
import org.apache.hadoop.hive.ql.plan.AlterDatabaseDesc;
import org.apache.hadoop.hive.ql.plan.AlterTableDesc;
import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc;
import org.apache.hadoop.hive.ql.plan.CreateFunctionDesc;
import org.apache.hadoop.hive.ql.plan.DDLWork;
import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork;
import org.apache.hadoop.hive.ql.plan.FunctionWork;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_FROM;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_LIMIT;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_DUMP;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_LOAD;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_STATUS;
import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_TO;

public class ReplicationSemanticAnalyzer extends BaseSemanticAnalyzer {
  // Database name or pattern
  private String dbNameOrPattern;
  // Table name or pattern
  private String tblNameOrPattern;
  private Long eventFrom;
  private Long eventTo;
  private Integer maxEventLimit;
  // Base path for REPL LOAD
  private String path;

  private static String testInjectDumpDir = null; // unit tests can overwrite this to affect default dump behaviour
  private static final String dumpSchema = "dump_dir,last_repl_id#string,string";

  private static final String FUNCTIONS_ROOT_DIR_NAME = "_functions";
  private static final String FUNCTION_METADATA_DIR_NAME = "_metadata";
  private final static Logger REPL_STATE_LOG = LoggerFactory.getLogger("ReplState");

  ReplicationSemanticAnalyzer(QueryState queryState) throws SemanticException {
    super(queryState);
  }

  @Override
  public void analyzeInternal(ASTNode ast) throws SemanticException {
    LOG.debug("ReplicationSemanticAanalyzer: analyzeInternal");
    LOG.debug(ast.getName() + ":" + ast.getToken().getText() + "=" + ast.getText());
    switch (ast.getToken().getType()) {
      case TOK_REPL_DUMP: {
        LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: dump");
        initReplDump(ast);
        analyzeReplDump(ast);
        break;
      }
      case TOK_REPL_LOAD: {
        LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: load");
        initReplLoad(ast);
        analyzeReplLoad(ast);
        break;
      }
      case TOK_REPL_STATUS: {
        LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: status");
        initReplStatus(ast);
        analyzeReplStatus(ast);
        break;
      }
      default: {
        throw new SemanticException("Unexpected root token");
      }
    }
  }

  private void initReplDump(ASTNode ast) {
    int numChildren = ast.getChildCount();
    dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(0).getText());
    // skip the first node, which is always required
    int currNode = 1;
    while (currNode < numChildren) {
      if (ast.getChild(currNode).getType() != TOK_FROM) {
        // optional tblName was specified.
        tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(currNode).getText());
      } else {
        // TOK_FROM subtree
        Tree fromNode = ast.getChild(currNode);
        eventFrom = Long.parseLong(PlanUtils.stripQuotes(fromNode.getChild(0).getText()));
        // skip the first, which is always required
        int numChild = 1;
        while (numChild < fromNode.getChildCount()) {
          if (fromNode.getChild(numChild).getType() == TOK_TO) {
            eventTo =
                Long.parseLong(PlanUtils.stripQuotes(fromNode.getChild(numChild + 1).getText()));
            // skip the next child, since we already took care of it
            numChild++;
          } else if (fromNode.getChild(numChild).getType() == TOK_LIMIT) {
            maxEventLimit =
                Integer.parseInt(PlanUtils.stripQuotes(fromNode.getChild(numChild + 1).getText()));
            // skip the next child, since we already took care of it
            numChild++;
          }
          // move to the next child in FROM tree
          numChild++;
        }
        // FROM node is always the last
        break;
      }
      // move to the next root node
      currNode++;
    }
  }

  // REPL DUMP
  private void analyzeReplDump(ASTNode ast) throws SemanticException {
    LOG.debug("ReplicationSemanticAnalyzer.analyzeReplDump: " + String.valueOf(dbNameOrPattern)
        + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(eventFrom) + " to "
        + String.valueOf(eventTo) + " maxEventLimit " + String.valueOf(maxEventLimit));
    String replRoot = conf.getVar(HiveConf.ConfVars.REPLDIR);
    Path dumpRoot = new Path(replRoot, getNextDumpDir());
    DumpMetaData dmd = new DumpMetaData(dumpRoot, conf);
    Path cmRoot = new Path(conf.getVar(HiveConf.ConfVars.REPLCMDIR));
    Long lastReplId;
    try {
      if (eventFrom == null){
        // bootstrap case
        Long bootDumpBeginReplId = db.getMSC().getCurrentNotificationEventId().getEventId();
        for (String dbName : matchesDb(dbNameOrPattern)) {
          REPL_STATE_LOG.info("Repl Dump: Started analyzing Repl Dump for DB: {}, Dump Type: BOOTSTRAP", dbName);
          LOG.debug("ReplicationSemanticAnalyzer: analyzeReplDump dumping db: " + dbName);
          Path dbRoot = dumpDbMetadata(dbName, dumpRoot);
          dumpFunctionMetadata(dbName, dumpRoot);
          for (String tblName : matchesTbl(dbName, tblNameOrPattern)) {
            LOG.debug("ReplicationSemanticAnalyzer: analyzeReplDump dumping table: " + tblName
                + " to db root " + dbRoot.toUri());
            dumpTbl(ast, dbName, tblName, dbRoot);
          }
          REPL_STATE_LOG.info("Repl Dump: Completed analyzing Repl Dump for DB: {} and created {} COPY tasks to dump " +
                              "metadata and data",
                              dbName, rootTasks.size());
        }
        Long bootDumpEndReplId = db.getMSC().getCurrentNotificationEventId().getEventId();
        LOG.info("Bootstrap object dump phase took from {} to {}", bootDumpBeginReplId, bootDumpEndReplId);

        // Now that bootstrap has dumped all objects related, we have to account for the changes
        // that occurred while bootstrap was happening - i.e. we have to look through all events
        // during the bootstrap period and consolidate them with our dump.

        IMetaStoreClient.NotificationFilter evFilter =
            new DatabaseAndTableFilter(dbNameOrPattern, tblNameOrPattern);
        EventUtils.MSClientNotificationFetcher evFetcher =
            new EventUtils.MSClientNotificationFetcher(db.getMSC());
        EventUtils.NotificationEventIterator evIter = new EventUtils.NotificationEventIterator(
            evFetcher, bootDumpBeginReplId,
            Ints.checkedCast(bootDumpEndReplId - bootDumpBeginReplId) + 1,
            evFilter );

        // Now we consolidate all the events that happenned during the objdump into the objdump
        while (evIter.hasNext()){
          NotificationEvent ev = evIter.next();
          Path evRoot = new Path(dumpRoot, String.valueOf(ev.getEventId()));
          // FIXME : implement consolidateEvent(..) similar to dumpEvent(ev,evRoot)
        }
        LOG.info(
            "Consolidation done, preparing to return {},{}->{}",
            dumpRoot.toUri(), bootDumpBeginReplId, bootDumpEndReplId);
        dmd.setDump(DumpType.BOOTSTRAP, bootDumpBeginReplId, bootDumpEndReplId, cmRoot);
        dmd.write();

        // Set the correct last repl id to return to the user
        lastReplId = bootDumpEndReplId;
      } else {
        // get list of events matching dbPattern & tblPattern
        // go through each event, and dump out each event to a event-level dump dir inside dumproot
        if (eventTo == null){
          eventTo = db.getMSC().getCurrentNotificationEventId().getEventId();
          LOG.debug("eventTo not specified, using current event id : {}", eventTo);
        } else if (eventTo < eventFrom) {
          throw new Exception("Invalid event ID input received in TO clause");
        }

        Integer maxRange = Ints.checkedCast(eventTo - eventFrom + 1);
        if ((maxEventLimit == null) || (maxEventLimit > maxRange)){
          maxEventLimit = maxRange;
        }

        // TODO : instead of simply restricting by message format, we should eventually
        // move to a jdbc-driver-stype registering of message format, and picking message
        // factory per event to decode. For now, however, since all messages have the
        // same factory, restricting by message format is effectively a guard against
        // older leftover data that would cause us problems.

        IMetaStoreClient.NotificationFilter evFilter = new AndFilter(
            new DatabaseAndTableFilter(dbNameOrPattern, tblNameOrPattern),
            new EventBoundaryFilter(eventFrom, eventTo),
            new MessageFormatFilter(MessageFactory.getInstance().getMessageFormat()));

        EventUtils.MSClientNotificationFetcher evFetcher
            = new EventUtils.MSClientNotificationFetcher(db.getMSC());

        EventUtils.NotificationEventIterator evIter = new EventUtils.NotificationEventIterator(
            evFetcher, eventFrom, maxEventLimit, evFilter);

        lastReplId = eventTo;
        REPL_STATE_LOG.info("Repl Dump: Started Repl Dump for DB: {}, Dump Type: INCREMENTAL",
                            (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?");
        while (evIter.hasNext()){
          NotificationEvent ev = evIter.next();
          lastReplId = ev.getEventId();
          Path evRoot = new Path(dumpRoot, String.valueOf(lastReplId));
          dumpEvent(ev, evRoot, cmRoot);
        }

        REPL_STATE_LOG.info("Repl Dump: Completed Repl Dump for DB: {}",
                            (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?");

        LOG.info("Done dumping events, preparing to return {},{}", dumpRoot.toUri(), lastReplId);
        Utils.writeOutput(
            Arrays.asList(
                "incremental",
                String.valueOf(eventFrom),
                String.valueOf(lastReplId)
            ),
            dmd.getDumpFilePath(), conf);
        dmd.setDump(DumpType.INCREMENTAL, eventFrom, lastReplId, cmRoot);
        dmd.write();
      }
      prepareReturnValues(Arrays.asList(dumpRoot.toUri().toString(), String.valueOf(lastReplId)), dumpSchema);
      setFetchTask(createFetchTask(dumpSchema));
    } catch (Exception e) {
      // TODO : simple wrap & rethrow for now, clean up with error codes
      LOG.warn("Error during analyzeReplDump", e);
      throw new SemanticException(e);
    }
  }

  private void dumpEvent(NotificationEvent ev, Path evRoot, Path cmRoot) throws Exception {
    EventHandler.Context context = new EventHandler.Context(
        evRoot,
        cmRoot,
        db,
        conf,
        getNewEventOnlyReplicationSpec(ev.getEventId())
    );
    EventHandlerFactory.handlerFor(ev).handle(context);
    REPL_STATE_LOG.info("Repl Dump: Dumped event with ID: {}, Type: {} and dumped metadata and data to path {}",
                        String.valueOf(ev.getEventId()), ev.getEventType(), evRoot.toUri().toString());
  }

  public static void injectNextDumpDirForTest(String dumpdir){
    testInjectDumpDir = dumpdir;
  }

  private String getNextDumpDir() {
    if (conf.getBoolVar(HiveConf.ConfVars.HIVE_IN_TEST)) {
      // make it easy to write .q unit tests, instead of unique id generation.
      // however, this does mean that in writing tests, we have to be aware that
      // repl dump will clash with prior dumps, and thus have to clean up properly.
      if (testInjectDumpDir == null){
        return "next";
      } else {
        return testInjectDumpDir;
      }
    } else {
      return String.valueOf(System.currentTimeMillis());
      // TODO: time good enough for now - we'll likely improve this.
      // We may also work in something the equivalent of pid, thrid and move to nanos to ensure
      // uniqueness.
    }
  }

  /**
   *
   * @param dbName
   * @param dumpRoot
   * @return db dumped path
   * @throws SemanticException
   */
  private Path dumpDbMetadata(String dbName, Path dumpRoot) throws SemanticException {
    Path dbRoot = new Path(dumpRoot, dbName);
    try {
      // TODO : instantiating FS objects are generally costly. Refactor
      FileSystem fs = dbRoot.getFileSystem(conf);
      Path dumpPath = new Path(dbRoot, EximUtil.METADATA_NAME);
      HiveWrapper.Tuple<Database> database = new HiveWrapper(db, dbName).database();
      EximUtil.createDbExportDump(fs, dumpPath, database.object, database.replicationSpec);
      REPL_STATE_LOG.info("Repl Dump: Dumped DB metadata");
    } catch (Exception e) {
      // TODO : simple wrap & rethrow for now, clean up with error codes
      throw new SemanticException(e);
    }
    return dbRoot;
  }

  private void dumpFunctionMetadata(String dbName, Path dumpRoot) throws SemanticException {
    Path functionsRoot = new Path(new Path(dumpRoot, dbName), FUNCTIONS_ROOT_DIR_NAME);
    try {
      // TODO : This should ideally return the Function Objects and not Strings(function names) that should be done by the caller, Look at this separately.
      List<String> functionNames = db.getFunctions(dbName, "*");
      for (String functionName : functionNames) {
        HiveWrapper.Tuple<Function> tuple;
        try {
          tuple = new HiveWrapper(db, dbName).function(functionName);
        } catch (HiveException e) {
          //This can happen as we are querying the getFunctions before we are getting the actual function
          //in between there can be a drop function by a user in which case our call will fail.
          LOG.info("Function " + functionName + " could not be found, we are ignoring it as it can be a valid state ", e);
          continue;
        }
        if (tuple.object.getResourceUris().isEmpty()) {
          REPL_STATE_LOG.warn(
              "Not replicating function: " + functionName + " as it seems to have been created "
                  + "without USING clause");
          continue;
        }

        Path functionMetadataRoot =
            new Path(new Path(functionsRoot, functionName), FUNCTION_METADATA_DIR_NAME);
        try (JsonWriter jsonWriter = new JsonWriter(functionMetadataRoot.getFileSystem(conf),
            functionMetadataRoot)) {
          new FunctionSerializer(tuple.object).writeTo(jsonWriter, tuple.replicationSpec);
        }
        REPL_STATE_LOG.info("Repl Dump: Dumped metadata for function: {}", functionName);
      }
    } catch (Exception e) {
      throw new SemanticException(e);
    }
  }

  /**
   *
   * @param ast
   * @param dbName
   * @param tblName
   * @param dbRoot
   * @return tbl dumped path
   * @throws SemanticException
   */
  private Path dumpTbl(ASTNode ast, String dbName, String tblName, Path dbRoot) throws SemanticException {
    Path tableRoot = new Path(dbRoot, tblName);
    try {
      URI toURI = EximUtil.getValidatedURI(conf, tableRoot.toUri().toString());
      TableSpec ts = new TableSpec(db, conf, dbName + "." + tblName, null);

      ExportSemanticAnalyzer.prepareExport(ast, toURI, ts, getNewReplicationSpec(), db, conf, ctx,
          rootTasks, inputs, outputs, LOG);
      REPL_STATE_LOG.info("Repl Dump: Analyzed dump for table/view: {}.{} and created copy tasks to dump metadata " +
                          "and data to path {}", dbName, tblName, toURI.toString());
    } catch (InvalidTableException te) {
      // Bootstrap dump shouldn't fail if the table is dropped/renamed while dumping it.
      // Just log a debug message and skip it.
      LOG.debug(te.getMessage());
      return null;
    } catch (HiveException e) {
      // TODO : simple wrap & rethrow for now, clean up with error codes
      throw new SemanticException(e);
    }
    return tableRoot;
  }

  // REPL LOAD
  private void initReplLoad(ASTNode ast) {
    int numChildren = ast.getChildCount();
    path = PlanUtils.stripQuotes(ast.getChild(0).getText());
    if (numChildren > 1) {
      dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(1).getText());
    }
    if (numChildren > 2) {
      tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(2).getText());
    }
  }

  /*
   * Example dump dirs we need to be able to handle :
   *
   * for: hive.repl.rootdir = staging/
   * Then, repl dumps will be created in staging/<dumpdir>
   *
   * single-db-dump: staging/blah12345 will contain a db dir for the db specified
   *  blah12345/
   *   default/
   *    _metadata
   *    tbl1/
   *      _metadata
   *      dt=20160907/
   *        _files
   *    tbl2/
   *    tbl3/
   *    unptn_tbl/
   *      _metadata
   *      _files
   *
   * multi-db-dump: staging/bar12347 will contain dirs for each db covered
   * staging/
   *  bar12347/
   *   default/
   *     ...
   *   sales/
   *     ...
   *
   * single table-dump: staging/baz123 will contain a table object dump inside
   * staging/
   *  baz123/
   *    _metadata
   *    dt=20150931/
   *      _files
   *
   * incremental dump : staging/blue123 will contain dirs for each event inside.
   * staging/
   *  blue123/
   *    34/
   *    35/
   *    36/
   */
  private void analyzeReplLoad(ASTNode ast) throws SemanticException {
    LOG.debug("ReplSemanticAnalyzer.analyzeReplLoad: " + String.valueOf(dbNameOrPattern) + "."
        + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(path));

    // for analyze repl load, we walk through the dir structure available in the path,
    // looking at each db, and then each table, and then setting up the appropriate
    // import job in its place.

    try {

      Path loadPath = new Path(path);
      final FileSystem fs = loadPath.getFileSystem(conf);

      if (!fs.exists(loadPath)) {
        // supposed dump path does not exist.
        throw new FileNotFoundException(loadPath.toUri().toString());
      }

      // Now, the dumped path can be one of three things:
      // a) It can be a db dump, in which case we expect a set of dirs, each with a
      // db name, and with a _metadata file in each, and table dirs inside that.
      // b) It can be a table dump dir, in which case we expect a _metadata dump of
      // a table in question in the dir, and individual ptn dir hierarchy.
      // c) A dump can be an incremental dump, which means we have several subdirs
      // each of which have the evid as the dir name, and each of which correspond
      // to a event-level dump. Currently, only CREATE_TABLE and ADD_PARTITION are
      // handled, so all of these dumps will be at a table/ptn level.

      // For incremental repl, we will have individual events which can
      // be other things like roles and fns as well.
      // At this point, all dump dirs should contain a _dumpmetadata file that
      // tells us what is inside that dumpdir.

      DumpMetaData dmd = new DumpMetaData(loadPath, conf);

      boolean evDump = false;
      if (dmd.isIncrementalDump()){
        LOG.debug("{} contains an incremental dump", loadPath);
        evDump = true;
      } else {
        LOG.debug("{} contains an bootstrap dump", loadPath);
      }

      if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) {
        // not an event dump, and table name pattern specified, this has to be a tbl-level dump
        rootTasks.addAll(analyzeTableLoad(dbNameOrPattern, tblNameOrPattern, path, null, null, null));
        return;
      }

      FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath);
      if (srcs == null || (srcs.length == 0)) {
        LOG.warn("Nothing to load at {}", loadPath.toUri().toString());
        return;
      }

      FileStatus[] dirsInLoadPath = fs.listStatus(loadPath, EximUtil.getDirectoryFilter(fs));

      if ((dirsInLoadPath == null) || (dirsInLoadPath.length == 0)) {
        throw new IllegalArgumentException("No data to load in path " + loadPath.toUri().toString());
      }

      if (!evDump){
        // not an event dump, not a table dump - thus, a db dump
        if ((dbNameOrPattern != null) && (dirsInLoadPath.length > 1)) {
          LOG.debug("Found multiple dirs when we expected 1:");
          for (FileStatus d : dirsInLoadPath) {
            LOG.debug("> " + d.getPath().toUri().toString());
          }
          throw new IllegalArgumentException(
              "Multiple dirs in "
                  + loadPath.toUri().toString()
                  + " does not correspond to REPL LOAD expecting to load to a singular destination point.");
        }

        for (FileStatus dir : dirsInLoadPath) {
          analyzeDatabaseLoad(dbNameOrPattern, fs, dir);
        }
      } else {
        // event dump, each subdir is an individual event dump.
        Arrays.sort(dirsInLoadPath); // we need to guarantee that the directory listing we got is in order of evid.

        Task<? extends Serializable> evTaskRoot = TaskFactory.get(new DependencyCollectionWork(), conf);
        Task<? extends Serializable> taskChainTail = evTaskRoot;

        int evstage = 0;
        int evIter = 0;
        Long lastEvid = null;
        Map<String,Long> dbsUpdated = new ReplicationSpec.ReplStateMap<String,Long>();
        Map<String,Long> tablesUpdated = new ReplicationSpec.ReplStateMap<String,Long>();

        REPL_STATE_LOG.info("Repl Load: Started analyzing Repl load for DB: {} from path {}, Dump Type: INCREMENTAL",
                (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?",
                loadPath.toUri().toString());
        for (FileStatus dir : dirsInLoadPath){
          LOG.debug("Loading event from {} to {}.{}", dir.getPath().toUri(), dbNameOrPattern, tblNameOrPattern);
          // event loads will behave similar to table loads, with one crucial difference
          // precursor order is strict, and each event must be processed after the previous one.
          // The way we handle this strict order is as follows:
          // First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
          // at the head of our event chain. For each event we process, we tell analyzeTableLoad to
          // create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
          // and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
          // these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
          // tasks as follows:
          //
          //                 --->ev1.task1--                          --->ev2.task1--
          //                /               \                        /               \
          //  evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
          //                \               /
          //                 --->ev1.task3--
          //
          // Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
          // entire chain

          String locn = dir.getPath().toUri().toString();
          DumpMetaData eventDmd = new DumpMetaData(new Path(locn), conf);
          List<Task<? extends Serializable>> evTasks = analyzeEventLoad(
              dbNameOrPattern, tblNameOrPattern, locn, taskChainTail,
              dbsUpdated, tablesUpdated, eventDmd);
          evIter++;
          REPL_STATE_LOG.info("Repl Load: Analyzed load for event {}/{} " +
                              "with ID: {}, Type: {}, Path: {}",
                              evIter, dirsInLoadPath.length,
                              dir.getPath().getName(), eventDmd.getDumpType().toString(), locn);

          LOG.debug("evstage#{} got {} tasks", evstage, evTasks!=null ? evTasks.size() : 0);
          if ((evTasks != null) && (!evTasks.isEmpty())){
            Task<? extends Serializable> barrierTask = TaskFactory.get(new DependencyCollectionWork(), conf);
            for (Task<? extends Serializable> t : evTasks){
              t.addDependentTask(barrierTask);
              LOG.debug("Added {}:{} as a precursor of barrier task {}:{}",
                  t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId());
            }
            LOG.debug("Updated taskChainTail from {}{} to {}{}",
                taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
            taskChainTail = barrierTask;
            evstage++;
            lastEvid = dmd.getEventTo();
          }
        }

        // Now, we need to update repl.last.id for the various parent objects that were updated.
        // This update logic will work differently based on what "level" REPL LOAD was run on.
        //  a) If this was a REPL LOAD at a table level, i.e. both dbNameOrPattern and
        //     tblNameOrPattern were specified, then the table is the only thing we should
        //     update the repl.last.id for.
        //  b) If this was a db-level REPL LOAD, then we should update the db, as well as any
        //     tables affected by partition level operations. (any table level ops will
        //     automatically be updated as the table gets updated. Note - renames will need
        //     careful handling.
        //  c) If this was a wh-level REPL LOAD, then we should update every db for which there
        //     were events occurring, as well as tables for which there were ptn-level ops
        //     happened. Again, renames must be taken care of.
        //
        // So, what we're going to do is have each event load update dbsUpdated and tablesUpdated
        // accordingly, but ignore updates to tablesUpdated & dbsUpdated in the case of a
        // table-level REPL LOAD, using only the table itself. In the case of a db-level REPL
        // LOAD, we ignore dbsUpdated, but inject our own, and do not ignore tblsUpdated.
        // And for wh-level, we do no special processing, and use all of dbsUpdated and
        // tblsUpdated as-is.

        // Additional Note - although this var says "dbNameOrPattern", on REPL LOAD side,
        // we do not support a pattern It can be null or empty, in which case
        // we re-use the existing name from the dump, or it can be specified,
        // in which case we honour it. However, having this be a pattern is an error.
        // Ditto for tblNameOrPattern.


        if (evstage > 0){
          if ((tblNameOrPattern != null) && (!tblNameOrPattern.isEmpty())){
            // if tblNameOrPattern is specified, then dbNameOrPattern will be too, and
            // thus, this is a table-level REPL LOAD - only table needs updating.
            // If any of the individual events logged any other dbs as having changed,
            // null them out.
            dbsUpdated.clear();
            tablesUpdated.clear();
            tablesUpdated.put(dbNameOrPattern + "." + tblNameOrPattern, lastEvid);
          } else  if ((dbNameOrPattern != null) && (!dbNameOrPattern.isEmpty())){
            // if dbNameOrPattern is specified and tblNameOrPattern isn't, this is a
            // db-level update, and thus, the database needs updating. In addition.
            dbsUpdated.clear();
            dbsUpdated.put(dbNameOrPattern, lastEvid);
          }
        }

        for (String tableName : tablesUpdated.keySet()){
          // weird - AlterTableDesc requires a HashMap to update props instead of a Map.
          HashMap<String,String> mapProp = new HashMap<String,String>();
          mapProp.put(
              ReplicationSpec.KEY.CURR_STATE_ID.toString(),
              tablesUpdated.get(tableName).toString());
          AlterTableDesc alterTblDesc =  new AlterTableDesc(
              AlterTableDesc.AlterTableTypes.ADDPROPS, null, false);
          alterTblDesc.setProps(mapProp);
          alterTblDesc.setOldName(tableName);
          Task<? extends Serializable> updateReplIdTask = TaskFactory.get(
              new DDLWork(inputs, outputs, alterTblDesc), conf);
          taskChainTail.addDependentTask(updateReplIdTask);
          taskChainTail = updateReplIdTask;
        }
        for (String dbName : dbsUpdated.keySet()){
          Map<String,String> mapProp = new HashMap<String,String>();
          mapProp.put(
              ReplicationSpec.KEY.CURR_STATE_ID.toString(),
              dbsUpdated.get(dbName).toString());
          AlterDatabaseDesc alterDbDesc = new AlterDatabaseDesc(dbName, mapProp);
          Task<? extends Serializable> updateReplIdTask = TaskFactory.get(
              new DDLWork(inputs, outputs, alterDbDesc), conf);
          taskChainTail.addDependentTask(updateReplIdTask);
          taskChainTail = updateReplIdTask;
        }
        rootTasks.add(evTaskRoot);
        REPL_STATE_LOG.info("Repl Load: Completed analyzing Repl load for DB: {} from path {} and created import " +
                            "(DDL/COPY/MOVE) tasks",
                            (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?",
                            loadPath.toUri().toString());
      }

    } catch (Exception e) {
      // TODO : simple wrap & rethrow for now, clean up with error codes
      throw new SemanticException(e);
    }

  }

  private List<Task<? extends Serializable>> analyzeEventLoad(
      String dbName, String tblName, String location, Task<? extends Serializable> precursor,
      Map<String, Long> dbsUpdated, Map<String, Long> tablesUpdated, DumpMetaData dmd)
      throws SemanticException {
    MessageHandler.Context context =
        new MessageHandler.Context(dbName, tblName, location, precursor, dmd, conf, db, ctx, LOG);
    MessageHandler messageHandler = dmd.getDumpType().handler();
    List<Task<? extends Serializable>> tasks = messageHandler.handle(context);

    if (precursor != null) {
      for (Task<? extends Serializable> t : tasks) {
        precursor.addDependentTask(t);
        LOG.debug("Added {}:{} as a precursor of {}:{}",
            precursor.getClass(), precursor.getId(), t.getClass(), t.getId());
      }
    }
    dbsUpdated.putAll(messageHandler.databasesUpdated());
    tablesUpdated.putAll(messageHandler.tablesUpdated());
    inputs.addAll(messageHandler.readEntities());
    outputs.addAll(messageHandler.writeEntities());
    return tasks;
  }

  private boolean existEmptyDb(String dbName) throws InvalidOperationException, HiveException {
    Hive hiveDb = Hive.get();
    Database db = hiveDb.getDatabase(dbName);
    if (null != db) {
      List<String> allTables = hiveDb.getAllTables(dbName);
      List<String> allFunctions = hiveDb.getFunctions(dbName, "*");
      if (!allTables.isEmpty()) {
        throw new InvalidOperationException(
                "Database " + db.getName() + " is not empty. One or more tables exist.");
      }
      if (!allFunctions.isEmpty()) {
        throw new InvalidOperationException(
                "Database " + db.getName() + " is not empty. One or more functions exist.");
      }

      return true;
    }

    return false;
  }

  private void analyzeDatabaseLoad(String dbName, FileSystem fs, FileStatus dir)
      throws SemanticException {
    try {
      // Path being passed to us is a db dump location. We go ahead and load as needed.
      // dbName might be null or empty, in which case we keep the original db name for the new
      // database creation

      // Two steps here - first, we read the _metadata file here, and create a CreateDatabaseDesc
      // associated with that
      // Then, we iterate over all subdirs, and create table imports for each.

      MetaData rv = new MetaData();
      try {
        rv = EximUtil.readMetaData(fs, new Path(dir.getPath(), EximUtil.METADATA_NAME));
      } catch (IOException e) {
        throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e);
      }

      Database dbObj = rv.getDatabase();

      if (dbObj == null) {
        throw new IllegalArgumentException(
            "_metadata file read did not contain a db object - invalid dump.");
      }

      if ((dbName == null) || (dbName.isEmpty())) {
        // We use dbName specified as long as it is not null/empty. If so, then we use the original
        // name
        // recorded in the thrift object.
        dbName = dbObj.getName();
      }

      REPL_STATE_LOG.info("Repl Load: Started analyzing Repl Load for DB: {} from Dump Dir: {}, Dump Type: BOOTSTRAP",
                          dbName, dir.getPath().toUri().toString());

      Task<? extends Serializable> dbRootTask = null;
      if (existEmptyDb(dbName)) {
        AlterDatabaseDesc alterDbDesc = new AlterDatabaseDesc(dbName, dbObj.getParameters());
        dbRootTask = TaskFactory.get(new DDLWork(inputs, outputs, alterDbDesc), conf);
      } else {
        CreateDatabaseDesc createDbDesc = new CreateDatabaseDesc();
        createDbDesc.setName(dbName);
        createDbDesc.setComment(dbObj.getDescription());
        createDbDesc.setDatabaseProperties(dbObj.getParameters());
        // note that we do not set location - for repl load, we want that auto-created.

        createDbDesc.setIfNotExists(false);
        // If it exists, we want this to be an error condition. Repl Load is not intended to replace a
        // db.
        // TODO: we might revisit this in create-drop-recreate cases, needs some thinking on.
        dbRootTask = TaskFactory.get(new DDLWork(inputs, outputs, createDbDesc), conf);
      }

      rootTasks.add(dbRootTask);
      FileStatus[] dirsInDbPath = fs.listStatus(dir.getPath(), EximUtil.getDirectoryFilter(fs));

      for (FileStatus tableDir : Collections2.filter(Arrays.asList(dirsInDbPath), new TableDirPredicate())) {
        analyzeTableLoad(
            dbName, null, tableDir.getPath().toUri().toString(), dbRootTask, null, null);
        REPL_STATE_LOG.info("Repl Load: Analyzed table/view/partition load from path {}",
                            tableDir.getPath().toUri().toString());
      }

      //Function load
      Path functionMetaDataRoot = new Path(dir.getPath(), FUNCTIONS_ROOT_DIR_NAME);
      if (fs.exists(functionMetaDataRoot)) {
        List<FileStatus> functionDirectories =
            Arrays.asList(fs.listStatus(functionMetaDataRoot, EximUtil.getDirectoryFilter(fs)));
        for (FileStatus functionDir : functionDirectories) {
          analyzeFunctionLoad(dbName, functionDir, dbRootTask);
          REPL_STATE_LOG.info("Repl Load: Analyzed function load from path {}",
                              functionDir.getPath().toUri().toString());
        }
      }

      REPL_STATE_LOG.info("Repl Load: Completed analyzing Repl Load for DB: {} and created import (DDL/COPY/MOVE) tasks",
              dbName);
    } catch (Exception e) {
      throw new SemanticException(e);
    }
  }

  private static class TableDirPredicate implements Predicate<FileStatus> {
    @Override
    public boolean apply(FileStatus fileStatus) {
      return !fileStatus.getPath().getName().contains(FUNCTIONS_ROOT_DIR_NAME);
    }
  }

  private void analyzeFunctionLoad(String dbName, FileStatus functionDir,
      Task<? extends Serializable> createDbTask) throws IOException, SemanticException {
    URI fromURI = EximUtil
        .getValidatedURI(conf, stripQuotes(functionDir.getPath().toUri().toString()));
    Path fromPath = new Path(fromURI.getScheme(), fromURI.getAuthority(), fromURI.getPath());

    FileSystem fs = FileSystem.get(fromURI, conf);
    inputs.add(toReadEntity(fromPath, conf));

    try {
      MetaData metaData = EximUtil.readMetaData(fs, new Path(fromPath, EximUtil.METADATA_NAME));
      ReplicationSpec replicationSpec = metaData.getReplicationSpec();
      if (replicationSpec.isNoop()) {
        // nothing to do here, silently return.
        return;
      }
      CreateFunctionDesc desc = new CreateFunctionDesc(
          dbName + "." + metaData.function.getFunctionName(),
          false,
          metaData.function.getClassName(),
          metaData.function.getResourceUris()
      );

      Task<FunctionWork> currentTask = TaskFactory.get(new FunctionWork(desc), conf);
      if (createDbTask != null) {
        createDbTask.addDependentTask(currentTask);
        LOG.debug("Added {}:{} as a precursor of {}:{}",
            createDbTask.getClass(), createDbTask.getId(), currentTask.getClass(),
            currentTask.getId());
      }
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e);
    }
  }

  private List<Task<? extends Serializable>> analyzeTableLoad(
      String dbName, String tblName, String locn,
      Task<? extends Serializable> precursor,
      Map<String,Long> dbsUpdated, Map<String,Long> tablesUpdated) throws SemanticException {
    // Path being passed to us is a table dump location. We go ahead and load it in as needed.
    // If tblName is null, then we default to the table name specified in _metadata, which is good.
    // or are both specified, in which case, that's what we are intended to create the new table as.
    if (dbName == null || dbName.isEmpty()) {
      throw new SemanticException("Database name cannot be null for a table load");
    }
    try {
      // no location set on repl loads
      boolean isLocationSet = false;
      // all repl imports are non-external
      boolean isExternalSet = false;
      // bootstrap loads are not partition level
      boolean isPartSpecSet = false;
      // repl loads are not partition level
      LinkedHashMap<String, String> parsedPartSpec = null;
      // no location for repl imports
      String parsedLocation = null;
      List<Task<? extends Serializable>> importTasks = new ArrayList<Task<? extends Serializable>>();

      EximUtil.SemanticAnalyzerWrapperContext x =
          new EximUtil.SemanticAnalyzerWrapperContext(conf, db, inputs, outputs, importTasks, LOG,
              ctx);
      ImportSemanticAnalyzer.prepareImport(isLocationSet, isExternalSet, isPartSpecSet,
          (precursor != null), parsedLocation, tblName, dbName, parsedPartSpec, locn, x,
          dbsUpdated, tablesUpdated);

      if (precursor != null) {
        for (Task<? extends Serializable> t : importTasks) {
          precursor.addDependentTask(t);
          LOG.debug("Added {}:{} as a precursor of {}:{}",
              precursor.getClass(), precursor.getId(), t.getClass(), t.getId());
        }
      }

      return importTasks;
    } catch (Exception e) {
      throw new SemanticException(e);
    }
  }

  // REPL STATUS
  private void initReplStatus(ASTNode ast) {
    int numChildren = ast.getChildCount();
    dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(0).getText());
    if (numChildren > 1) {
      tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(1).getText());
    }
  }

  private void analyzeReplStatus(ASTNode ast) throws SemanticException {
    LOG.debug("ReplicationSemanticAnalyzer.analyzeReplStatus: " + String.valueOf(dbNameOrPattern)
        + "." + String.valueOf(tblNameOrPattern));

    String replLastId = null;

    try {
      if (tblNameOrPattern != null) {
        // Checking for status of table
        Table tbl = db.getTable(dbNameOrPattern, tblNameOrPattern);
        if (tbl != null) {
          inputs.add(new ReadEntity(tbl));
          Map<String, String> params = tbl.getParameters();
          if (params != null && (params.containsKey(ReplicationSpec.KEY.CURR_STATE_ID.toString()))) {
            replLastId = params.get(ReplicationSpec.KEY.CURR_STATE_ID.toString());
          }
        }
      } else {
        // Checking for status of a db
        Database database = db.getDatabase(dbNameOrPattern);
        if (database != null) {
          inputs.add(new ReadEntity(database));
          Map<String, String> params = database.getParameters();
          if (params != null && (params.containsKey(ReplicationSpec.KEY.CURR_STATE_ID.toString()))) {
            replLastId = params.get(ReplicationSpec.KEY.CURR_STATE_ID.toString());
          }
        }
      }
    } catch (HiveException e) {
      throw new SemanticException(e); // TODO : simple wrap & rethrow for now, clean up with error
                                      // codes
    }

    prepareReturnValues(Collections.singletonList(replLastId), "last_repl_id#string");
    setFetchTask(createFetchTask("last_repl_id#string"));
    LOG.debug("ReplicationSemanticAnalyzer.analyzeReplStatus: writing repl.last.id={} out to {}",
        String.valueOf(replLastId), ctx.getResFile(), conf);
  }

  private void prepareReturnValues(List<String> values, String schema) throws SemanticException {
    LOG.debug("prepareReturnValues : " + schema);
    for (String s : values) {
      LOG.debug("    > " + s);
    }
    ctx.setResFile(ctx.getLocalTmpPath());
    Utils.writeOutput(values, ctx.getResFile(), conf);
  }

  private ReplicationSpec getNewReplicationSpec() throws SemanticException {
    try {
      ReplicationSpec rspec = getNewReplicationSpec("replv2", "will-be-set");
      rspec.setCurrentReplicationState(String.valueOf(db.getMSC()
          .getCurrentNotificationEventId().getEventId()));
      return rspec;
    } catch (Exception e) {
      throw new SemanticException(e); // TODO : simple wrap & rethrow for now, clean up with error codes
    }
  }

  // Use for specifying object state as well as event state
  private ReplicationSpec getNewReplicationSpec(String evState, String objState) throws SemanticException {
    return new ReplicationSpec(true, false, evState, objState, false, true, true);
  }

  // Use for replication states focused on event only, where the obj state will be the event state
  private ReplicationSpec getNewEventOnlyReplicationSpec(Long eventId) throws SemanticException {
    return getNewReplicationSpec(eventId.toString(), eventId.toString());
  }

  private Iterable<? extends String> matchesTbl(String dbName, String tblPattern)
      throws HiveException {
    if (tblPattern == null) {
      return removeValuesTemporaryTables(db.getAllTables(dbName));
    } else {
      return db.getTablesByPattern(dbName, tblPattern);
    }
  }

  private final static String TMP_TABLE_PREFIX =
      SemanticAnalyzer.VALUES_TMP_TABLE_NAME_PREFIX.toLowerCase();

  static Iterable<String> removeValuesTemporaryTables(List<String> tableNames) {
    return Collections2.filter(tableNames,
        tableName -> {
          assert tableName != null;
          return !tableName.toLowerCase().startsWith(TMP_TABLE_PREFIX);
        });
  }

  private Iterable<? extends String> matchesDb(String dbPattern) throws HiveException {
    if (dbPattern == null) {
      return db.getAllDatabases();
    } else {
      return db.getDatabasesByPattern(dbPattern);
    }
  }
}