/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.parse; import com.google.common.base.Predicate; import com.google.common.collect.Collections2; import com.google.common.primitives.Ints; import org.antlr.runtime.tree.Tree; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.IMetaStoreClient; import org.apache.hadoop.hive.metastore.api.Database; import org.apache.hadoop.hive.metastore.api.Function; import org.apache.hadoop.hive.metastore.api.InvalidOperationException; import org.apache.hadoop.hive.metastore.api.NotificationEvent; import org.apache.hadoop.hive.metastore.messaging.EventUtils; import org.apache.hadoop.hive.metastore.messaging.MessageFactory; import org.apache.hadoop.hive.metastore.messaging.event.filters.AndFilter; import org.apache.hadoop.hive.metastore.messaging.event.filters.DatabaseAndTableFilter; import org.apache.hadoop.hive.metastore.messaging.event.filters.EventBoundaryFilter; import org.apache.hadoop.hive.metastore.messaging.event.filters.MessageFormatFilter; import org.apache.hadoop.hive.ql.ErrorMsg; import org.apache.hadoop.hive.ql.QueryState; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.InvalidTableException; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.parse.repl.DumpType; import org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper; import org.apache.hadoop.hive.ql.parse.repl.dump.events.EventHandler; import org.apache.hadoop.hive.ql.parse.repl.dump.events.EventHandlerFactory; import org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData; import org.apache.hadoop.hive.ql.parse.repl.dump.io.FunctionSerializer; import org.apache.hadoop.hive.ql.parse.repl.dump.io.JsonWriter; import org.apache.hadoop.hive.ql.parse.repl.dump.Utils; import org.apache.hadoop.hive.ql.parse.repl.load.MetaData; import org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler; import org.apache.hadoop.hive.ql.plan.AlterDatabaseDesc; import org.apache.hadoop.hive.ql.plan.AlterTableDesc; import org.apache.hadoop.hive.ql.plan.CreateDatabaseDesc; import org.apache.hadoop.hive.ql.plan.CreateFunctionDesc; import org.apache.hadoop.hive.ql.plan.DDLWork; import org.apache.hadoop.hive.ql.plan.DependencyCollectionWork; import org.apache.hadoop.hive.ql.plan.FunctionWork; import org.apache.hadoop.hive.ql.plan.PlanUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.FileNotFoundException; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_FROM; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_LIMIT; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_DUMP; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_LOAD; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_REPL_STATUS; import static org.apache.hadoop.hive.ql.parse.HiveParser.TOK_TO; public class ReplicationSemanticAnalyzer extends BaseSemanticAnalyzer { // Database name or pattern private String dbNameOrPattern; // Table name or pattern private String tblNameOrPattern; private Long eventFrom; private Long eventTo; private Integer maxEventLimit; // Base path for REPL LOAD private String path; private static String testInjectDumpDir = null; // unit tests can overwrite this to affect default dump behaviour private static final String dumpSchema = "dump_dir,last_repl_id#string,string"; private static final String FUNCTIONS_ROOT_DIR_NAME = "_functions"; private static final String FUNCTION_METADATA_DIR_NAME = "_metadata"; private final static Logger REPL_STATE_LOG = LoggerFactory.getLogger("ReplState"); ReplicationSemanticAnalyzer(QueryState queryState) throws SemanticException { super(queryState); } @Override public void analyzeInternal(ASTNode ast) throws SemanticException { LOG.debug("ReplicationSemanticAanalyzer: analyzeInternal"); LOG.debug(ast.getName() + ":" + ast.getToken().getText() + "=" + ast.getText()); switch (ast.getToken().getType()) { case TOK_REPL_DUMP: { LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: dump"); initReplDump(ast); analyzeReplDump(ast); break; } case TOK_REPL_LOAD: { LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: load"); initReplLoad(ast); analyzeReplLoad(ast); break; } case TOK_REPL_STATUS: { LOG.debug("ReplicationSemanticAnalyzer: analyzeInternal: status"); initReplStatus(ast); analyzeReplStatus(ast); break; } default: { throw new SemanticException("Unexpected root token"); } } } private void initReplDump(ASTNode ast) { int numChildren = ast.getChildCount(); dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(0).getText()); // skip the first node, which is always required int currNode = 1; while (currNode < numChildren) { if (ast.getChild(currNode).getType() != TOK_FROM) { // optional tblName was specified. tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(currNode).getText()); } else { // TOK_FROM subtree Tree fromNode = ast.getChild(currNode); eventFrom = Long.parseLong(PlanUtils.stripQuotes(fromNode.getChild(0).getText())); // skip the first, which is always required int numChild = 1; while (numChild < fromNode.getChildCount()) { if (fromNode.getChild(numChild).getType() == TOK_TO) { eventTo = Long.parseLong(PlanUtils.stripQuotes(fromNode.getChild(numChild + 1).getText())); // skip the next child, since we already took care of it numChild++; } else if (fromNode.getChild(numChild).getType() == TOK_LIMIT) { maxEventLimit = Integer.parseInt(PlanUtils.stripQuotes(fromNode.getChild(numChild + 1).getText())); // skip the next child, since we already took care of it numChild++; } // move to the next child in FROM tree numChild++; } // FROM node is always the last break; } // move to the next root node currNode++; } } // REPL DUMP private void analyzeReplDump(ASTNode ast) throws SemanticException { LOG.debug("ReplicationSemanticAnalyzer.analyzeReplDump: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(eventFrom) + " to " + String.valueOf(eventTo) + " maxEventLimit " + String.valueOf(maxEventLimit)); String replRoot = conf.getVar(HiveConf.ConfVars.REPLDIR); Path dumpRoot = new Path(replRoot, getNextDumpDir()); DumpMetaData dmd = new DumpMetaData(dumpRoot, conf); Path cmRoot = new Path(conf.getVar(HiveConf.ConfVars.REPLCMDIR)); Long lastReplId; try { if (eventFrom == null){ // bootstrap case Long bootDumpBeginReplId = db.getMSC().getCurrentNotificationEventId().getEventId(); for (String dbName : matchesDb(dbNameOrPattern)) { REPL_STATE_LOG.info("Repl Dump: Started analyzing Repl Dump for DB: {}, Dump Type: BOOTSTRAP", dbName); LOG.debug("ReplicationSemanticAnalyzer: analyzeReplDump dumping db: " + dbName); Path dbRoot = dumpDbMetadata(dbName, dumpRoot); dumpFunctionMetadata(dbName, dumpRoot); for (String tblName : matchesTbl(dbName, tblNameOrPattern)) { LOG.debug("ReplicationSemanticAnalyzer: analyzeReplDump dumping table: " + tblName + " to db root " + dbRoot.toUri()); dumpTbl(ast, dbName, tblName, dbRoot); } REPL_STATE_LOG.info("Repl Dump: Completed analyzing Repl Dump for DB: {} and created {} COPY tasks to dump " + "metadata and data", dbName, rootTasks.size()); } Long bootDumpEndReplId = db.getMSC().getCurrentNotificationEventId().getEventId(); LOG.info("Bootstrap object dump phase took from {} to {}", bootDumpBeginReplId, bootDumpEndReplId); // Now that bootstrap has dumped all objects related, we have to account for the changes // that occurred while bootstrap was happening - i.e. we have to look through all events // during the bootstrap period and consolidate them with our dump. IMetaStoreClient.NotificationFilter evFilter = new DatabaseAndTableFilter(dbNameOrPattern, tblNameOrPattern); EventUtils.MSClientNotificationFetcher evFetcher = new EventUtils.MSClientNotificationFetcher(db.getMSC()); EventUtils.NotificationEventIterator evIter = new EventUtils.NotificationEventIterator( evFetcher, bootDumpBeginReplId, Ints.checkedCast(bootDumpEndReplId - bootDumpBeginReplId) + 1, evFilter ); // Now we consolidate all the events that happenned during the objdump into the objdump while (evIter.hasNext()){ NotificationEvent ev = evIter.next(); Path evRoot = new Path(dumpRoot, String.valueOf(ev.getEventId())); // FIXME : implement consolidateEvent(..) similar to dumpEvent(ev,evRoot) } LOG.info( "Consolidation done, preparing to return {},{}->{}", dumpRoot.toUri(), bootDumpBeginReplId, bootDumpEndReplId); dmd.setDump(DumpType.BOOTSTRAP, bootDumpBeginReplId, bootDumpEndReplId, cmRoot); dmd.write(); // Set the correct last repl id to return to the user lastReplId = bootDumpEndReplId; } else { // get list of events matching dbPattern & tblPattern // go through each event, and dump out each event to a event-level dump dir inside dumproot if (eventTo == null){ eventTo = db.getMSC().getCurrentNotificationEventId().getEventId(); LOG.debug("eventTo not specified, using current event id : {}", eventTo); } else if (eventTo < eventFrom) { throw new Exception("Invalid event ID input received in TO clause"); } Integer maxRange = Ints.checkedCast(eventTo - eventFrom + 1); if ((maxEventLimit == null) || (maxEventLimit > maxRange)){ maxEventLimit = maxRange; } // TODO : instead of simply restricting by message format, we should eventually // move to a jdbc-driver-stype registering of message format, and picking message // factory per event to decode. For now, however, since all messages have the // same factory, restricting by message format is effectively a guard against // older leftover data that would cause us problems. IMetaStoreClient.NotificationFilter evFilter = new AndFilter( new DatabaseAndTableFilter(dbNameOrPattern, tblNameOrPattern), new EventBoundaryFilter(eventFrom, eventTo), new MessageFormatFilter(MessageFactory.getInstance().getMessageFormat())); EventUtils.MSClientNotificationFetcher evFetcher = new EventUtils.MSClientNotificationFetcher(db.getMSC()); EventUtils.NotificationEventIterator evIter = new EventUtils.NotificationEventIterator( evFetcher, eventFrom, maxEventLimit, evFilter); lastReplId = eventTo; REPL_STATE_LOG.info("Repl Dump: Started Repl Dump for DB: {}, Dump Type: INCREMENTAL", (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?"); while (evIter.hasNext()){ NotificationEvent ev = evIter.next(); lastReplId = ev.getEventId(); Path evRoot = new Path(dumpRoot, String.valueOf(lastReplId)); dumpEvent(ev, evRoot, cmRoot); } REPL_STATE_LOG.info("Repl Dump: Completed Repl Dump for DB: {}", (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?"); LOG.info("Done dumping events, preparing to return {},{}", dumpRoot.toUri(), lastReplId); Utils.writeOutput( Arrays.asList( "incremental", String.valueOf(eventFrom), String.valueOf(lastReplId) ), dmd.getDumpFilePath(), conf); dmd.setDump(DumpType.INCREMENTAL, eventFrom, lastReplId, cmRoot); dmd.write(); } prepareReturnValues(Arrays.asList(dumpRoot.toUri().toString(), String.valueOf(lastReplId)), dumpSchema); setFetchTask(createFetchTask(dumpSchema)); } catch (Exception e) { // TODO : simple wrap & rethrow for now, clean up with error codes LOG.warn("Error during analyzeReplDump", e); throw new SemanticException(e); } } private void dumpEvent(NotificationEvent ev, Path evRoot, Path cmRoot) throws Exception { EventHandler.Context context = new EventHandler.Context( evRoot, cmRoot, db, conf, getNewEventOnlyReplicationSpec(ev.getEventId()) ); EventHandlerFactory.handlerFor(ev).handle(context); REPL_STATE_LOG.info("Repl Dump: Dumped event with ID: {}, Type: {} and dumped metadata and data to path {}", String.valueOf(ev.getEventId()), ev.getEventType(), evRoot.toUri().toString()); } public static void injectNextDumpDirForTest(String dumpdir){ testInjectDumpDir = dumpdir; } private String getNextDumpDir() { if (conf.getBoolVar(HiveConf.ConfVars.HIVE_IN_TEST)) { // make it easy to write .q unit tests, instead of unique id generation. // however, this does mean that in writing tests, we have to be aware that // repl dump will clash with prior dumps, and thus have to clean up properly. if (testInjectDumpDir == null){ return "next"; } else { return testInjectDumpDir; } } else { return String.valueOf(System.currentTimeMillis()); // TODO: time good enough for now - we'll likely improve this. // We may also work in something the equivalent of pid, thrid and move to nanos to ensure // uniqueness. } } /** * * @param dbName * @param dumpRoot * @return db dumped path * @throws SemanticException */ private Path dumpDbMetadata(String dbName, Path dumpRoot) throws SemanticException { Path dbRoot = new Path(dumpRoot, dbName); try { // TODO : instantiating FS objects are generally costly. Refactor FileSystem fs = dbRoot.getFileSystem(conf); Path dumpPath = new Path(dbRoot, EximUtil.METADATA_NAME); HiveWrapper.Tuple<Database> database = new HiveWrapper(db, dbName).database(); EximUtil.createDbExportDump(fs, dumpPath, database.object, database.replicationSpec); REPL_STATE_LOG.info("Repl Dump: Dumped DB metadata"); } catch (Exception e) { // TODO : simple wrap & rethrow for now, clean up with error codes throw new SemanticException(e); } return dbRoot; } private void dumpFunctionMetadata(String dbName, Path dumpRoot) throws SemanticException { Path functionsRoot = new Path(new Path(dumpRoot, dbName), FUNCTIONS_ROOT_DIR_NAME); try { // TODO : This should ideally return the Function Objects and not Strings(function names) that should be done by the caller, Look at this separately. List<String> functionNames = db.getFunctions(dbName, "*"); for (String functionName : functionNames) { HiveWrapper.Tuple<Function> tuple; try { tuple = new HiveWrapper(db, dbName).function(functionName); } catch (HiveException e) { //This can happen as we are querying the getFunctions before we are getting the actual function //in between there can be a drop function by a user in which case our call will fail. LOG.info("Function " + functionName + " could not be found, we are ignoring it as it can be a valid state ", e); continue; } if (tuple.object.getResourceUris().isEmpty()) { REPL_STATE_LOG.warn( "Not replicating function: " + functionName + " as it seems to have been created " + "without USING clause"); continue; } Path functionMetadataRoot = new Path(new Path(functionsRoot, functionName), FUNCTION_METADATA_DIR_NAME); try (JsonWriter jsonWriter = new JsonWriter(functionMetadataRoot.getFileSystem(conf), functionMetadataRoot)) { new FunctionSerializer(tuple.object).writeTo(jsonWriter, tuple.replicationSpec); } REPL_STATE_LOG.info("Repl Dump: Dumped metadata for function: {}", functionName); } } catch (Exception e) { throw new SemanticException(e); } } /** * * @param ast * @param dbName * @param tblName * @param dbRoot * @return tbl dumped path * @throws SemanticException */ private Path dumpTbl(ASTNode ast, String dbName, String tblName, Path dbRoot) throws SemanticException { Path tableRoot = new Path(dbRoot, tblName); try { URI toURI = EximUtil.getValidatedURI(conf, tableRoot.toUri().toString()); TableSpec ts = new TableSpec(db, conf, dbName + "." + tblName, null); ExportSemanticAnalyzer.prepareExport(ast, toURI, ts, getNewReplicationSpec(), db, conf, ctx, rootTasks, inputs, outputs, LOG); REPL_STATE_LOG.info("Repl Dump: Analyzed dump for table/view: {}.{} and created copy tasks to dump metadata " + "and data to path {}", dbName, tblName, toURI.toString()); } catch (InvalidTableException te) { // Bootstrap dump shouldn't fail if the table is dropped/renamed while dumping it. // Just log a debug message and skip it. LOG.debug(te.getMessage()); return null; } catch (HiveException e) { // TODO : simple wrap & rethrow for now, clean up with error codes throw new SemanticException(e); } return tableRoot; } // REPL LOAD private void initReplLoad(ASTNode ast) { int numChildren = ast.getChildCount(); path = PlanUtils.stripQuotes(ast.getChild(0).getText()); if (numChildren > 1) { dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(1).getText()); } if (numChildren > 2) { tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(2).getText()); } } /* * Example dump dirs we need to be able to handle : * * for: hive.repl.rootdir = staging/ * Then, repl dumps will be created in staging/<dumpdir> * * single-db-dump: staging/blah12345 will contain a db dir for the db specified * blah12345/ * default/ * _metadata * tbl1/ * _metadata * dt=20160907/ * _files * tbl2/ * tbl3/ * unptn_tbl/ * _metadata * _files * * multi-db-dump: staging/bar12347 will contain dirs for each db covered * staging/ * bar12347/ * default/ * ... * sales/ * ... * * single table-dump: staging/baz123 will contain a table object dump inside * staging/ * baz123/ * _metadata * dt=20150931/ * _files * * incremental dump : staging/blue123 will contain dirs for each event inside. * staging/ * blue123/ * 34/ * 35/ * 36/ */ private void analyzeReplLoad(ASTNode ast) throws SemanticException { LOG.debug("ReplSemanticAnalyzer.analyzeReplLoad: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern) + " from " + String.valueOf(path)); // for analyze repl load, we walk through the dir structure available in the path, // looking at each db, and then each table, and then setting up the appropriate // import job in its place. try { Path loadPath = new Path(path); final FileSystem fs = loadPath.getFileSystem(conf); if (!fs.exists(loadPath)) { // supposed dump path does not exist. throw new FileNotFoundException(loadPath.toUri().toString()); } // Now, the dumped path can be one of three things: // a) It can be a db dump, in which case we expect a set of dirs, each with a // db name, and with a _metadata file in each, and table dirs inside that. // b) It can be a table dump dir, in which case we expect a _metadata dump of // a table in question in the dir, and individual ptn dir hierarchy. // c) A dump can be an incremental dump, which means we have several subdirs // each of which have the evid as the dir name, and each of which correspond // to a event-level dump. Currently, only CREATE_TABLE and ADD_PARTITION are // handled, so all of these dumps will be at a table/ptn level. // For incremental repl, we will have individual events which can // be other things like roles and fns as well. // At this point, all dump dirs should contain a _dumpmetadata file that // tells us what is inside that dumpdir. DumpMetaData dmd = new DumpMetaData(loadPath, conf); boolean evDump = false; if (dmd.isIncrementalDump()){ LOG.debug("{} contains an incremental dump", loadPath); evDump = true; } else { LOG.debug("{} contains an bootstrap dump", loadPath); } if ((!evDump) && (tblNameOrPattern != null) && !(tblNameOrPattern.isEmpty())) { // not an event dump, and table name pattern specified, this has to be a tbl-level dump rootTasks.addAll(analyzeTableLoad(dbNameOrPattern, tblNameOrPattern, path, null, null, null)); return; } FileStatus[] srcs = LoadSemanticAnalyzer.matchFilesOrDir(fs, loadPath); if (srcs == null || (srcs.length == 0)) { LOG.warn("Nothing to load at {}", loadPath.toUri().toString()); return; } FileStatus[] dirsInLoadPath = fs.listStatus(loadPath, EximUtil.getDirectoryFilter(fs)); if ((dirsInLoadPath == null) || (dirsInLoadPath.length == 0)) { throw new IllegalArgumentException("No data to load in path " + loadPath.toUri().toString()); } if (!evDump){ // not an event dump, not a table dump - thus, a db dump if ((dbNameOrPattern != null) && (dirsInLoadPath.length > 1)) { LOG.debug("Found multiple dirs when we expected 1:"); for (FileStatus d : dirsInLoadPath) { LOG.debug("> " + d.getPath().toUri().toString()); } throw new IllegalArgumentException( "Multiple dirs in " + loadPath.toUri().toString() + " does not correspond to REPL LOAD expecting to load to a singular destination point."); } for (FileStatus dir : dirsInLoadPath) { analyzeDatabaseLoad(dbNameOrPattern, fs, dir); } } else { // event dump, each subdir is an individual event dump. Arrays.sort(dirsInLoadPath); // we need to guarantee that the directory listing we got is in order of evid. Task<? extends Serializable> evTaskRoot = TaskFactory.get(new DependencyCollectionWork(), conf); Task<? extends Serializable> taskChainTail = evTaskRoot; int evstage = 0; int evIter = 0; Long lastEvid = null; Map<String,Long> dbsUpdated = new ReplicationSpec.ReplStateMap<String,Long>(); Map<String,Long> tablesUpdated = new ReplicationSpec.ReplStateMap<String,Long>(); REPL_STATE_LOG.info("Repl Load: Started analyzing Repl load for DB: {} from path {}, Dump Type: INCREMENTAL", (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?", loadPath.toUri().toString()); for (FileStatus dir : dirsInLoadPath){ LOG.debug("Loading event from {} to {}.{}", dir.getPath().toUri(), dbNameOrPattern, tblNameOrPattern); // event loads will behave similar to table loads, with one crucial difference // precursor order is strict, and each event must be processed after the previous one. // The way we handle this strict order is as follows: // First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask) // at the head of our event chain. For each event we process, we tell analyzeTableLoad to // create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks // and introduce a new barrier task(also a DependencyCollectionTask) which depends on all // these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of // tasks as follows: // // --->ev1.task1-- --->ev2.task1-- // / \ / \ // evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail // \ / // --->ev1.task3-- // // Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the // entire chain String locn = dir.getPath().toUri().toString(); DumpMetaData eventDmd = new DumpMetaData(new Path(locn), conf); List<Task<? extends Serializable>> evTasks = analyzeEventLoad( dbNameOrPattern, tblNameOrPattern, locn, taskChainTail, dbsUpdated, tablesUpdated, eventDmd); evIter++; REPL_STATE_LOG.info("Repl Load: Analyzed load for event {}/{} " + "with ID: {}, Type: {}, Path: {}", evIter, dirsInLoadPath.length, dir.getPath().getName(), eventDmd.getDumpType().toString(), locn); LOG.debug("evstage#{} got {} tasks", evstage, evTasks!=null ? evTasks.size() : 0); if ((evTasks != null) && (!evTasks.isEmpty())){ Task<? extends Serializable> barrierTask = TaskFactory.get(new DependencyCollectionWork(), conf); for (Task<? extends Serializable> t : evTasks){ t.addDependentTask(barrierTask); LOG.debug("Added {}:{} as a precursor of barrier task {}:{}", t.getClass(), t.getId(), barrierTask.getClass(), barrierTask.getId()); } LOG.debug("Updated taskChainTail from {}{} to {}{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId()); taskChainTail = barrierTask; evstage++; lastEvid = dmd.getEventTo(); } } // Now, we need to update repl.last.id for the various parent objects that were updated. // This update logic will work differently based on what "level" REPL LOAD was run on. // a) If this was a REPL LOAD at a table level, i.e. both dbNameOrPattern and // tblNameOrPattern were specified, then the table is the only thing we should // update the repl.last.id for. // b) If this was a db-level REPL LOAD, then we should update the db, as well as any // tables affected by partition level operations. (any table level ops will // automatically be updated as the table gets updated. Note - renames will need // careful handling. // c) If this was a wh-level REPL LOAD, then we should update every db for which there // were events occurring, as well as tables for which there were ptn-level ops // happened. Again, renames must be taken care of. // // So, what we're going to do is have each event load update dbsUpdated and tablesUpdated // accordingly, but ignore updates to tablesUpdated & dbsUpdated in the case of a // table-level REPL LOAD, using only the table itself. In the case of a db-level REPL // LOAD, we ignore dbsUpdated, but inject our own, and do not ignore tblsUpdated. // And for wh-level, we do no special processing, and use all of dbsUpdated and // tblsUpdated as-is. // Additional Note - although this var says "dbNameOrPattern", on REPL LOAD side, // we do not support a pattern It can be null or empty, in which case // we re-use the existing name from the dump, or it can be specified, // in which case we honour it. However, having this be a pattern is an error. // Ditto for tblNameOrPattern. if (evstage > 0){ if ((tblNameOrPattern != null) && (!tblNameOrPattern.isEmpty())){ // if tblNameOrPattern is specified, then dbNameOrPattern will be too, and // thus, this is a table-level REPL LOAD - only table needs updating. // If any of the individual events logged any other dbs as having changed, // null them out. dbsUpdated.clear(); tablesUpdated.clear(); tablesUpdated.put(dbNameOrPattern + "." + tblNameOrPattern, lastEvid); } else if ((dbNameOrPattern != null) && (!dbNameOrPattern.isEmpty())){ // if dbNameOrPattern is specified and tblNameOrPattern isn't, this is a // db-level update, and thus, the database needs updating. In addition. dbsUpdated.clear(); dbsUpdated.put(dbNameOrPattern, lastEvid); } } for (String tableName : tablesUpdated.keySet()){ // weird - AlterTableDesc requires a HashMap to update props instead of a Map. HashMap<String,String> mapProp = new HashMap<String,String>(); mapProp.put( ReplicationSpec.KEY.CURR_STATE_ID.toString(), tablesUpdated.get(tableName).toString()); AlterTableDesc alterTblDesc = new AlterTableDesc( AlterTableDesc.AlterTableTypes.ADDPROPS, null, false); alterTblDesc.setProps(mapProp); alterTblDesc.setOldName(tableName); Task<? extends Serializable> updateReplIdTask = TaskFactory.get( new DDLWork(inputs, outputs, alterTblDesc), conf); taskChainTail.addDependentTask(updateReplIdTask); taskChainTail = updateReplIdTask; } for (String dbName : dbsUpdated.keySet()){ Map<String,String> mapProp = new HashMap<String,String>(); mapProp.put( ReplicationSpec.KEY.CURR_STATE_ID.toString(), dbsUpdated.get(dbName).toString()); AlterDatabaseDesc alterDbDesc = new AlterDatabaseDesc(dbName, mapProp); Task<? extends Serializable> updateReplIdTask = TaskFactory.get( new DDLWork(inputs, outputs, alterDbDesc), conf); taskChainTail.addDependentTask(updateReplIdTask); taskChainTail = updateReplIdTask; } rootTasks.add(evTaskRoot); REPL_STATE_LOG.info("Repl Load: Completed analyzing Repl load for DB: {} from path {} and created import " + "(DDL/COPY/MOVE) tasks", (null != dbNameOrPattern && !dbNameOrPattern.isEmpty()) ? dbNameOrPattern : "?", loadPath.toUri().toString()); } } catch (Exception e) { // TODO : simple wrap & rethrow for now, clean up with error codes throw new SemanticException(e); } } private List<Task<? extends Serializable>> analyzeEventLoad( String dbName, String tblName, String location, Task<? extends Serializable> precursor, Map<String, Long> dbsUpdated, Map<String, Long> tablesUpdated, DumpMetaData dmd) throws SemanticException { MessageHandler.Context context = new MessageHandler.Context(dbName, tblName, location, precursor, dmd, conf, db, ctx, LOG); MessageHandler messageHandler = dmd.getDumpType().handler(); List<Task<? extends Serializable>> tasks = messageHandler.handle(context); if (precursor != null) { for (Task<? extends Serializable> t : tasks) { precursor.addDependentTask(t); LOG.debug("Added {}:{} as a precursor of {}:{}", precursor.getClass(), precursor.getId(), t.getClass(), t.getId()); } } dbsUpdated.putAll(messageHandler.databasesUpdated()); tablesUpdated.putAll(messageHandler.tablesUpdated()); inputs.addAll(messageHandler.readEntities()); outputs.addAll(messageHandler.writeEntities()); return tasks; } private boolean existEmptyDb(String dbName) throws InvalidOperationException, HiveException { Hive hiveDb = Hive.get(); Database db = hiveDb.getDatabase(dbName); if (null != db) { List<String> allTables = hiveDb.getAllTables(dbName); List<String> allFunctions = hiveDb.getFunctions(dbName, "*"); if (!allTables.isEmpty()) { throw new InvalidOperationException( "Database " + db.getName() + " is not empty. One or more tables exist."); } if (!allFunctions.isEmpty()) { throw new InvalidOperationException( "Database " + db.getName() + " is not empty. One or more functions exist."); } return true; } return false; } private void analyzeDatabaseLoad(String dbName, FileSystem fs, FileStatus dir) throws SemanticException { try { // Path being passed to us is a db dump location. We go ahead and load as needed. // dbName might be null or empty, in which case we keep the original db name for the new // database creation // Two steps here - first, we read the _metadata file here, and create a CreateDatabaseDesc // associated with that // Then, we iterate over all subdirs, and create table imports for each. MetaData rv = new MetaData(); try { rv = EximUtil.readMetaData(fs, new Path(dir.getPath(), EximUtil.METADATA_NAME)); } catch (IOException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e); } Database dbObj = rv.getDatabase(); if (dbObj == null) { throw new IllegalArgumentException( "_metadata file read did not contain a db object - invalid dump."); } if ((dbName == null) || (dbName.isEmpty())) { // We use dbName specified as long as it is not null/empty. If so, then we use the original // name // recorded in the thrift object. dbName = dbObj.getName(); } REPL_STATE_LOG.info("Repl Load: Started analyzing Repl Load for DB: {} from Dump Dir: {}, Dump Type: BOOTSTRAP", dbName, dir.getPath().toUri().toString()); Task<? extends Serializable> dbRootTask = null; if (existEmptyDb(dbName)) { AlterDatabaseDesc alterDbDesc = new AlterDatabaseDesc(dbName, dbObj.getParameters()); dbRootTask = TaskFactory.get(new DDLWork(inputs, outputs, alterDbDesc), conf); } else { CreateDatabaseDesc createDbDesc = new CreateDatabaseDesc(); createDbDesc.setName(dbName); createDbDesc.setComment(dbObj.getDescription()); createDbDesc.setDatabaseProperties(dbObj.getParameters()); // note that we do not set location - for repl load, we want that auto-created. createDbDesc.setIfNotExists(false); // If it exists, we want this to be an error condition. Repl Load is not intended to replace a // db. // TODO: we might revisit this in create-drop-recreate cases, needs some thinking on. dbRootTask = TaskFactory.get(new DDLWork(inputs, outputs, createDbDesc), conf); } rootTasks.add(dbRootTask); FileStatus[] dirsInDbPath = fs.listStatus(dir.getPath(), EximUtil.getDirectoryFilter(fs)); for (FileStatus tableDir : Collections2.filter(Arrays.asList(dirsInDbPath), new TableDirPredicate())) { analyzeTableLoad( dbName, null, tableDir.getPath().toUri().toString(), dbRootTask, null, null); REPL_STATE_LOG.info("Repl Load: Analyzed table/view/partition load from path {}", tableDir.getPath().toUri().toString()); } //Function load Path functionMetaDataRoot = new Path(dir.getPath(), FUNCTIONS_ROOT_DIR_NAME); if (fs.exists(functionMetaDataRoot)) { List<FileStatus> functionDirectories = Arrays.asList(fs.listStatus(functionMetaDataRoot, EximUtil.getDirectoryFilter(fs))); for (FileStatus functionDir : functionDirectories) { analyzeFunctionLoad(dbName, functionDir, dbRootTask); REPL_STATE_LOG.info("Repl Load: Analyzed function load from path {}", functionDir.getPath().toUri().toString()); } } REPL_STATE_LOG.info("Repl Load: Completed analyzing Repl Load for DB: {} and created import (DDL/COPY/MOVE) tasks", dbName); } catch (Exception e) { throw new SemanticException(e); } } private static class TableDirPredicate implements Predicate<FileStatus> { @Override public boolean apply(FileStatus fileStatus) { return !fileStatus.getPath().getName().contains(FUNCTIONS_ROOT_DIR_NAME); } } private void analyzeFunctionLoad(String dbName, FileStatus functionDir, Task<? extends Serializable> createDbTask) throws IOException, SemanticException { URI fromURI = EximUtil .getValidatedURI(conf, stripQuotes(functionDir.getPath().toUri().toString())); Path fromPath = new Path(fromURI.getScheme(), fromURI.getAuthority(), fromURI.getPath()); FileSystem fs = FileSystem.get(fromURI, conf); inputs.add(toReadEntity(fromPath, conf)); try { MetaData metaData = EximUtil.readMetaData(fs, new Path(fromPath, EximUtil.METADATA_NAME)); ReplicationSpec replicationSpec = metaData.getReplicationSpec(); if (replicationSpec.isNoop()) { // nothing to do here, silently return. return; } CreateFunctionDesc desc = new CreateFunctionDesc( dbName + "." + metaData.function.getFunctionName(), false, metaData.function.getClassName(), metaData.function.getResourceUris() ); Task<FunctionWork> currentTask = TaskFactory.get(new FunctionWork(desc), conf); if (createDbTask != null) { createDbTask.addDependentTask(currentTask); LOG.debug("Added {}:{} as a precursor of {}:{}", createDbTask.getClass(), createDbTask.getId(), currentTask.getClass(), currentTask.getId()); } } catch (IOException e) { throw new SemanticException(ErrorMsg.INVALID_PATH.getMsg(), e); } } private List<Task<? extends Serializable>> analyzeTableLoad( String dbName, String tblName, String locn, Task<? extends Serializable> precursor, Map<String,Long> dbsUpdated, Map<String,Long> tablesUpdated) throws SemanticException { // Path being passed to us is a table dump location. We go ahead and load it in as needed. // If tblName is null, then we default to the table name specified in _metadata, which is good. // or are both specified, in which case, that's what we are intended to create the new table as. if (dbName == null || dbName.isEmpty()) { throw new SemanticException("Database name cannot be null for a table load"); } try { // no location set on repl loads boolean isLocationSet = false; // all repl imports are non-external boolean isExternalSet = false; // bootstrap loads are not partition level boolean isPartSpecSet = false; // repl loads are not partition level LinkedHashMap<String, String> parsedPartSpec = null; // no location for repl imports String parsedLocation = null; List<Task<? extends Serializable>> importTasks = new ArrayList<Task<? extends Serializable>>(); EximUtil.SemanticAnalyzerWrapperContext x = new EximUtil.SemanticAnalyzerWrapperContext(conf, db, inputs, outputs, importTasks, LOG, ctx); ImportSemanticAnalyzer.prepareImport(isLocationSet, isExternalSet, isPartSpecSet, (precursor != null), parsedLocation, tblName, dbName, parsedPartSpec, locn, x, dbsUpdated, tablesUpdated); if (precursor != null) { for (Task<? extends Serializable> t : importTasks) { precursor.addDependentTask(t); LOG.debug("Added {}:{} as a precursor of {}:{}", precursor.getClass(), precursor.getId(), t.getClass(), t.getId()); } } return importTasks; } catch (Exception e) { throw new SemanticException(e); } } // REPL STATUS private void initReplStatus(ASTNode ast) { int numChildren = ast.getChildCount(); dbNameOrPattern = PlanUtils.stripQuotes(ast.getChild(0).getText()); if (numChildren > 1) { tblNameOrPattern = PlanUtils.stripQuotes(ast.getChild(1).getText()); } } private void analyzeReplStatus(ASTNode ast) throws SemanticException { LOG.debug("ReplicationSemanticAnalyzer.analyzeReplStatus: " + String.valueOf(dbNameOrPattern) + "." + String.valueOf(tblNameOrPattern)); String replLastId = null; try { if (tblNameOrPattern != null) { // Checking for status of table Table tbl = db.getTable(dbNameOrPattern, tblNameOrPattern); if (tbl != null) { inputs.add(new ReadEntity(tbl)); Map<String, String> params = tbl.getParameters(); if (params != null && (params.containsKey(ReplicationSpec.KEY.CURR_STATE_ID.toString()))) { replLastId = params.get(ReplicationSpec.KEY.CURR_STATE_ID.toString()); } } } else { // Checking for status of a db Database database = db.getDatabase(dbNameOrPattern); if (database != null) { inputs.add(new ReadEntity(database)); Map<String, String> params = database.getParameters(); if (params != null && (params.containsKey(ReplicationSpec.KEY.CURR_STATE_ID.toString()))) { replLastId = params.get(ReplicationSpec.KEY.CURR_STATE_ID.toString()); } } } } catch (HiveException e) { throw new SemanticException(e); // TODO : simple wrap & rethrow for now, clean up with error // codes } prepareReturnValues(Collections.singletonList(replLastId), "last_repl_id#string"); setFetchTask(createFetchTask("last_repl_id#string")); LOG.debug("ReplicationSemanticAnalyzer.analyzeReplStatus: writing repl.last.id={} out to {}", String.valueOf(replLastId), ctx.getResFile(), conf); } private void prepareReturnValues(List<String> values, String schema) throws SemanticException { LOG.debug("prepareReturnValues : " + schema); for (String s : values) { LOG.debug(" > " + s); } ctx.setResFile(ctx.getLocalTmpPath()); Utils.writeOutput(values, ctx.getResFile(), conf); } private ReplicationSpec getNewReplicationSpec() throws SemanticException { try { ReplicationSpec rspec = getNewReplicationSpec("replv2", "will-be-set"); rspec.setCurrentReplicationState(String.valueOf(db.getMSC() .getCurrentNotificationEventId().getEventId())); return rspec; } catch (Exception e) { throw new SemanticException(e); // TODO : simple wrap & rethrow for now, clean up with error codes } } // Use for specifying object state as well as event state private ReplicationSpec getNewReplicationSpec(String evState, String objState) throws SemanticException { return new ReplicationSpec(true, false, evState, objState, false, true, true); } // Use for replication states focused on event only, where the obj state will be the event state private ReplicationSpec getNewEventOnlyReplicationSpec(Long eventId) throws SemanticException { return getNewReplicationSpec(eventId.toString(), eventId.toString()); } private Iterable<? extends String> matchesTbl(String dbName, String tblPattern) throws HiveException { if (tblPattern == null) { return removeValuesTemporaryTables(db.getAllTables(dbName)); } else { return db.getTablesByPattern(dbName, tblPattern); } } private final static String TMP_TABLE_PREFIX = SemanticAnalyzer.VALUES_TMP_TABLE_NAME_PREFIX.toLowerCase(); static Iterable<String> removeValuesTemporaryTables(List<String> tableNames) { return Collections2.filter(tableNames, tableName -> { assert tableName != null; return !tableName.toLowerCase().startsWith(TMP_TABLE_PREFIX); }); } private Iterable<? extends String> matchesDb(String dbPattern) throws HiveException { if (dbPattern == null) { return db.getAllDatabases(); } else { return db.getDatabasesByPattern(dbPattern); } } }