UpdateDeleteSemanticAnalyzer.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.parse;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.antlr.runtime.TokenRewriteStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.TableType;
import org.apache.hadoop.hive.metastore.TransactionalValidationListener;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.Context;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.QueryState;
import org.apache.hadoop.hive.ql.hooks.Entity;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveUtils;
import org.apache.hadoop.hive.ql.metadata.InvalidTableException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.session.SessionState;


/**
 * A subclass of the {@link org.apache.hadoop.hive.ql.parse.SemanticAnalyzer} that just handles
 * update, delete and merge statements.  It works by rewriting the updates and deletes into insert
 * statements (since they are actually inserts) and then doing some patch up to make them work as
 * updates and deletes instead.
 */
public class UpdateDeleteSemanticAnalyzer extends SemanticAnalyzer {

  private boolean useSuper = false;

  UpdateDeleteSemanticAnalyzer(QueryState queryState) throws SemanticException {
    super(queryState);
  }

  @Override
  public void analyzeInternal(ASTNode tree) throws SemanticException {
    if (useSuper) {
      super.analyzeInternal(tree);
    } else {

      if (!SessionState.get().getTxnMgr().supportsAcid()) {
        throw new SemanticException(ErrorMsg.ACID_OP_ON_NONACID_TXNMGR.getMsg());
      }
      switch (tree.getToken().getType()) {
        case HiveParser.TOK_DELETE_FROM:
          analyzeDelete(tree);
          break;
        case HiveParser.TOK_UPDATE_TABLE:
          analyzeUpdate(tree);
          break;
        case HiveParser.TOK_MERGE:
          analyzeMerge(tree);
          break;
        default:
          throw new RuntimeException("Asked to parse token " + tree.getName() + " in " +
              "UpdateDeleteSemanticAnalyzer");
      }
      cleanUpMetaColumnAccessControl();

    }
  }
  private boolean updating() {
    return currentOperation == Context.Operation.UPDATE;
  }
  private boolean deleting() {
    return currentOperation == Context.Operation.DELETE;
  }

  private void analyzeUpdate(ASTNode tree) throws SemanticException {
    currentOperation = Context.Operation.UPDATE;
    reparseAndSuperAnalyze(tree);
  }

  private void analyzeDelete(ASTNode tree) throws SemanticException {
    currentOperation = Context.Operation.DELETE;
    reparseAndSuperAnalyze(tree);
  }
  /**
   * Append list of partition columns to Insert statement, i.e. the 1st set of partCol1,partCol2
   * INSERT INTO T PARTITION(partCol1,partCol2...) SELECT col1, ... partCol1,partCol2...
   */
  private void addPartitionColsToInsert(List<FieldSchema> partCols, StringBuilder rewrittenQueryStr) {
    // If the table is partitioned we have to put the partition() clause in
    if (partCols != null && partCols.size() > 0) {
      rewrittenQueryStr.append(" partition (");
      boolean first = true;
      for (FieldSchema fschema : partCols) {
        if (first)
          first = false;
        else
          rewrittenQueryStr.append(", ");
        //would be nice if there was a way to determine if quotes are needed
        rewrittenQueryStr.append(HiveUtils.unparseIdentifier(fschema.getName(), this.conf));
      }
      rewrittenQueryStr.append(")");
    }
  }
  /**
   * Append list of partition columns to Insert statement, i.e. the 2nd set of partCol1,partCol2
   * INSERT INTO T PARTITION(partCol1,partCol2...) SELECT col1, ... partCol1,partCol2...
   * @param target target table
   */
  private void addPartitionColsToSelect(List<FieldSchema> partCols, StringBuilder rewrittenQueryStr,
                                        ASTNode target) throws SemanticException {
    String targetName = target != null ? getSimpleTableName(target) : null;

    // If the table is partitioned, we need to select the partition columns as well.
    if (partCols != null) {
      for (FieldSchema fschema : partCols) {
        rewrittenQueryStr.append(", ");
        //would be nice if there was a way to determine if quotes are needed
        if(targetName != null) {
          rewrittenQueryStr.append(targetName).append('.');
        }
        rewrittenQueryStr.append(HiveUtils.unparseIdentifier(fschema.getName(), this.conf));
      }
    }
  }
  /**
   * Assert that we are not asked to update a bucketing column or partition column
   * @param colName it's the A in "SET A = B"
   */
  private void checkValidSetClauseTarget(ASTNode colName, Table targetTable) throws SemanticException {
    String columnName = normalizeColName(colName.getText());

    // Make sure this isn't one of the partitioning columns, that's not supported.
    for (FieldSchema fschema : targetTable.getPartCols()) {
      if (fschema.getName().equalsIgnoreCase(columnName)) {
        throw new SemanticException(ErrorMsg.UPDATE_CANNOT_UPDATE_PART_VALUE.getMsg());
      }
    }
    //updating bucket column should move row from one file to another - not supported
    if(targetTable.getBucketCols() != null && targetTable.getBucketCols().contains(columnName)) {
      throw new SemanticException(ErrorMsg.UPDATE_CANNOT_UPDATE_BUCKET_VALUE,columnName);
    }
    boolean foundColumnInTargetTable = false;
    for(FieldSchema col : targetTable.getCols()) {
      if(columnName.equalsIgnoreCase(col.getName())) {
        foundColumnInTargetTable = true;
        break;
      }
    }
    if(!foundColumnInTargetTable) {
      throw new SemanticException(ErrorMsg.INVALID_TARGET_COLUMN_IN_SET_CLAUSE, colName.getText(),
        getDotName(new String[] {targetTable.getDbName(), targetTable.getTableName()}));
    }
  }
  private ASTNode findLHSofAssignment(ASTNode assignment) {
    assert assignment.getToken().getType() == HiveParser.EQUAL :
      "Expected set assignments to use equals operator but found " + assignment.getName();
    ASTNode tableOrColTok = (ASTNode)assignment.getChildren().get(0);
    assert tableOrColTok.getToken().getType() == HiveParser.TOK_TABLE_OR_COL :
      "Expected left side of assignment to be table or column";
    ASTNode colName = (ASTNode)tableOrColTok.getChildren().get(0);
    assert colName.getToken().getType() == HiveParser.Identifier :
      "Expected column name";
    return colName;
  }
  private Map<String, ASTNode> collectSetColumnsAndExpressions(ASTNode setClause,
                         Set<String> setRCols, Table targetTable) throws SemanticException {
    // An update needs to select all of the columns, as we rewrite the entire row.  Also,
    // we need to figure out which columns we are going to replace.
    assert setClause.getToken().getType() == HiveParser.TOK_SET_COLUMNS_CLAUSE :
      "Expected second child of update token to be set token";

    // Get the children of the set clause, each of which should be a column assignment
    List<? extends Node> assignments = setClause.getChildren();
    // Must be deterministic order map for consistent q-test output across Java versions
    Map<String, ASTNode> setCols = new LinkedHashMap<String, ASTNode>(assignments.size());
    for (Node a : assignments) {
      ASTNode assignment = (ASTNode)a;
      ASTNode colName = findLHSofAssignment(assignment);
      if(setRCols != null) {
        addSetRCols((ASTNode) assignment.getChildren().get(1), setRCols);
      }
      checkValidSetClauseTarget(colName, targetTable);

      String columnName = normalizeColName(colName.getText());
      // This means that in UPDATE T SET x = _something_
      // _something_ can be whatever is supported in SELECT _something_
      setCols.put(columnName, (ASTNode)assignment.getChildren().get(1));
    }
    return setCols;
  }
  /**
   * @return the Metastore representation of the target table
   */
  private Table getTargetTable(ASTNode tabRef) throws SemanticException {
    String[] tableName;
    Table mTable;
    switch (tabRef.getType()) {
      case HiveParser.TOK_TABREF:
        tableName = getQualifiedTableName((ASTNode) tabRef.getChild(0));
        break;
      case HiveParser.TOK_TABNAME:
        tableName = getQualifiedTableName(tabRef);
        break;
      default:
          throw raiseWrongType("TOK_TABREF|TOK_TABNAME", tabRef);
    }
    try {
      mTable = db.getTable(tableName[0], tableName[1]);
    } catch (InvalidTableException e) {
      LOG.error("Failed to find table " + getDotName(tableName) + " got exception "
        + e.getMessage());
      throw new SemanticException(ErrorMsg.INVALID_TABLE.getMsg(getDotName(tableName)), e);
    } catch (HiveException e) {
      LOG.error("Failed to find table " + getDotName(tableName) + " got exception "
        + e.getMessage());
      throw new SemanticException(e.getMessage(), e);
    }
    return mTable;
  }
  // Walk through all our inputs and set them to note that this read is part of an update or a
  // delete.
  private void markReadEntityForUpdate() {
    for (ReadEntity input : inputs) {
      if(isWritten(input)) {
        //todo: this is actually not adding anything since LockComponent uses a Trie to "promote" a lock
        //except by accident - when we have a partitioned target table we have a ReadEntity and WriteEntity
        //for the table, so we mark ReadEntity and then delete WriteEntity (replace with Partition entries)
        //so DbTxnManager skips Read lock on the ReadEntity....
        input.setUpdateOrDelete(true);//input.noLockNeeded()?
      }
    }
  }
  /**
   *  For updates, we need to set the column access info so that it contains information on
   *  the columns we are updating.
   *  (But not all the columns of the target table even though the rewritten query writes
   *  all columns of target table since that is an implmentation detail)
   */
  private void setUpAccessControlInfoForUpdate(Table mTable, Map<String, ASTNode> setCols) {
    ColumnAccessInfo cai = new ColumnAccessInfo();
    for (String colName : setCols.keySet()) {
      cai.add(Table.getCompleteName(mTable.getDbName(), mTable.getTableName()), colName);
    }
    setUpdateColumnAccessInfo(cai);
  }
  /**
   * We need to weed ROW__ID out of the input column info, as it doesn't make any sense to
   * require the user to have authorization on that column.
   */
  private void cleanUpMetaColumnAccessControl() {
    //we do this for Update/Delete (incl Merge) because we introduce this column into the query
    //as part of rewrite
    if (columnAccessInfo != null) {
      columnAccessInfo.stripVirtualColumn(VirtualColumn.ROWID);
    }
  }
  /**
   * Parse the newly generated SQL statment to get a new AST
   */
  private ReparseResult parseRewrittenQuery(StringBuilder rewrittenQueryStr, String originalQuery) throws SemanticException {
    // Parse the rewritten query string
    Context rewrittenCtx;
    try {
      // Set dynamic partitioning to nonstrict so that queries do not need any partition
      // references.
      // todo: this may be a perf issue as it prevents the optimizer.. or not
      HiveConf.setVar(conf, HiveConf.ConfVars.DYNAMICPARTITIONINGMODE, "nonstrict");
      rewrittenCtx = new Context(conf);
      rewrittenCtx.setExplainConfig(ctx.getExplainConfig());
      rewrittenCtx.setIsUpdateDeleteMerge(true);
    } catch (IOException e) {
      throw new SemanticException(ErrorMsg.UPDATEDELETE_IO_ERROR.getMsg());
    }
    rewrittenCtx.setCmd(rewrittenQueryStr.toString());

    ASTNode rewrittenTree;
    try {
      LOG.info("Going to reparse <" + originalQuery + "> as \n<" + rewrittenQueryStr.toString() + ">");
      rewrittenTree = ParseUtils.parse(rewrittenQueryStr.toString(), rewrittenCtx);
    } catch (ParseException e) {
      throw new SemanticException(ErrorMsg.UPDATEDELETE_PARSE_ERROR.getMsg(), e);
    }
    return new ReparseResult(rewrittenTree, rewrittenCtx);
  }
  /**
   * Assert it supports Acid write
   */
  private void validateTargetTable(Table mTable) throws SemanticException {
    if (mTable.getTableType() == TableType.VIRTUAL_VIEW ||
      mTable.getTableType() == TableType.MATERIALIZED_VIEW) {
        LOG.error("Table " + getDotName(new String[] {mTable.getDbName(), mTable.getTableName()}) + " is a view or materialized view");
        throw new SemanticException(ErrorMsg.UPDATE_DELETE_VIEW.getMsg());
    }
  }
  /**
   * This supports update and delete statements
   */
  private void reparseAndSuperAnalyze(ASTNode tree) throws SemanticException {
    List<? extends Node> children = tree.getChildren();
    // The first child should be the table we are deleting from
    ASTNode tabName = (ASTNode)children.get(0);
    assert tabName.getToken().getType() == HiveParser.TOK_TABNAME :
        "Expected tablename as first child of " + operation() + " but found " + tabName.getName();

    // Rewrite the delete or update into an insert.  Crazy, but it works as deletes and update
    // actually are inserts into the delta file in Hive.  A delete
    // DELETE FROM _tablename_ [WHERE ...]
    // will be rewritten as
    // INSERT INTO TABLE _tablename_ [PARTITION (_partcols_)] SELECT ROW__ID[,
    // _partcols_] from _tablename_ SORT BY ROW__ID
    // An update
    // UPDATE _tablename_ SET x = _expr_ [WHERE...]
    // will be rewritten as
    // INSERT INTO TABLE _tablename_ [PARTITION (_partcols_)] SELECT _all_,
    // _partcols_from _tablename_ SORT BY ROW__ID
    // where _all_ is all the non-partition columns.  The expressions from the set clause will be
    // re-attached later.
    // The where clause will also be re-attached later.
    // The sort by clause is put in there so that records come out in the right order to enable
    // merge on read.

    StringBuilder rewrittenQueryStr = new StringBuilder();
    Table mTable = getTargetTable(tabName);
    validateTargetTable(mTable);

    rewrittenQueryStr.append("insert into table ");
    rewrittenQueryStr.append(getFullTableNameForSQL(tabName));

    addPartitionColsToInsert(mTable.getPartCols(), rewrittenQueryStr);

    rewrittenQueryStr.append(" select ROW__ID");

    Map<Integer, ASTNode> setColExprs = null;
    Map<String, ASTNode> setCols = null;
    // Must be deterministic order set for consistent q-test output across Java versions
    Set<String> setRCols = new LinkedHashSet<String>();
    if (updating()) {
      // We won't write the set
      // expressions in the rewritten query.  We'll patch that up later.
      // The set list from update should be the second child (index 1)
      assert children.size() >= 2 : "Expected update token to have at least two children";
      ASTNode setClause = (ASTNode)children.get(1);
      setCols = collectSetColumnsAndExpressions(setClause, setRCols, mTable);
      setColExprs = new HashMap<>(setClause.getChildCount());

      List<FieldSchema> nonPartCols = mTable.getCols();
      for (int i = 0; i < nonPartCols.size(); i++) {
        rewrittenQueryStr.append(',');
        String name = nonPartCols.get(i).getName();
        ASTNode setCol = setCols.get(name);
        rewrittenQueryStr.append(HiveUtils.unparseIdentifier(name, this.conf));
        if (setCol != null) {
          // This is one of the columns we're setting, record it's position so we can come back
          // later and patch it up.
          // Add one to the index because the select has the ROW__ID as the first column.
          setColExprs.put(i + 1, setCol);
        }
      }
    }

    addPartitionColsToSelect(mTable.getPartCols(), rewrittenQueryStr, null);
    rewrittenQueryStr.append(" from ");
    rewrittenQueryStr.append(getFullTableNameForSQL(tabName));

    ASTNode where = null;
    int whereIndex = deleting() ? 1 : 2;
    if (children.size() > whereIndex) {
      where = (ASTNode)children.get(whereIndex);
      assert where.getToken().getType() == HiveParser.TOK_WHERE :
          "Expected where clause, but found " + where.getName();
    }

    // Add a sort by clause so that the row ids come out in the correct order
    rewrittenQueryStr.append(" sort by ROW__ID ");

    ReparseResult rr = parseRewrittenQuery(rewrittenQueryStr, ctx.getCmd());
    Context rewrittenCtx = rr.rewrittenCtx;
    ASTNode rewrittenTree = rr.rewrittenTree;

    ASTNode rewrittenInsert = (ASTNode)rewrittenTree.getChildren().get(1);
    assert rewrittenInsert.getToken().getType() == HiveParser.TOK_INSERT :
        "Expected TOK_INSERT as second child of TOK_QUERY but found " + rewrittenInsert.getName();

    if(updating()) {
      rewrittenCtx.setOperation(Context.Operation.UPDATE);
      rewrittenCtx.addDestNamePrefix(1, Context.DestClausePrefix.UPDATE);
    }
    else if(deleting()) {
      rewrittenCtx.setOperation(Context.Operation.DELETE);
      rewrittenCtx.addDestNamePrefix(1, Context.DestClausePrefix.DELETE);
    }

    if (where != null) {
      // The structure of the AST for the rewritten insert statement is:
      // TOK_QUERY -> TOK_FROM
      //          \-> TOK_INSERT -> TOK_INSERT_INTO
      //                        \-> TOK_SELECT
      //                        \-> TOK_SORTBY
      // The following adds the TOK_WHERE and its subtree from the original query as a child of
      // TOK_INSERT, which is where it would have landed if it had been there originally in the
      // string.  We do it this way because it's easy then turning the original AST back into a
      // string and reparsing it.  We have to move the SORT_BY over one,
      // so grab it and then push it to the second slot, and put the where in the first slot
      ASTNode sortBy = (ASTNode)rewrittenInsert.getChildren().get(2);
      assert sortBy.getToken().getType() == HiveParser.TOK_SORTBY :
          "Expected TOK_SORTBY to be first child of TOK_SELECT, but found " + sortBy.getName();
      rewrittenInsert.addChild(sortBy);
      rewrittenInsert.setChild(2, where);
    }

    // Patch up the projection list for updates, putting back the original set expressions.
    if (updating() && setColExprs != null) {
      // Walk through the projection list and replace the column names with the
      // expressions from the original update.  Under the TOK_SELECT (see above) the structure
      // looks like:
      // TOK_SELECT -> TOK_SELEXPR -> expr
      //           \-> TOK_SELEXPR -> expr ...
      ASTNode rewrittenSelect = (ASTNode)rewrittenInsert.getChildren().get(1);
      assert rewrittenSelect.getToken().getType() == HiveParser.TOK_SELECT :
          "Expected TOK_SELECT as second child of TOK_INSERT but found " +
              rewrittenSelect.getName();
      for (Map.Entry<Integer, ASTNode> entry : setColExprs.entrySet()) {
        ASTNode selExpr = (ASTNode)rewrittenSelect.getChildren().get(entry.getKey());
        assert selExpr.getToken().getType() == HiveParser.TOK_SELEXPR :
            "Expected child of TOK_SELECT to be TOK_SELEXPR but was " + selExpr.getName();
        // Now, change it's child
        selExpr.setChild(0, entry.getValue());
      }
    }

    try {
      useSuper = true;
      super.analyze(rewrittenTree, rewrittenCtx);
    } finally {
      useSuper = false;
    }

    updateOutputs(mTable);


    if (updating()) {
      setUpAccessControlInfoForUpdate(mTable, setCols);

      // Add the setRCols to the input list
      for (String colName : setRCols) {
        if(columnAccessInfo != null) {//assuming this means we are not doing Auth
          columnAccessInfo.add(Table.getCompleteName(mTable.getDbName(), mTable.getTableName()),
            colName);
        }
      }
    }
  }
  /**
   * Check that {@code readEntity} is also being written
   */
  private boolean isWritten(Entity readEntity) {
    for(Entity writeEntity : outputs) {
      //make sure to compare them as Entity, i.e. that it's the same table or partition, etc
      if(writeEntity.toString().equalsIgnoreCase(readEntity.toString())) {
        return true;
      }
    }
    return false;
  }
  private String operation() {
    if (currentOperation == Context.Operation.OTHER) {
      throw new IllegalStateException("UpdateDeleteSemanticAnalyzer neither updating nor " +
        "deleting, operation not known.");
    }
    return currentOperation.toString();
  }

  // This method finds any columns on the right side of a set statement (thus rcols) and puts them
  // in a set so we can add them to the list of input cols to check.
  private void addSetRCols(ASTNode node, Set<String> setRCols) {

    // See if this node is a TOK_TABLE_OR_COL.  If so, find the value and put it in the list.  If
    // not, recurse on any children
    if (node.getToken().getType() == HiveParser.TOK_TABLE_OR_COL) {
      ASTNode colName = (ASTNode)node.getChildren().get(0);
      assert colName.getToken().getType() == HiveParser.Identifier :
          "Expected column name";
      setRCols.add(normalizeColName(colName.getText()));
    } else if (node.getChildren() != null) {
      for (Node n : node.getChildren()) {
        addSetRCols((ASTNode)n, setRCols);
      }
    }
  }

  /**
   * Column names are stored in metastore in lower case, regardless of the CREATE TABLE statement.
   * Unfortunately there is no single place that normalizes the input query.
   * @param colName not null
   */
  private static String normalizeColName(String colName) {
    return colName.toLowerCase();
  }

  private Context.Operation currentOperation = Context.Operation.OTHER;
  private static final String Indent = "  ";

  private IdentifierQuoter quotedIdenfierHelper;

  /**
   * This allows us to take an arbitrary ASTNode and turn it back into SQL that produced it.
   * Since HiveLexer.g is written such that it strips away any ` (back ticks) around 
   * quoted identifiers we need to add those back to generated SQL.
   * Additionally, the parser only produces tokens of type Identifier and never
   * QuotedIdentifier (HIVE-6013).  So here we just quote all identifiers.
   * (') around String literals are retained w/o issues
   */
  private static class IdentifierQuoter {
    private final TokenRewriteStream trs;
    private final IdentityHashMap<ASTNode, ASTNode> visitedNodes = new IdentityHashMap<>();
    IdentifierQuoter(TokenRewriteStream trs) {
      this.trs = trs;
      if(trs == null) {
        throw new IllegalArgumentException("Must have a TokenRewriteStream");
      }
    }
    private void visit(ASTNode n) {
      if(n.getType() == HiveParser.Identifier) {
        if(visitedNodes.containsKey(n)) {
          /**
           * Since we are modifying the stream, it's not idempotent.  Ideally, the caller would take
           * care to only quote Identifiers in each subtree once, but this makes it safe
           */
          return;
        }
        visitedNodes.put(n, n);
        trs.insertBefore(n.getToken(), "`");
        trs.insertAfter(n.getToken(), "`");
      }
      if(n.getChildCount() <= 0) {return;}
      for(Node c : n.getChildren()) {
        visit((ASTNode)c);
      }
    }
  }

  /**
   * This allows us to take an arbitrary ASTNode and turn it back into SQL that produced it without
   * needing to understand what it is (except for QuotedIdentifiers)
   * 
   */
  private String getMatchedText(ASTNode n) {
    quotedIdenfierHelper.visit(n);
    return ctx.getTokenRewriteStream().toString(n.getTokenStartIndex(),
      n.getTokenStopIndex() + 1).trim();
  }
  /**
   * Here we take a Merge statement AST and generate a semantically equivalent multi-insert
   * statement to exectue.  Each Insert leg represents a single WHEN clause.  As much as possible,
   * the new SQL statement is made to look like the input SQL statement so that it's easier to map
   * Query Compiler errors from generated SQL to original one this way.
   * The generated SQL is a complete representation of the original input for the same reason.
   * In many places SemanticAnalyzer throws exceptions that contain (line, position) coordinates.
   * If generated SQL doesn't have everything and is patched up later, these coordinates point to
   * the wrong place.
   *
   * @throws SemanticException
   */
  private void analyzeMerge(ASTNode tree) throws SemanticException {
    currentOperation = Context.Operation.MERGE;
    quotedIdenfierHelper = new IdentifierQuoter(ctx.getTokenRewriteStream());
    /*
     * See org.apache.hadoop.hive.ql.parse.TestMergeStatement for some examples of the merge AST
      For example, given:
      merge into acidTbl using nonAcidPart2 source ON acidTbl.a = source.a2
      WHEN MATCHED THEN UPDATE set b = source.b2
      WHEN NOT MATCHED THEN INSERT VALUES(source.a2, source.b2)

      We get AST like this:
      "(tok_merge " +
        "(tok_tabname acidtbl) (tok_tabref (tok_tabname nonacidpart2) source) " +
        "(= (. (tok_table_or_col acidtbl) a) (. (tok_table_or_col source) a2)) " +
        "(tok_matched " +
        "(tok_update " +
        "(tok_set_columns_clause (= (tok_table_or_col b) (. (tok_table_or_col source) b2))))) " +
        "(tok_not_matched " +
        "tok_insert " +
        "(tok_value_row (. (tok_table_or_col source) a2) (. (tok_table_or_col source) b2))))");

        And need to produce a multi-insert like this to execute:
        FROM acidTbl right outer join nonAcidPart2 ON acidTbl.a = source.a2
        Insert into table acidTbl select nonAcidPart2.a2, nonAcidPart2.b2 where acidTbl.a is null
        INSERT INTO TABLE acidTbl select target.ROW__ID, nonAcidPart2.a2, nonAcidPart2.b2 where nonAcidPart2.a2=acidTbl.a sort by acidTbl.ROW__ID
    */
    /*todo: we need some sort of validation phase over original AST to make things user friendly; for example, if
     original command refers to a column that doesn't exist, this will be caught when processing the rewritten query but
     the errors will point at locations that the user can't map to anything
     - VALUES clause must have the same number of values as target table (including partition cols).  Part cols go last in Select clause of Insert as Select
     todo: do we care to preserve comments in original SQL?
     todo: check if identifiers are propertly escaped/quoted in the generated SQL - it's currently inconsistent
      Look at UnparseTranslator.addIdentifierTranslation() - it does unescape + unparse...
     todo: consider "WHEN NOT MATCHED BY SOURCE THEN UPDATE SET TargetTable.Col1 = SourceTable.Col1 "; what happens when source is empty?  This should be a runtime error - maybe not
      the outer side of ROJ is empty => the join produces 0 rows.  If supporting WHEN NOT MATCHED BY SOURCE, then this should be a runtime error
    */
    ASTNode target = (ASTNode)tree.getChild(0);
    ASTNode source = (ASTNode)tree.getChild(1);
    String targetName = getSimpleTableName(target);
    String sourceName = getSimpleTableName(source);
    ASTNode onClause = (ASTNode) tree.getChild(2);
    String onClauseAsText = getMatchedText(onClause);

    Table targetTable = getTargetTable(target);
    validateTargetTable(targetTable);
    List<ASTNode> whenClauses = findWhenClauses(tree);

    StringBuilder rewrittenQueryStr = new StringBuilder("FROM\n");
    rewrittenQueryStr.append(Indent).append(getFullTableNameForSQL(target));
    if(isAliased(target)) {
      rewrittenQueryStr.append(" ").append(targetName);
    }
    rewrittenQueryStr.append('\n');
    rewrittenQueryStr.append(Indent).append(chooseJoinType(whenClauses)).append("\n");
    if(source.getType() == HiveParser.TOK_SUBQUERY) {
      //this includes the mandatory alias
      rewrittenQueryStr.append(Indent).append(getMatchedText(source));
    }
    else {
      rewrittenQueryStr.append(Indent).append(getFullTableNameForSQL(source));
      if(isAliased(source)) {
        rewrittenQueryStr.append(" ").append(sourceName);
      }
    }
    rewrittenQueryStr.append('\n');
    rewrittenQueryStr.append(Indent).append("ON ").append(onClauseAsText).append('\n');

    /**
     * We allow at most 2 WHEN MATCHED clause, in which case 1 must be Update the other Delete
     * If we have both update and delete, the 1st one (in SQL code) must have "AND <extra predicate>"
     * so that the 2nd can ensure not to process the same rows.
     * Update and Delete may be in any order.  (Insert is always last)
     */
    String extraPredicate = null;
    int numWhenMatchedUpdateClauses = 0, numWhenMatchedDeleteClauses = 0;
    int numInsertClauses = 0;
    for(ASTNode whenClause : whenClauses) {
      switch (getWhenClauseOperation(whenClause).getType()) {
        case HiveParser.TOK_INSERT:
          numInsertClauses++;
          handleInsert(whenClause, rewrittenQueryStr, target, onClause, targetTable, targetName, onClauseAsText);
          break;
        case HiveParser.TOK_UPDATE:
          numWhenMatchedUpdateClauses++;
          String s = handleUpdate(whenClause, rewrittenQueryStr, target, onClauseAsText, targetTable, extraPredicate);
          if(numWhenMatchedUpdateClauses + numWhenMatchedDeleteClauses == 1) {
            extraPredicate = s;//i.e. it's the 1st WHEN MATCHED
          }
          break;
        case HiveParser.TOK_DELETE:
          numWhenMatchedDeleteClauses++;
          String s1 = handleDelete(whenClause, rewrittenQueryStr, target, onClauseAsText, targetTable, extraPredicate);
          if(numWhenMatchedUpdateClauses + numWhenMatchedDeleteClauses == 1) {
            extraPredicate = s1;//i.e. it's the 1st WHEN MATCHED
          }
          break;
        default:
          throw new IllegalStateException("Unexpected WHEN clause type: " + whenClause.getType() +
            addParseInfo(whenClause));
      }
      if(numWhenMatchedDeleteClauses > 1) {
        throw new SemanticException(ErrorMsg.MERGE_TOO_MANY_DELETE, ctx.getCmd());
      }
      if(numWhenMatchedUpdateClauses > 1) {
        throw new SemanticException(ErrorMsg.MERGE_TOO_MANY_UPDATE, ctx.getCmd());
      }
      assert numInsertClauses < 2;
      if(numInsertClauses == 1 && numWhenMatchedUpdateClauses == 1) {
        if(AcidUtils.getAcidOperationalProperties(targetTable).isSplitUpdate()) {
          throw new IllegalStateException("Tables with " +
            hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES + "=" +
            TransactionalValidationListener.DEFAULT_TRANSACTIONAL_PROPERTY + " currently do not " +
            "support MERGE with both Insert and Update clauses.");
        }
      }
    }
    if(numWhenMatchedDeleteClauses + numWhenMatchedUpdateClauses == 2 && extraPredicate == null) {
      throw new SemanticException(ErrorMsg.MERGE_PREDIACTE_REQUIRED, ctx.getCmd());
    }
    boolean validating = handleCardinalityViolation(rewrittenQueryStr, target, onClauseAsText,
      targetTable, numWhenMatchedDeleteClauses == 0 && numWhenMatchedUpdateClauses == 0);
    ReparseResult rr = parseRewrittenQuery(rewrittenQueryStr, ctx.getCmd());
    Context rewrittenCtx = rr.rewrittenCtx;
    ASTNode rewrittenTree = rr.rewrittenTree;
    rewrittenCtx.setOperation(Context.Operation.MERGE);

    //set dest name mapping on new context; 1st chid is TOK_FROM
    for(int insClauseIdx = 1, whenClauseIdx = 0;
        insClauseIdx < rewrittenTree.getChildCount() - (validating ? 1 : 0/*skip cardinality violation clause*/);
        insClauseIdx++, whenClauseIdx++) {
      //we've added Insert clauses in order or WHEN items in whenClauses
      ASTNode insertClause = (ASTNode) rewrittenTree.getChild(insClauseIdx);
      switch (getWhenClauseOperation(whenClauses.get(whenClauseIdx)).getType()) {
        case HiveParser.TOK_INSERT:
          rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.INSERT);
          break;
        case HiveParser.TOK_UPDATE:
          rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.UPDATE);
          break;
        case HiveParser.TOK_DELETE:
          rewrittenCtx.addDestNamePrefix(insClauseIdx, Context.DestClausePrefix.DELETE);
          break;
        default:
          assert false;
      }
    }
    if(validating) {
      //here means the last branch of the multi-insert is Cardinality Validation
      rewrittenCtx.addDestNamePrefix(rewrittenTree.getChildCount() - 1, Context.DestClausePrefix.INSERT);
    }
    try {
      useSuper = true;
      super.analyze(rewrittenTree, rewrittenCtx);
    } finally {
      useSuper = false;
    }
    updateOutputs(targetTable);
  }

  /**
   * SemanticAnalyzer will generate a WriteEntity for the target table since it doesn't know/check
   * if the read and write are of the same table in "insert ... select ....".  Since DbTxnManager
   * uses Read/WriteEntity objects to decide which locks to acquire, we get more concurrency if we
   * have change the table WriteEntity to a set of partition WriteEntity objects based on
   * ReadEntity objects computed for this table.
   */
  private void updateOutputs(Table targetTable) {
    markReadEntityForUpdate();

    if(targetTable.isPartitioned()) {
      List<ReadEntity> partitionsRead = getRestrictedPartitionSet(targetTable);
      if(!partitionsRead.isEmpty()) {
        //if there is WriteEntity with WriteType=UPDATE/DELETE for target table, replace it with
        //WriteEntity for each partition
        List<WriteEntity> toRemove = new ArrayList<>();
        for(WriteEntity we : outputs) {
          WriteEntity.WriteType wt = we.getWriteType();
          if(isTargetTable(we, targetTable) &&
            (wt == WriteEntity.WriteType.UPDATE || wt == WriteEntity.WriteType.DELETE)) {
            /**
             * The assumption here is that SemanticAnalyzer will will generate ReadEntity for each
             * partition that exists and is matched by the WHERE clause (which may be all of them).
             * Since we don't allow updating the value of a partition column, we know that we always
             * write the same (or fewer) partitions than we read.  Still, the write is a Dynamic
             * Partition write - see HIVE-15032.
             */
            toRemove.add(we);
          }
        }
        outputs.removeAll(toRemove);
        for(ReadEntity re : partitionsRead) {
          for(WriteEntity original : toRemove) {
            //since we may have both Update and Delete branches, Auth needs to know
            WriteEntity we = new WriteEntity(re.getPartition(), original.getWriteType());
            we.setDynamicPartitionWrite(original.isDynamicPartitionWrite());
            outputs.add(we);
          }
        }
      }
    }
  }
  /**
   * If the optimizer has determined that it only has to read some of the partitions of the
   * target table to satisfy the query, then we know that the write side of update/delete
   * (and update/delete parts of merge)
   * can only write (at most) that set of partitions (since we currently don't allow updating
   * partition (or bucket) columns).  So we want to replace the table level
   * WriteEntity in the outputs with WriteEntity for each of these partitions
   * ToDo: see if this should be moved to SemanticAnalyzer itself since it applies to any
   * insert which does a select against the same table.  Then SemanticAnalyzer would also
   * be able to not use DP for the Insert...
   *
   * Note that the Insert of Merge may be creating new partitions and writing to partitions
   * which were not read  (WHEN NOT MATCHED...).  WriteEntity for that should be created
   * in MoveTask (or some other task after the query is complete)
   */
  private List<ReadEntity> getRestrictedPartitionSet(Table targetTable) {
    List<ReadEntity> partitionsRead = new ArrayList<>();
    for(ReadEntity re : inputs) {
      if(re.isFromTopLevelQuery && re.getType() == Entity.Type.PARTITION && isTargetTable(re, targetTable)) {
        partitionsRead.add(re);
      }
    }
    return partitionsRead;
  }
  /**
   * if there is no WHEN NOT MATCHED THEN INSERT, we don't outer join
   */
  private String chooseJoinType(List<ASTNode> whenClauses) {
    for(ASTNode whenClause : whenClauses) {
      if(getWhenClauseOperation(whenClause).getType() == HiveParser.TOK_INSERT) {
        return "RIGHT OUTER JOIN";
      }
    }
    return "INNER JOIN";
  }
  /**
   * does this Entity belong to target table (partition)
   */
  private boolean isTargetTable(Entity entity, Table targetTable) {
    //todo: https://issues.apache.org/jira/browse/HIVE-15048
    /**
     * is this the right way to compare?  Should it just compare paths?
     * equals() impl looks heavy weight
     */
    return targetTable.equals(entity.getTable());
  }

  /**
   * Per SQL Spec ISO/IEC 9075-2:2011(E) Section 14.2 under "General Rules" Item 6/Subitem a/Subitem 2/Subitem B,
   * an error should be raised if > 1 row of "source" matches the same row in "target".
   * This should not affect the runtime of the query as it's running in parallel with other
   * branches of the multi-insert.  It won't actually write any data to merge_tmp_table since the
   * cardinality_violation() UDF throws an error whenever it's called killing the query
   * @return true if another Insert clause was added
   */
  private boolean handleCardinalityViolation(StringBuilder rewrittenQueryStr, ASTNode target,
                                             String onClauseAsString, Table targetTable,
                                             boolean onlyHaveWhenNotMatchedClause)
              throws SemanticException {
    if(!conf.getBoolVar(HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK)) {
      LOG.info("Merge statement cardinality violation check is disabled: " +
        HiveConf.ConfVars.MERGE_CARDINALITY_VIOLATION_CHECK.varname);
      return false;
    }
    if(onlyHaveWhenNotMatchedClause) {
      //if no update or delete in Merge, there is no need to to do cardinality check
      return false;
    }
    //this is a tmp table and thus Session scoped and acid requires SQL statement to be serial in a
    // given session, i.e. the name can be fixed across all invocations
    String tableName = "merge_tmp_table";
    rewrittenQueryStr.append("\nINSERT INTO ").append(tableName)
      .append("\n  SELECT cardinality_violation(")
      .append(getSimpleTableName(target)).append(".ROW__ID");
      addPartitionColsToSelect(targetTable.getPartCols(), rewrittenQueryStr, target);
    
      rewrittenQueryStr.append(")\n WHERE ").append(onClauseAsString)
      .append(" GROUP BY ").append(getSimpleTableName(target)).append(".ROW__ID");
    
      addPartitionColsToSelect(targetTable.getPartCols(), rewrittenQueryStr, target);

      rewrittenQueryStr.append(" HAVING count(*) > 1");
    //say table T has partiton p, we are generating
    //select cardinality_violation(ROW_ID, p) WHERE ... GROUP BY ROW__ID, p
    //the Group By args are passed to cardinality_violation to add the violating value to the error msg
    try {
      if (null == db.getTable(tableName, false)) {
        StorageFormat format = new StorageFormat(conf);
        format.processStorageFormat("TextFile");
        Table table = db.newTable(tableName);
        table.setSerializationLib(format.getSerde());
        List<FieldSchema> fields = new ArrayList<FieldSchema>();
        fields.add(new FieldSchema("val", "int", null));
        table.setFields(fields);
        table.setDataLocation(Warehouse.getDnsPath(new Path(SessionState.get().getTempTableSpace(),
          tableName), conf));
        table.getTTable().setTemporary(true);
        table.setStoredAsSubDirectories(false);
        table.setInputFormatClass(format.getInputFormat());
        table.setOutputFormatClass(format.getOutputFormat());
        db.createTable(table, true);
      }
    }
    catch(HiveException|MetaException e) {
      throw new SemanticException(e.getMessage(), e);
    }
    return true;
  }
  /**
   * @param onClauseAsString - because there is no clone() and we need to use in multiple places
   * @param deleteExtraPredicate - see notes at caller
   */
  private String handleUpdate(ASTNode whenMatchedUpdateClause, StringBuilder rewrittenQueryStr,
                              ASTNode target, String onClauseAsString, Table targetTable,
                              String deleteExtraPredicate) throws SemanticException {
    assert whenMatchedUpdateClause.getType() == HiveParser.TOK_MATCHED;
    assert getWhenClauseOperation(whenMatchedUpdateClause).getType() == HiveParser.TOK_UPDATE;
    String targetName = getSimpleTableName(target);
    rewrittenQueryStr.append("INSERT INTO ").append(getFullTableNameForSQL(target));
    addPartitionColsToInsert(targetTable.getPartCols(), rewrittenQueryStr);
    rewrittenQueryStr.append("    -- update clause\n select ").append(targetName).append(".ROW__ID");

    ASTNode setClause = (ASTNode)getWhenClauseOperation(whenMatchedUpdateClause).getChild(0);
    //columns being updated -> update expressions; "setRCols" (last param) is null because we use actual expressions
    //before reparsing, i.e. they are known to SemanticAnalyzer logic
    Map<String, ASTNode> setColsExprs = collectSetColumnsAndExpressions(setClause, null, targetTable);
    //if target table has cols c1,c2,c3 and p1 partition col and we had "SET c2 = 5, c1 = current_date()" we want to end up with
    //insert into target (p1) select current_date(), 5, c3, p1 where ....
    //since we take the RHS of set exactly as it was in Input, we don't need to deal with quoting/escaping column/table names
    List<FieldSchema> nonPartCols = targetTable.getCols();
    for(FieldSchema fs : nonPartCols) {
      rewrittenQueryStr.append(", ");
      String name = fs.getName();
      if (setColsExprs.containsKey(name)) {
        String rhsExp = getMatchedText(setColsExprs.get(name));
        //"set a=5, b=8" - rhsExp picks up the next char (e.g. ',') from the token stream
        switch (rhsExp.charAt(rhsExp.length() - 1)) {
          case ',':
          case '\n':
            rhsExp = rhsExp.substring(0, rhsExp.length() - 1);
        }
        rewrittenQueryStr.append(rhsExp);
      }
      else {
        rewrittenQueryStr.append(getSimpleTableName(target)).append(".").append(HiveUtils.unparseIdentifier(name, this.conf));
      }
    }
    addPartitionColsToSelect(targetTable.getPartCols(), rewrittenQueryStr, target);
    rewrittenQueryStr.append("\n   WHERE ").append(onClauseAsString);
    String extraPredicate = getWhenClausePredicate(whenMatchedUpdateClause);
    if(extraPredicate != null) {
      //we have WHEN MATCHED AND <boolean expr> THEN DELETE
      rewrittenQueryStr.append(" AND ").append(extraPredicate);
    }
    if(deleteExtraPredicate != null) {
      rewrittenQueryStr.append(" AND NOT(").append(deleteExtraPredicate).append(")");
    }
    rewrittenQueryStr.append("\n sort by ");
    rewrittenQueryStr.append(targetName).append(".ROW__ID \n");

    setUpAccessControlInfoForUpdate(targetTable, setColsExprs);
    //we don't deal with columns on RHS of SET expression since the whole expr is part of the
    //rewritten SQL statement and is thus handled by SemanticAnalzyer.  Nor do we have to
    //figure which cols on RHS are from source and which from target

    return extraPredicate;
  }
  /**
   * @param onClauseAsString - because there is no clone() and we need to use in multiple places
   * @param updateExtraPredicate - see notes at caller
   */
  private String handleDelete(ASTNode whenMatchedDeleteClause, StringBuilder rewrittenQueryStr, ASTNode target,
                            String onClauseAsString, Table targetTable, String updateExtraPredicate) throws SemanticException {
    assert whenMatchedDeleteClause.getType() == HiveParser.TOK_MATCHED;
    assert getWhenClauseOperation(whenMatchedDeleteClause).getType() == HiveParser.TOK_DELETE;
    List<FieldSchema> partCols = targetTable.getPartCols();
    String targetName = getSimpleTableName(target);
    rewrittenQueryStr.append("INSERT INTO ").append(getFullTableNameForSQL(target));
    addPartitionColsToInsert(partCols, rewrittenQueryStr);

    rewrittenQueryStr.append("    -- delete clause\n select ").append(targetName).append(".ROW__ID ");
    addPartitionColsToSelect(partCols, rewrittenQueryStr, target);
    rewrittenQueryStr.append("\n   WHERE ").append(onClauseAsString);
    String extraPredicate = getWhenClausePredicate(whenMatchedDeleteClause);
    if(extraPredicate != null) {
      //we have WHEN MATCHED AND <boolean expr> THEN DELETE
      rewrittenQueryStr.append(" AND ").append(extraPredicate);
    }
    if(updateExtraPredicate != null) {
      rewrittenQueryStr.append(" AND NOT(").append(updateExtraPredicate).append(")");
    }
    rewrittenQueryStr.append("\n sort by ");
    rewrittenQueryStr.append(targetName).append(".ROW__ID \n");
    return extraPredicate;
  }
  private static String addParseInfo(ASTNode n) {
    return " at " + ErrorMsg.renderPosition(n);
  }

  /**
   * Returns the table name to use in the generated query preserving original quotes/escapes if any
   * @see #getFullTableNameForSQL(ASTNode)
   */
  private String getSimpleTableName(ASTNode n) throws SemanticException {
    return HiveUtils.unparseIdentifier(getSimpleTableNameBase(n), this.conf);
  }
  private String getSimpleTableNameBase(ASTNode n) throws SemanticException {
    switch (n.getType()) {
      case HiveParser.TOK_TABREF:
        int aliasIndex = findTabRefIdxs(n)[0];
        if (aliasIndex != 0) {
          return n.getChild(aliasIndex).getText();//the alias
        }
        return getSimpleTableNameBase((ASTNode) n.getChild(0));
        case HiveParser.TOK_TABNAME:
        if(n.getChildCount() == 2) {
          //db.table -> return table
          return n.getChild(1).getText();
        }
        return n.getChild(0).getText();
      case HiveParser.TOK_SUBQUERY:
        return n.getChild(1).getText();//the alias
      default:
        throw raiseWrongType("TOK_TABREF|TOK_TABNAME|TOK_SUBQUERY", n);
    }
  }
  /**
   * @return table name in db.table form with proper quoting/escaping to be used in a SQL statement
   */
  private String getFullTableNameForSQL(ASTNode n) throws SemanticException {
    switch (n.getType()) {
      case HiveParser.TOK_TABNAME:
        String[] tableName = getQualifiedTableName(n);
        return getDotName(new String[] {
          HiveUtils.unparseIdentifier(tableName[0], this.conf),
          HiveUtils.unparseIdentifier(tableName[1], this.conf) });
      case HiveParser.TOK_TABREF:
        return getFullTableNameForSQL((ASTNode) n.getChild(0));
      default:
        throw raiseWrongType("TOK_TABNAME", n);
    }
  }  private static final class ReparseResult {
    private final ASTNode rewrittenTree;
    private final Context rewrittenCtx;
    ReparseResult(ASTNode n, Context c) {
      rewrittenTree = n;
      rewrittenCtx = c;
    }
  }
  private static IllegalArgumentException raiseWrongType(String expectedTokName, ASTNode n) {
    return new IllegalArgumentException("Expected " + expectedTokName + "; got " + n.getType());
  }
  private boolean isAliased(ASTNode n) {
    switch (n.getType()) {
      case HiveParser.TOK_TABREF:
        return findTabRefIdxs(n)[0] != 0;
      case HiveParser.TOK_TABNAME:
        return false;
      case HiveParser.TOK_SUBQUERY:
        assert n.getChildCount() > 1 : "Expected Derived Table to be aliased";
        return true;
      default:
        throw raiseWrongType("TOK_TABREF|TOK_TABNAME", n);
    }
  }
  /**
   * Collect WHEN clauses from Merge statement AST
   */
  private List<ASTNode> findWhenClauses(ASTNode tree) throws SemanticException {
    assert tree.getType() == HiveParser.TOK_MERGE;
    List<ASTNode> whenClauses = new ArrayList<>();
    for(int idx = 3; idx < tree.getChildCount(); idx++) {
      ASTNode whenClause = (ASTNode)tree.getChild(idx);
      assert whenClause.getType() == HiveParser.TOK_MATCHED ||
        whenClause.getType() == HiveParser.TOK_NOT_MATCHED :
        "Unexpected node type found: " + whenClause.getType() + addParseInfo(whenClause);
      whenClauses.add(whenClause);
    }
    if(whenClauses.size() <= 0) {
      //Futureproofing: the parser will actually not allow this
      throw new SemanticException("Must have at least 1 WHEN clause in MERGE statement");
    }
    return whenClauses;
  }
  private ASTNode getWhenClauseOperation(ASTNode whenClause) {
    if(!(whenClause.getType() == HiveParser.TOK_MATCHED || whenClause.getType() == HiveParser.TOK_NOT_MATCHED)) {
      throw  raiseWrongType("Expected TOK_MATCHED|TOK_NOT_MATCHED", whenClause);
    }
    return (ASTNode) whenClause.getChild(0);
  }
  /**
   * returns the <boolean predicate> as in WHEN MATCHED AND <boolean predicate> THEN...
   * @return may be null
   */
  private String getWhenClausePredicate(ASTNode whenClause) {
    if(!(whenClause.getType() == HiveParser.TOK_MATCHED || whenClause.getType() == HiveParser.TOK_NOT_MATCHED)) {
      throw  raiseWrongType("Expected TOK_MATCHED|TOK_NOT_MATCHED", whenClause);
    }
    if(whenClause.getChildCount() == 2) {
      return getMatchedText((ASTNode)whenClause.getChild(1));
    }
    return null;
  }
  /**
   * Generates the Insert leg of the multi-insert SQL to represent WHEN NOT MATCHED THEN INSERT clause
   * @param targetTableNameInSourceQuery - simple name/alias
   * @throws SemanticException
   */
  private void handleInsert(ASTNode whenNotMatchedClause, StringBuilder rewrittenQueryStr, ASTNode target,
                            ASTNode onClause, Table targetTable,
                            String targetTableNameInSourceQuery, String onClauseAsString) throws SemanticException {
    assert whenNotMatchedClause.getType() == HiveParser.TOK_NOT_MATCHED;
    assert getWhenClauseOperation(whenNotMatchedClause).getType() == HiveParser.TOK_INSERT;
    List<FieldSchema> partCols = targetTable.getPartCols();
    String valuesClause = getMatchedText((ASTNode)getWhenClauseOperation(whenNotMatchedClause).getChild(0));
    valuesClause = valuesClause.substring(1, valuesClause.length() - 1);//strip '(' and ')'

    rewrittenQueryStr.append("INSERT INTO ").append(getFullTableNameForSQL(target));
    addPartitionColsToInsert(partCols, rewrittenQueryStr);

    OnClauseAnalyzer oca = new OnClauseAnalyzer(onClause, targetTable, targetTableNameInSourceQuery,
      conf, onClauseAsString);
    oca.analyze();
    rewrittenQueryStr.append("    -- insert clause\n  select ")
      .append(valuesClause).append("\n   WHERE ").append(oca.getPredicate());
    String extraPredicate = getWhenClausePredicate(whenNotMatchedClause);
    if(extraPredicate != null) {
      //we have WHEN NOT MATCHED AND <boolean expr> THEN INSERT
      rewrittenQueryStr.append(" AND ")
        .append(getMatchedText(((ASTNode)whenNotMatchedClause.getChild(1)))).append('\n');
    }
  }
  /**
   * Suppose the input Merge statement has ON target.a = source.b and c = d.  Assume, that 'c' is from
   * target table and 'd' is from source expression.  In order to properly
   * generate the Insert for WHEN NOT MATCHED THEN INSERT, we need to make sure that the Where
   * clause of this Insert contains "target.a is null and target.c is null"  This ensures that this
   * Insert leg does not receive any rows that are processed by Insert corresponding to
   * WHEN MATCHED THEN ... clauses.  (Implicit in this is a mini resolver that figures out if an
   * unqualified column is part of the target table.  We can get away with this simple logic because
   * we know that target is always a table (as opposed to some derived table).
   * The job of this class is to generate this predicate.
   *
   * Note that is this predicate cannot simply be NOT(on-clause-expr).  IF on-clause-expr evaluates
   * to Unknown, it will be treated as False in the WHEN MATCHED Inserts but NOT(Unknown) = Unknown,
   * and so it will be False for WHEN NOT MATCHED Insert...
   */
  private static final class OnClauseAnalyzer {
    private final ASTNode onClause;
    private final Map<String, List<String>> table2column = new HashMap<>();
    private final List<String> unresolvedColumns = new ArrayList<>();
    private final List<FieldSchema> allTargetTableColumns = new ArrayList<>();
    private final Set<String> tableNamesFound = new HashSet<>();
    private final String targetTableNameInSourceQuery;
    private final HiveConf conf;
    private final String onClauseAsString;
    /**
     * @param targetTableNameInSourceQuery alias or simple name
     */
    OnClauseAnalyzer(ASTNode onClause, Table targetTable, String targetTableNameInSourceQuery,
                     HiveConf conf, String onClauseAsString) {
      this.onClause = onClause;
      allTargetTableColumns.addAll(targetTable.getCols());
      allTargetTableColumns.addAll(targetTable.getPartCols());
      this.targetTableNameInSourceQuery = unescapeIdentifier(targetTableNameInSourceQuery);
      this.conf = conf;
      this.onClauseAsString = onClauseAsString;
    }
    /**
     * finds all columns and groups by table ref (if there is one)
     */
    private void visit(ASTNode n) {
      if(n.getType() == HiveParser.TOK_TABLE_OR_COL) {
        ASTNode parent = (ASTNode) n.getParent();
        if(parent != null && parent.getType() == HiveParser.DOT) {
          //the ref must be a table, so look for column name as right child of DOT
          if(parent.getParent() != null && parent.getParent().getType() == HiveParser.DOT) {
            //I don't think this can happen... but just in case
            throw new IllegalArgumentException("Found unexpected db.table.col reference in " + onClauseAsString);
          }
          addColumn2Table(n.getChild(0).getText(), parent.getChild(1).getText());
        }
        else {
          //must be just a column name
          unresolvedColumns.add(n.getChild(0).getText());
        }
      }
      if(n.getChildCount() == 0) {
        return;
      }
      for(Node child : n.getChildren()) {
        visit((ASTNode)child);
      }
    }
    private void analyze() {
      visit(onClause);
      if(tableNamesFound.size() > 2) {
        throw new IllegalArgumentException("Found > 2 table refs in ON clause.  Found " +
          tableNamesFound + " in " + onClauseAsString);
      }
      handleUnresolvedColumns();
      if(tableNamesFound.size() > 2) {
        throw new IllegalArgumentException("Found > 2 table refs in ON clause (incl unresolved).  " +
          "Found " + tableNamesFound + " in " + onClauseAsString);
      }
    }
    /**
     * Find those that belong to target table
     */
    private void handleUnresolvedColumns() {
      if(unresolvedColumns.isEmpty()) { return; }
      for(String c : unresolvedColumns) {
        for(FieldSchema fs : allTargetTableColumns) {
          if(c.equalsIgnoreCase(fs.getName())) {
            //c belongs to target table; strictly speaking there maybe an ambiguous ref but
            //this will be caught later when multi-insert is parsed
            addColumn2Table(targetTableNameInSourceQuery.toLowerCase(), c);
            break;
          }
        }
      }
    }
    private void addColumn2Table(String tableName, String columnName) {
      tableName = tableName.toLowerCase();//normalize name for mapping
      tableNamesFound.add(tableName);
      List<String> cols = table2column.get(tableName);
      if(cols == null) {
        cols = new ArrayList<>();
        table2column.put(tableName, cols);
      }
      //we want to preserve 'columnName' as it was in original input query so that rewrite
      //looks as much as possible like original query
      cols.add(columnName);
    }
    /**
     * Now generate the predicate for Where clause
     */
    private String getPredicate() {
      //normilize table name for mapping
      List<String> targetCols = table2column.get(targetTableNameInSourceQuery.toLowerCase());
      if(targetCols == null) {
        /*e.g. ON source.t=1
        * this is not strictly speaking invlaid but it does ensure that all columns from target
        * table are all NULL for every row.  This would make any WHEN MATCHED clause invalid since
        * we don't have a ROW__ID.  The WHEN NOT MATCHED could be meaningful but it's just data from
        * source satisfying source.t=1...  not worth the effort to support this*/
        throw new IllegalArgumentException(ErrorMsg.INVALID_TABLE_IN_ON_CLAUSE_OF_MERGE
          .format(targetTableNameInSourceQuery, onClauseAsString));
      }
      StringBuilder sb = new StringBuilder();
      for(String col : targetCols) {
        if(sb.length() > 0) {
          sb.append(" AND ");
        }
        //but preserve table name in SQL
        sb.append(HiveUtils.unparseIdentifier(targetTableNameInSourceQuery, conf)).append(".").append(HiveUtils.unparseIdentifier(col, conf)).append(" IS NULL");
      }
      return sb.toString();
    }
  }
}