RowResolver.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.RowSchema;

/**
 * Implementation of the Row Resolver.
 *
 */
public class RowResolver implements Serializable{
  private static final long serialVersionUID = 1L;
  private RowSchema rowSchema;
  private LinkedHashMap<String, LinkedHashMap<String, ColumnInfo>> rslvMap;

  private HashMap<String, String[]> invRslvMap;
  /*
   * now a Column can have an alternate mapping.
   * This captures the alternate mapping.
   * The primary(first) mapping is still only held in
   * invRslvMap.
   */
  private final Map<String, String[]> altInvRslvMap;
  private  Map<String, ASTNode> expressionMap;

  // TODO: Refactor this and do in a more object oriented manner
  private boolean isExprResolver;

  private static final Logger LOG = LoggerFactory.getLogger(RowResolver.class.getName());

  private NamedJoinInfo namedJoinInfo;

  public RowResolver() {
    rowSchema = new RowSchema();
    rslvMap = new LinkedHashMap<String, LinkedHashMap<String, ColumnInfo>>();
    invRslvMap = new HashMap<String, String[]>();
    altInvRslvMap = new HashMap<String, String[]>();
    expressionMap = new HashMap<String, ASTNode>();
    isExprResolver = false;
  }

  /**
   * Puts a resolver entry corresponding to a source expression which is to be
   * used for identical expression recognition (e.g. for matching expressions
   * in the SELECT list with the GROUP BY clause).  The convention for such
   * entries is an empty-string ("") as the table alias together with the
   * string rendering of the ASTNode as the column alias.
   */
  public void putExpression(ASTNode node, ColumnInfo colInfo) {
    String treeAsString = node.toStringTree();
    expressionMap.put(treeAsString, node);
    put("", treeAsString, colInfo);
  }

  /**
   * Retrieves the ColumnInfo corresponding to a source expression which
   * exactly matches the string rendering of the given ASTNode.
   */
  public ColumnInfo getExpression(ASTNode node) throws SemanticException {
    return get("", node.toStringTree());
  }

  /**
   * Retrieves the source expression matching a given ASTNode's
   * string rendering exactly.
   */
  public ASTNode getExpressionSource(ASTNode node) {
    return expressionMap.get(node.toStringTree());
  }

  public void put(String tab_alias, String col_alias, ColumnInfo colInfo) {
    if (!addMappingOnly(tab_alias, col_alias, colInfo)) {
      //Make sure that the table alias and column alias are stored
      //in the column info
      if (tab_alias != null) {
        colInfo.setTabAlias(tab_alias.toLowerCase());
      }
      if (col_alias != null) {
        colInfo.setAlias(col_alias.toLowerCase());
      }
      rowSchema.getSignature().add(colInfo);
    }
  }

  public boolean addMappingOnly(String tab_alias, String col_alias, ColumnInfo colInfo) {
    if (tab_alias != null) {
      tab_alias = tab_alias.toLowerCase();
    }

    /*
     * allow multiple mappings to the same ColumnInfo.
     * When a ColumnInfo is mapped multiple times, only the
     * first inverse mapping is captured.
     */
    boolean colPresent = invRslvMap.containsKey(colInfo.getInternalName());

    LinkedHashMap<String, ColumnInfo> f_map = rslvMap.get(tab_alias);
    if (f_map == null) {
      f_map = new LinkedHashMap<String, ColumnInfo>();
      rslvMap.put(tab_alias, f_map);
    }
    ColumnInfo oldColInfo = f_map.put(col_alias, colInfo);
    if (oldColInfo != null) {
      LOG.warn("Duplicate column info for " + tab_alias + "." + col_alias
          + " was overwritten in RowResolver map: " + oldColInfo + " by " + colInfo);
    }

    String[] qualifiedAlias = new String[2];
    qualifiedAlias[0] = tab_alias;
    qualifiedAlias[1] = col_alias;
    if ( !colPresent ) {
      invRslvMap.put(colInfo.getInternalName(), qualifiedAlias);
    } else {
      altInvRslvMap.put(colInfo.getInternalName(), qualifiedAlias);
    }

    return colPresent;
  }

  public boolean hasTableAlias(String tab_alias) {
    return rslvMap.get(tab_alias.toLowerCase()) != null;
  }

  /**
   * Gets the column Info to tab_alias.col_alias type of a column reference. I
   * the tab_alias is not provided as can be the case with an non aliased
   * column, this function looks up the column in all the table aliases in this
   * row resolver and returns the match. It also throws an exception if the
   * column is found in multiple table aliases. If no match is found a null
   * values is returned.
   *
   * This allows us to interpret both select t.c1 type of references and select
   * c1 kind of references. The later kind are what we call non aliased column
   * references in the query.
   *
   * @param tab_alias
   *          The table alias to match (this is null if the column reference is
   *          non aliased)
   * @param col_alias
   *          The column name that is being searched for
   * @return ColumnInfo
   * @throws SemanticException
   */
  public ColumnInfo get(String tab_alias, String col_alias) throws SemanticException {
    ColumnInfo ret = null;

    if (tab_alias != null) {
      tab_alias = tab_alias.toLowerCase();
      HashMap<String, ColumnInfo> f_map = rslvMap.get(tab_alias);
      if (f_map == null) {
        return null;
      }
      ret = f_map.get(col_alias);
    } else {
      boolean found = false;
      String foundTbl = null;
      for (Map.Entry<String, LinkedHashMap<String, ColumnInfo>> rslvEntry: rslvMap.entrySet()) {
        String rslvKey = rslvEntry.getKey();
        LinkedHashMap<String, ColumnInfo> cmap = rslvEntry.getValue();
        for (Map.Entry<String, ColumnInfo> cmapEnt : cmap.entrySet()) {
          if (col_alias.equalsIgnoreCase(cmapEnt.getKey())) {
            /*
             * We can have an unaliased and one aliased mapping to a Column.
             */
            if (found && foundTbl != null && rslvKey != null) {
              throw new SemanticException("Column " + col_alias
                  + " Found in more than One Tables/Subqueries");
            }
            found = true;
            foundTbl = rslvKey == null ? foundTbl : rslvKey;
            ret = cmapEnt.getValue();
          }
        }
      }
    }

    return ret;
  }

  public ArrayList<ColumnInfo> getColumnInfos() {
    return rowSchema.getSignature();
  }

  /**
   * Get a list of aliases for non-hidden columns
   * @param max the maximum number of columns to return
   * @return a list of non-hidden column names no greater in size than max
   */
  public List<String> getReferenceableColumnAliases(String tableAlias, int max) {
    int count = 0;
    Set<String> columnNames = new LinkedHashSet<String> ();

    int tables = rslvMap.size();

    Map<String, ColumnInfo> mapping = rslvMap.get(tableAlias);
    if (mapping != null) {
      for (Map.Entry<String, ColumnInfo> entry : mapping.entrySet()) {
        if (max > 0 && count >= max) {
          break;
        }
        ColumnInfo columnInfo = entry.getValue();
        if (!columnInfo.isHiddenVirtualCol()) {
          columnNames.add(entry.getKey());
          count++;
        }
      }
    } else {
      for (ColumnInfo columnInfo : getColumnInfos()) {
        if (max > 0 && count >= max) {
          break;
        }
        if (!columnInfo.isHiddenVirtualCol()) {
          String[] inverse = !isExprResolver ? reverseLookup(columnInfo.getInternalName()) : null;
          if (inverse != null) {
            columnNames.add(inverse[0] == null || tables <= 1 ? inverse[1] :
                inverse[0] + "." + inverse[1]);
          } else {
            columnNames.add(columnInfo.getAlias());
          }
          count++;
        }
      }
    }
    return new ArrayList<String>(columnNames);
  }

  public LinkedHashMap<String, ColumnInfo> getFieldMap(String tabAlias) {
    if (tabAlias == null) {
      return rslvMap.get(null);
    } else {
      return rslvMap.get(tabAlias.toLowerCase());
    }
  }

  public int getPosition(String internalName) {
    int pos = -1;

    for (ColumnInfo var : rowSchema.getSignature()) {
      ++pos;
      if (var.getInternalName().equals(internalName)) {
        return pos;
      }
    }

    return -1;
  }

  public Set<String> getTableNames() {
    return rslvMap.keySet();
  }

  public String[] reverseLookup(String internalName) {
    return invRslvMap.get(internalName);
  }

  public void setIsExprResolver(boolean isExprResolver) {
    this.isExprResolver = isExprResolver;
  }

  public boolean getIsExprResolver() {
    return isExprResolver;
  }

  public String[] getAlternateMappings(String internalName) {
    return altInvRslvMap.get(internalName);
  }

  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();

    for (Map.Entry<String, LinkedHashMap<String, ColumnInfo>> e : rslvMap
        .entrySet()) {
      String tab = e.getKey();
      sb.append(tab + "{");
      HashMap<String, ColumnInfo> f_map = e.getValue();
      if (f_map != null) {
        for (Map.Entry<String, ColumnInfo> entry : f_map.entrySet()) {
          sb.append("(" + entry.getKey() + "," + entry.getValue().toString()
              + ")");
        }
      }
      sb.append("} ");
    }
    return sb.toString();
  }

  public RowSchema getRowSchema() {
    return rowSchema;
  }

  public LinkedHashMap<String, LinkedHashMap<String, ColumnInfo>> getRslvMap() {
    return rslvMap;
  }

  public Map<String, ASTNode> getExpressionMap() {
    return expressionMap;
  }

  public void setExprResolver(boolean isExprResolver) {
    this.isExprResolver = isExprResolver;
  }

  public boolean doesInvRslvMapContain(String column) {
    return getInvRslvMap().containsKey(column);
  }

  public void setRowSchema(RowSchema rowSchema) {
    this.rowSchema = rowSchema;
  }

  public void setExpressionMap(Map<String, ASTNode> expressionMap) {
    this.expressionMap = expressionMap;
  }

  private static class IntRef {
    public int val = 0;
  }

  public static boolean add(RowResolver rrToAddTo, RowResolver rrToAddFrom, int numColumns)
      throws SemanticException {
    return add(rrToAddTo, rrToAddFrom, null, numColumns);
  }

  // TODO: 1) How to handle collisions? 2) Should we be cloning ColumnInfo or not?
  private static boolean add(RowResolver rrToAddTo, RowResolver rrToAddFrom,
      IntRef outputColPosRef, int numColumns) throws SemanticException {
    boolean hasDuplicates = false;
    String tabAlias;
    String colAlias;
    String[] qualifiedColName;
    int i = 0;

    int outputColPos = outputColPosRef == null ? 0 : outputColPosRef.val;
    for (ColumnInfo cInfoFrmInput : rrToAddFrom.getRowSchema().getSignature()) {
      if ( numColumns >= 0 && i == numColumns ) {
        break;
      }
      ColumnInfo newCI = null;
      String internalName = cInfoFrmInput.getInternalName();
      qualifiedColName = rrToAddFrom.reverseLookup(internalName);
      tabAlias = qualifiedColName[0];
      colAlias = qualifiedColName[1];

      newCI = new ColumnInfo(cInfoFrmInput);
      newCI.setInternalName(SemanticAnalyzer.getColumnInternalName(outputColPos));

      outputColPos++;

      boolean isUnique = rrToAddTo.putWithCheck(tabAlias, colAlias, internalName, newCI);
      hasDuplicates |= (!isUnique);

      qualifiedColName = rrToAddFrom.getAlternateMappings(internalName);
      if (qualifiedColName != null) {
        tabAlias = qualifiedColName[0];
        colAlias = qualifiedColName[1];
        rrToAddTo.put(tabAlias, colAlias, newCI);
      }
      i++;
    }

    if (outputColPosRef != null) {
      outputColPosRef.val = outputColPos;
    }
    return !hasDuplicates;
  }

  /**
   * Adds column to RR, checking for duplicate columns. Needed because CBO cannot handle the Hive
   * behavior of blindly overwriting old mapping in RR and still somehow working after that.
   * @return True if mapping was added without duplicates.
   */
  public boolean putWithCheck(String tabAlias, String colAlias,
      String internalName, ColumnInfo newCI) throws SemanticException {
    ColumnInfo existing = get(tabAlias, colAlias);
    // Hive adds the same mapping twice... I wish we could fix stuff like that.
    if (existing == null) {
      put(tabAlias, colAlias, newCI);
      return true;
    } else if (existing.isSameColumnForRR(newCI)) {
      return true;
    }
    LOG.warn("Found duplicate column alias in RR: "
        + existing.toMappingString(tabAlias, colAlias) + " adding "
        + newCI.toMappingString(tabAlias, colAlias));
    if (internalName != null) {
      existing = get(tabAlias, internalName);
      if (existing == null) {
        put(tabAlias, internalName, newCI);
        return true;
      } else if (existing.isSameColumnForRR(newCI)) {
        return true;
      }
      LOG.warn("Failed to use internal name after finding a duplicate: "
          + existing.toMappingString(tabAlias, internalName));
    }
    return false;
  }

  private static boolean add(RowResolver rrToAddTo, RowResolver rrToAddFrom,
      IntRef outputColPosRef) throws SemanticException {
    return add(rrToAddTo, rrToAddFrom, outputColPosRef, -1);
  }

  public static boolean add(RowResolver rrToAddTo, RowResolver rrToAddFrom)
      throws SemanticException {
    return add(rrToAddTo, rrToAddFrom, null, -1);
  }

  /**
   * Return a new row resolver that is combination of left RR and right RR.
   * The schema will be schema of left, schema of right
   *
   * @param leftRR
   * @param rightRR
   * @return
   * @throws SemanticException
   */
  public static RowResolver getCombinedRR(RowResolver leftRR,
      RowResolver rightRR) throws SemanticException {
    RowResolver combinedRR = new RowResolver();
    IntRef outputColPos = new IntRef();
    if (!add(combinedRR, leftRR, outputColPos)) {
      LOG.warn("Duplicates detected when adding columns to RR: see previous message");
    }
    if (!add(combinedRR, rightRR, outputColPos)) {
      LOG.warn("Duplicates detected when adding columns to RR: see previous message");
    }
    return combinedRR;
  }

  public RowResolver duplicate() {
    RowResolver resolver = new RowResolver();
    resolver.rowSchema = new RowSchema(rowSchema);
    resolver.rslvMap.putAll(rslvMap);
    resolver.invRslvMap.putAll(invRslvMap);
    resolver.altInvRslvMap.putAll(altInvRslvMap);
    resolver.expressionMap.putAll(expressionMap);
    resolver.isExprResolver = isExprResolver;
    return resolver;
  }

  private HashMap<String, String[]> getInvRslvMap() {
    return invRslvMap; // If making this public, note that its ordering is undefined.
  }

  public NamedJoinInfo getNamedJoinInfo() {
    return namedJoinInfo;
  }

  public void setNamedJoinInfo(NamedJoinInfo namedJoinInfo) {
    this.namedJoinInfo = namedJoinInfo;
  }
}