AbstractBucketJoinProc.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.FileUtils;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.QB;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.TableAccessAnalyzer;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;

/**
 * this transformation does bucket map join optimization.
 */
abstract public class AbstractBucketJoinProc implements NodeProcessor {

  protected ParseContext pGraphContext;

  public AbstractBucketJoinProc(ParseContext pGraphContext) {
    this.pGraphContext = pGraphContext;
  }

  public AbstractBucketJoinProc() {
  }

  @Override
  abstract public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
      Object... nodeOutputs) throws SemanticException;

  public static List<String> getBucketFilePathsOfPartition(
      Path location, ParseContext pGraphContext) throws SemanticException {
    List<String> fileNames = new ArrayList<String>();
    try {
      FileSystem fs = location.getFileSystem(pGraphContext.getConf());
      FileStatus[] files = fs.listStatus(new Path(location.toString()), FileUtils.HIDDEN_FILES_PATH_FILTER);
      if (files != null) {
        for (FileStatus file : files) {
          fileNames.add(file.getPath().toString());
        }
      }
    } catch (IOException e) {
      throw new SemanticException(e);
    }
    return fileNames;
  }

  // This function checks whether all bucketing columns are also in join keys and are in same order
  private boolean checkBucketColumns(List<String> bucketColumns,
      List<String> joinKeys,
      Integer[] joinKeyOrders) {
    if (joinKeys == null || bucketColumns == null || bucketColumns.isEmpty()) {
      return false;
    }
    for (int i = 0; i < joinKeys.size(); i++) {
      int index = bucketColumns.indexOf(joinKeys.get(i));
      if (joinKeyOrders[i] != null && joinKeyOrders[i] != index) {
        return false;
      }
      joinKeyOrders[i] = index;
    }

    // Check if the join columns contains all bucket columns.
    // If a table is bucketized on column B, but the join key is A and B,
    // it is easy to see joining on different buckets yield empty results.
    return joinKeys.containsAll(bucketColumns);
  }

  private boolean checkNumberOfBucketsAgainstBigTable(
      Map<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition,
      int numberOfBucketsInPartitionOfBigTable) {
    for (List<Integer> bucketNums : tblAliasToNumberOfBucketsInEachPartition.values()) {
      for (int nxt : bucketNums) {
        boolean ok = (nxt >= numberOfBucketsInPartitionOfBigTable) ? nxt
            % numberOfBucketsInPartitionOfBigTable == 0
            : numberOfBucketsInPartitionOfBigTable % nxt == 0;
        if (!ok) {
          return false;
        }
      }
    }
    return true;
  }

  protected boolean canConvertMapJoinToBucketMapJoin(
      MapJoinOperator mapJoinOp,
      BucketJoinProcCtx context) throws SemanticException {

    if (!this.pGraphContext.getMapJoinOps().contains(mapJoinOp)) {
      return false;
    }

    List<String> joinAliases = new ArrayList<String>();
    String[] srcs = mapJoinOp.getConf().getBaseSrc();
    String[] left = mapJoinOp.getConf().getLeftAliases();
    List<String> mapAlias = mapJoinOp.getConf().getMapAliases();
    String baseBigAlias = null;

    for (String s : left) {
      if (s != null) {
        String subQueryAlias = QB.getAppendedAliasFromId(mapJoinOp.getConf().getId(), s);
        if (!joinAliases.contains(subQueryAlias)) {
          joinAliases.add(subQueryAlias);
          if (!mapAlias.contains(s)) {
            baseBigAlias = subQueryAlias;
          }
        }
      }
    }

    for (String s : srcs) {
      if (s != null) {
        String subQueryAlias = QB.getAppendedAliasFromId(mapJoinOp.getConf().getId(), s);
        if (!joinAliases.contains(subQueryAlias)) {
          joinAliases.add(subQueryAlias);
          if (!mapAlias.contains(s)) {
            baseBigAlias = subQueryAlias;
          }
        }
      }
    }

    Map<Byte, List<ExprNodeDesc>> keysMap = mapJoinOp.getConf().getKeys();

    return checkConvertBucketMapJoin(
        context,
        mapJoinOp.getConf().getAliasToOpInfo(),
        keysMap,
        baseBigAlias,
        joinAliases);
  }

  /*
   * Can this mapjoin be converted to a bucketed mapjoin ?
   * The following checks are performed:
   * a. The join columns contains all the bucket columns.
   * b. The join keys are not transformed in the sub-query.
   * c. All partitions contain the expected number of files (number of buckets).
   * d. The number of buckets in the big table can be divided by no of buckets in small tables.
   */
  protected boolean checkConvertBucketMapJoin(
      BucketJoinProcCtx context,
      Map<String, Operator<? extends OperatorDesc>> aliasToOpInfo,
      Map<Byte, List<ExprNodeDesc>> keysMap,
      String baseBigAlias,
      List<String> joinAliases) throws SemanticException {

    LinkedHashMap<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition =
        new LinkedHashMap<String, List<Integer>>();
    LinkedHashMap<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition =
        new LinkedHashMap<String, List<List<String>>>();

    HashMap<String, TableScanOperator> topOps = pGraphContext.getTopOps();

    HashMap<String, String> aliasToNewAliasMap = new HashMap<String, String>();

    // (partition to bucket file names) and (partition to bucket number) for
    // the big table;
    LinkedHashMap<Partition, List<String>> bigTblPartsToBucketFileNames =
        new LinkedHashMap<Partition, List<String>>();
    LinkedHashMap<Partition, Integer> bigTblPartsToBucketNumber =
        new LinkedHashMap<Partition, Integer>();

    Integer[] joinKeyOrder = null; // accessing order of join cols to bucket cols, should be same
    boolean bigTablePartitioned = true;
    for (int index = 0; index < joinAliases.size(); index++) {
      String alias = joinAliases.get(index);
      Operator<? extends OperatorDesc> topOp = aliasToOpInfo.get(alias);
      // The alias may not be present in case of a sub-query
      if (topOp == null) {
        return false;
      }
      List<String> keys = toColumns(keysMap.get((byte) index));
      if (keys == null || keys.isEmpty()) {
        return false;
      }
      int oldKeySize = keys.size();
      TableScanOperator tso = TableAccessAnalyzer.genRootTableScan(topOp, keys);
      if (tso == null) {
        // We cannot get to root TableScan operator, likely because there is a join or group-by
        // between topOp and root TableScan operator. We don't handle that case, and simply return
        return false;
      }

      // For nested sub-queries, the alias mapping is not maintained in QB currently.
      if (topOps.containsValue(tso)) {
        for (Map.Entry<String, TableScanOperator> topOpEntry : topOps.entrySet()) {
          if (topOpEntry.getValue() == tso) {
            String newAlias = topOpEntry.getKey();
            if (!newAlias.equals(alias)) {
              joinAliases.set(index, newAlias);
              if (baseBigAlias.equals(alias)) {
                baseBigAlias = newAlias;
              }
              aliasToNewAliasMap.put(alias, newAlias);
              alias = newAlias;
            }
            break;
          }
        }
      }
      else {
        // Ideally, this should never happen, and this should be an assert.
        return false;
      }

      // The join keys cannot be transformed in the sub-query currently.
      // TableAccessAnalyzer.genRootTableScan will only return the base table scan
      // if the join keys are constants or a column. Even a simple cast of the join keys
      // will result in a null table scan operator. In case of constant join keys, they would
      // be removed, and the size before and after the genRootTableScan will be different.
      if (keys.size() != oldKeySize) {
        return false;
      }

      if (joinKeyOrder == null) {
        joinKeyOrder = new Integer[keys.size()];
      }

      Table tbl = tso.getConf().getTableMetadata();
      if (tbl.isPartitioned()) {
        PrunedPartitionList prunedParts = pGraphContext.getPrunedPartitions(alias, tso);
        List<Partition> partitions = prunedParts.getNotDeniedPartns();
        // construct a mapping of (Partition->bucket file names) and (Partition -> bucket number)
        if (partitions.isEmpty()) {
          if (!alias.equals(baseBigAlias)) {
            tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.<Integer> asList());
            tblAliasToBucketedFilePathsInEachPartition.put(alias, new ArrayList<List<String>>());
          }
        } else {
          List<Integer> buckets = new ArrayList<Integer>();
          List<List<String>> files = new ArrayList<List<String>>();
          for (Partition p : partitions) {
            if (!checkBucketColumns(p.getBucketCols(), keys, joinKeyOrder)) {
              return false;
            }
            List<String> fileNames =
                getBucketFilePathsOfPartition(p.getDataLocation(), pGraphContext);
            // The number of files for the table should be same as number of buckets.
            int bucketCount = p.getBucketCount();

            if (fileNames.size() != 0 && fileNames.size() != bucketCount) {
              String msg = "The number of buckets for table " +
                  tbl.getTableName() + " partition " + p.getName() + " is " +
                  p.getBucketCount() + ", whereas the number of files is " + fileNames.size();
              throw new SemanticException(
                  ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
            }

            if (alias.equals(baseBigAlias)) {
              bigTblPartsToBucketFileNames.put(p, fileNames);
              bigTblPartsToBucketNumber.put(p, bucketCount);
            } else {
              files.add(fileNames);
              buckets.add(bucketCount);
            }
          }
          if (!alias.equals(baseBigAlias)) {
            tblAliasToNumberOfBucketsInEachPartition.put(alias, buckets);
            tblAliasToBucketedFilePathsInEachPartition.put(alias, files);
          }
        }
      } else {
        if (!checkBucketColumns(tbl.getBucketCols(), keys, joinKeyOrder)) {
          return false;
        }
        List<String> fileNames =
            getBucketFilePathsOfPartition(tbl.getDataLocation(), pGraphContext);
        Integer num = new Integer(tbl.getNumBuckets());

        // The number of files for the table should be same as number of buckets.
        if (fileNames.size() != 0 && fileNames.size() != num) {
          String msg = "The number of buckets for table " +
              tbl.getTableName() + " is " + tbl.getNumBuckets() +
              ", whereas the number of files is " + fileNames.size();
          throw new SemanticException(
              ErrorMsg.BUCKETED_TABLE_METADATA_INCORRECT.getMsg(msg));
        }

        if (alias.equals(baseBigAlias)) {
          bigTblPartsToBucketFileNames.put(null, fileNames);
          bigTblPartsToBucketNumber.put(null, tbl.getNumBuckets());
          bigTablePartitioned = false;
        } else {
          tblAliasToNumberOfBucketsInEachPartition.put(alias, Arrays.asList(num));
          tblAliasToBucketedFilePathsInEachPartition.put(alias, Arrays.asList(fileNames));
        }
      }
    }

    // All tables or partitions are bucketed, and their bucket number is
    // stored in 'bucketNumbers', we need to check if the number of buckets in
    // the big table can be divided by no of buckets in small tables.
    for (Integer numBucketsInPartitionOfBigTable : bigTblPartsToBucketNumber.values()) {
      if (!checkNumberOfBucketsAgainstBigTable(
          tblAliasToNumberOfBucketsInEachPartition, numBucketsInPartitionOfBigTable)) {
        return false;
      }
    }

    context.setTblAliasToNumberOfBucketsInEachPartition(tblAliasToNumberOfBucketsInEachPartition);
    context.setTblAliasToBucketedFilePathsInEachPartition(
        tblAliasToBucketedFilePathsInEachPartition);
    context.setBigTblPartsToBucketFileNames(bigTblPartsToBucketFileNames);
    context.setBigTblPartsToBucketNumber(bigTblPartsToBucketNumber);
    context.setJoinAliases(joinAliases);
    context.setBaseBigAlias(baseBigAlias);
    context.setBigTablePartitioned(bigTablePartitioned);
    if (!aliasToNewAliasMap.isEmpty()) {
      context.setAliasToNewAliasMap(aliasToNewAliasMap);
    }

    return true;
  }

  /*
   * Convert mapjoin to a bucketed mapjoin.
   * The operator tree is not changed, but the mapjoin descriptor in the big table is
   * enhanced to keep the big table bucket -> small table buckets mapping.
   */
  protected void convertMapJoinToBucketMapJoin(
      MapJoinOperator mapJoinOp,
      BucketJoinProcCtx context) throws SemanticException {
    MapJoinDesc desc = mapJoinOp.getConf();

    Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
        new LinkedHashMap<String, Map<String, List<String>>>();

    Map<String, List<Integer>> tblAliasToNumberOfBucketsInEachPartition =
        context.getTblAliasToNumberOfBucketsInEachPartition();

    Map<String, List<List<String>>> tblAliasToBucketedFilePathsInEachPartition =
        context.getTblAliasToBucketedFilePathsInEachPartition();

    Map<Partition, List<String>> bigTblPartsToBucketFileNames =
        context.getBigTblPartsToBucketFileNames();

    Map<Partition, Integer> bigTblPartsToBucketNumber =
        context.getBigTblPartsToBucketNumber();

    List<String> joinAliases = context.getJoinAliases();
    String baseBigAlias = context.getBaseBigAlias();

    // sort bucket names for the big table
    for (List<String> partBucketNames : bigTblPartsToBucketFileNames.values()) {
      Collections.sort(partBucketNames);
    }

    // go through all small tables and get the mapping from bucket file name
    // in the big table to bucket file names in small tables.
    for (int j = 0; j < joinAliases.size(); j++) {
      String alias = joinAliases.get(j);
      if (alias.equals(baseBigAlias)) {
        continue;
      }
      for (List<String> names : tblAliasToBucketedFilePathsInEachPartition.get(alias)) {
        Collections.sort(names);
      }
      List<Integer> smallTblBucketNums = tblAliasToNumberOfBucketsInEachPartition.get(alias);
      List<List<String>> smallTblFilesList = tblAliasToBucketedFilePathsInEachPartition.get(alias);

      Map<String, List<String>> mappingBigTableBucketFileNameToSmallTableBucketFileNames =
          new LinkedHashMap<String, List<String>>();
      aliasBucketFileNameMapping.put(alias,
          mappingBigTableBucketFileNameToSmallTableBucketFileNames);

      // for each bucket file in big table, get the corresponding bucket file
      // name in the small table.
      // more than 1 partition in the big table, do the mapping for each partition
      Iterator<Entry<Partition, List<String>>> bigTblPartToBucketNames =
          bigTblPartsToBucketFileNames.entrySet().iterator();
      Iterator<Entry<Partition, Integer>> bigTblPartToBucketNum = bigTblPartsToBucketNumber
          .entrySet().iterator();
      while (bigTblPartToBucketNames.hasNext()) {
        assert bigTblPartToBucketNum.hasNext();
        int bigTblBucketNum = bigTblPartToBucketNum.next().getValue();
        List<String> bigTblBucketNameList = bigTblPartToBucketNames.next().getValue();
        fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(smallTblBucketNums,
            smallTblFilesList,
            mappingBigTableBucketFileNameToSmallTableBucketFileNames, bigTblBucketNum,
            bigTblBucketNameList,
            desc.getBigTableBucketNumMapping());
      }
    }
    desc.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
    desc.setBigTableAlias(baseBigAlias);
    boolean bigTablePartitioned = context.isBigTablePartitioned();
    if (bigTablePartitioned) {
      desc.setBigTablePartSpecToFileMapping(convert(bigTblPartsToBucketFileNames));
    }

    Map<Integer, Set<String>> posToAliasMap = mapJoinOp.getPosToAliasMap();
    Map<String, String> aliasToNewAliasMap = context.getAliasToNewAliasMap();
    if (aliasToNewAliasMap != null && posToAliasMap != null) {
      for (Map.Entry<String, String> entry: aliasToNewAliasMap.entrySet()) {
        for (Set<String> aliases: posToAliasMap.values()) {
          if (aliases.remove(entry.getKey())) {
            aliases.add(entry.getValue());
          }
        }
      }
    }

    // successfully convert to bucket map join
    desc.setBucketMapJoin(true);
  }

  // convert partition to partition spec string
  private Map<String, List<String>> convert(Map<Partition, List<String>> mapping) {
    Map<String, List<String>> converted = new HashMap<String, List<String>>();
    for (Map.Entry<Partition, List<String>> entry : mapping.entrySet()) {
      converted.put(entry.getKey().getName(), entry.getValue());
    }
    return converted;
  }

  public static List<String> toColumns(List<ExprNodeDesc> keys) {
    List<String> columns = new ArrayList<String>();
    for (ExprNodeDesc key : keys) {
      if (key instanceof ExprNodeColumnDesc) {
        columns.add(((ExprNodeColumnDesc) key).getColumn());
      } else if ((key instanceof ExprNodeConstantDesc)) {
        ExprNodeConstantDesc constant = (ExprNodeConstantDesc) key;
        String colName = constant.getFoldedFromCol();
        if (colName == null){
          return null;
        } else {
          columns.add(colName);
        }
      } else {
        return null;
      }
    }
    return columns;
  }

  // called for each partition of big table and populates mapping for each file in the partition
  private void fillMappingBigTableBucketFileNameToSmallTableBucketFileNames(
      List<Integer> smallTblBucketNums,
      List<List<String>> smallTblFilesList,
      Map<String, List<String>> bigTableBucketFileNameToSmallTableBucketFileNames,
      int bigTblBucketNum, List<String> bigTblBucketNameList,
      Map<String, Integer> bucketFileNameMapping) {

    for (int bindex = 0; bindex < bigTblBucketNameList.size(); bindex++) {
      ArrayList<String> resultFileNames = new ArrayList<String>();
      for (int sindex = 0; sindex < smallTblBucketNums.size(); sindex++) {
        int smallTblBucketNum = smallTblBucketNums.get(sindex);
        List<String> smallTblFileNames = smallTblFilesList.get(sindex);
        if (smallTblFileNames.size() > 0) {
          if (bigTblBucketNum >= smallTblBucketNum) {
            // if the big table has more buckets than the current small table,
            // use "MOD" to get small table bucket names. For example, if the big
            // table has 4 buckets and the small table has 2 buckets, then the
            // mapping should be 0->0, 1->1, 2->0, 3->1.
            int toAddSmallIndex = bindex % smallTblBucketNum;
            resultFileNames.add(smallTblFileNames.get(toAddSmallIndex));
          } else {
            int jump = smallTblBucketNum / bigTblBucketNum;
            for (int i = bindex; i < smallTblFileNames.size(); i = i + jump) {
              resultFileNames.add(smallTblFileNames.get(i));
            }
          }
        }
      }
      String inputBigTBLBucket = bigTblBucketNameList.get(bindex);
      bigTableBucketFileNameToSmallTableBucketFileNames.put(inputBigTBLBucket, resultFileNames);
      bucketFileNameMapping.put(inputBigTBLBucket, bindex);
    }
  }
}