HashTableLoader.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec.spark;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.HashTableSinkOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.MapredContext;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
import org.apache.hadoop.hive.ql.exec.mr.MapredLocalTask;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SparkBucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.VectorMapJoinDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.mapred.JobConf;

/**
 * HashTableLoader for Spark to load the hashtable for MapJoins.
 */
public class HashTableLoader implements org.apache.hadoop.hive.ql.exec.HashTableLoader {

  private static final Logger LOG = LoggerFactory.getLogger(HashTableLoader.class.getName());

  private ExecMapperContext context;
  private Configuration hconf;

  private MapJoinOperator joinOp;
  private MapJoinDesc desc;

  private boolean useFastContainer = false;

  @Override
  public void init(ExecMapperContext context, MapredContext mrContext, Configuration hconf,
      MapJoinOperator joinOp) {
    this.context = context;
    this.hconf = hconf;
    this.joinOp = joinOp;
    this.desc = joinOp.getConf();
    if (desc.getVectorMode() && HiveConf.getBoolVar(
        hconf, HiveConf.ConfVars.HIVE_VECTORIZATION_MAPJOIN_NATIVE_FAST_HASHTABLE_ENABLED)) {
      VectorMapJoinDesc vectorDesc = (VectorMapJoinDesc) desc.getVectorDesc();
      useFastContainer = vectorDesc != null && vectorDesc.hashTableImplementationType() ==
          VectorMapJoinDesc.HashTableImplementationType.FAST;
    }
  }

  @Override
  public void load(MapJoinTableContainer[] mapJoinTables,
      MapJoinTableContainerSerDe[] mapJoinTableSerdes)
      throws HiveException {

    // Note: it's possible that a MJ operator is in a ReduceWork, in which case the
    // currentInputPath will be null. But, since currentInputPath is only interesting
    // for bucket join case, and for bucket join the MJ operator will always be in
    // a MapWork, this should be OK.
    String currentInputPath =
        context.getCurrentInputPath() == null ? null : context.getCurrentInputPath().toString();

    LOG.info("******* Load from HashTable for input file: " + currentInputPath);
    MapredLocalWork localWork = context.getLocalWork();
    try {
      if (localWork.getDirectFetchOp() != null) {
        loadDirectly(mapJoinTables, currentInputPath);
      }
      // All HashTables share the same base dir,
      // which is passed in as the tmp path
      Path baseDir = localWork.getTmpPath();
      if (baseDir == null) {
        return;
      }
      FileSystem fs = FileSystem.get(baseDir.toUri(), hconf);
      BucketMapJoinContext mapJoinCtx = localWork.getBucketMapjoinContext();
      boolean firstContainer = true;
      boolean useOptimizedContainer = !useFastContainer && HiveConf.getBoolVar(
          hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE);
      for (int pos = 0; pos < mapJoinTables.length; pos++) {
        if (pos == desc.getPosBigTable() || mapJoinTables[pos] != null) {
          continue;
        }
        if (useOptimizedContainer) {
          MapJoinObjectSerDeContext keyCtx = mapJoinTableSerdes[pos].getKeyContext();
          ObjectInspector keyOI = keyCtx.getSerDe().getObjectInspector();
          if (!MapJoinBytesTableContainer.isSupportedKey(keyOI)) {
            if (firstContainer) {
              LOG.warn("Not using optimized table container." +
                  "Only a subset of mapjoin keys is supported.");
              useOptimizedContainer = false;
              HiveConf.setBoolVar(hconf, HiveConf.ConfVars.HIVEMAPJOINUSEOPTIMIZEDTABLE, false);
            } else {
              throw new HiveException("Only a subset of mapjoin keys is supported.");
            }
          }
        }
        firstContainer = false;
        String bigInputPath = currentInputPath;
        if (currentInputPath != null && mapJoinCtx != null) {
          if (!desc.isBucketMapJoin()) {
            bigInputPath = null;
          } else {
            Set<String> aliases =
              ((SparkBucketMapJoinContext) mapJoinCtx).getPosToAliasMap().get(pos);
            String alias = aliases.iterator().next();
            // Any one small table input path
            String smallInputPath =
              mapJoinCtx.getAliasBucketFileNameMapping().get(alias).get(bigInputPath).get(0);
            bigInputPath = mapJoinCtx.getMappingBigFile(alias, smallInputPath);
          }
        }
        String fileName = localWork.getBucketFileName(bigInputPath);
        Path path = Utilities.generatePath(baseDir, desc.getDumpFilePrefix(), (byte) pos, fileName);
        mapJoinTables[pos] = load(fs, path, mapJoinTableSerdes[pos]);
      }
    } catch (Exception e) {
      throw new HiveException(e);
    }
  }

  private MapJoinTableContainer load(FileSystem fs, Path path,
      MapJoinTableContainerSerDe mapJoinTableSerde) throws HiveException {
    LOG.info("\tLoad back all hashtable files from tmp folder uri:" + path);
    if (!SparkUtilities.isDedicatedCluster(hconf)) {
      return useFastContainer ? mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) :
          mapJoinTableSerde.load(fs, path, hconf);
    }
    MapJoinTableContainer mapJoinTable = SmallTableCache.get(path);
    if (mapJoinTable == null) {
      synchronized (path.toString().intern()) {
        mapJoinTable = SmallTableCache.get(path);
        if (mapJoinTable == null) {
          mapJoinTable = useFastContainer ?
              mapJoinTableSerde.loadFastContainer(desc, fs, path, hconf) :
              mapJoinTableSerde.load(fs, path, hconf);
          SmallTableCache.cache(path, mapJoinTable);
        }
      }
    }
    return mapJoinTable;
  }

  private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName)
      throws Exception {
    MapredLocalWork localWork = context.getLocalWork();
    List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
    if (directWorks == null || directWorks.isEmpty()) {
      return;
    }
    JobConf job = new JobConf(hconf);
    MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);

    HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
    sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));

    for (Operator<?> operator : directWorks) {
      if (operator != null) {
        operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
      }
    }
    localTask.setExecContext(context);
    localTask.startForward(inputFileName);

    MapJoinTableContainer[] tables = sink.getMapJoinTables();
    for (int i = 0; i < sink.getNumParent(); i++) {
      if (sink.getParentOperators().get(i) != null) {
        mapJoinTables[i] = tables[i];
      }
    }

    Arrays.fill(tables, null);
  }
}