SimpleFetchOptimizer.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.ql.stats.StatsUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.CommonJoinOperator;
import org.apache.hadoop.hive.ql.exec.FetchTask;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.LimitOperator;
import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.ScriptOperator;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.hooks.ReadEntity;
import org.apache.hadoop.hive.ql.io.ContentSummaryInputFormat;
import org.apache.hadoop.hive.ql.io.HiveInputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.HiveStorageHandler;
import org.apache.hadoop.hive.ql.metadata.InputEstimator;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.parse.SplitSample;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.FetchWork;
import org.apache.hadoop.hive.ql.plan.ListSinkDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.SelectDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDate;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;

/**
 * Tries to convert simple fetch query to single fetch task, which fetches rows directly
 * from location of table/partition.
 */
public class SimpleFetchOptimizer extends Transform {

  private final Logger LOG = LoggerFactory.getLogger(SimpleFetchOptimizer.class.getName());

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    Map<String, TableScanOperator> topOps = pctx.getTopOps();
    if (pctx.getQueryProperties().isQuery() && !pctx.getQueryProperties().isAnalyzeCommand()
        && topOps.size() == 1) {
      // no join, no groupby, no distinct, no lateral view, no subq,
      // no CTAS or insert, not analyze command, and single sourced.
      String alias = (String) pctx.getTopOps().keySet().toArray()[0];
      TableScanOperator topOp = pctx.getTopOps().values().iterator().next();
      try {
        FetchTask fetchTask = optimize(pctx, alias, topOp);
        if (fetchTask != null) {
          pctx.setFetchTask(fetchTask);
        }
      } catch (Exception e) {
        // Has to use full name to make sure it does not conflict with
        // org.apache.commons.lang.StringUtils
        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
        if (e instanceof SemanticException) {
          throw (SemanticException) e;
        }
        throw new SemanticException(e.getMessage(), e);
      }
    }
    return pctx;
  }

  // returns non-null FetchTask instance when succeeded
  @SuppressWarnings("unchecked")
  private FetchTask optimize(ParseContext pctx, String alias, TableScanOperator source)
      throws Exception {
    String mode = HiveConf.getVar(
        pctx.getConf(), HiveConf.ConfVars.HIVEFETCHTASKCONVERSION);

    boolean aggressive = "more".equals(mode);
    final int limit = pctx.getQueryProperties().getOuterQueryLimit();
    // limit = 0 means that we do not need any task.
    if (limit == 0) {
      return null;
    }
    FetchData fetch = checkTree(aggressive, pctx, alias, source);
    if (fetch != null && checkThreshold(fetch, limit, pctx)) {
      FetchWork fetchWork = fetch.convertToWork();
      FetchTask fetchTask = (FetchTask) TaskFactory.get(fetchWork, pctx.getConf());
      fetchWork.setSink(fetch.completed(pctx, fetchWork));
      fetchWork.setSource(source);
      fetchWork.setLimit(limit);
      return fetchTask;
    }
    return null;
  }

  private boolean checkThreshold(FetchData data, int limit, ParseContext pctx) throws Exception {
    if (limit > 0) {
      if (data.hasOnlyPruningFilter()) {
        /* partitioned table + query has only pruning filters */
        return true;
      } else if (data.isPartitioned() == false && data.isFiltered() == false) {
        /* unpartitioned table + no filters */
        return true;
      }
      /* fall through */
    }
    long threshold = HiveConf.getLongVar(pctx.getConf(),
        HiveConf.ConfVars.HIVEFETCHTASKCONVERSIONTHRESHOLD);
    if (threshold < 0) {
      return true;
    }
    Operator child = data.scanOp.getChildOperators().get(0);
    if(child instanceof SelectOperator) {
      // select *, constant and casts can be allowed without a threshold check
      if (checkExpressions((SelectOperator)child)) {
        return true;
      }
    }
    return data.isDataLengthWithInThreshold(pctx, threshold);
  }

  // all we can handle is LimitOperator, FilterOperator SelectOperator and final FS
  //
  // for non-aggressive mode (minimal)
  // 1. sampling is not allowed
  // 2. for partitioned table, all filters should be targeted to partition column
  // 3. SelectOperator should use only simple cast/column access
  private FetchData checkTree(boolean aggressive, ParseContext pctx, String alias,
      TableScanOperator ts) throws HiveException {
    SplitSample splitSample = pctx.getNameToSplitSample().get(alias);
    if (!aggressive && splitSample != null) {
      return null;
    }
    if (!aggressive && ts.getConf().getTableSample() != null) {
      return null;
    }
    Table table = ts.getConf().getTableMetadata();
    if (table == null) {
      return null;
    }
    ReadEntity parent = PlanUtils.getParentViewInfo(alias, pctx.getViewAliasToInput());
    if (!table.isPartitioned()) {
      FetchData fetch = new FetchData(ts, parent, table, splitSample);
      return checkOperators(fetch, aggressive, false);
    }

    boolean bypassFilter = false;
    if (HiveConf.getBoolVar(pctx.getConf(), HiveConf.ConfVars.HIVEOPTPPD)) {
      ExprNodeDesc pruner = pctx.getOpToPartPruner().get(ts);
      if (PartitionPruner.onlyContainsPartnCols(table, pruner)) {
        bypassFilter = !pctx.getPrunedPartitions(alias, ts).hasUnknownPartitions();
      }
    }
    if (!aggressive && !bypassFilter) {
      return null;
    }
    PrunedPartitionList partitions = pctx.getPrunedPartitions(alias, ts);
    FetchData fetch = new FetchData(ts, parent, table, partitions, splitSample, bypassFilter);
    return checkOperators(fetch, aggressive, bypassFilter);
  }

  private FetchData checkOperators(FetchData fetch, boolean aggressive, boolean bypassFilter) {
    if (aggressive) {
      return isConvertible(fetch) ? fetch : null;
    }
    return checkOperators(fetch, fetch.scanOp, bypassFilter);
  }

  private FetchData checkOperators(FetchData fetch, TableScanOperator ts, boolean bypassFilter) {
    if (ts.getChildOperators().size() != 1) {
      return null;
    }
    Operator<?> op = ts.getChildOperators().get(0);
    for (; ; op = op.getChildOperators().get(0)) {
      if (op instanceof SelectOperator) {
        if (!checkExpressions((SelectOperator) op)) {
          return null;
        }
        continue;
      }

      if (!(op instanceof LimitOperator || (op instanceof FilterOperator && bypassFilter))) {
        break;
      }

      if (op.getChildOperators() == null || op.getChildOperators().size() != 1) {
        return null;
      }

      if (op instanceof FilterOperator) {
        fetch.setFiltered(true);
      }
    }

    if (op instanceof FileSinkOperator) {
      fetch.fileSink = op;
      return fetch;
    }

    return null;
  }

  private boolean checkExpressions(SelectOperator op) {
    SelectDesc desc = op.getConf();
    if (desc.isSelectStar() || desc.isSelStarNoCompute()) {
      return true;
    }
    for (ExprNodeDesc expr : desc.getColList()) {
      if (!checkExpression(expr)) {
        return false;
      }
    }
    return true;
  }

  private boolean checkExpression(ExprNodeDesc expr) {
    if (expr instanceof ExprNodeConstantDesc ||
        expr instanceof ExprNodeColumnDesc) {
      return true;
    }

    if (expr instanceof ExprNodeGenericFuncDesc) {
      GenericUDF udf = ((ExprNodeGenericFuncDesc) expr).getGenericUDF();
      if (udf instanceof GenericUDFToBinary || udf instanceof GenericUDFToChar
          || udf instanceof GenericUDFToDate || udf instanceof GenericUDFToDecimal
          || udf instanceof GenericUDFToUnixTimeStamp || udf instanceof GenericUDFToUtcTimestamp
          || udf instanceof GenericUDFToVarchar) {
        return expr.getChildren().size() == 1 && checkExpression(expr.getChildren().get(0));
      }
    }
    return false;
  }

  private boolean isConvertible(FetchData fetch) {
    return isConvertible(fetch, fetch.scanOp, new HashSet<Operator<?>>());
  }

  private boolean isConvertible(FetchData fetch, Operator<?> operator, Set<Operator<?>> traversed) {
    if (operator instanceof ReduceSinkOperator || operator instanceof CommonJoinOperator
        || operator instanceof ScriptOperator) {
      return false;
    }

    if (operator instanceof FilterOperator) {
      fetch.setFiltered(true);
    }

    if (!traversed.add(operator)) {
      return true;
    }
    if (operator.getNumChild() == 0) {
      if (operator instanceof FileSinkOperator) {
        fetch.fileSink = operator;
        return true;
      }
      return false;
    }
    for (Operator<?> child : operator.getChildOperators()) {
      if (!traversed.containsAll(child.getParentOperators())){
        continue;
      }
      if (!isConvertible(fetch, child, traversed)) {
        return false;
      }
    }
    return true;
  }

  enum Status {
    PASS,
    FAIL,
    UNAVAILABLE
  }

  private class FetchData {

    // source table scan
    private final TableScanOperator scanOp;
    private final ReadEntity parent;

    private final Table table;
    private final SplitSample splitSample;
    private final PrunedPartitionList partsList;
    private final Set<ReadEntity> inputs = new LinkedHashSet<ReadEntity>();
    private final boolean onlyPruningFilter;

    // this is always non-null when conversion is completed
    private Operator<?> fileSink;
    private boolean filtered;

    private FetchData(TableScanOperator scanOp, ReadEntity parent, Table table, SplitSample splitSample) {
      this.scanOp = scanOp;
      this.parent = parent;
      this.table = table;
      this.partsList = null;
      this.splitSample = splitSample;
      this.onlyPruningFilter = false;
    }

    private FetchData(TableScanOperator scanOp, ReadEntity parent, Table table, PrunedPartitionList partsList,
        SplitSample splitSample, boolean bypassFilter) {
      this.scanOp = scanOp;
      this.parent = parent;
      this.table = table;
      this.partsList = partsList;
      this.splitSample = splitSample;
      this.onlyPruningFilter = bypassFilter;
    }

    /*
     * all filters were executed during partition pruning
     */
    public final boolean hasOnlyPruningFilter() {
      return this.onlyPruningFilter;
    }

    public final boolean isPartitioned() {
      return this.table.isPartitioned();
    }

    /* there are filter operators in the pipeline */
    public final boolean isFiltered() {
      return this.filtered;
    }

    public final void setFiltered(boolean filtered) {
      this.filtered = filtered;
    }

    private FetchWork convertToWork() throws HiveException {
      inputs.clear();
      Utilities.addSchemaEvolutionToTableScanOperator(table, scanOp);
      TableDesc tableDesc = Utilities.getTableDesc(table);
      if (!table.isPartitioned()) {
        inputs.add(new ReadEntity(table, parent, !table.isView() && parent == null));
        FetchWork work = new FetchWork(table.getPath(), tableDesc);
        PlanUtils.configureInputJobPropertiesForStorageHandler(work.getTblDesc());
        work.setSplitSample(splitSample);
        return work;
      }
      List<Path> listP = new ArrayList<Path>();
      List<PartitionDesc> partP = new ArrayList<PartitionDesc>();

      for (Partition partition : partsList.getNotDeniedPartns()) {
        inputs.add(new ReadEntity(partition, parent, parent == null));
        listP.add(partition.getDataLocation());
        partP.add(Utilities.getPartitionDescFromTableDesc(tableDesc, partition, true));
      }
      Table sourceTable = partsList.getSourceTable();
      inputs.add(new ReadEntity(sourceTable, parent, parent == null));
      TableDesc table = Utilities.getTableDesc(sourceTable);
      FetchWork work = new FetchWork(listP, partP, table);
      if (!work.getPartDesc().isEmpty()) {
        PartitionDesc part0 = work.getPartDesc().get(0);
        PlanUtils.configureInputJobPropertiesForStorageHandler(part0.getTableDesc());
        work.setSplitSample(splitSample);
      }
      return work;
    }

    // this optimizer is for replacing FS to temp+fetching from temp with
    // single direct fetching, which means FS is not needed any more when conversion completed.
    // rows forwarded will be received by ListSinkOperator, which is replacing FS
    private ListSinkOperator completed(ParseContext pctx, FetchWork work) {
      for (ReadEntity input : inputs) {
        PlanUtils.addInput(pctx.getSemanticInputs(), input);
      }
      return replaceFSwithLS(fileSink, work.getSerializationNullFormat());
    }

    private boolean isDataLengthWithInThreshold(ParseContext pctx, final long threshold)
        throws Exception {
      if (splitSample != null && splitSample.getTotalLength() != null) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Threshold " + splitSample.getTotalLength() + " exceeded for pseudoMR mode");
        }
        return (threshold - splitSample.getTotalLength()) > 0;
      }

      Status status = checkThresholdWithMetastoreStats(table, partsList, threshold);
      if (status.equals(Status.PASS)) {
        return true;
      } else if (status.equals(Status.FAIL)) {
        return false;
      } else {
        LOG.info("Cannot fetch stats from metastore for table: {}. Falling back to filesystem scan..",
          table.getCompleteName());
        // metastore stats is unavailable, fallback to old way
        final JobConf jobConf = new JobConf(pctx.getConf());
        Utilities.setColumnNameList(jobConf, scanOp, true);
        Utilities.setColumnTypeList(jobConf, scanOp, true);
        HiveStorageHandler handler = table.getStorageHandler();
        if (handler instanceof InputEstimator) {
          InputEstimator estimator = (InputEstimator) handler;
          TableDesc tableDesc = Utilities.getTableDesc(table);
          PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
          Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
          long len = estimator.estimate(jobConf, scanOp, threshold).getTotalLength();
          if (LOG.isDebugEnabled()) {
            LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
          }
          return (threshold - len) > 0;
        }
        if (table.isNonNative()) {
          return true; // nothing can be done
        }
        if (!table.isPartitioned()) {
          long len = getPathLength(jobConf, table.getPath(), table.getInputFormatClass(), threshold);
          if (LOG.isDebugEnabled()) {
            LOG.debug("Threshold " + len + " exceeded for pseudoMR mode");
          }
          return (threshold - len) > 0;
        }
        final AtomicLong total = new AtomicLong(0);
        //TODO: use common thread pool later?
        int threadCount = HiveConf.getIntVar(pctx.getConf(),
          HiveConf.ConfVars.HIVE_STATS_GATHER_NUM_THREADS);
        final ExecutorService pool = (threadCount > 0) ?
          Executors.newFixedThreadPool(threadCount,
            new ThreadFactoryBuilder()
              .setDaemon(true)
              .setNameFormat("SimpleFetchOptimizer-FileLength-%d").build()) : null;
        try {
          List<Future> futures = Lists.newLinkedList();
          for (final Partition partition : partsList.getNotDeniedPartns()) {
            final Path path = partition.getDataLocation();
            if (pool != null) {
              futures.add(pool.submit(new Callable<Long>() {
                @Override
                public Long call() throws Exception {
                  long len = getPathLength(jobConf, path, partition.getInputFormatClass(), threshold);
                  LOG.trace(path + ", length=" + len);
                  return total.addAndGet(len);
                }
              }));
            } else {
              total.addAndGet(getPathLength(jobConf, path, partition.getInputFormatClass(), threshold));
            }
          }
          if (pool != null) {
            pool.shutdown();
            for (Future<Long> future : futures) {
              long totalLen = future.get();
              if ((threshold - totalLen) <= 0) {
                // early exit, as getting file lengths can be expensive in object stores.
                return false;
              }
            }
          }
          return (threshold - total.get()) >= 0;
        } finally {
          LOG.info("Data set size=" + total.get() + ", threshold=" + threshold);
          if (pool != null) {
            pool.shutdownNow();
          }
        }
      }
    }

    // This method gets the basic stats from metastore for table/partitions. This will make use of the statistics from
    // AnnotateWithStatistics optimizer when available. If execution engine is tez or spark, AnnotateWithStatistics
    // optimization is applied only during physical compilation because of DPP changing the stats. In such case, we
    // we will get the basic stats from metastore. When statistics is absent in metastore we will use the fallback of
    // scanning the filesystem to get file lengths.
    private Status checkThresholdWithMetastoreStats(final Table table, final PrunedPartitionList partsList,
      final long threshold) {
      if (table != null && !table.isPartitioned()) {
        long dataSize = StatsUtils.getTotalSize(table);
        if (dataSize <= 0) {
          LOG.warn("Cannot determine basic stats for table: {} from metastore. Falling back.", table.getCompleteName());
          return Status.UNAVAILABLE;
        }

        return (threshold - dataSize) >= 0 ? Status.PASS : Status.FAIL;
      } else if (table != null && table.isPartitioned() && partsList != null) {
        List<Long> dataSizes = StatsUtils.getBasicStatForPartitions(table, partsList.getNotDeniedPartns(),
          StatsSetupConst.TOTAL_SIZE);
        long totalDataSize = StatsUtils.getSumIgnoreNegatives(dataSizes);
        if (totalDataSize <= 0) {
          LOG.warn("Cannot determine basic stats for partitioned table: {} from metastore. Falling back.",
            table.getCompleteName());
          return Status.UNAVAILABLE;
        }

        return (threshold - totalDataSize) >= 0 ? Status.PASS : Status.FAIL;
      }

      return Status.UNAVAILABLE;
    }

    private long getPathLength(JobConf conf, Path path,
        Class<? extends InputFormat> clazz, long threshold)
        throws IOException {
      if (ContentSummaryInputFormat.class.isAssignableFrom(clazz)) {
        InputFormat input = HiveInputFormat.getInputFormatFromCache(clazz, conf);
        return ((ContentSummaryInputFormat)input).getContentSummary(path, conf).getLength();
      } else {
        FileSystem fs = path.getFileSystem(conf);
        try {
          long length = 0;
          RemoteIterator<LocatedFileStatus> results = fs.listFiles(path, true);
          // No need to iterate more, when threshold is reached
          // (beneficial especially for object stores)
          while (length <= threshold && results.hasNext()) {
            length += results.next().getLen();
          }
          LOG.trace("length=" + length + ", threshold=" + threshold);
          return length;
        } catch (FileNotFoundException e) {
          return 0;
        }
      }
    }
  }

  public static ListSinkOperator replaceFSwithLS(Operator<?> fileSink, String nullFormat) {
    ListSinkDesc desc = new ListSinkDesc(nullFormat);
    ListSinkOperator sink = (ListSinkOperator) OperatorFactory.get(
        fileSink.getCompilationOpContext(), desc);

    sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>());
    Operator<? extends OperatorDesc> parent = fileSink.getParentOperators().get(0);
    sink.getParentOperators().add(parent);
    parent.replaceChild(fileSink, sink);
    fileSink.setParentOperators(null);
    return sink;
  }
}