/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.parse; import java.io.Serializable; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.Stack; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.DriverContext; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.TaskFactory; import org.apache.hadoop.hive.ql.hooks.WriteEntity; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.rcfile.stats.PartialScanWork; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils; import org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.StatsNoJobWork; import org.apache.hadoop.hive.ql.plan.StatsWork; import org.apache.hadoop.hive.ql.plan.TezWork; import org.apache.hadoop.mapred.InputFormat; /** * ProcessAnalyzeTable sets up work for the several variants of analyze table * (normal, no scan, partial scan.) The plan at this point will be a single * table scan operator. */ public class ProcessAnalyzeTable implements NodeProcessor { private static final Logger LOG = LoggerFactory.getLogger(ProcessAnalyzeTable.class.getName()); // shared plan utils for tez private GenTezUtils utils = null; /** * Injecting the utils in the constructor facilitates testing */ public ProcessAnalyzeTable(GenTezUtils utils) { this.utils = utils; } @SuppressWarnings("unchecked") @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException { GenTezProcContext context = (GenTezProcContext) procContext; TableScanOperator tableScan = (TableScanOperator) nd; ParseContext parseContext = context.parseContext; Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata() .getInputFormatClass(); if (parseContext.getQueryProperties().isAnalyzeCommand()) { assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0; String alias = null; for (String a : parseContext.getTopOps().keySet()) { if (tableScan == parseContext.getTopOps().get(a)) { alias = a; } } assert alias != null; TezWork tezWork = context.currentTask.getWork(); if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) { // For ORC & Parquet, all the following statements are the same // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // There will not be any Tez job above this task StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata() .getTableSpec()); snjWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan); if (confirmedParts.size() > 0) { Table source = tableScan.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan); PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false); snjWork.setPrunedPartitionList(partList); } Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf()); snjTask.setParentTasks(null); context.rootTasks.remove(context.currentTask); context.rootTasks.add(snjTask); return true; } else { // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS; // The plan consists of a simple TezTask followed by a StatsTask. // The Tez task is just a simple TableScanOperator StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); statsWork.setSourceTask(context.currentTask); statsWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf()); context.currentTask.addDependentTask(statsTask); // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan; // The plan consists of a StatsTask only. if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) { statsTask.setParentTasks(null); statsWork.setNoScanAnalyzeCommand(true); context.rootTasks.remove(context.currentTask); context.rootTasks.add(statsTask); } // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan; if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) { handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask); } // NOTE: here we should use the new partition predicate pushdown API to // get a list of pruned list, // and pass it to setTaskPlan as the last parameter Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan); PrunedPartitionList partitions = null; if (confirmedPartns.size() > 0) { Table source = tableScan.getConf().getTableMetadata(); List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan); partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false); } MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions); w.setGatheringStats(true); return true; } } else if (parseContext.getAnalyzeRewrite() != null) { // we need to collect table stats while collecting column stats. try { context.currentTask.addDependentTask(genTableStats(context, tableScan)); } catch (HiveException e) { throw new SemanticException(e); } } return null; } private Task<?> genTableStats(GenTezProcContext context, TableScanOperator tableScan) throws HiveException { Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata() .getInputFormatClass(); ParseContext parseContext = context.parseContext; Table table = tableScan.getConf().getTableMetadata(); List<Partition> partitions = new ArrayList<>(); if (table.isPartitioned()) { partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions()); for (Partition partn : partitions) { LOG.debug("XXX: adding part: " + partn); context.outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK)); } } TableSpec tableSpec = new TableSpec(table, partitions); tableScan.getConf().getTableMetadata().setTableSpec(tableSpec); if (inputFormat.equals(OrcInputFormat.class)) { // For ORC, there is no Tez Job for table stats. StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata() .getTableSpec()); snjWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); // If partition is specified, get pruned partition list if (partitions.size() > 0) { snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan)); } return TaskFactory.get(snjWork, parseContext.getConf()); } else { StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec()); statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix()); statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir()); statsWork.setSourceTask(context.currentTask); statsWork.setStatsReliable(parseContext.getConf().getBoolVar( HiveConf.ConfVars.HIVE_STATS_RELIABLE)); return TaskFactory.get(statsWork, parseContext.getConf()); } } /** * handle partial scan command. * * It is composed of PartialScanTask followed by StatsTask. */ private void handlePartialScanCommand(TableScanOperator tableScan, ParseContext parseContext, StatsWork statsWork, GenTezProcContext context, Task<StatsWork> statsTask) throws SemanticException { String aggregationKey = tableScan.getConf().getStatsAggPrefix(); StringBuilder aggregationKeyBuffer = new StringBuilder(aggregationKey); List<Path> inputPaths = GenMapRedUtils.getInputPathsForPartialScan(tableScan, aggregationKeyBuffer); aggregationKey = aggregationKeyBuffer.toString(); // scan work PartialScanWork scanWork = new PartialScanWork(inputPaths); scanWork.setMapperCannotSpanPartns(true); scanWork.setAggKey(aggregationKey); scanWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir(), parseContext.getConf()); // stats work statsWork.setPartialScanAnalyzeCommand(true); // partial scan task DriverContext driverCxt = new DriverContext(); Task<PartialScanWork> partialScanTask = TaskFactory.get(scanWork, parseContext.getConf()); partialScanTask.initialize(parseContext.getQueryState(), null, driverCxt, tableScan.getCompilationOpContext()); partialScanTask.setWork(scanWork); statsWork.setSourceTask(partialScanTask); // task dependency context.rootTasks.remove(context.currentTask); context.rootTasks.add(partialScanTask); partialScanTask.addDependentTask(statsTask); } }