/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer; import java.util.Collection; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.Context; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.plan.LimitDesc; import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.metadata.Table; import org.apache.hadoop.hive.ql.optimizer.ppr.PartitionPruner; import org.apache.hadoop.hive.ql.parse.GlobalLimitCtx; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.PrunedPartitionList; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.parse.SplitSample; import org.apache.hadoop.hive.ql.plan.FilterDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; /** * This optimizer is used to reduce the input size for the query for queries which are * specifying a limit. * <p/> * For eg. for a query of type: * <p/> * select expr from T where <filter> limit 100; * <p/> * Most probably, the whole table T need not be scanned. * Chances are that even if we scan the first file of T, we would get the 100 rows * needed by this query. * This optimizer step populates the GlobalLimitCtx which is used later on to prune the inputs. */ public class GlobalLimitOptimizer extends Transform { private final Logger LOG = LoggerFactory.getLogger(GlobalLimitOptimizer.class.getName()); @Override public ParseContext transform(ParseContext pctx) throws SemanticException { Context ctx = pctx.getContext(); Map<String, TableScanOperator> topOps = pctx.getTopOps(); GlobalLimitCtx globalLimitCtx = pctx.getGlobalLimitCtx(); Map<String, SplitSample> nameToSplitSample = pctx.getNameToSplitSample(); // determine the query qualifies reduce input size for LIMIT // The query only qualifies when there are only one top operator // and there is no transformer or UDTF and no block sampling // is used. if (ctx.getTryCount() == 0 && topOps.size() == 1 && !globalLimitCtx.ifHasTransformOrUDTF() && nameToSplitSample.isEmpty()) { // Here we recursively check: // 1. whether there are exact one LIMIT in the query // 2. whether there is no aggregation, group-by, distinct, sort by, // distributed by, or table sampling in any of the sub-query. // The query only qualifies if both conditions are satisfied. // // Example qualified queries: // CREATE TABLE ... AS SELECT col1, col2 FROM tbl LIMIT .. // INSERT OVERWRITE TABLE ... SELECT col1, hash(col2), split(col1) // FROM ... LIMIT... // SELECT * FROM (SELECT col1 as col2 (SELECT * FROM ...) t1 LIMIT ...) t2); // TableScanOperator ts = topOps.values().iterator().next(); LimitOperator tempGlobalLimit = checkQbpForGlobalLimit(ts); // query qualify for the optimization if (tempGlobalLimit != null) { LimitDesc tempGlobalLimitDesc = tempGlobalLimit.getConf(); Table tab = ts.getConf().getTableMetadata(); Set<FilterOperator> filterOps = OperatorUtils.findOperators(ts, FilterOperator.class); if (!tab.isPartitioned()) { if (filterOps.size() == 0) { Integer tempOffset = tempGlobalLimitDesc.getOffset(); globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset); } } else { // check if the pruner only contains partition columns if (onlyContainsPartnCols(tab, filterOps)) { String alias = (String) topOps.keySet().toArray()[0]; PrunedPartitionList partsList = pctx.getPrunedPartitions(alias, ts); // If there is any unknown partition, create a map-reduce job for // the filter to prune correctly if (!partsList.hasUnknownPartitions()) { Integer tempOffset = tempGlobalLimitDesc.getOffset(); globalLimitCtx.enableOpt(tempGlobalLimitDesc.getLimit(), (tempOffset == null) ? 0 : tempOffset); } } } if (globalLimitCtx.isEnable()) { LOG.info("Qualify the optimize that reduces input size for 'offset' for offset " + globalLimitCtx.getGlobalOffset()); LOG.info("Qualify the optimize that reduces input size for 'limit' for limit " + globalLimitCtx.getGlobalLimit()); } } } return pctx; } private boolean onlyContainsPartnCols(Table table, Set<FilterOperator> filters) { for (FilterOperator filter : filters) { if (!PartitionPruner.onlyContainsPartnCols(table, filter.getConf().getPredicate())) { return false; } } return true; } /** * Check the limit number in all sub queries * * @return if there is one and only one limit for all subqueries, return the limit * if there is no limit, return 0 * otherwise, return null */ private static LimitOperator checkQbpForGlobalLimit(TableScanOperator ts) { Set<Class<? extends Operator<?>>> searchedClasses = new ImmutableSet.Builder<Class<? extends Operator<?>>>() .add(ReduceSinkOperator.class) .add(GroupByOperator.class) .add(FilterOperator.class) .add(LimitOperator.class) .build(); Multimap<Class<? extends Operator<?>>, Operator<?>> ops = OperatorUtils.classifyOperators(ts, searchedClasses); // To apply this optimization, in the input query: // - There cannot exist any order by/sort by clause, // thus existsOrdering should be false. // - There cannot exist any distribute by clause, thus // existsPartitioning should be false. // - There cannot exist any cluster by clause, thus // existsOrdering AND existsPartitioning should be false. for (Operator<?> op : ops.get(ReduceSinkOperator.class)) { ReduceSinkDesc reduceSinkConf = ((ReduceSinkOperator) op).getConf(); if (reduceSinkConf.isOrdering() || reduceSinkConf.isPartitioning()) { return null; } } // - There cannot exist any (distinct) aggregate. for (Operator<?> op : ops.get(GroupByOperator.class)) { GroupByDesc groupByConf = ((GroupByOperator) op).getConf(); if (groupByConf.isAggregate() || groupByConf.isDistinct()) { return null; } } // - There cannot exist any sampling predicate. for (Operator<?> op : ops.get(FilterOperator.class)) { FilterDesc filterConf = ((FilterOperator) op).getConf(); if (filterConf.getIsSamplingPred()) { return null; } } // If there is one and only one limit starting at op, return the limit // If there is no limit, return 0 // Otherwise, return null Collection<Operator<?>> limitOps = ops.get(LimitOperator.class); if (limitOps.size() == 1) { return (LimitOperator) limitOps.iterator().next(); } else if (limitOps.size() == 0) { return null; } return null; } }