/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.physical.index; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Stack; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.hive.metastore.api.Index; import org.apache.hadoop.hive.ql.exec.TableScanOperator; import org.apache.hadoop.hive.ql.exec.Task; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.exec.mr.MapRedTask; import org.apache.hadoop.hive.ql.hooks.ReadEntity; import org.apache.hadoop.hive.ql.index.HiveIndexHandler; import org.apache.hadoop.hive.ql.index.HiveIndexQueryContext; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.metadata.HiveUtils; import org.apache.hadoop.hive.ql.metadata.Partition; import org.apache.hadoop.hive.ql.optimizer.IndexUtils; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.MapredWork; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.plan.TableScanDesc; /** * * IndexWhereProcessor. * Processes Operator Nodes to look for WHERE queries with a predicate column * on which we have an index. Creates an index subquery Task for these * WHERE queries to use the index automatically. */ public class IndexWhereProcessor implements NodeProcessor { private static final Logger LOG = LoggerFactory.getLogger(IndexWhereProcessor.class.getName()); private final Map<TableScanOperator, List<Index>> tsToIndices; public IndexWhereProcessor(Map<TableScanOperator, List<Index>> tsToIndices) { super(); this.tsToIndices = tsToIndices; } @Override /** * Process a node of the operator tree. This matches on the rule in IndexWhereTaskDispatcher */ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { TableScanOperator operator = (TableScanOperator) nd; List<Node> opChildren = operator.getChildren(); TableScanDesc operatorDesc = operator.getConf(); if (operatorDesc == null || !tsToIndices.containsKey(operator)) { return null; } List<Index> indexes = tsToIndices.get(operator); ExprNodeDesc predicate = operatorDesc.getFilterExpr(); IndexWhereProcCtx context = (IndexWhereProcCtx) procCtx; ParseContext pctx = context.getParseContext(); LOG.info("Processing predicate for index optimization"); if (predicate == null) { LOG.info("null predicate pushed down"); return null; } LOG.info(predicate.getExprString()); // check if we have tsToIndices on all partitions in this table scan Set<Partition> queryPartitions; try { queryPartitions = IndexUtils.checkPartitionsCoveredByIndex(operator, pctx, indexes); if (queryPartitions == null) { // partitions not covered return null; } } catch (HiveException e) { LOG.error("Fatal Error: problem accessing metastore", e); throw new SemanticException(e); } // we can only process MapReduce tasks to check input size if (!context.getCurrentTask().isMapRedTask()) { return null; } MapRedTask currentTask = (MapRedTask) context.getCurrentTask(); // get potential reentrant index queries from each index Map<Index, HiveIndexQueryContext> queryContexts = new HashMap<Index, HiveIndexQueryContext>(); // make sure we have an index on the table being scanned TableDesc tblDesc = operator.getTableDesc(); Map<String, List<Index>> indexesByType = new HashMap<String, List<Index>>(); for (Index indexOnTable : indexes) { if (indexesByType.get(indexOnTable.getIndexHandlerClass()) == null) { List<Index> newType = new ArrayList<Index>(); newType.add(indexOnTable); indexesByType.put(indexOnTable.getIndexHandlerClass(), newType); } else { indexesByType.get(indexOnTable.getIndexHandlerClass()).add(indexOnTable); } } // choose index type with most tsToIndices of the same type on the table // TODO HIVE-2130 This would be a good place for some sort of cost based choice? List<Index> bestIndexes = indexesByType.values().iterator().next(); for (List<Index> indexTypes : indexesByType.values()) { if (bestIndexes.size() < indexTypes.size()) { bestIndexes = indexTypes; } } // rewrite index queries for the chosen index type HiveIndexQueryContext tmpQueryContext = new HiveIndexQueryContext(); tmpQueryContext.setQueryPartitions(queryPartitions); rewriteForIndexes(predicate, bestIndexes, pctx, currentTask, tmpQueryContext); List<Task<?>> indexTasks = tmpQueryContext.getQueryTasks(); if (indexTasks != null && indexTasks.size() > 0) { queryContexts.put(bestIndexes.get(0), tmpQueryContext); } // choose an index rewrite to use if (queryContexts.size() > 0) { // TODO HIVE-2130 This would be a good place for some sort of cost based choice? Index chosenIndex = queryContexts.keySet().iterator().next(); // modify the parse context to use indexing // we need to delay this until we choose one index so that we don't attempt to modify pctx multiple times HiveIndexQueryContext queryContext = queryContexts.get(chosenIndex); // prepare the map reduce job to use indexing MapWork work = currentTask.getWork().getMapWork(); work.setInputformat(queryContext.getIndexInputFormat()); work.addIndexIntermediateFile(queryContext.getIndexIntermediateFile()); // modify inputs based on index query Set<ReadEntity> inputs = pctx.getSemanticInputs(); inputs.addAll(queryContext.getAdditionalSemanticInputs()); List<Task<?>> chosenRewrite = queryContext.getQueryTasks(); // add dependencies so index query runs first insertIndexQuery(pctx, context, chosenRewrite); } return null; } /** * Get a list of Tasks to activate use of tsToIndices. * Generate the tasks for the index query (where we store results of * querying the index in a tmp file) inside the IndexHandler * @param predicate Predicate of query to rewrite * @param index Index to use for rewrite * @param pctx * @param task original task before rewrite * @param queryContext stores return values */ private void rewriteForIndexes(ExprNodeDesc predicate, List<Index> indexes, ParseContext pctx, Task<MapredWork> task, HiveIndexQueryContext queryContext) throws SemanticException { HiveIndexHandler indexHandler; // All tsToIndices in the list are of the same type, and therefore can use the // same handler to generate the index query tasks Index index = indexes.get(0); try { indexHandler = HiveUtils.getIndexHandler(pctx.getConf(), index.getIndexHandlerClass()); } catch (HiveException e) { LOG.error("Exception while loading IndexHandler: " + index.getIndexHandlerClass(), e); throw new SemanticException("Failed to load indexHandler: " + index.getIndexHandlerClass(), e); } // check the size try { ContentSummary inputSummary = Utilities.getInputSummary(pctx.getContext(), task.getWork().getMapWork(), null); long inputSize = inputSummary.getLength(); if (!indexHandler.checkQuerySize(inputSize, pctx.getConf())) { queryContext.setQueryTasks(null); return; } } catch (IOException e) { throw new SemanticException("Failed to get task size", e); } // use the IndexHandler to generate the index query indexHandler.generateIndexQuery(indexes, predicate, pctx, queryContext); // TODO HIVE-2115 use queryContext.residualPredicate to process residual predicate return; } /** * Insert the rewrite tasks at the head of the pctx task tree * @param pctx * @param context * @param chosenRewrite */ private void insertIndexQuery(ParseContext pctx, IndexWhereProcCtx context, List<Task<?>> chosenRewrite) { Task<?> wholeTableScan = context.getCurrentTask(); LinkedHashSet<Task<?>> rewriteLeaves = new LinkedHashSet<Task<?>>(); findLeaves(chosenRewrite, rewriteLeaves); for (Task<?> leaf : rewriteLeaves) { leaf.addDependentTask(wholeTableScan); // add full scan task as child for every index query task } // replace the original with the index sub-query as a root task pctx.replaceRootTask(wholeTableScan, chosenRewrite); } /** * Find the leaves of the task tree */ private void findLeaves(List<Task<?>> tasks, Set<Task<?>> leaves) { for (Task<?> t : tasks) { if (t.getDependentTasks() == null) { leaves.add(t); } else { findLeaves(t.getDependentTasks(), leaves); } } } }