/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer.physical; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.TaskGraphWalker; import org.apache.hadoop.hive.ql.parse.SemanticException; /* * Convert tasks involving JOIN into MAPJOIN. * If hive.auto.convert.join is true, the tasks involving join are converted. * Consider the query: * select .... from T1 join T2 on T1.key = T2.key join T3 on T1.key = T3.key * * There is a map-reduce task which performs a 3-way join (T1, T2, T3). * The task would be converted to a conditional task which would have 4 children * a. Mapjoin considering T1 as the big table * b. Mapjoin considering T2 as the big table * c. Mapjoin considering T3 as the big table * d. Map-reduce join (the original task). * * Note that the sizes of all the inputs may not be available at compile time. At runtime, it is * determined which branch we want to pick up from the above. * * However, if hive.auto.convert.join.noconditionaltask is set to true, and * the sum of any n-1 tables is smaller than hive.auto.convert.join.noconditionaltask.size, * then a mapjoin is created instead of the conditional task. For the above, if the size of * T1 + T2 is less than the threshold, then the task is converted to a mapjoin task with T3 as * the big table. * * In this case, further optimization is performed by merging 2 consecutive map-only jobs. * Consider the query: * select ... from T1 join T2 on T1.key1 = T2.key1 join T3 on T1.key2 = T3.key2 * * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2) * followed by another map-reduce job (to perform join of the result with T3). After the * optimization, both these tasks would be converted to map-only tasks. These 2 map-only jobs * are then merged into a single map-only job. As a followup (HIVE-3952), it would be possible to * merge a map-only task with a map-reduce task. * Consider the query: * select T1.key2, count(*) from T1 join T2 on T1.key1 = T2.key1 group by T1.key2; * Initially, the plan would consist of 2 Map-reduce jobs (1 to perform join for T1 and T2) * followed by another map-reduce job (to perform groupby of the result). After the * optimization, the join task would be converted to map-only tasks. After HIVE-3952, the map-only * task would be merged with the map-reduce task to create a single map-reduce task. */ public class CommonJoinResolver implements PhysicalPlanResolver { @Override public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException { // create dispatcher and graph walker Dispatcher disp = new CommonJoinTaskDispatcher(pctx); TaskGraphWalker ogw = new TaskGraphWalker(disp); // get all the tasks nodes from root task List<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pctx.rootTasks); // begin to walk through the task tree. ogw.startWalking(topNodes, null); return pctx; } }