/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.planner; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Set; import org.voltdb.catalog.Database; import org.voltdb.expressions.AbstractExpression; import org.voltdb.expressions.ExpressionUtil; import org.voltdb.expressions.TupleValueExpression; import org.voltdb.planner.parseinfo.BranchNode; import org.voltdb.planner.parseinfo.JoinNode; import org.voltdb.planner.parseinfo.StmtSubqueryScan; import org.voltdb.planner.parseinfo.StmtTableScan; import org.voltdb.planner.parseinfo.SubqueryLeafNode; import org.voltdb.plannodes.IndexSortablePlanNode; import org.voltdb.plannodes.AbstractJoinPlanNode; import org.voltdb.plannodes.AbstractPlanNode; import org.voltdb.plannodes.AbstractReceivePlanNode; import org.voltdb.plannodes.IndexScanPlanNode; import org.voltdb.plannodes.IndexUseForOrderBy; import org.voltdb.plannodes.MaterializedScanPlanNode; import org.voltdb.plannodes.NestLoopIndexPlanNode; import org.voltdb.plannodes.NestLoopPlanNode; import org.voltdb.types.JoinType; import org.voltdb.utils.PermutationGenerator; /** * For a select, delete or update plan, this class builds the part of the plan * which collects tuples from relations. Given the tables and the predicate * (and sometimes the output columns), this will build a plan that will output * matching tuples to a temp table. A delete, update or send plan node can then * be glued on top of it. In selects, aggregation and other projections are also * done on top of the result from this class. * */ public class SelectSubPlanAssembler extends SubPlanAssembler { /** The list of generated plans. This allows their generation in batches.*/ ArrayDeque<AbstractPlanNode> m_plans = new ArrayDeque<>(); /** The list of all possible join orders, assembled by queueAllJoinOrders */ private ArrayDeque<JoinNode> m_joinOrders = new ArrayDeque<>(); /** * * @param db The catalog's Database object. * @param selectStmt The parsed and dissected statement object describing the sql to execute. * @param partitioning in/out param first element is partition key value, forcing a single-partition statement if non-null, * second may be an inferred partition key if no explicit single-partitioning was specified */ SelectSubPlanAssembler(Database db, ParsedSelectStmt selectStmt, StatementPartitioning partitioning) { super(db, selectStmt, partitioning); if (selectStmt.hasJoinOrder()) { // If a join order was provided or large number of tables join m_joinOrders.addAll(selectStmt.getJoinOrder()); } else { assert(m_parsedStmt.m_noTableSelectionList.size() == 0); m_joinOrders = queueJoinOrders(m_parsedStmt.m_joinTree, true); } } /** * Compute every permutation of the list of involved tables and put them in a deque. * TODO(XIN): takes at least 3.3% cpu of planner. Optimize it when possible. */ public static ArrayDeque<JoinNode> queueJoinOrders(JoinNode joinNode, boolean findAll) { assert(joinNode != null); // Clone the original JoinNode clonedTree = (JoinNode) joinNode.clone(); // Split join tree into a set of subtrees. The join type for all nodes in a subtree is the same List<JoinNode> subTrees = clonedTree.extractSubTrees(); assert(!subTrees.isEmpty()); // Generate possible join orders for each sub-tree separately ArrayList<List<JoinNode>> joinOrderList = generateJoinOrders(subTrees); // Reassemble the all possible combinations of the sub-tree and queue them ArrayDeque<JoinNode> joinOrders = new ArrayDeque<>(); queueSubJoinOrders(joinOrderList, 0, new ArrayList<JoinNode>(), joinOrders, findAll); return joinOrders; } private static void queueSubJoinOrders(List<List<JoinNode>> joinOrderList, int joinOrderListIdx, ArrayList<JoinNode> currentJoinOrder, ArrayDeque<JoinNode> joinOrders, boolean findAll) { if (!findAll && joinOrders.size() > 0) { // At least find one valid join order return; } if (joinOrderListIdx == joinOrderList.size()) { // End of recursion assert(!currentJoinOrder.isEmpty()); JoinNode joinTree = JoinNode.reconstructJoinTreeFromSubTrees(currentJoinOrder); joinOrders.add(joinTree); return; } // Recursive step List<JoinNode> nextTrees = joinOrderList.get(joinOrderListIdx); for (JoinNode headTree: nextTrees) { ArrayList<JoinNode> updatedJoinOrder = new ArrayList<>(); // Order is important: The top sub-trees must be first for (JoinNode node : currentJoinOrder) { updatedJoinOrder.add((JoinNode)node.clone()); } updatedJoinOrder.add((JoinNode)headTree.clone()); queueSubJoinOrders(joinOrderList, joinOrderListIdx + 1, updatedJoinOrder, joinOrders, findAll); } } /** * For each join tree from the input list generate a set of the join trees by permuting the leafs * (table nodes) of the tree without breaking the joins semantic. * * @param subTrees the list of join trees. * @return The list containing the list of trees of all possible permutations of the input trees */ private static ArrayList<List<JoinNode>> generateJoinOrders(List<JoinNode> subTrees) { ArrayList<List<JoinNode>> permutations = new ArrayList<>(); for (JoinNode subTree : subTrees) { permutations.add(generateJoinOrdersForTree(subTree)); } return permutations; } private static List<JoinNode> generateJoinOrdersForTree(JoinNode subTree) { if (subTree instanceof BranchNode) { BranchNode branchSubTree = (BranchNode) subTree; JoinType joinType = branchSubTree.getJoinType(); if (joinType == JoinType.INNER) { return generateInnerJoinOrdersForTree(subTree); } else if (joinType == JoinType.LEFT) { return generateOuterJoinOrdersForTree(subTree); } else if (joinType == JoinType.FULL) { return generateFullJoinOrdersForTree(subTree); } else { // Shouldn't get there assert(false); return null; } } else { // Single tables and subqueries return generateInnerJoinOrdersForTree(subTree); } } /** * Helper method to generate join orders for a join tree containing only INNER joins that * can be obtained by the permutation of the original tables. * * @param subTree join tree * @return list of valid join orders */ private static List<JoinNode> generateInnerJoinOrdersForTree(JoinNode subTree) { // Get a list of the leaf nodes(tables) to permute them List<JoinNode> tableNodes = subTree.generateLeafNodesJoinOrder(); List<List<JoinNode>> joinOrders = PermutationGenerator.generatePurmutations(tableNodes); List<JoinNode> newTrees = new ArrayList<>(); for (List<JoinNode> joinOrder: joinOrders) { newTrees.add(JoinNode.reconstructJoinTreeFromTableNodes(joinOrder, JoinType.INNER)); } //Collect all the join/where conditions to reassign them later AbstractExpression combinedWhereExpr = subTree.getAllFilters(); List<JoinNode> treePermutations = new ArrayList<>(); for (JoinNode newTree : newTrees) { if (combinedWhereExpr != null) { newTree.setWhereExpression(combinedWhereExpr.clone()); } // The new tree root node id must match the original one to be able to reconnect the // subtrees newTree.setId(subTree.getId()); treePermutations.add(newTree); } return treePermutations; } /** * Helper method to generate join orders for an OUTER join tree. * At the moment, permutations for LEFT Joins are not supported yet * * @param subTree join tree * @return list of valid join orders */ private static List<JoinNode> generateOuterJoinOrdersForTree(JoinNode subTree) { List<JoinNode> treePermutations = new ArrayList<>(); treePermutations.add(subTree); return treePermutations; } /** * Helper method to generate join orders for a join tree containing only FULL joins. * The only allowed permutation is a join order that has original left and right nodes * swapped. * * @param subTree join tree * @return list of valid join orders */ private static List<JoinNode> generateFullJoinOrdersForTree(JoinNode subTree) { assert(subTree != null); List<JoinNode> joinOrders = new ArrayList<>(); if (!(subTree instanceof BranchNode)) { // End of recursion joinOrders.add(subTree); return joinOrders; } BranchNode branchNode = (BranchNode) subTree; // Descend to the left branch assert(branchNode.getLeftNode() != null); List<JoinNode> leftJoinOrders = generateFullJoinOrdersForTree(branchNode.getLeftNode()); assert(!leftJoinOrders.isEmpty()); // Descend to the right branch assert(branchNode.getRightNode() != null); List<JoinNode> rightJoinOrders = generateFullJoinOrdersForTree(branchNode.getRightNode()); assert(!rightJoinOrders.isEmpty()); // Create permutation pairing left and right nodes and the revere variant for (JoinNode leftNode : leftJoinOrders) { for (JoinNode rightNode : rightJoinOrders) { JoinNode resultOne = new BranchNode(branchNode.getId(), branchNode.getJoinType(), (JoinNode) leftNode.clone(), (JoinNode) rightNode.clone()); JoinNode resultTwo = new BranchNode(branchNode.getId(), branchNode.getJoinType(), (JoinNode) rightNode.clone(), (JoinNode) leftNode.clone()); if (branchNode.getJoinExpression() != null) { resultOne.setJoinExpression(branchNode.getJoinExpression().clone()); resultTwo.setJoinExpression(branchNode.getJoinExpression().clone()); } if (branchNode.getWhereExpression() != null) { resultOne.setWhereExpression(branchNode.getWhereExpression().clone()); resultTwo.setWhereExpression(branchNode.getWhereExpression().clone()); } joinOrders.add(resultOne); joinOrders.add(resultTwo); } } return joinOrders; } /** * Pull a join order out of the join orders deque, compute all possible plans * for that join order, then append them to the computed plans deque. */ @Override protected AbstractPlanNode nextPlan() { // repeat (usually run once) until plans are created // or no more plans can be created while (m_plans.size() == 0) { // get the join order for us to make plans out of JoinNode joinTree = m_joinOrders.poll(); // no more join orders => no more plans to generate if (joinTree == null) { return null; } // Analyze join and filter conditions joinTree.analyzeJoinExpressions(m_parsedStmt.m_noTableSelectionList); // a query that is a little too quirky or complicated. if (!m_parsedStmt.m_noTableSelectionList.isEmpty()) { throw new PlanningErrorException("Join with filters that do not depend on joined tables is not supported in VoltDB"); } if ( ! m_partitioning.wasSpecifiedAsSingle()) { // Now that analyzeJoinExpressions has done its job of properly categorizing // and placing the various filters that the HSQL parser tends to leave in the strangest // configuration, this is the first opportunity to analyze WHERE and JOIN filters' // effects on statement partitioning. // But this causes the analysis to be run based on a particular join order. // Which join orders does this analysis actually need to be run on? // Can it be run on the first join order and be assumed to hold for all join orders? // If there is a join order that fails to generate a single viable plan, is its // determination of partitioning (or partitioning failure) always valid for other // join orders, or should the analysis be repeated on a viable join order // in that case? // For now, analyze each join order independently and when an invalid partitioning is // detected, skip the plan generation for that particular ordering. // If this causes all plans to be skipped, commonly the case, the PlanAssembler // should propagate an error message identifying partitioning as the problem. HashMap<AbstractExpression, Set<AbstractExpression>> valueEquivalence = joinTree.getAllEquivalenceFilters(); Collection<StmtTableScan> scans = m_parsedStmt.allScans(); m_partitioning.analyzeForMultiPartitionAccess(scans, valueEquivalence); if ( ! m_partitioning.isJoinValid() ) { // The case of more than one independent partitioned table // would result in an illegal plan with more than two fragments. // Don't throw a planning error here, in case the problem is just with this // particular join order, but do leave a hint to the PlanAssembler in case // the failure is unanimous -- a common case. m_recentErrorMsg = m_partitioning.getJoinInvalidReason(); // This join order, at least, is not worth trying to plan. continue; } } generateMorePlansForJoinTree(joinTree); } return m_plans.poll(); } /** * Given a specific join order, compute all possible sub-plan-graphs for that * join order and add them to the deque of plans. If this doesn't add plans, * it doesn't mean no more plans can be generated. It's possible that the * particular join order it got had no reasonable plans. * * @param joinTree An array of tables in the join order. */ private void generateMorePlansForJoinTree(JoinNode joinTree) { assert(joinTree != null); // generate the access paths for all nodes generateAccessPaths(joinTree); List<JoinNode> nodes = joinTree.generateAllNodesJoinOrder(); generateSubPlanForJoinNodeRecursively(joinTree, 0, nodes); } /** * Generate all possible access paths for all nodes in the tree. * The join and filter expressions are kept at the parent node * 1- The OUTER-only join conditions - Testing the outer-only conditions COULD be considered as an * optimal first step to processing each outer tuple - PreJoin predicate for NLJ or NLIJ * 2 -The INNER-only and INNER_OUTER join conditions are used for finding a matching inner tuple(s) for a * given outer tuple. Index and end-Index expressions for NLIJ and join predicate for NLJ. * 3 -The OUTER-only filter conditions. - Can be pushed down to pre-qualify the outer tuples before they enter * the join - Where condition for the left child * 4. The INNER-only and INNER_OUTER where conditions are used for filtering joined tuples. - * Post join predicate for NLIJ and NLJ . Possible optimization - * if INNER-only condition is NULL-rejecting (inner_tuple is NOT NULL or inner_tuple > 0) * it can be pushed down as a filter expression to the inner child * * @param joinNode A root to the join tree to generate access paths to all nodes in that tree. */ private void generateAccessPaths(JoinNode joinNode) { assert(joinNode != null); if (joinNode instanceof BranchNode) { generateOuterAccessPaths((BranchNode)joinNode); generateInnerAccessPaths((BranchNode)joinNode); // An empty access path for the root joinNode.m_accessPaths.add(new AccessPath()); return; } // This is a select from a single table joinNode.m_accessPaths.addAll(getRelevantAccessPathsForTable(joinNode.getTableScan(), joinNode.m_joinInnerList, joinNode.m_whereInnerList, null)); } /** * Generate all possible access paths for an outer node in a join. * The outer table and/or join can have the naive access path and possible index path(s) * Optimizations - outer-table-only where expressions can be pushed down to the child node * to pre-qualify the outer tuples before they enter the join. * For inner joins outer-table-only join expressions can be pushed down as well * * @param parentNode A parent node to the node to generate paths to. */ private void generateOuterAccessPaths(BranchNode parentNode) { JoinNode outerChildNode = parentNode.getLeftNode(); assert(outerChildNode != null); JoinType joinType = parentNode.getJoinType(); // For LEFT and FULL join types, the outer join expressions are kept as a pre-join predicate // at the join node to pre-qualify the outer rows List<AbstractExpression> joinOuterList = (joinType == JoinType.INNER) ? parentNode.m_joinOuterList : null; if (outerChildNode instanceof BranchNode) { generateOuterAccessPaths((BranchNode)outerChildNode); generateInnerAccessPaths((BranchNode)outerChildNode); // The join node can have only sequential scan access outerChildNode.m_accessPaths.add(getRelevantNaivePath(joinOuterList, parentNode.m_whereOuterList)); assert(outerChildNode.m_accessPaths.size() > 0); return; } // WHERE Outer expressions must stay at the join node for the FULL joins // They will be added later as part of the WHERE predicate of the join node List<AbstractExpression> parentWhereList = null; if (joinType != JoinType.FULL) { parentWhereList = parentNode.m_whereOuterList; } outerChildNode.m_accessPaths.addAll( getRelevantAccessPathsForTable(outerChildNode.getTableScan(), joinOuterList, parentWhereList, null)); } /** * Generate all possible access paths for an inner node in a join. * The set of potential index expressions depends whether the inner node can be inlined * with the NLIJ or not. In the former case, inner and inner-outer join expressions can * be considered for the index access. In the latter, only inner join expressions qualifies. * * @param parentNode A parent node to the node to generate paths to. */ private void generateInnerAccessPaths(BranchNode parentNode) { JoinNode innerChildNode = parentNode.getRightNode(); assert(innerChildNode != null); // In case of inner join WHERE and JOIN expressions can be merged if (parentNode.getJoinType() == JoinType.INNER) { parentNode.m_joinInnerOuterList.addAll(parentNode.m_whereInnerOuterList); parentNode.m_whereInnerOuterList.clear(); parentNode.m_joinInnerList.addAll(parentNode.m_whereInnerList); parentNode.m_whereInnerList.clear(); } if (innerChildNode instanceof BranchNode) { generateOuterAccessPaths((BranchNode)innerChildNode); generateInnerAccessPaths((BranchNode)innerChildNode); // The inner node is a join node itself. Only naive access path is possible innerChildNode.m_accessPaths.add( getRelevantNaivePath(parentNode.m_joinInnerOuterList, parentNode.m_joinInnerList)); return; } // The inner table can have multiple index access paths based on // inner and inner-outer join expressions plus the naive one. List<AbstractExpression> filterExprs = null; List<AbstractExpression> postExprs = null; // For the FULL join type, the inner join expressions must stay at the join node and // not go down to the inner node as filters (as predicates for SeqScan nodes and/or // index expressions for Index Scan). The latter case (IndexScan) won't work for NLJ because // the inner join expression will effectively filter out inner tuple prior to the NLJ. if (parentNode.getJoinType() != JoinType.FULL) { filterExprs = parentNode.m_joinInnerList; } else { postExprs = parentNode.m_joinInnerList; } StmtTableScan innerTable = innerChildNode.getTableScan(); assert(innerTable != null); innerChildNode.m_accessPaths.addAll( getRelevantAccessPathsForTable(innerTable, parentNode.m_joinInnerOuterList, filterExprs, postExprs)); // If there are inner expressions AND inner-outer expressions, it could be that there // are indexed access paths that use elements of both in the indexing expressions, // especially in the case of a compound index. // These access paths can not be considered for use with an NLJ because they rely on // inner-outer expressions. // If there is a possibility that NLIJ will not be an option due to the // "special case" processing that puts a send/receive plan between the join node // and its inner child node, other access paths need to be considered that use the // same indexes as those identified so far but in a simpler, less effective way // that does not rely on inner-outer expressions. // The following simplistic method of finding these access paths is to force // inner-outer expressions to be handled as NLJ-compatible post-filters and repeat // the search for access paths. // This will typically generate some duplicate access paths, including the naive // sequential scan path and any indexed paths that happened to use only the inner // expressions. // For now, we deal with this redundancy by dropping (and re-generating) all // access paths EXCPT those that reference the inner-outer expressions. // TODO: implementing access path hash and equality and possibly using a "Set" // would allow deduping as new access paths are added OR // the simplified access path search process could be based on // the existing indexed access paths -- for each access path that "hasInnerOuterIndexExpression" // try to generate and add a simpler access path using the same index, // this time with the inner-outer expressions used only as non-indexable post-filters. // Don't bother generating these redundant or inferior access paths unless there is // an inner-outer expression and a chance that NLIJ will be taken out of the running. boolean mayNeedInnerSendReceive = ( ! m_partitioning.wasSpecifiedAsSingle()) && (m_partitioning.getCountOfPartitionedTables() > 0) && (parentNode.getJoinType() != JoinType.INNER) && ! innerTable.getIsReplicated(); // too expensive/complicated to test here? (parentNode.m_leftNode has a replicated result?) && if (mayNeedInnerSendReceive && ! parentNode.m_joinInnerOuterList.isEmpty()) { List<AccessPath> innerOuterAccessPaths = new ArrayList<>(); for (AccessPath innerAccessPath : innerChildNode.m_accessPaths) { if ((innerAccessPath.index != null) && hasInnerOuterIndexExpression(innerChildNode.getTableAlias(), innerAccessPath.indexExprs, innerAccessPath.initialExpr, innerAccessPath.endExprs)) { innerOuterAccessPaths.add(innerAccessPath); } } if (parentNode.getJoinType() != JoinType.FULL) { filterExprs = parentNode.m_joinInnerList; postExprs = parentNode.m_joinInnerOuterList; } else { // For FULL join type the inner join expressions must be part of the post predicate // in order to stay at the join node and not be pushed down to the inner node filterExprs = null; postExprs = new ArrayList<>(parentNode.m_joinInnerList); postExprs.addAll(parentNode.m_joinInnerOuterList); } Collection<AccessPath> nljAccessPaths = getRelevantAccessPathsForTable( innerTable, null, filterExprs, postExprs); innerChildNode.m_accessPaths.clear(); innerChildNode.m_accessPaths.addAll(nljAccessPaths); innerChildNode.m_accessPaths.addAll(innerOuterAccessPaths); } assert(innerChildNode.m_accessPaths.size() > 0); } /** * generate all possible plans for the tree. * * @param rootNode The root node for the whole join tree. * @param nodes The node list to iterate over. */ private void generateSubPlanForJoinNodeRecursively(JoinNode rootNode, int nextNode, List<JoinNode> nodes) { assert(nodes.size() > nextNode); JoinNode joinNode = nodes.get(nextNode); if (nodes.size() == nextNode + 1) { for (AccessPath path : joinNode.m_accessPaths) { joinNode.m_currentAccessPath = path; AbstractPlanNode plan = getSelectSubPlanForJoinNode(rootNode); if (plan == null) { continue; } m_plans.add(plan); } return; } for (AccessPath path : joinNode.m_accessPaths) { joinNode.m_currentAccessPath = path; generateSubPlanForJoinNodeRecursively(rootNode, nextNode+1, nodes); } } /** * Given a specific join node and access path set for inner and outer tables, construct the plan * that gives the right tuples. * * @param joinNode The join node to build the plan for. * @param isInnerTable True if the join node is the inner node in the join * @return A completed plan-sub-graph that should match the correct tuples from the * correct tables. */ private AbstractPlanNode getSelectSubPlanForJoinNode(JoinNode joinNode) { assert(joinNode != null); if (joinNode instanceof BranchNode) { BranchNode branchJoinNode = (BranchNode)joinNode; // Outer node AbstractPlanNode outerScanPlan = getSelectSubPlanForJoinNode(branchJoinNode.getLeftNode()); if (outerScanPlan == null) { return null; } // Inner Node. AbstractPlanNode innerScanPlan = getSelectSubPlanForJoinNode((branchJoinNode).getRightNode()); if (innerScanPlan == null) { return null; } // Join Node IndexSortablePlanNode answer = getSelectSubPlanForJoin(branchJoinNode, outerScanPlan, innerScanPlan); // Propagate information used for order by clauses in window functions // and the statement level order by clause. This is only if the // branch node is an inner join. if ((answer != null) && (branchJoinNode.getJoinType() == JoinType.INNER) && outerScanPlan instanceof IndexSortablePlanNode) { IndexUseForOrderBy indexUseForJoin = answer.indexUse(); IndexUseForOrderBy indexUseFromScan = ((IndexSortablePlanNode)outerScanPlan).indexUse(); indexUseForJoin.setWindowFunctionUsesIndex(indexUseFromScan.getWindowFunctionUsesIndex()); indexUseForJoin.setWindowFunctionIsCompatibleWithOrderBy(indexUseFromScan.isWindowFunctionCompatibleWithOrderBy()); indexUseForJoin.setFinalExpressionOrderFromIndexScan(indexUseFromScan.getFinalExpressionOrderFromIndexScan()); indexUseForJoin.setSortOrderFromIndexScan(indexUseFromScan.getSortOrderFromIndexScan()); } if (answer == null) { return null; } return answer.planNode(); } // End of recursion AbstractPlanNode scanNode = getAccessPlanForTable(joinNode); // Connect the sub-query tree if any if (joinNode instanceof SubqueryLeafNode) { StmtSubqueryScan tableScan = ((SubqueryLeafNode)joinNode).getSubqueryScan(); CompiledPlan subQueryPlan = tableScan.getBestCostPlan(); assert(subQueryPlan != null); assert(subQueryPlan.rootPlanGraph != null); // The sub-query best cost plan needs to be un-linked from the previous parent plan // it's the same child plan that gets re-attached to many parents one at a time subQueryPlan.rootPlanGraph.disconnectParents(); scanNode.addAndLinkChild(subQueryPlan.rootPlanGraph); } return scanNode; } /** * Given a join node and plan-sub-graph for outer and inner sub-nodes, * construct the plan-sub-graph for that node. * * @param joinNode A parent join node. * @param outerPlan The outer node plan-sub-graph. * @param innerPlan The inner node plan-sub-graph. * @return A completed plan-sub-graph * or null if a valid plan can not be produced for given access paths. */ private IndexSortablePlanNode getSelectSubPlanForJoin(BranchNode joinNode, AbstractPlanNode outerPlan, AbstractPlanNode innerPlan) { // Filter (post-join) expressions ArrayList<AbstractExpression> whereClauses = new ArrayList<>(); whereClauses.addAll(joinNode.m_whereInnerList); whereClauses.addAll(joinNode.m_whereInnerOuterList); if (joinNode.getJoinType() == JoinType.FULL) { // For all other join types, the whereOuterList expressions were pushed down to the outer node whereClauses.addAll(joinNode.m_whereOuterList); } // The usual approach of calculating a local (partial) join result on each partition, // then sending and merging them with other partial results on the coordinator does not // ensure correct answers for some queries like: // SELECT * FROM replicated LEFT JOIN partitioned ON ... // They require a "global view" of the partitioned working set in order to // properly identify which replicated rows need to be null-padded, // and to ensure that the same replicated row is not null-padded redundantly on multiple partitions. // Many queries using this pattern impractically require redistribution and caching of a considerable // subset of a partitioned table in preparation for a "coordinated" join. // Yet, there may be useful cases with sufficient constant-based partitioned table filtering // in the "ON clause" to keep the distributed working set size under control, like // SELECT * FROM replicated R LEFT JOIN partitioned P // ON R.key == P.non_partition_key AND P.non_partition_key BETWEEN ? and ?; // // Such queries need to be prohibited by the planner if it can not guarantee the // correct results that require a "send before join" plan. // This could be the case if the replicated-to-partition join in these examples // were subject to another join with a partitioned table, like // SELECT * FROM replicated R LEFT JOIN partitioned P1 ON ... // LEFT JOIN also_partitioned P2 ON ... // assert(joinNode.getRightNode() != null); JoinNode innerJoinNode = joinNode.getRightNode(); AccessPath innerAccessPath = innerJoinNode.m_currentAccessPath; // We may need to add a send/receive pair to the inner plan for the special case. // This trick only works once per plan, BUT once the partitioned data has been // received on the coordinator, it can be treated as replicated data in later // joins, which MAY help with later outer joins with replicated data. boolean needInnerSendReceive = m_partitioning.requiresTwoFragments() && ! innerPlan.hasReplicatedResult() && outerPlan.hasReplicatedResult() && joinNode.getJoinType() != JoinType.INNER; // When the inner plan is an IndexScan, there MAY be a choice of whether to join using a // NestLoopJoin (NLJ) or a NestLoopIndexJoin (NLIJ). The NLJ will have an advantage over the // NLIJ in the cases where it applies, since it does a single access or iteration over the index // and caches the result, where the NLIJ does an index access or iteration for each outer row. // The NestLoopJoin applies when the inner IndexScan is driven only by parameter and constant // expressions determined at the start of the query. That requires that none of the IndexScan's // various expressions that drive the index access may reference columns from the outer row // -- they can only reference columns of the index's base table (the indexed expressions) // as well as constants and parameters. The IndexScan's "otherExprs" expressions that only // drive post-filtering are not an issue since the NestLoopJoin does feature per-outer-tuple // post-filtering on each pass over the cached index scan result. // The special case of an OUTER JOIN of replicated outer row data with a partitioned inner // table requires that the partitioned data be sent to the coordinator prior to the join. // This limits the join option to NLJ. The index scan must make a single index access on // each partition and cache the result at the coordinator for post-filtering. // This requires that the index access be based on parameters or constants only // -- the replicated outer row data will only be available later at the coordinator, // so it can not drive the per-partition index scan. // If the NLJ option is precluded for the usual reason (outer-row-based indexing) AND // the NLIJ is precluded by the special case (OUTER JOIN of replicated outer rows and // partitioned inner rows) this method returns null, effectively rejecting this indexed // access path for the inner node. Other access paths or join orders may prove more successful. boolean canHaveNLJ = true; boolean canHaveNLIJ = true; if (innerPlan instanceof IndexScanPlanNode) { if (hasInnerOuterIndexExpression(joinNode.getRightNode().getTableAlias(), innerAccessPath.indexExprs, innerAccessPath.initialExpr, innerAccessPath.endExprs)) { canHaveNLJ = false; } } else { canHaveNLIJ = false; } if (needInnerSendReceive) { canHaveNLIJ = false; } // Prohibit FULL join plans with distributed outer and replicated inner branches - // The join must happen on either a coordinator node or both tables must be joined on // partition columns if (joinNode.getJoinType() == JoinType.FULL && m_partitioning.requiresTwoFragments() && !outerPlan.hasReplicatedResult() && innerPlan.hasReplicatedResult()) { canHaveNLIJ = false; canHaveNLJ = false; } AbstractJoinPlanNode ajNode = null; if (canHaveNLJ) { NestLoopPlanNode nljNode = new NestLoopPlanNode(); // get all the clauses that join the applicable two tables // Copy innerAccessPath.joinExprs to leave it unchanged, // avoiding accumulation of redundant expressions when // joinClauses gets built up for various alternative plans. ArrayList<AbstractExpression> joinClauses = new ArrayList<>(innerAccessPath.joinExprs); if ((innerPlan instanceof IndexScanPlanNode) || (innerPlan instanceof NestLoopIndexPlanNode && innerPlan.getChild(0) instanceof MaterializedScanPlanNode)) { // InnerPlan is an IndexScan OR an NLIJ of a MaterializedScan // (IN LIST) and an IndexScan. In this case, the inner and // inner-outer non-index join expressions (if any) are in the // indexScan's otherExpr. The former should stay as IndexScanPlan // predicates but the latter need to be pulled up into NLJ // predicates because the IndexScan is executed once, not once // per outer tuple. ArrayList<AbstractExpression> otherExprs = new ArrayList<>(); // PLEASE do not update the "innerAccessPath.otherExprs", it may be reused // for other path evaluation on the other outer side join. List<AbstractExpression> innerExpr = filterSingleTVEExpressions(innerAccessPath.otherExprs, otherExprs); joinClauses.addAll(otherExprs); IndexScanPlanNode scanNode = null; if (innerPlan instanceof IndexScanPlanNode) { scanNode = (IndexScanPlanNode)innerPlan; } else { assert(innerPlan instanceof NestLoopIndexPlanNode); scanNode = ((NestLoopIndexPlanNode) innerPlan).getInlineIndexScan(); } scanNode.setPredicate(innerExpr); } else if (innerJoinNode instanceof BranchNode && joinNode.getJoinType() != JoinType.INNER) { // If the innerJoinNode is a LEAF node OR if the join type is an INNER join, // the conditions that apply to the inner side // have been applied as predicates to the inner scan node already. // otherExpr of innerAccessPath comes from its parentNode's joinInnerList. // For Outer join (LEFT or FULL), it could mean a join predicate on the table of // the inner node ONLY, that can not be pushed down. joinClauses.addAll(innerAccessPath.otherExprs); } nljNode.setJoinPredicate(ExpressionUtil.combinePredicates(joinClauses)); // combine the tails plan graph with the new head node nljNode.addAndLinkChild(outerPlan); // If successful in the special case, the NLJ plan must be modified to cause the // partitioned inner data to be sent to the coordinator prior to the join. // This is done by adding send and receive plan nodes between the NLJ and its // right child node. if (needInnerSendReceive) { // This trick only works once per plan. if (outerPlan.hasAnyNodeOfClass(AbstractReceivePlanNode.class) || innerPlan.hasAnyNodeOfClass(AbstractReceivePlanNode.class)) { return null; } innerPlan = addSendReceivePair(innerPlan); } nljNode.addAndLinkChild(innerPlan); ajNode = nljNode; } else if (canHaveNLIJ) { NestLoopIndexPlanNode nlijNode = new NestLoopIndexPlanNode(); IndexScanPlanNode innerNode = (IndexScanPlanNode) innerPlan; // Set IndexScan predicate. The INNER join expressions for a FULL join come from // the innerAccessPath.joinExprs and need to be combined with the other join expressions innerNode.setPredicate(innerAccessPath.joinExprs, innerAccessPath.otherExprs); nlijNode.addInlinePlanNode(innerPlan); // combine the tails plan graph with the new head node nlijNode.addAndLinkChild(outerPlan); ajNode = nlijNode; } else { m_recentErrorMsg = "Unsupported special case of complex OUTER JOIN between replicated outer table and partitioned inner table."; return null; } ajNode.setJoinType(joinNode.getJoinType()); ajNode.setPreJoinPredicate(ExpressionUtil.combinePredicates(joinNode.m_joinOuterList)); ajNode.setWherePredicate(ExpressionUtil.combinePredicates(whereClauses)); ajNode.resolveSortDirection(); return ajNode; } /** * A method to filter out single-TVE expressions. * * @param expr List of single-TVE expressions. * @param otherExprs List of multi-TVE expressions. * @return List of single-TVE expressions from the input collection. */ private static List<AbstractExpression> filterSingleTVEExpressions(List<AbstractExpression> exprs, List<AbstractExpression> otherExprs) { List<AbstractExpression> singleTVEExprs = new ArrayList<>(); for (AbstractExpression expr : exprs) { List<TupleValueExpression> tves = ExpressionUtil.getTupleValueExpressions(expr); if (tves.size() == 1) { singleTVEExprs.add(expr); } else { otherExprs.add(expr); } } return singleTVEExprs; } /** * For a join node, determines whether any of the inner-outer expressions were used * for an inner index access -- this requires joining with a NestLoopIndexJoin. * These are detected as TVE references in the various clauses that drive the indexing * -- as opposed to TVE references in post-filters that pose no problem with either * NLIJ or the more efficient (one-pass through the index) NestLoopJoin. * * @param innerTable - the Table of all inner TVEs that are exempt from the check. * @param indexExprs - a list of expressions used in the indexing * @param initialExpr - a list of expressions used in the indexing * @param endExprs - a list of expressions used in the indexing * @return true if at least one of the expression lists references a TVE. */ private static boolean hasInnerOuterIndexExpression(String innerTableAlias, Collection<AbstractExpression> indexExprs, Collection<AbstractExpression> initialExpr, Collection<AbstractExpression> endExprs) { HashSet<AbstractExpression> indexedExprs = new HashSet<>(); indexedExprs.addAll(indexExprs); indexedExprs.addAll(initialExpr); indexedExprs.addAll(endExprs); // Find an outer TVE by ignoring any TVEs based on the inner table. for (AbstractExpression indexed : indexedExprs) { Collection<TupleValueExpression> indexedTVEs = indexed.findAllTupleValueSubexpressions(); for (AbstractExpression indexedTVExpr : indexedTVEs) { if ( ! TupleValueExpression.isOperandDependentOnTable(indexedTVExpr, innerTableAlias)) { return true; } } } return false; } }