/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.planner; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.NavigableSet; import java.util.Set; import org.json_voltpatches.JSONException; import org.voltdb.VoltType; import org.voltdb.catalog.CatalogMap; import org.voltdb.catalog.Column; import org.voltdb.catalog.ColumnRef; import org.voltdb.catalog.Constraint; import org.voltdb.catalog.Database; import org.voltdb.catalog.Index; import org.voltdb.catalog.Table; import org.voltdb.expressions.AbstractExpression; import org.voltdb.expressions.AggregateExpression; import org.voltdb.expressions.ConstantValueExpression; import org.voltdb.expressions.ExpressionUtil; import org.voltdb.expressions.OperatorExpression; import org.voltdb.expressions.ParameterValueExpression; import org.voltdb.expressions.SelectSubqueryExpression; import org.voltdb.expressions.TupleAddressExpression; import org.voltdb.expressions.TupleValueExpression; import org.voltdb.expressions.WindowFunctionExpression; import org.voltdb.planner.microoptimizations.MicroOptimizationRunner; import org.voltdb.planner.parseinfo.BranchNode; import org.voltdb.planner.parseinfo.JoinNode; import org.voltdb.planner.parseinfo.StmtSubqueryScan; import org.voltdb.planner.parseinfo.StmtTableScan; import org.voltdb.plannodes.AbstractJoinPlanNode; import org.voltdb.plannodes.AbstractPlanNode; import org.voltdb.plannodes.AbstractReceivePlanNode; import org.voltdb.plannodes.AbstractScanPlanNode; import org.voltdb.plannodes.AggregatePlanNode; import org.voltdb.plannodes.DeletePlanNode; import org.voltdb.plannodes.HashAggregatePlanNode; import org.voltdb.plannodes.IndexScanPlanNode; import org.voltdb.plannodes.IndexSortablePlanNode; import org.voltdb.plannodes.IndexUseForOrderBy; import org.voltdb.plannodes.InsertPlanNode; import org.voltdb.plannodes.LimitPlanNode; import org.voltdb.plannodes.MaterializePlanNode; import org.voltdb.plannodes.MergeReceivePlanNode; import org.voltdb.plannodes.NestLoopPlanNode; import org.voltdb.plannodes.NodeSchema; import org.voltdb.plannodes.OrderByPlanNode; import org.voltdb.plannodes.PartialAggregatePlanNode; import org.voltdb.plannodes.ProjectionPlanNode; import org.voltdb.plannodes.ReceivePlanNode; import org.voltdb.plannodes.SchemaColumn; import org.voltdb.plannodes.SendPlanNode; import org.voltdb.plannodes.SeqScanPlanNode; import org.voltdb.plannodes.SwapTablesPlanNode; import org.voltdb.plannodes.UnionPlanNode; import org.voltdb.plannodes.UpdatePlanNode; import org.voltdb.plannodes.WindowFunctionPlanNode; import org.voltdb.types.ConstraintType; import org.voltdb.types.ExpressionType; import org.voltdb.types.IndexType; import org.voltdb.types.JoinType; import org.voltdb.types.PlanNodeType; import org.voltdb.types.SortDirectionType; import org.voltdb.utils.CatalogUtil; /** * The query planner accepts catalog data, SQL statements from the catalog, then * outputs a set of complete and correct query plans. It will output MANY plans * and some of them will be stupid. The best plan will be selected by computing * resource usage statistics for the plans, then using those statistics to * compute the cost of a specific plan. The plan with the lowest cost wins. * */ public class PlanAssembler { // The convenience struct to accumulate results after parsing multiple statements private static class ParsedResultAccumulator { public final boolean m_orderIsDeterministic; public final boolean m_hasLimitOrOffset; public final String m_isContentDeterministic; public ParsedResultAccumulator(boolean orderIsDeterministic, boolean hasLimitOrOffset, String isContentDeterministic) { m_orderIsDeterministic = orderIsDeterministic; m_hasLimitOrOffset = hasLimitOrOffset; m_isContentDeterministic = isContentDeterministic; } } /** convenience pointer to the database object in the catalog */ private final Database m_catalogDb; /** parsed statement for an insert */ private ParsedInsertStmt m_parsedInsert = null; /** parsed statement for an update */ private ParsedUpdateStmt m_parsedUpdate = null; /** parsed statement for a delete */ private ParsedDeleteStmt m_parsedDelete = null; /** parsed statement for a swap */ private ParsedSwapStmt m_parsedSwap = null; /** parsed statement for a select */ private ParsedSelectStmt m_parsedSelect = null; /** parsed statement for a union */ private ParsedUnionStmt m_parsedUnion = null; /** plan selector */ private final PlanSelector m_planSelector; /** Describes the specified and inferred partition context. */ private StatementPartitioning m_partitioning; /** Error message */ private String m_recentErrorMsg; /** * Used to generate the table-touching parts of a plan. All join-order and * access path selection stuff is done by the SelectSubPlanAssember. */ private SubPlanAssembler m_subAssembler = null; /** * Flag when the only expected plan for a statement has already been generated. */ private boolean m_bestAndOnlyPlanWasGenerated = false; /** * @param catalogDb * Catalog info about schema, metadata and procedures. * @param partitioning * Describes the specified and inferred partition context. */ PlanAssembler(Database catalogDb, StatementPartitioning partitioning, PlanSelector planSelector) { m_catalogDb = catalogDb; m_partitioning = partitioning; m_planSelector = planSelector; } String getSQLText() { if (m_parsedDelete != null) { return m_parsedDelete.m_sql; } if (m_parsedInsert != null) { return m_parsedInsert.m_sql; } if (m_parsedUpdate != null) { return m_parsedUpdate.m_sql; } if (m_parsedSelect != null) { return m_parsedSelect.m_sql; } assert(false); return null; } /** * Return true if tableList includes at least one matview. */ private boolean tableListIncludesReadOnlyView(List<Table> tableList) { NavigableSet<String> exportTables = CatalogUtil.getExportTableNames(m_catalogDb); for (Table table : tableList) { if (table.getMaterializer() != null && !exportTables.contains(table.getMaterializer().getTypeName())) { return true; } } return false; } /** * Return true if tableList includes at least one export table. */ private boolean tableListIncludesExportOnly(List<Table> tableList) { // list of all export tables (assume uppercase) NavigableSet<String> exportTables = CatalogUtil.getExportTableNames(m_catalogDb); // this loop is O(number-of-joins * number-of-export-tables) // which seems acceptable if not great. Probably faster than // re-hashing the export only tables for faster lookup. for (Table table : tableList) { if (exportTables.contains(table.getTypeName())) { return true; } } return false; } private boolean isPartitionColumnInGroupbyList(List<ParsedColInfo> groupbyColumns) { assert(m_parsedSelect != null); if (groupbyColumns == null) { return false; } for (ParsedColInfo groupbyCol : groupbyColumns) { StmtTableScan scanTable = m_parsedSelect.getStmtTableScanByAlias(groupbyCol.tableAlias); // table alias may be from AbstractParsedStmt.TEMP_TABLE_NAME. if (scanTable != null && scanTable.getPartitioningColumns() != null) { for (SchemaColumn pcol : scanTable.getPartitioningColumns()) { if (pcol != null && pcol.getColumnName().equals(groupbyCol.columnName) ) { return true; } } } } return false; } private boolean canPushDownDistinctAggregation(AggregateExpression aggExpr) { assert(m_parsedSelect != null); assert(aggExpr != null); assert(aggExpr.isDistinct()); if ( aggExpr.getExpressionType() == ExpressionType.AGGREGATE_COUNT_STAR ) { return true; } AbstractExpression aggArg = aggExpr.getLeft(); // constant if (aggArg instanceof ConstantValueExpression || aggArg instanceof ParameterValueExpression) { return true; } if ( ! (aggArg instanceof TupleValueExpression)) { return false; } TupleValueExpression tve = (TupleValueExpression) aggArg; String tableAlias = tve.getTableAlias(); StmtTableScan scanTable = m_parsedSelect.getStmtTableScanByAlias(tableAlias); // table alias may be from AbstractParsedStmt.TEMP_TABLE_NAME. if (scanTable == null || scanTable.getPartitioningColumns() == null) { return false; } for (SchemaColumn pcol : scanTable.getPartitioningColumns()) { if (pcol != null && pcol.getColumnName().equals(tve.getColumnName()) ) { return true; } } return false; } /** * Clear any old state and get ready to plan a new plan. The next call to * getNextPlan() will return the first candidate plan for these parameters. * */ private void setupForNewPlans(AbstractParsedStmt parsedStmt) { m_bestAndOnlyPlanWasGenerated = false; m_partitioning.analyzeTablePartitioning(parsedStmt.allScans()); if (parsedStmt instanceof ParsedUnionStmt) { m_parsedUnion = (ParsedUnionStmt) parsedStmt; return; } if (parsedStmt instanceof ParsedSelectStmt) { if (tableListIncludesExportOnly(parsedStmt.m_tableList)) { throw new PlanningErrorException( "Illegal to read a stream."); } m_parsedSelect = (ParsedSelectStmt) parsedStmt; // Simplify the outer join if possible if (m_parsedSelect.m_joinTree instanceof BranchNode) { if (! m_parsedSelect.hasJoinOrder()) { simplifyOuterJoin((BranchNode)m_parsedSelect.m_joinTree); } // Convert RIGHT joins to the LEFT ones ((BranchNode)m_parsedSelect.m_joinTree).toLeftJoin(); } m_subAssembler = new SelectSubPlanAssembler(m_catalogDb, m_parsedSelect, m_partitioning); // Process the GROUP BY information, decide whether it is group by the partition column if (isPartitionColumnInGroupbyList(m_parsedSelect.groupByColumns())) { m_parsedSelect.setHasPartitionColumnInGroupby(); } if (isPartitionColumnInWindowedAggregatePartitionByList()) { m_parsedSelect.setHasPartitionColumnInWindowedAggregate(); } // FIXME: is the following scheme/comment obsolete? // FIXME: turn it on when we are able to push down DISTINCT // if (isPartitionColumnInGroupbyList(m_parsedSelect.m_distinctGroupByColumns)) { // m_parsedSelect.setHasPartitionColumnInDistinctGroupby(); // } return; } // @TODO // Need to use StmtTableScan instead // check that no modification happens to views if (tableListIncludesReadOnlyView(parsedStmt.m_tableList)) { throw new PlanningErrorException("Illegal to modify a materialized view."); } m_partitioning.setIsDML(); // Check that only multi-partition writes are made to replicated tables. // figure out which table we're updating/deleting if (parsedStmt instanceof ParsedSwapStmt) { assert (parsedStmt.m_tableList.size() == 2); if (tableListIncludesExportOnly(parsedStmt.m_tableList)) { throw new PlanningErrorException("Illegal to swap a stream."); } m_parsedSwap = (ParsedSwapStmt) parsedStmt; return; } Table targetTable = parsedStmt.m_tableList.get(0); if (targetTable.getIsreplicated()) { if (m_partitioning.wasSpecifiedAsSingle() && !m_partitioning.isReplicatedDmlToRunOnAllPartitions()) { String msg = "Trying to write to replicated table '" + targetTable.getTypeName() + "' in a single-partition procedure."; throw new PlanningErrorException(msg); } } else if (m_partitioning.wasSpecifiedAsSingle() == false) { m_partitioning.setPartitioningColumnForDML(targetTable.getPartitioncolumn()); } if (parsedStmt instanceof ParsedInsertStmt) { m_parsedInsert = (ParsedInsertStmt) parsedStmt; // The currently handled inserts are too simple to even require a subplan assembler. So, done. return; } if (parsedStmt instanceof ParsedUpdateStmt) { if (tableListIncludesExportOnly(parsedStmt.m_tableList)) { throw new PlanningErrorException("Illegal to update a stream."); } m_parsedUpdate = (ParsedUpdateStmt) parsedStmt; } else if (parsedStmt instanceof ParsedDeleteStmt) { if (tableListIncludesExportOnly(parsedStmt.m_tableList)) { throw new PlanningErrorException("Illegal to delete from a stream."); } m_parsedDelete = (ParsedDeleteStmt) parsedStmt; } else { throw new RuntimeException("Unknown subclass of AbstractParsedStmt."); } if ( ! m_partitioning.wasSpecifiedAsSingle()) { //TODO: When updates and deletes can contain joins, this step may have to be // deferred so that the valueEquivalence set can be analyzed per join order. // This appears to be an unfortunate side effect of how the HSQL interface // misleadingly organizes the placement of join/where filters on the statement tree. // This throws off the accounting of equivalence join filters until they can be // normalized in analyzeJoinFilters, but that normalization process happens on a // per-join-order basis, and so, so must this analysis. HashMap<AbstractExpression, Set<AbstractExpression>> valueEquivalence = parsedStmt.analyzeValueEquivalence(); Collection<StmtTableScan> scans = parsedStmt.allScans(); m_partitioning.analyzeForMultiPartitionAccess(scans, valueEquivalence); } m_subAssembler = new WriterSubPlanAssembler(m_catalogDb, parsedStmt, m_partitioning); } private boolean isPartitionColumnInWindowedAggregatePartitionByList() { assert (m_parsedSelect != null); return (m_parsedSelect.isPartitionColumnInWindowedAggregatePartitionByList()); } private static void failIfNonDeterministicDml(AbstractParsedStmt parsedStmt, CompiledPlan plan) { // If we have content non-determinism on DML, then fail planning. // This can happen if: // INSERT INTO ... SELECT ... where the select statement has a limit on unordered data. // UPSERT INTO ... SELECT has the same issue, but no limit is required because // order may determine which rows are updated and which are inserted // DELETE ... ORDER BY <n> LIMIT <n> also has this issue // Update doesn't have this issue yet (but having ORDER BY and LIMIT there doesn't seem out // of the question). // When subqueries in WHERE clauses of DML are allowed, we will need to make sure the // subqueries are content-deterministic too. if (plan == null || plan.isReadOnly()) { return; } boolean contentDeterministic = plan.isContentDeterministic(); if (parsedStmt instanceof ParsedInsertStmt && !(plan.isOrderDeterministic() && contentDeterministic)) { ParsedInsertStmt parsedInsert = (ParsedInsertStmt)parsedStmt; boolean targetHasLimitRowsTrigger = parsedInsert.targetTableHasLimitRowsTrigger(); String contentDeterministicMsg = ""; if (!contentDeterministic) { contentDeterministicMsg = " " + plan.nondeterminismDetail(); } if (parsedStmt.m_isUpsert) { throw new PlanningErrorException( "UPSERT statement manipulates data in a non-deterministic way. " + "Adding an ORDER BY clause to UPSERT INTO ... SELECT may address this issue." + contentDeterministicMsg); } if (targetHasLimitRowsTrigger) { throw new PlanningErrorException( "Order of rows produced by SELECT statement in INSERT INTO ... SELECT is " + "non-deterministic. Since the table being inserted into has a row limit " + "trigger, the SELECT output must be ordered. Add an ORDER BY clause " + "to address this issue." + contentDeterministicMsg ); } if (plan.hasLimitOrOffset()) { throw new PlanningErrorException( "INSERT statement manipulates data in a content non-deterministic way. " + "Adding an ORDER BY clause to INSERT INTO ... SELECT may address this issue." + contentDeterministicMsg); } if (!contentDeterministic) { throw new PlanningErrorException("INSERT statement manipulates data in a non-deterministic way." + contentDeterministicMsg); } } if (parsedStmt instanceof ParsedDeleteStmt && !((ParsedDeleteStmt)parsedStmt).sideEffectsAreDeterministic()) { throw new PlanningErrorException( "DELETE statement manipulates data in a non-deterministic way. This may happen " + "when the DELETE has an ORDER BY clause with a LIMIT, but the order is not " + "well-defined."); } } /** * Generate the best cost plan for the current SQL statement context. * * @param parsedStmt Current SQL statement to generate plan for * @return The best cost plan or null. */ static String IN_EXISTS_SCALAR_ERROR_MESSAGE = "Subquery expressions are only supported for " + "single partition procedures and AdHoc queries referencing only replicated tables."; CompiledPlan getBestCostPlan(AbstractParsedStmt parsedStmt) { // parse any subqueries that the statement contains List<StmtSubqueryScan> subqueryNodes = parsedStmt.getSubqueryScans(); ParsedResultAccumulator fromSubqueryResult = null; if (! subqueryNodes.isEmpty()) { fromSubqueryResult = getBestCostPlanForFromSubQueries(subqueryNodes); if (fromSubqueryResult == null) { // There was at least one sub-query and we should have a compiled plan for it return null; } } // Get the best plans for the expression subqueries ( IN/EXISTS (SELECT...) ) Set<AbstractExpression> subqueryExprs = parsedStmt.findSubquerySubexpressions(); if ( ! subqueryExprs.isEmpty() ) { // guards against IN/EXISTS/Scalar subqueries if ( ! m_partitioning.wasSpecifiedAsSingle() ) { // Don't allow partitioned tables in subqueries. // This restriction stems from the lack of confidence that the // planner can reliably identify all cases of adequate and // inadequate partition key join criteria across different // levels of correlated subqueries. for (AbstractExpression e : subqueryExprs) { assert(e instanceof SelectSubqueryExpression); SelectSubqueryExpression subExpr = (SelectSubqueryExpression)e; if (! subExpr.getSubqueryScan().getIsReplicated()) { m_recentErrorMsg = IN_EXISTS_SCALAR_ERROR_MESSAGE; return null; } } } if (!getBestCostPlanForExpressionSubQueries(subqueryExprs)) { // There was at least one sub-query and we should have a compiled plan for it return null; } } // set up the plan assembler for this statement setupForNewPlans(parsedStmt); // get ready to find the plan with minimal cost CompiledPlan rawplan = null; // loop over all possible plans while (true) { rawplan = getNextPlan(); // stop this while loop when no more plans are generated if (rawplan == null) { break; } // Update the best cost plan so far m_planSelector.considerCandidatePlan(rawplan, parsedStmt); } CompiledPlan retval = m_planSelector.m_bestPlan; if (retval == null) { return null; } if (fromSubqueryResult != null) { // Calculate the combined state of determinism for the parent and child statements boolean orderIsDeterministic = retval.isOrderDeterministic(); String contentDeterminismDetail = fromSubqueryResult.m_isContentDeterministic; if (orderIsDeterministic && ! fromSubqueryResult.m_orderIsDeterministic) { //TODO: this reliance on the vague isOrderDeterministicInSpiteOfUnorderedSubqueries test // is subject to false negatives for determinism. It misses the subtlety of parent // queries that surgically add orderings for specific "key" columns of a subquery result // or a subquery-based join for an effectively deterministic result. // The first step towards repairing this would involve detecting deterministic and // non-deterministic subquery results IN CONTEXT where they are scanned in the parent // query, so that the parent query can ensure that ALL the columns from a // non-deterministic subquery are later sorted. // The next step would be to extend the model for "subquery scans" // to identify dependencies / uniqueness constraints in subquery results // that can be exploited to impose determinism with fewer parent order by columns // -- like just the keys. orderIsDeterministic = parsedStmt.isOrderDeterministicInSpiteOfUnorderedSubqueries(); } boolean hasLimitOrOffset = fromSubqueryResult.m_hasLimitOrOffset || retval.hasLimitOrOffset(); retval.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic, contentDeterminismDetail); // Need to re-attach the sub-queries plans to the best parent plan. The same best plan for each // sub-query is reused with all parent candidate plans and needs to be reconnected with // the final best parent plan retval.rootPlanGraph = connectChildrenBestPlans(retval.rootPlanGraph); } /* * Find out if the query is inherently content deterministic and * remember it. */ String contentDeterminismMessage = parsedStmt.getContentDeterminismMessage(); if (contentDeterminismMessage != null) { retval.setNondeterminismDetail(contentDeterminismMessage); } failIfNonDeterministicDml(parsedStmt, retval); if (m_partitioning != null) { retval.setStatementPartitioning(m_partitioning); } return retval; } /** * Output the best cost plan. * */ void finalizeBestCostPlan() { m_planSelector.finalizeOutput(); } /** * Generate best cost plans for a list of FROM sub-queries. * @param subqueryNodes - list of FROM sub-queries. * @return ParsedResultAccumulator */ private ParsedResultAccumulator getBestCostPlanForFromSubQueries(List<StmtSubqueryScan> subqueryNodes) { int nextPlanId = m_planSelector.m_planId; boolean orderIsDeterministic = true; boolean hasSignificantOffsetOrLimit = false; String isContentDeterministic = null; for (StmtSubqueryScan subqueryScan : subqueryNodes) { nextPlanId = planForParsedSubquery(subqueryScan, nextPlanId); CompiledPlan subqueryBestPlan = subqueryScan.getBestCostPlan(); if (subqueryBestPlan == null) { throw new PlanningErrorException(m_recentErrorMsg); } orderIsDeterministic &= subqueryBestPlan.isOrderDeterministic(); if (isContentDeterministic != null && !subqueryBestPlan.isContentDeterministic()) { isContentDeterministic = subqueryBestPlan.nondeterminismDetail(); } // Offsets or limits in subqueries are only significant (only effect content determinism) // when they apply to un-ordered subquery contents. hasSignificantOffsetOrLimit |= (( ! subqueryBestPlan.isOrderDeterministic() ) && subqueryBestPlan.hasLimitOrOffset()); } // need to reset plan id for the entire SQL m_planSelector.m_planId = nextPlanId; return new ParsedResultAccumulator(orderIsDeterministic, hasSignificantOffsetOrLimit, isContentDeterministic); } /** * Generate best cost plans for each Subquery expression from the list * @param subqueryExprs - list of subquery expressions * @return true if a best plan was generated for each subquery, false otherwise */ private boolean getBestCostPlanForExpressionSubQueries(Set<AbstractExpression> subqueryExprs) { int nextPlanId = m_planSelector.m_planId; for (AbstractExpression expr : subqueryExprs) { assert(expr instanceof SelectSubqueryExpression); if (!(expr instanceof SelectSubqueryExpression)) { continue; // DEAD CODE? } SelectSubqueryExpression subqueryExpr = (SelectSubqueryExpression) expr; StmtSubqueryScan subqueryScan = subqueryExpr.getSubqueryScan(); nextPlanId = planForParsedSubquery(subqueryScan, nextPlanId); CompiledPlan bestPlan = subqueryScan.getBestCostPlan(); if (bestPlan == null) { return false; } subqueryExpr.setSubqueryNode(bestPlan.rootPlanGraph); // The subquery plan must not contain Receive/Send nodes because it will be executed // multiple times during the parent statement execution. if (bestPlan.rootPlanGraph.hasAnyNodeOfType(PlanNodeType.SEND)) { // fail the whole plan m_recentErrorMsg = IN_EXISTS_SCALAR_ERROR_MESSAGE; return false; } } // need to reset plan id for the entire SQL m_planSelector.m_planId = nextPlanId; return true; } /** * Generate a unique and correct plan for the current SQL statement context. * This method gets called repeatedly until it returns null, meaning there * are no more plans. * * @return A not-previously returned query plan or null if no more * computable plans. */ private CompiledPlan getNextPlan() { CompiledPlan retval; AbstractParsedStmt nextStmt = null; if (m_parsedSelect != null) { nextStmt = m_parsedSelect; retval = getNextSelectPlan(); } else if (m_parsedInsert != null) { nextStmt = m_parsedInsert; retval = getNextInsertPlan(); } else if (m_parsedDelete != null) { nextStmt = m_parsedDelete; retval = getNextDeletePlan(); // note that for replicated tables, multi-fragment plans // need to divide the result by the number of partitions } else if (m_parsedUpdate != null) { nextStmt = m_parsedUpdate; retval = getNextUpdatePlan(); } else if (m_parsedUnion != null) { nextStmt = m_parsedUnion; retval = getNextUnionPlan(); } else if (m_parsedSwap != null) { nextStmt = m_parsedSwap; retval = getNextSwapPlan(); } else { throw new RuntimeException( "setupForNewPlans encountered unsupported statement type."); } if (retval == null || retval.rootPlanGraph == null) { return null; } assert (nextStmt != null); retval.parameters = nextStmt.getParameters(); return retval; } /** * This is a UNION specific method. Generate a unique and correct plan * for the current SQL UNION statement by building the best plans for each individual statements * within the UNION. * * @return A union plan or null. */ private CompiledPlan getNextUnionPlan() { String isContentDeterministic = null; // Since only the one "best" plan is considered, // this method should be called only once. if (m_bestAndOnlyPlanWasGenerated) { return null; } m_bestAndOnlyPlanWasGenerated = true; // Simply return an union plan node with a corresponding union type set AbstractPlanNode subUnionRoot = new UnionPlanNode(m_parsedUnion.m_unionType); m_recentErrorMsg = null; ArrayList<CompiledPlan> childrenPlans = new ArrayList<>(); StatementPartitioning commonPartitioning = null; // Build best plans for the children first int planId = 0; for (AbstractParsedStmt parsedChildStmt : m_parsedUnion.m_children) { StatementPartitioning partitioning = (StatementPartitioning)m_partitioning.clone(); PlanSelector planSelector = (PlanSelector) m_planSelector.clone(); planSelector.m_planId = planId; PlanAssembler assembler = new PlanAssembler(m_catalogDb, partitioning, planSelector); CompiledPlan bestChildPlan = assembler.getBestCostPlan(parsedChildStmt); partitioning = assembler.m_partitioning; // make sure we got a winner if (bestChildPlan == null) { m_recentErrorMsg = assembler.getErrorMessage(); if (m_recentErrorMsg == null) { m_recentErrorMsg = "Unable to plan for statement. Error unknown."; } return null; } childrenPlans.add(bestChildPlan); // Remember the content non-determinism message for the // first non-deterministic children we find. if (isContentDeterministic != null) { isContentDeterministic = bestChildPlan.nondeterminismDetail(); } // Make sure that next child's plans won't override current ones. planId = planSelector.m_planId; // Decide whether child statements' partitioning is compatible. if (commonPartitioning == null) { commonPartitioning = partitioning; continue; } AbstractExpression statementPartitionExpression = partitioning.singlePartitioningExpression(); if (commonPartitioning.requiresTwoFragments()) { if (partitioning.requiresTwoFragments() || statementPartitionExpression != null) { // If two child statements need to use a second fragment, // it can't currently be a two-fragment plan. // The coordinator expects a single-table result from each partition. // Also, currently the coordinator of a two-fragment plan is not allowed to // target a particular partition, so neither can the union of the coordinator // and a statement that wants to run single-partition. throw new PlanningErrorException( "Statements are too complex in set operation using multiple partitioned tables."); } // the new statement is apparently a replicated read and has no effect on partitioning continue; } AbstractExpression commonPartitionExpression = commonPartitioning.singlePartitioningExpression(); if (commonPartitionExpression == null) { // the prior statement(s) were apparently replicated reads // and have no effect on partitioning commonPartitioning = partitioning; continue; } if (partitioning.requiresTwoFragments()) { // Again, currently the coordinator of a two-fragment plan is not allowed to // target a particular partition, so neither can the union of the coordinator // and a statement that wants to run single-partition. throw new PlanningErrorException( "Statements are too complex in set operation using multiple partitioned tables."); } if (statementPartitionExpression == null) { // the new statement is apparently a replicated read and has no effect on partitioning continue; } if ( ! commonPartitionExpression.equals(statementPartitionExpression)) { throw new PlanningErrorException( "Statements use conflicting partitioned table filters in set operation or sub-query."); } } if (commonPartitioning != null) { m_partitioning = commonPartitioning; } // need to reset plan id for the entire UNION m_planSelector.m_planId = planId; // Add and link children plans for (CompiledPlan selectPlan : childrenPlans) { subUnionRoot.addAndLinkChild(selectPlan.rootPlanGraph); } // order by if (m_parsedUnion.hasOrderByColumns()) { subUnionRoot = handleOrderBy(m_parsedUnion, subUnionRoot); } // limit/offset if (m_parsedUnion.hasLimitOrOffset()) { subUnionRoot = handleUnionLimitOperator(subUnionRoot); } CompiledPlan retval = new CompiledPlan(); retval.rootPlanGraph = subUnionRoot; retval.setReadOnly(true); retval.sql = m_planSelector.m_sql; boolean orderIsDeterministic = m_parsedUnion.isOrderDeterministic(); boolean hasLimitOrOffset = m_parsedUnion.hasLimitOrOffset(); retval.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic, isContentDeterministic); // compute the cost - total of all children retval.cost = 0.0; for (CompiledPlan bestChildPlan : childrenPlans) { retval.cost += bestChildPlan.cost; } return retval; } private int planForParsedSubquery(StmtSubqueryScan subqueryScan, int planId) { AbstractParsedStmt subQuery = subqueryScan.getSubqueryStmt(); assert(subQuery != null); PlanSelector planSelector = (PlanSelector) m_planSelector.clone(); planSelector.m_planId = planId; StatementPartitioning currentPartitioning = (StatementPartitioning)m_partitioning.clone(); PlanAssembler assembler = new PlanAssembler(m_catalogDb, currentPartitioning, planSelector); CompiledPlan compiledPlan = assembler.getBestCostPlan(subQuery); // make sure we got a winner if (compiledPlan == null) { String tbAlias = subqueryScan.getTableAlias(); m_recentErrorMsg = "Subquery statement for table " + tbAlias + " has error: " + assembler.getErrorMessage(); return planSelector.m_planId; } subqueryScan.setSubqueriesPartitioning(currentPartitioning); // Remove the coordinator send/receive pair. // It will be added later for the whole plan. //TODO: It may make more sense to plan ahead and not generate the send/receive pair // at all for subquery contexts where it is not needed. if (subqueryScan.canRunInOneFragment()) { // The MergeReceivePlanNode always has an inline ORDER BY node and may have // LIMIT/OFFSET and aggregation node(s). Removing the MergeReceivePlanNode will // also remove its inline node(s) which may produce an invalid access plan. // For example, // SELECT TC1 FROM (SELECT C1 AS TC1 FROM P ORDER BY C1) PT LIMIT 4; // where P is partitioned and C1 is a non-partitioned index column. // Removing the subquery MergeReceivePlnaNode and its ORDER BY node results // in the invalid access plan - the subquery result order is significant in this case // The concern with generally keeping the (Merge)Receive node in the subquery is // that it would needlessly generate more-than-2-fragment plans in cases // where 2 fragments could have done the job. if ( ! compiledPlan.rootPlanGraph.hasAnyNodeOfClass(MergeReceivePlanNode.class)) { compiledPlan.rootPlanGraph = removeCoordinatorSendReceivePair(compiledPlan.rootPlanGraph); } } subqueryScan.setBestCostPlan(compiledPlan); return planSelector.m_planId; } /** * Remove the coordinator send/receive pair if any from the graph. * * @param root the complete plan node. * @return the plan without the send/receive pair. */ static public AbstractPlanNode removeCoordinatorSendReceivePair(AbstractPlanNode root) { assert(root != null); return removeCoordinatorSendReceivePairRecursive(root, root); } static private AbstractPlanNode removeCoordinatorSendReceivePairRecursive( AbstractPlanNode root, AbstractPlanNode current) { if (current instanceof AbstractReceivePlanNode) { assert(current.getChildCount() == 1); AbstractPlanNode child = current.getChild(0); assert(child instanceof SendPlanNode); assert(child.getChildCount() == 1); child = child.getChild(0); child.clearParents(); if (current == root) { return child; } assert(current.getParentCount() == 1); AbstractPlanNode parent = current.getParent(0); parent.unlinkChild(current); parent.addAndLinkChild(child); return root; } if (current.getChildCount() == 1) { // This is still a coordinator node return removeCoordinatorSendReceivePairRecursive(root, current.getChild(0)); } // We have hit a multi-child plan node -- a nestloop join or a union. // Can we really assume that there is no send/receive below this point? // TODO: It seems to me (--paul) that for a replicated-to-partitioned // left outer join, we should be following the second (partitioned) // child node of a nestloop join. // I'm not sure what the correct behavior is for a union. return root; } /** * For each Subquery node in the plan tree attach the subquery plan to the parent node. * @param initial plan * @return A complete plan tree for the entire SQl. */ private AbstractPlanNode connectChildrenBestPlans(AbstractPlanNode parentPlan) { if (parentPlan instanceof AbstractScanPlanNode) { AbstractScanPlanNode scanNode = (AbstractScanPlanNode) parentPlan; StmtTableScan tableScan = scanNode.getTableScan(); if (tableScan instanceof StmtSubqueryScan) { CompiledPlan bestCostPlan = ((StmtSubqueryScan)tableScan).getBestCostPlan(); assert (bestCostPlan != null); AbstractPlanNode subQueryRoot = bestCostPlan.rootPlanGraph; subQueryRoot.disconnectParents(); scanNode.clearChildren(); scanNode.addAndLinkChild(subQueryRoot); } } else { for (int i = 0; i < parentPlan.getChildCount(); ++i) { connectChildrenBestPlans(parentPlan.getChild(i)); } } return parentPlan; } private CompiledPlan getNextSelectPlan() { assert (m_subAssembler != null); // A matview reaggregation template plan may have been initialized // with a post-predicate expression moved from the statement's // join tree prior to any subquery planning. // Since normally subquery planning is driven from the join tree, // any subqueries that are moved out of the join tree would need // to be planned separately. // This planning would need to be done prior to calling // m_subAssembler.nextPlan() // because it can have query partitioning implications. // Under the current query limitations, the partitioning implications // are very simple -- subqueries are not allowed in multipartition // queries against partitioned data, so detection of a subquery in // the same query as a matview reaggregation can just return an error, // without any need for subquery planning here. HashAggregatePlanNode reAggNode = null; HashAggregatePlanNode mvReAggTemplate = m_parsedSelect.m_mvFixInfo.getReAggregationPlanNode(); if (mvReAggTemplate != null) { reAggNode = new HashAggregatePlanNode(mvReAggTemplate); AbstractExpression postPredicate = reAggNode.getPostPredicate(); if (postPredicate != null && postPredicate.hasSubquerySubexpression()) { // For now, this is just a special case violation of the limitation on // use of subquery expressions in MP queries on partitioned data. // That special case was going undetected when we didn't flag it here. m_recentErrorMsg = IN_EXISTS_SCALAR_ERROR_MESSAGE; return null; } // // Something more along these lines would have to be enabled // // to allow expression subqueries to be used in multi-partition // // matview queries. // if (!getBestCostPlanForExpressionSubQueries(subqueryExprs)) { // // There was at least one sub-query and we should have a compiled plan for it // return null; // } } AbstractPlanNode subSelectRoot = m_subAssembler.nextPlan(); if (subSelectRoot == null) { m_recentErrorMsg = m_subAssembler.m_recentErrorMsg; return null; } AbstractPlanNode root = subSelectRoot; boolean mvFixNeedsProjection = false; /* * If the access plan for the table in the join order was for a * distributed table scan there must be a send/receive pair at the top * EXCEPT for the special outer join case in which a replicated table * was on the OUTER side of an outer join across from the (joined) scan * of the partitioned table(s) (all of them) in the query. In that case, * the one required send/receive pair is already in the plan below the * inner side of a NestLoop join. */ if (m_partitioning.requiresTwoFragments()) { boolean mvFixInfoCoordinatorNeeded = true; boolean mvFixInfoEdgeCaseOuterJoin = false; ArrayList<AbstractPlanNode> receivers = root.findAllNodesOfClass(AbstractReceivePlanNode.class); if (receivers.size() == 1) { // The subplan SHOULD be good to go, but just make sure that it doesn't // scan a partitioned table except under the ReceivePlanNode that was just found. // Edge cases: left outer join with replicated table. if (m_parsedSelect.m_mvFixInfo.needed()) { mvFixInfoCoordinatorNeeded = false; AbstractPlanNode receiveNode = receivers.get(0); if (receiveNode.getParent(0) instanceof NestLoopPlanNode) { if (subSelectRoot.hasInlinedIndexScanOfTable(m_parsedSelect.m_mvFixInfo.getMVTableName())) { return getNextSelectPlan(); } List<AbstractPlanNode> nljs = receiveNode.findAllNodesOfType(PlanNodeType.NESTLOOP); List<AbstractPlanNode> nlijs = receiveNode.findAllNodesOfType(PlanNodeType.NESTLOOPINDEX); // outer join edge case does not have any join plan node under receive node. // This is like a single table case. if (nljs.size() + nlijs.size() == 0) { mvFixInfoEdgeCaseOuterJoin = true; } root = handleMVBasedMultiPartQuery(reAggNode, root, mvFixInfoEdgeCaseOuterJoin); } } } else { if (receivers.size() > 0) { throw new PlanningErrorException( "This special case join between an outer replicated table and " + "an inner partitioned table is too complex and is not supported."); } root = SubPlanAssembler.addSendReceivePair(root); // Root is a receive node here. assert(root instanceof ReceivePlanNode); if (m_parsedSelect.mayNeedAvgPushdown()) { m_parsedSelect.switchOptimalSuiteForAvgPushdown(); } if (m_parsedSelect.m_tableList.size() > 1 && m_parsedSelect.m_mvFixInfo.needed() && subSelectRoot.hasInlinedIndexScanOfTable(m_parsedSelect.m_mvFixInfo.getMVTableName())) { // MV partitioned joined query needs reAggregation work on coordinator. // Index scan on MV table can not be supported. // So, in-lined index scan of Nested loop index join can not be possible. return getNextSelectPlan(); } } root = handleAggregationOperators(root); // Process the re-aggregate plan node and insert it into the plan. if (m_parsedSelect.m_mvFixInfo.needed() && mvFixInfoCoordinatorNeeded) { AbstractPlanNode tmpRoot = root; root = handleMVBasedMultiPartQuery(reAggNode, root, mvFixInfoEdgeCaseOuterJoin); if (root != tmpRoot) { mvFixNeedsProjection = true; } } } else { /* * There is no receive node and root is a single partition plan. */ // If there is no receive plan node and no distributed plan has been generated, // the fix set for MV is not needed. m_parsedSelect.m_mvFixInfo.setNeeded(false); root = handleAggregationOperators(root); } // If we have a windowed expression in the display list we want to // add a PartitionByPlanNode here. if (m_parsedSelect.hasWindowFunctionExpression()) { root = handleWindowedOperators(root); } if (m_parsedSelect.hasOrderByColumns()) { root = handleOrderBy(m_parsedSelect, root); if (m_parsedSelect.isComplexOrderBy() && root instanceof OrderByPlanNode) { AbstractPlanNode child = root.getChild(0); AbstractPlanNode grandChild = child.getChild(0); // swap the ORDER BY and complex aggregate Projection node if (child instanceof ProjectionPlanNode) { root.unlinkChild(child); child.unlinkChild(grandChild); child.addAndLinkChild(root); root.addAndLinkChild(grandChild); // update the new root root = child; } else if (m_parsedSelect.hasDistinctWithGroupBy() && child.getPlanNodeType() == PlanNodeType.HASHAGGREGATE && grandChild.getPlanNodeType() == PlanNodeType.PROJECTION) { AbstractPlanNode grandGrandChild = grandChild.getChild(0); child.clearParents(); root.clearChildren(); grandGrandChild.clearParents(); grandChild.clearChildren(); grandChild.addAndLinkChild(root); root.addAndLinkChild(grandGrandChild); root = child; } } } // Add a project node if we need one. Some types of nodes can have their // own inline projection nodes, while others need an out-of-line projection // node. if (mvFixNeedsProjection || needProjectionNode(root)) { root = addProjection(root); } if (m_parsedSelect.hasLimitOrOffset()) { root = handleSelectLimitOperator(root); } CompiledPlan plan = new CompiledPlan(); plan.rootPlanGraph = root; plan.setReadOnly(true); boolean orderIsDeterministic = m_parsedSelect.isOrderDeterministic(); boolean hasLimitOrOffset = m_parsedSelect.hasLimitOrOffset(); String contentDeterminismMessage = m_parsedSelect.getContentDeterminismMessage(); plan.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic, contentDeterminismMessage); // Apply the micro-optimization: // LIMIT push down, Table count / Counting Index, Optimized Min/Max MicroOptimizationRunner.applyAll(plan, m_parsedSelect); return plan; } /** * Return true if the plan referenced by root node needs a * projection node appended to the top. * * This method does a lot of "if this node is an * instance of this class.... else if this node is an * instance of this other class..." Perhaps it could be replaced * by a virtual method on AbstractPlanNode? * * @param root The root node of a plan * @return true if a project node is required */ private boolean needProjectionNode (AbstractPlanNode root) { if (!root.planNodeClassNeedsProjectionNode()) { return false; } // If there is a complexGroupby at his point, it means that // display columns contain all the order by columns and // does not require another projection node on top of sort node. // If there is a complex aggregation case, the projection plan node is already added // right above the group by plan node. In future, we may inline that projection node. if (m_parsedSelect.hasComplexGroupby() || m_parsedSelect.hasComplexAgg()) { return false; } if (root instanceof AbstractReceivePlanNode && m_parsedSelect.hasPartitionColumnInGroupby()) { // Top aggregate has been removed, its schema is exactly the same to // its local aggregate node. return false; } return true; } // ENG-4909 Bug: currently disable NESTLOOPINDEX plan for IN private static boolean disableNestedLoopIndexJoinForInComparison (AbstractPlanNode root, AbstractParsedStmt parsedStmt) { if (root.getPlanNodeType() == PlanNodeType.NESTLOOPINDEX) { assert(parsedStmt != null); return true; } return false; } /** Returns true if this DELETE can be executed in the EE as a truncate operation */ static private boolean deleteIsTruncate(ParsedDeleteStmt stmt, AbstractPlanNode plan) { if (!(plan instanceof SeqScanPlanNode)) { return false; } // Assume all index scans have filters in this context, so only consider seq scans. SeqScanPlanNode seqScanNode = (SeqScanPlanNode)plan; if (seqScanNode.getPredicate() != null) { return false; } if (stmt.hasLimitOrOffset()) { return false; } return true; } private CompiledPlan getNextDeletePlan() { assert (m_subAssembler != null); // figure out which table we're deleting from assert (m_parsedDelete.m_tableList.size() == 1); Table targetTable = m_parsedDelete.m_tableList.get(0); AbstractPlanNode subSelectRoot = m_subAssembler.nextPlan(); if (subSelectRoot == null) { return null; } // ENG-4909 Bug: currently disable NESTLOOPINDEX plan for IN if (disableNestedLoopIndexJoinForInComparison(subSelectRoot, m_parsedDelete)) { // Recursion here, now that subAssembler.nextPlan() has been called, // simply jumps ahead to the next plan (if any). return getNextDeletePlan(); } boolean isSinglePartitionPlan = m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle(); // generate the delete node with the right target table DeletePlanNode deleteNode = new DeletePlanNode(); deleteNode.setTargetTableName(targetTable.getTypeName()); assert(subSelectRoot instanceof AbstractScanPlanNode); // If the scan matches all rows, we can throw away the scan // nodes and use a truncate delete node. if (deleteIsTruncate(m_parsedDelete, subSelectRoot)) { deleteNode.setTruncate(true); } else { // User may have specified an ORDER BY ... LIMIT clause if (m_parsedDelete.orderByColumns().size() > 0 && !isSinglePartitionPlan && !targetTable.getIsreplicated()) { throw new PlanningErrorException( "DELETE statements affecting partitioned tables must " + "be able to execute on one partition " + "when ORDER BY and LIMIT or OFFSET clauses " + "are present."); } boolean needsOrderByNode = isOrderByNodeRequired(m_parsedDelete, subSelectRoot); AbstractExpression addressExpr = new TupleAddressExpression(); NodeSchema proj_schema = new NodeSchema(); // This planner-created column is magic. proj_schema.addColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "tuple_address", "tuple_address", addressExpr); if (needsOrderByNode) { // Projection will need to pass the sort keys to the order by node for (ParsedColInfo col : m_parsedDelete.orderByColumns()) { proj_schema.addColumn(col.asSchemaColumn()); } } ProjectionPlanNode projectionNode = new ProjectionPlanNode(proj_schema); subSelectRoot.addInlinePlanNode(projectionNode); AbstractPlanNode root = subSelectRoot; if (needsOrderByNode) { OrderByPlanNode ob = buildOrderByPlanNode(m_parsedDelete.orderByColumns()); ob.addAndLinkChild(root); root = ob; } if (m_parsedDelete.hasLimitOrOffset()) { assert(m_parsedDelete.orderByColumns().size() > 0); root.addInlinePlanNode(m_parsedDelete.limitPlanNode()); } deleteNode.addAndLinkChild(root); } CompiledPlan plan = new CompiledPlan(); plan.setReadOnly(false); // check non-determinism status // treat this as deterministic for reporting purposes: // delete statements produce just one row that is the // number of rows affected boolean orderIsDeterministic = true; boolean hasLimitOrOffset = m_parsedDelete.hasLimitOrOffset(); // The delete statement cannot be inherently content non-deterministic. // So, the last parameter is always null. plan.statementGuaranteesDeterminism(hasLimitOrOffset, orderIsDeterministic, null); if (isSinglePartitionPlan) { plan.rootPlanGraph = deleteNode; return plan; } // Add a compensating sum of modified tuple counts or a limit 1 // AND a send on top of the union-like receive node. boolean isReplicated = targetTable.getIsreplicated(); plan.rootPlanGraph = addCoordinatorToDMLNode(deleteNode, isReplicated); return plan; } /** * Get the next (only) plan for a VoltDB SWAP TABLE statement. * These are pretty simple and will only generate a single plan. * * @return The next (only) plan for a given SWAP TABLE statement, then null. */ private CompiledPlan getNextSwapPlan() { // there's really only one way to do a swap, so just // plan it the right way once, then return null after that if (m_bestAndOnlyPlanWasGenerated) { return null; } m_bestAndOnlyPlanWasGenerated = true; // figure out which tables we're swapping assert (m_parsedSwap.m_tableList.size() == 2); Table theTable = m_parsedSwap.m_tableList.get(0); Table otherTable = m_parsedSwap.m_tableList.get(1); CompiledPlan retval = new CompiledPlan(); retval.setReadOnly(false); // the root of the SWAP TABLE plan is always a SwapPlanNode SwapTablesPlanNode swapNode = new SwapTablesPlanNode(); swapNode.initializeSwapTablesPlanNode(theTable, otherTable); // SWAP commands are only run single-partition when invoked from // an explicitly declared single-partition stored procedure. if (m_partitioning.wasSpecifiedAsSingle()) { retval.rootPlanGraph = swapNode; return retval; } // Add a compensating sum of modified tuple counts or a limit 1 // AND a send on top of the union-like receive node. boolean isReplicated = theTable.getIsreplicated(); retval.rootPlanGraph = addCoordinatorToDMLNode(swapNode, isReplicated); return retval; } private CompiledPlan getNextUpdatePlan() { assert (m_subAssembler != null); AbstractPlanNode subSelectRoot = m_subAssembler.nextPlan(); if (subSelectRoot == null) { return null; } if (disableNestedLoopIndexJoinForInComparison(subSelectRoot, m_parsedUpdate)) { // Recursion here, now that subAssembler.nextPlan() has been called, // simply jumps ahead to the next plan (if any). return getNextUpdatePlan(); } UpdatePlanNode updateNode = new UpdatePlanNode(); //FIXME: does this assert need to be relaxed in the face of non-from-clause subquery support? // It was not in Mike A's original branch. assert (m_parsedUpdate.m_tableList.size() == 1); Table targetTable = m_parsedUpdate.m_tableList.get(0); updateNode.setTargetTableName(targetTable.getTypeName()); // set this to false until proven otherwise updateNode.setUpdateIndexes(false); TupleAddressExpression tae = new TupleAddressExpression(); NodeSchema proj_schema = new NodeSchema(); // This planner-generated column is magic. proj_schema.addColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "tuple_address", "tuple_address", tae); // get the set of columns affected by indexes Set<String> affectedColumns = getIndexedColumnSetForTable(targetTable); // add the output columns we need to the projection // // Right now, the EE is going to use the original column names // and compare these to the persistent table column names in the // update executor in order to figure out which table columns get // updated. We'll associate the actual values with VOLT_TEMP_TABLE // to avoid any false schema/column matches with the actual table. for (Entry<Column, AbstractExpression> colEntry : m_parsedUpdate.columns.entrySet()) { Column col = colEntry.getKey(); String colName = col.getTypeName(); AbstractExpression expr = colEntry.getValue(); expr.setInBytes(colEntry.getKey().getInbytes()); proj_schema.addColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, colName, colName, expr); // check if this column is an indexed column if (affectedColumns.contains(colName)) { updateNode.setUpdateIndexes(true); } } ProjectionPlanNode projectionNode = new ProjectionPlanNode(proj_schema); // add the projection inline (TODO: this will break if more than one // layer is below this) // // When we inline this projection into the scan, we're going // to overwrite any original projection that we might have inlined // in order to simply cull the columns from the persistent table. assert(subSelectRoot instanceof AbstractScanPlanNode); subSelectRoot.addInlinePlanNode(projectionNode); // connect the nodes to build the graph updateNode.addAndLinkChild(subSelectRoot); CompiledPlan retval = new CompiledPlan(); retval.setReadOnly (false); if (targetTable.getIsreplicated()) { retval.replicatedTableDML = true; } //FIXME: This assumption was only safe when we didn't support updates // w/ possibly non-deterministic subqueries. // Is there some way to integrate a "subquery determinism" check here? // because we didn't support updates with limits, either. // Since the update cannot be inherently non-deterministic, there is // no message, and the last parameter is null. retval.statementGuaranteesDeterminism(false, true, null); if (m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle()) { retval.rootPlanGraph = updateNode; return retval; } // Send the local result counts to the coordinator. // Add a compensating sum of modified tuple counts or a limit 1 // AND a send on top of the union-like receive node. boolean isReplicated = targetTable.getIsreplicated(); retval.rootPlanGraph = addCoordinatorToDMLNode(updateNode, isReplicated); return retval; } static private AbstractExpression castExprIfNeeded( AbstractExpression expr, Column column) { if (expr.getValueType().getValue() != column.getType() || expr.getValueSize() != column.getSize()) { expr = new OperatorExpression(ExpressionType.OPERATOR_CAST, expr, null); expr.setValueType(VoltType.get((byte) column.getType())); // We don't really support parameterized casting, such as specifically to "VARCHAR(3)" // vs. just VARCHAR, but set the size parameter anyway in this case to make sure that // the tuple that gets the result of the cast can be properly formatted as inline. // A too-wide value survives the cast (to generic VARCHAR of any length) but the // attempt to cache the result in the inline temp tuple storage will throw an early // runtime error on be half of the target table column. // The important thing here is to leave the formatting hint in the output schema that // drives the temp tuple layout. expr.setValueSize(column.getSize()); } return expr; } /** * Get the next (only) plan for a SQL insertion. Inserts are pretty simple * and this will only generate a single plan. * * @return The next (only) plan for a given insert statement, then null. */ private CompiledPlan getNextInsertPlan() { // there's really only one way to do an insert, so just // do it the right way once, then return null after that if (m_bestAndOnlyPlanWasGenerated) { return null; } m_bestAndOnlyPlanWasGenerated = true; // The child of the insert node produces rows containing values // from one of // - A VALUES clause. In this case the child node is a MaterializeNode // - a SELECT statement as in "INSERT INTO ... SELECT ...". In this case // the child node is the root of an arbitrary subplan. // figure out which table we're inserting into assert (m_parsedInsert.m_tableList.size() == 1); Table targetTable = m_parsedInsert.m_tableList.get(0); StmtSubqueryScan subquery = m_parsedInsert.getSubqueryScan(); CompiledPlan retval = null; String isContentDeterministic = null; if (subquery != null) { isContentDeterministic = subquery.calculateContentDeterminismMessage(); if (subquery.getBestCostPlan() == null) { // Seems like this should really be caught earlier // in getBestCostPlan, above. throw new PlanningErrorException("INSERT INTO ... SELECT subquery could not be planned: " + m_recentErrorMsg); } boolean targetIsExportTable = tableListIncludesExportOnly(m_parsedInsert.m_tableList); InsertSubPlanAssembler subPlanAssembler = new InsertSubPlanAssembler(m_catalogDb, m_parsedInsert, m_partitioning, targetIsExportTable); AbstractPlanNode subplan = subPlanAssembler.nextPlan(); if (subplan == null) { throw new PlanningErrorException(subPlanAssembler.m_recentErrorMsg); } assert(m_partitioning.isJoinValid()); // Use the subquery's plan as the basis for the insert plan. retval = subquery.getBestCostPlan(); } else { retval = new CompiledPlan(); } retval.setReadOnly(false); // Iterate over each column in the table we're inserting into: // - Make sure we're supplying values for columns that require it. // For a normal INSERT, these are the usual non-nullable values that // don't have a default value. // For an UPSERT, the (only) required values are the primary key // components. Other required values can be supplied from the // existing row in "UPDATE mode". If some other value is required // for an INSERT, UPSERT's "INSERT mode" will throw a runtime // constraint violation as the INSERT operation tries to set the // non-nullable column to null. // - Set partitioning expressions for VALUES (...) case. // TODO: it would be good someday to do the same kind of processing // for the INSERT ... SELECT ... case, by analyzing the subquery. if (m_parsedInsert.m_isUpsert) { boolean hasPrimaryKey = false; for (Constraint constraint : targetTable.getConstraints()) { if (constraint.getType() != ConstraintType.PRIMARY_KEY.getValue()) { continue; } hasPrimaryKey = true; boolean targetsPrimaryKey = false; for (ColumnRef colRef : constraint.getIndex().getColumns()) { int primary = colRef.getColumn().getIndex(); for (Column targetCol : m_parsedInsert.m_columns.keySet()) { if (targetCol.getIndex() == primary) { targetsPrimaryKey = true; break; } } if (! targetsPrimaryKey) { throw new PlanningErrorException("UPSERT on table \"" + targetTable.getTypeName() + "\" must specify a value for primary key \"" + colRef.getColumn().getTypeName() + "\"."); } } } if (! hasPrimaryKey) { throw new PlanningErrorException("UPSERT is not allowed on table \"" + targetTable.getTypeName() + "\" that has no primary key."); } } CatalogMap<Column> targetTableColumns = targetTable.getColumns(); for (Column col : targetTableColumns) { boolean needsValue = (!m_parsedInsert.m_isUpsert) && (col.getNullable() == false) && (col.getDefaulttype() == 0); if (needsValue && !m_parsedInsert.m_columns.containsKey(col)) { // This check could be done during parsing? throw new PlanningErrorException("Column " + col.getName() + " has no default and is not nullable."); } // hint that this statement can be executed SP. if (col.equals(m_partitioning.getPartitionColForDML()) && subquery == null) { // When AdHoc insert-into-select is supported, we'll need to be able to infer // partitioning of the sub-select AbstractExpression expr = m_parsedInsert.getExpressionForPartitioning(col); String fullColumnName = targetTable.getTypeName() + "." + col.getTypeName(); m_partitioning.addPartitioningExpression(fullColumnName, expr, expr.getValueType()); } } NodeSchema matSchema = null; if (subquery == null) { matSchema = new NodeSchema(); } int[] fieldMap = new int[m_parsedInsert.m_columns.size()]; int i = 0; // The insert statement's set of columns are contained in a LinkedHashMap, // meaning that we'll iterate over the columns here in the order that the user // specified them in the original SQL. (If the statement didn't specify any // columns, then all the columns will be in the map in schema order.) // - Build the field map, used by insert executor to build tuple to execute // - For VALUES(...) insert statements, build the materialize node's schema for (Map.Entry<Column, AbstractExpression> e : m_parsedInsert.m_columns.entrySet()) { Column col = e.getKey(); fieldMap[i] = col.getIndex(); if (matSchema != null) { AbstractExpression valExpr = e.getValue(); valExpr.setInBytes(col.getInbytes()); // Patch over any mismatched expressions with an explicit cast. // Most impossible-to-cast type combinations should have already been caught by the // parser, but there are also runtime checks in the casting code // -- such as for out of range values. valExpr = castExprIfNeeded(valExpr, col); matSchema.addColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, col.getTypeName(), col.getTypeName(), valExpr); } i++; } // the root of the insert plan is always an InsertPlanNode InsertPlanNode insertNode = new InsertPlanNode(); insertNode.setTargetTableName(targetTable.getTypeName()); if (subquery != null) { insertNode.setSourceIsPartitioned(! subquery.getIsReplicated()); } // The field map tells the insert node // where to put values produced by child into the row to be inserted. insertNode.setFieldMap(fieldMap); if (matSchema != null) { MaterializePlanNode matNode = new MaterializePlanNode(matSchema); // connect the insert and the materialize nodes together insertNode.addAndLinkChild(matNode); retval.statementGuaranteesDeterminism(false, true, isContentDeterministic); } else { insertNode.addAndLinkChild(retval.rootPlanGraph); } if (m_partitioning.wasSpecifiedAsSingle() || m_partitioning.isInferredSingle()) { insertNode.setMultiPartition(false); retval.rootPlanGraph = insertNode; return retval; } insertNode.setMultiPartition(true); // Add a compensating sum of modified tuple counts or a limit 1 // AND a send on top of a union-like receive node. boolean isReplicated = targetTable.getIsreplicated(); retval.rootPlanGraph = addCoordinatorToDMLNode(insertNode, isReplicated); return retval; } /** * Add a receive node, a sum or limit node, and a send node to the given DML node. * If the DML target is a replicated table, it will add a limit node, * otherwise it adds a sum node. * * @param dmlRoot * @param isReplicated Whether or not the target table is a replicated table. * @return */ private static AbstractPlanNode addCoordinatorToDMLNode( AbstractPlanNode dmlRoot, boolean isReplicated) { dmlRoot = SubPlanAssembler.addSendReceivePair(dmlRoot); AbstractPlanNode sumOrLimitNode; if (isReplicated) { // Replicated table DML result doesn't need to be summed. All partitions should // modify the same number of tuples in replicated table, so just pick the result from // any partition. LimitPlanNode limitNode = new LimitPlanNode(); sumOrLimitNode = limitNode; limitNode.setLimit(1); } else { // create the nodes being pushed on top of dmlRoot. AggregatePlanNode countNode = new AggregatePlanNode(); sumOrLimitNode = countNode; // configure the count aggregate (sum) node to produce a single // output column containing the result of the sum. // Create a TVE that should match the tuple count input column // This TVE is magic. // really really need to make this less hard-wired TupleValueExpression count_tve = new TupleValueExpression( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "modified_tuples", "modified_tuples", 0); count_tve.setValueType(VoltType.BIGINT); count_tve.setValueSize(VoltType.BIGINT.getLengthInBytesForFixedTypes()); countNode.addAggregate(ExpressionType.AGGREGATE_SUM, false, 0, count_tve); // The output column. Not really based on a TVE (it is really the // count expression represented by the count configured above). But // this is sufficient for now. This looks identical to the above // TVE but it's logically different so we'll create a fresh one. TupleValueExpression tve = new TupleValueExpression( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "modified_tuples", "modified_tuples", 0); tve.setValueType(VoltType.BIGINT); tve.setValueSize(VoltType.BIGINT.getLengthInBytesForFixedTypes()); NodeSchema count_schema = new NodeSchema(); count_schema.addColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "modified_tuples", "modified_tuples", tve); countNode.setOutputSchema(count_schema); } // connect the nodes to build the graph sumOrLimitNode.addAndLinkChild(dmlRoot); SendPlanNode sendNode = new SendPlanNode(); sendNode.addAndLinkChild(sumOrLimitNode); return sendNode; } /** * Given a relatively complete plan-sub-graph, apply a trivial projection * (filter) to it. If the root node can embed the projection do so. If not, * add a new projection node. * * @param rootNode * The root of the plan-sub-graph to add the projection to. * @return The new root of the plan-sub-graph (might be the same as the * input). */ private AbstractPlanNode addProjection(AbstractPlanNode rootNode) { assert (m_parsedSelect != null); assert (m_parsedSelect.m_displayColumns != null); // Build the output schema for the projection based on the display columns NodeSchema proj_schema = m_parsedSelect.getFinalProjectionSchema(); for (SchemaColumn col : proj_schema.getColumns()) { // Adjust the differentiator fields of TVEs, since they need to // reflect the inlined projection node in scan nodes. AbstractExpression colExpr = col.getExpression(); Collection<TupleValueExpression> allTves = ExpressionUtil.getTupleValueExpressions(colExpr); for (TupleValueExpression tve : allTves) { if ( ! tve.needsDifferentiation()) { // PartitionByPlanNode and a following OrderByPlanNode // can have an internally generated RANK column. // These do not need to have their differentiator updated, // since it's only used for disambiguation in some // combinations of "SELECT *" and subqueries. // In fact attempting to adjust this special column will // cause failed assertions. The tve for this expression // will be marked as not needing differentiation, // so we just ignore it here. continue; } rootNode.adjustDifferentiatorField(tve); } } ProjectionPlanNode projectionNode = new ProjectionPlanNode(); projectionNode.setOutputSchemaWithoutClone(proj_schema); // If the projection can be done inline. then add the // projection node inline. Even if the rootNode is a // scan, if we have a windowed expression we need to // add it out of line. if (rootNode instanceof AbstractScanPlanNode) { rootNode.addInlinePlanNode(projectionNode); return rootNode; } projectionNode.addAndLinkChild(rootNode); return projectionNode; } /** Given a list of ORDER BY columns, construct and return an OrderByPlanNode. */ private static OrderByPlanNode buildOrderByPlanNode(List<ParsedColInfo> cols) { OrderByPlanNode n = new OrderByPlanNode(); for (ParsedColInfo col : cols) { n.addSort(col.expression, col.ascending ? SortDirectionType.ASC : SortDirectionType.DESC); } return n; } /** * Determine if an OrderByPlanNode is needed. This may return false if the * statement has no ORDER BY clause, or if the subtree is already producing * rows in the correct order. Note that a hash aggregate node will cause this * to return true, and a serial or partial aggregate node may cause this * to return true. * * @param parsedStmt The statement whose plan may need an OrderByPlanNode * @param root The subtree which may need its output tuples ordered * @return true if the plan needs an OrderByPlanNode, false otherwise */ private static boolean isOrderByNodeRequired(AbstractParsedStmt parsedStmt, AbstractPlanNode root) { // Only sort when the statement has an ORDER BY. if ( ! parsedStmt.hasOrderByColumns()) { return false; } // Skip the explicit ORDER BY plan step if an IndexScan is already providing the equivalent ordering. // Note that even tree index scans that produce values in their own "key order" only report // their sort direction != SortDirectionType.INVALID // when they enforce an ordering equivalent to the one requested in the ORDER BY // or window function clause. Even an intervening non-hash aggregate will not interfere // in this optimization. // Is there a window function between the root and the // scan or join nodes? Also, does this window function // use the index. int numberWindowFunctions = 0; int numberReceiveNodes = 0; int numberHashAggregates = 0; // EE keeps the insertion ORDER so that ORDER BY could apply before DISTINCT. // However, this probably is not optimal if there are low cardinality results. // Again, we have to replace the TVEs for ORDER BY clause for these cases in planning. // // Find the scan or join node. AbstractPlanNode probe; for (probe = root; ! ((probe instanceof AbstractJoinPlanNode) || (probe instanceof AbstractScanPlanNode)) && (probe != null); probe = (probe.getChildCount() > 0) ? probe.getChild(0) : null) { // Count the number of window functions between the // root and the join/scan node. Note that we know we // have a statement level order by (SLOB) here. If the SLOB // can use the index for ordering the scan or join node, // we will have recorded it in the scan or join node. if (probe.getPlanNodeType() == PlanNodeType.WINDOWFUNCTION) { numberWindowFunctions += 1; } // Also, see if there are receive nodes. We need to // generate an ORDERBY node if there are RECEIVE nodes, // because the RECEIVE->MERGERECEIVE microoptimization // needs them. if (probe.getPlanNodeType() == PlanNodeType.RECEIVE) { numberReceiveNodes += 1; } // Finally, count the number of non-serial aggregate // nodes. A hash or partial aggregate operation invalidates // the ordering, but a serial aggregation does not. if ((probe.getPlanNodeType() == PlanNodeType.HASHAGGREGATE) || (probe.getPlanNodeType() == PlanNodeType.PARTIALAGGREGATE)) { numberHashAggregates += 1; } } if (probe == null) { // No idea what happened here. We can't find a // scan or join node at all. This seems unlikely // to be right. Maybe this should be an assert? return true; } // // o If the SLOB cannot use the index, then we // need an order by node always. // o If there are zero window functions, then // - If the SLOB cannot use the index than we // need an order by node. // - If the SLOB can use the index, then // = If the statement is a single fragment // statement then we don't need an order by // node. // = If the statement is a two fragment // statement then we need an order by node. // This is because we will convert the RECEIVE // node into a MERGERECEIVE node in the // microoptimizer, and the MERGERECEIVE // node needs an inline order by node to do // the merge. // o If there is only one window function, then // - If the window function does not use the index // then we always need an order by node. // - If the window function can use the index but // the SLOB can't use the index, then we need an // order by node. // - If both the SLOB and the window function can // use the index, then we don't need an order // by, no matter how many fragments this statement // has. This is because any RECEIVE node will be // a descendent of the window function node. So // the RECEIVE to MERGERECEIVE conversion happens // in the window function and not the order by. // o If there is more than one window function then // we always need an order by node. The second // window function will invalidate the ordering of // the first one. (Actually, if the SLOB order is // compatible with the last window function then // the situation is like the one-window function // below.) // if ( ! (probe instanceof IndexSortablePlanNode)) { return true; } IndexUseForOrderBy indexUse = ((IndexSortablePlanNode)probe).indexUse(); if (indexUse.getSortOrderFromIndexScan() == SortDirectionType.INVALID) { return true; } // Hash aggregates and partial aggregates // invalidate the index ordering. So, we will need // an ORDERBY node. if (numberHashAggregates > 0) { return true; } if ( numberWindowFunctions == 0 ) { if ( indexUse.getWindowFunctionUsesIndex() == SubPlanAssembler.NO_INDEX_USE ) { return true; } assert( indexUse.getWindowFunctionUsesIndex() == SubPlanAssembler.STATEMENT_LEVEL_ORDER_BY_INDEX ); // Return true for MP (numberReceiveNodes > 0) and // false for SP (numberReceiveNodes == 0); return numberReceiveNodes > 0; } if (numberWindowFunctions == 1) { // If the WF uses the index then getWindowFunctionUsesIndex() // will return 0. if ( ( indexUse.getWindowFunctionUsesIndex() != 0 ) || ( ! indexUse.isWindowFunctionCompatibleWithOrderBy() ) ) { return true; } // Both the WF and the SLOB can use the index. Since the // window function will have the order by node, the SLOB // does not need one. So this is a false. return false; } // This can actually never happen now, // because we only support one window function. return true; } /** * Create an order by node as required by the statement and make it a parent of root. * @param parsedStmt Parsed statement, for context * @param root The root of the plan needing ordering * @return new orderByNode (the new root) or the original root if no orderByNode was required. */ private static AbstractPlanNode handleOrderBy(AbstractParsedStmt parsedStmt, AbstractPlanNode root) { assert (parsedStmt instanceof ParsedSelectStmt || parsedStmt instanceof ParsedUnionStmt || parsedStmt instanceof ParsedDeleteStmt); if (! isOrderByNodeRequired(parsedStmt, root)) { return root; } OrderByPlanNode orderByNode = buildOrderByPlanNode(parsedStmt.orderByColumns()); orderByNode.addAndLinkChild(root); return orderByNode; } /** * Add a limit, pushed-down if possible, and return the new root. * @param root top of the original plan * @return new plan's root node */ private AbstractPlanNode handleSelectLimitOperator(AbstractPlanNode root) { // The coordinator's top limit graph fragment for a MP plan. // If planning "order by ... limit", getNextSelectPlan() // will have already added an order by to the coordinator frag. // This is the only limit node in a SP plan LimitPlanNode topLimit = m_parsedSelect.getLimitNodeTop(); assert(topLimit != null); /* * TODO: allow push down limit with distinct (select distinct C from T limit 5) * , DISTINCT in aggregates and DISTINCT PUSH DOWN with partition column included. */ AbstractPlanNode sendNode = null; // Whether or not we can push the limit node down boolean canPushDown = ! m_parsedSelect.hasDistinctWithGroupBy(); if (canPushDown) { sendNode = checkLimitPushDownViability(root); if (sendNode == null) { canPushDown = false; } else { canPushDown = m_parsedSelect.getCanPushdownLimit(); } } if (m_parsedSelect.m_mvFixInfo.needed()) { // Do not push down limit for mv based distributed query. canPushDown = false; } /* * Push down the limit plan node when possible even if offset is set. If * the plan is for a partitioned table, do the push down. Otherwise, * there is no need to do the push down work, the limit plan node will * be run in the partition. */ if (canPushDown) { /* * For partitioned table, the pushed-down limit plan node has a limit based * on the combined limit and offset, which may require an expression if either of these * was not a hard-coded constant and didn't get parameterized. * The top level limit plan node remains the same, with the original limit and offset values. */ LimitPlanNode distLimit = m_parsedSelect.getLimitNodeDist(); // Disconnect the distributed parts of the plan below the SEND node AbstractPlanNode distributedPlan = sendNode.getChild(0); distributedPlan.clearParents(); sendNode.clearChildren(); // If the distributed limit must be performed on ordered input, // ensure the order of the data on each partition. if (m_parsedSelect.hasOrderByColumns()) { distributedPlan = handleOrderBy(m_parsedSelect, distributedPlan); } if (isInlineLimitPlanNodePossible(distributedPlan)) { // Inline the distributed limit. distributedPlan.addInlinePlanNode(distLimit); sendNode.addAndLinkChild(distributedPlan); } else { distLimit.addAndLinkChild(distributedPlan); // Add the distributed work back to the plan sendNode.addAndLinkChild(distLimit); } } // In future, inline LIMIT for join, Receive // Then we do not need to distinguish the order by node. return inlineLimitOperator(root, topLimit); } /** * Add a limit, and return the new root. * @param root top of the original plan * @return new plan's root node */ private AbstractPlanNode handleUnionLimitOperator(AbstractPlanNode root) { // The coordinator's top limit graph fragment for a MP plan. // If planning "order by ... limit", getNextUnionPlan() // will have already added an order by to the coordinator frag. // This is the only limit node in a SP plan LimitPlanNode topLimit = m_parsedUnion.getLimitNodeTop(); assert(topLimit != null); return inlineLimitOperator(root, topLimit); } /** * Inline Limit plan node if possible * @param root * @param topLimit * @return */ private AbstractPlanNode inlineLimitOperator(AbstractPlanNode root, LimitPlanNode topLimit) { if (isInlineLimitPlanNodePossible(root)) { root.addInlinePlanNode(topLimit); } else if (root instanceof ProjectionPlanNode && isInlineLimitPlanNodePossible(root.getChild(0)) ) { // In future, inlined this projection node for OrderBy and Aggregate // Then we could delete this ELSE IF block. root.getChild(0).addInlinePlanNode(topLimit); } else { topLimit.addAndLinkChild(root); root = topLimit; } return root; } /** * Inline limit plan node can be applied with ORDER BY node * and serial aggregation node * @param pn * @return */ static private boolean isInlineLimitPlanNodePossible(AbstractPlanNode pn) { if (pn instanceof OrderByPlanNode || pn.getPlanNodeType() == PlanNodeType.AGGREGATE) { return true; } return false; } private AbstractPlanNode handleMVBasedMultiPartQuery( HashAggregatePlanNode reAggNode, AbstractPlanNode root, boolean edgeCaseOuterJoin) { MaterializedViewFixInfo mvFixInfo = m_parsedSelect.m_mvFixInfo; AbstractPlanNode receiveNode = root; AbstractPlanNode reAggParent = null; // Find receive plan node and insert the constructed // re-aggregation plan node. if (root instanceof AbstractReceivePlanNode) { root = reAggNode; } else { List<AbstractPlanNode> recList = root.findAllNodesOfClass(AbstractReceivePlanNode.class); assert(recList.size() == 1); receiveNode = recList.get(0); reAggParent = receiveNode.getParent(0); boolean result = reAggParent.replaceChild(receiveNode, reAggNode); assert(result); } reAggNode.addAndLinkChild(receiveNode); reAggNode.m_isCoordinatingAggregator = true; assert(receiveNode instanceof ReceivePlanNode); AbstractPlanNode sendNode = receiveNode.getChild(0); assert(sendNode instanceof SendPlanNode); AbstractPlanNode sendNodeChild = sendNode.getChild(0); HashAggregatePlanNode reAggNodeForReplace = null; if (m_parsedSelect.m_tableList.size() > 1 && !edgeCaseOuterJoin) { reAggNodeForReplace = reAggNode; } boolean find = mvFixInfo.processScanNodeWithReAggNode(sendNode, reAggNodeForReplace); assert(find); // If it is a normal joined query, replace the node under the // receive node with materialized view scan node. if (m_parsedSelect.m_tableList.size() > 1 && !edgeCaseOuterJoin) { AbstractPlanNode joinNode = sendNodeChild; // No agg, limit pushed down at this point. assert(joinNode instanceof AbstractJoinPlanNode); // Fix the node after Re-aggregation node. joinNode.clearParents(); assert(mvFixInfo.m_scanNode != null); mvFixInfo.m_scanNode.clearParents(); // replace joinNode with MV scan node on each partition. sendNode.clearChildren(); sendNode.addAndLinkChild(mvFixInfo.m_scanNode); // If reAggNode has parent node before we put it under join node, // its parent will be the parent of the new join node. Update the root node. if (reAggParent != null) { reAggParent.replaceChild(reAggNode, joinNode); root = reAggParent; } else { root = joinNode; } } return root; } private static class IndexGroupByInfo { boolean m_multiPartition = false; List<Integer> m_coveredGroupByColumns; boolean m_canBeFullySerialized = false; AbstractPlanNode m_indexAccess = null; boolean isChangedToSerialAggregate() { return m_canBeFullySerialized && m_indexAccess != null; } boolean isChangedToPartialAggregate() { return !m_canBeFullySerialized && m_indexAccess != null; } boolean needHashAggregator(AbstractPlanNode root, ParsedSelectStmt parsedSelect) { // A hash is required to build up per-group aggregates in parallel vs. // when there is only one aggregation over the entire table OR when the // per-group aggregates are being built serially from the ordered output // of an index scan. // Currently, an index scan only claims to have a sort direction when its output // matches the order demanded by the ORDER BY clause. if (! parsedSelect.isGrouped()) { return false; } if (isChangedToSerialAggregate() && ! m_multiPartition) { return false; } boolean predeterminedOrdering = false; if (root instanceof IndexScanPlanNode) { if (((IndexScanPlanNode)root).getSortDirection() != SortDirectionType.INVALID) { predeterminedOrdering = true; } } else if (root instanceof AbstractJoinPlanNode) { if (((AbstractJoinPlanNode)root).getSortDirection() != SortDirectionType.INVALID) { predeterminedOrdering = true; } } if (predeterminedOrdering) { // The ordering predetermined by indexed access is known // to cover (at least) the ORDER BY columns. // Yet, any additional non-ORDER-BY columns in the GROUP BY // clause will need partial aggregation. if (parsedSelect.groupByIsAnOrderByPermutation()) { return false; } } return true; } } private static AbstractPlanNode findSeqScanCandidateForGroupBy( AbstractPlanNode candidate) { if (candidate.getPlanNodeType() == PlanNodeType.SEQSCAN && ! candidate.isSubQuery()) { // scan on sub-query does not support index, early exit here // In future, support sub-query edge cases. return candidate; } // For join node, find outer sequential scan plan node if (candidate.getPlanNodeType() == PlanNodeType.NESTLOOP) { assert(candidate.getChildCount() == 2); return findSeqScanCandidateForGroupBy(candidate.getChild(0)); } if (candidate.getPlanNodeType() == PlanNodeType.NESTLOOPINDEX) { return findSeqScanCandidateForGroupBy(candidate.getChild(0)); } return null; } /** * For a seqscan feeding a GROUP BY, consider substituting an IndexScan * that pre-sorts by the GROUP BY keys. * If a candidate is already an indexscan, * simply calculate GROUP BY column coverage * * @param candidate * @param gbInfo * @return true when planner can switch to index scan * from a sequential scan, and when the index scan * has no parent plan node or the candidate is already * an indexscan and covers all or some GROUP BY columns */ private boolean switchToIndexScanForGroupBy(AbstractPlanNode candidate, IndexGroupByInfo gbInfo) { if (! m_parsedSelect.isGrouped()) { return false; } if (candidate instanceof IndexScanPlanNode) { calculateIndexGroupByInfo((IndexScanPlanNode) candidate, gbInfo); if (gbInfo.m_coveredGroupByColumns != null && !gbInfo.m_coveredGroupByColumns.isEmpty()) { // The candidate index does cover all or some // of the GROUP BY columns and can be serialized gbInfo.m_indexAccess = candidate; return true; } return false; } AbstractPlanNode sourceSeqScan = findSeqScanCandidateForGroupBy(candidate); if (sourceSeqScan == null) { return false; } assert(sourceSeqScan instanceof SeqScanPlanNode); AbstractPlanNode parent = null; if (sourceSeqScan.getParentCount() > 0) { parent = sourceSeqScan.getParent(0); } AbstractPlanNode indexAccess = indexAccessForGroupByExprs( (SeqScanPlanNode)sourceSeqScan, gbInfo); if (indexAccess.getPlanNodeType() != PlanNodeType.INDEXSCAN) { // does not find proper index to replace sequential scan return false; } gbInfo.m_indexAccess = indexAccess; if (parent != null) { // have a parent and would like to replace // the sequential scan with an index scan indexAccess.clearParents(); // For two children join node, index 0 is its outer side parent.replaceChild(0, indexAccess); return false; } // parent is null and switched to index scan from sequential scan return true; } /** * Create nodes for windowed operations. * * @param root * @return */ private AbstractPlanNode handleWindowedOperators(AbstractPlanNode root) { // Get the windowed expression. We need to set its output // schema from the display list. WindowFunctionExpression winExpr = m_parsedSelect.getWindowFunctionExpressions().get(0); assert(winExpr != null); // This will set the output schema to contain the // windowed schema column only. In generateOutputSchema // we will add the input columns. WindowFunctionPlanNode pnode = new WindowFunctionPlanNode(); pnode.setWindowFunctionExpression(winExpr); // We always need an order by plan node, even if the sort // is optimized away by an index. This may be turned // into an inline order by in a MergeReceivePlanNode. IndexUseForOrderBy scanNode = findScanNodeForWindowFunction(root); AbstractPlanNode cnode = null; int winfunc = (scanNode == null) ? SubPlanAssembler.NO_INDEX_USE : scanNode.getWindowFunctionUsesIndex(); // If we have an index which is compatible with the statement // level order by, and we have a window function which can't // use the index we have to ignore the statement level order by // index use. We will need to order the input according to the // window function first, and that will in general invalidate the // statement level order by ordering. if ((SubPlanAssembler.STATEMENT_LEVEL_ORDER_BY_INDEX == winfunc) || (SubPlanAssembler.NO_INDEX_USE == winfunc)) { // No index. Calculate the expression order here and stuff it into // the order by node. Note that if we support more than one window // function this would be the case when scanNode.getWindowFunctionUsesIndex() // returns a window function number which is different from the number // of winExpr. List<AbstractExpression> partitionByExpressions = winExpr.getPartitionByExpressions(); // If the order by expression list contains a partition by expression then // we won't have to sort by it twice. We sort by the partition by expressions // first, and we don't care what order we sort by them. So, find the // sort direction in the order by list and use that in the partition by // list, and then mark that it was deleted in the order by // list. // // We choose to make this dontsort rather than dosort because the // Java default value for boolean is false, and we want to sort by // default. boolean dontsort[] = new boolean[winExpr.getOrderbySize()]; List<AbstractExpression> orderByExpressions = winExpr.getOrderByExpressions(); List<SortDirectionType> orderByDirections = winExpr.getOrderByDirections(); OrderByPlanNode onode = new OrderByPlanNode(); for (int idx = 0; idx < winExpr.getPartitionbySize(); ++idx) { SortDirectionType pdir = SortDirectionType.ASC; AbstractExpression partitionByExpression = partitionByExpressions.get(idx); int sidx = winExpr.getSortIndexOfOrderByExpression(partitionByExpression); if (0 <= sidx) { pdir = orderByDirections.get(sidx); dontsort[sidx] = true; } onode.addSort(partitionByExpression, pdir); } for (int idx = 0; idx < winExpr.getOrderbySize(); ++idx) { if (!dontsort[idx]) { AbstractExpression orderByExpr = orderByExpressions.get(idx); SortDirectionType orderByDir = orderByDirections.get(idx); onode.addSort(orderByExpr, orderByDir); } } onode.addAndLinkChild(root); cnode = onode; } else { assert(scanNode != null); // This means the index is good for this window function. // If this is an MP statement we still need to generate the // order by node, because we may need to turn it into an // inline order by node of a MergeReceive node. assert( 0 == scanNode.getWindowFunctionUsesIndex() ); if (m_partitioning.requiresTwoFragments()) { OrderByPlanNode onode = new OrderByPlanNode(); SortDirectionType dir = scanNode.getSortOrderFromIndexScan(); assert(dir != SortDirectionType.INVALID); // This was created when the index was determined. // We cached it in the scan node. List<AbstractExpression> orderExprs = scanNode.getFinalExpressionOrderFromIndexScan(); assert(orderExprs != null); for (AbstractExpression ae : orderExprs) { onode.addSort(ae, dir); } // Link in the OrderByNode. onode.addAndLinkChild(root); cnode = onode; } else { // Don't create and link in the order by node. cnode = root; } } pnode.addAndLinkChild(cnode); return pnode; } private IndexUseForOrderBy findScanNodeForWindowFunction(AbstractPlanNode root) { while (root != null) { if (root instanceof IndexSortablePlanNode) { return ((IndexSortablePlanNode) root).indexUse(); } // Any other kind of scan or join plan // node cannot have a useful index. if ((root instanceof AbstractScanPlanNode) || (root instanceof AbstractJoinPlanNode)) { return null; } if (root.getChildCount() == 0) { break; } root = root.getChild(0); } return null; } private AbstractPlanNode handleAggregationOperators(AbstractPlanNode root) { /* Check if any aggregate expressions are present */ /* * "Select A from T group by A" is grouped but has no aggregate operator * expressions. Catch that case by checking the grouped flag */ if (m_parsedSelect.hasAggregateOrGroupby()) { AggregatePlanNode aggNode = null; AggregatePlanNode topAggNode = null; // i.e., on the coordinator IndexGroupByInfo gbInfo = new IndexGroupByInfo(); if (root instanceof AbstractReceivePlanNode) { // do not apply index scan for serial/partial aggregation // for distinct that does not group by partition column if ( ! m_parsedSelect.hasAggregateDistinct() || m_parsedSelect.hasPartitionColumnInGroupby()) { AbstractPlanNode candidate = root.getChild(0).getChild(0); gbInfo.m_multiPartition = true; switchToIndexScanForGroupBy(candidate, gbInfo); } } else if (switchToIndexScanForGroupBy(root, gbInfo)) { root = gbInfo.m_indexAccess; } boolean needHashAgg = gbInfo.needHashAggregator(root, m_parsedSelect); // Construct the aggregate nodes if (needHashAgg) { if ( m_parsedSelect.m_mvFixInfo.needed() ) { // TODO: may optimize this edge case in future aggNode = new HashAggregatePlanNode(); } else { if (gbInfo.isChangedToSerialAggregate()) { assert(root instanceof ReceivePlanNode); aggNode = new AggregatePlanNode(); } else if (gbInfo.isChangedToPartialAggregate()) { aggNode = new PartialAggregatePlanNode(gbInfo.m_coveredGroupByColumns); } else { aggNode = new HashAggregatePlanNode(); } topAggNode = new HashAggregatePlanNode(); } } else { aggNode = new AggregatePlanNode(); if ( ! m_parsedSelect.m_mvFixInfo.needed()) { topAggNode = new AggregatePlanNode(); } } NodeSchema agg_schema = new NodeSchema(); NodeSchema top_agg_schema = new NodeSchema(); for ( int outputColumnIndex = 0; outputColumnIndex < m_parsedSelect.m_aggResultColumns.size(); outputColumnIndex += 1) { ParsedColInfo col = m_parsedSelect.m_aggResultColumns.get(outputColumnIndex); AbstractExpression rootExpr = col.expression; AbstractExpression agg_input_expr = null; SchemaColumn schema_col = null; SchemaColumn top_schema_col = null; if (rootExpr instanceof AggregateExpression) { ExpressionType agg_expression_type = rootExpr.getExpressionType(); agg_input_expr = rootExpr.getLeft(); // A bit of a hack: ProjectionNodes after the // aggregate node need the output columns here to // contain TupleValueExpressions (effectively on a temp table). // So we construct one based on the output of the // aggregate expression, the column alias provided by HSQL, // and the offset into the output table schema for the // aggregate node that we're computing. // Oh, oh, it's magic, you know.. TupleValueExpression tve = new TupleValueExpression( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "", col.alias, rootExpr, outputColumnIndex); tve.setDifferentiator(col.differentiator); boolean is_distinct = ((AggregateExpression)rootExpr).isDistinct(); aggNode.addAggregate(agg_expression_type, is_distinct, outputColumnIndex, agg_input_expr); schema_col = new SchemaColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "", col.alias, tve, outputColumnIndex); top_schema_col = new SchemaColumn( AbstractParsedStmt.TEMP_TABLE_NAME, AbstractParsedStmt.TEMP_TABLE_NAME, "", col.alias, tve, outputColumnIndex); /* * Special case count(*), count(), sum(), min() and max() to * push them down to each partition. It will do the * push-down if the select columns only contains the listed * aggregate operators and other group-by columns. If the * select columns includes any other aggregates, it will not * do the push-down. - nshi */ if (topAggNode != null) { ExpressionType top_expression_type = agg_expression_type; /* * For count(*), count() and sum(), the pushed-down * aggregate node doesn't change. An extra sum() * aggregate node is added to the coordinator to sum up * the numbers from all the partitions. The input schema * and the output schema of the sum() aggregate node is * the same as the output schema of the push-down * aggregate node. * * If DISTINCT is specified, don't do push-down for * count() and sum() when not group by partition column. * An exception is the aggregation arguments are the * partition column (ENG-4980). */ if (agg_expression_type == ExpressionType.AGGREGATE_COUNT_STAR || agg_expression_type == ExpressionType.AGGREGATE_COUNT || agg_expression_type == ExpressionType.AGGREGATE_SUM) { if (is_distinct && ! (m_parsedSelect.hasPartitionColumnInGroupby() || canPushDownDistinctAggregation((AggregateExpression)rootExpr) ) ) { topAggNode = null; } else { // for aggregate distinct when group by // partition column, the top aggregate node // will be dropped later, thus there is no // effect to assign the top_expression_type. top_expression_type = ExpressionType.AGGREGATE_SUM; } } /* * For min() and max(), the pushed-down aggregate node * doesn't change. An extra aggregate node of the same * type is added to the coordinator. The input schema * and the output schema of the top aggregate node is * the same as the output schema of the pushed-down * aggregate node. * * APPROX_COUNT_DISTINCT can be similarly pushed down, but * must be split into two different functions, which is * done later, from pushDownAggregate(). */ else if (agg_expression_type != ExpressionType.AGGREGATE_MIN && agg_expression_type != ExpressionType.AGGREGATE_MAX && agg_expression_type != ExpressionType.AGGREGATE_APPROX_COUNT_DISTINCT) { /* * Unsupported aggregate for push-down (AVG for example). */ topAggNode = null; } if (topAggNode != null) { /* * Input column of the top aggregate node is the * output column of the push-down aggregate node */ boolean topDistinctFalse = false; topAggNode.addAggregate(top_expression_type, topDistinctFalse, outputColumnIndex, tve); } }// end if we have a top agg node } else { // All complex aggregations have been simplified, // cases like "MAX(counter)+1" or "MAX(col)/MIN(col)" // has already been broken down. assert( ! rootExpr.hasAnySubexpressionOfClass(AggregateExpression.class)); /* * These columns are the pass through columns that are not being * aggregated on. These are the ones from the SELECT list. They * MUST already exist in the child node's output. Find them and * add them to the aggregate's output. */ schema_col = new SchemaColumn( col.tableName, col.tableAlias, col.columnName, col.alias, col.expression, outputColumnIndex); AbstractExpression topExpr = null; if (col.groupBy) { topExpr = m_parsedSelect.m_groupByExpressions.get(col.alias); } else { topExpr = col.expression; } top_schema_col = new SchemaColumn( col.tableName, col.tableAlias, col.columnName, col.alias, topExpr, outputColumnIndex); } agg_schema.addColumn(schema_col); top_agg_schema.addColumn(top_schema_col); }// end for each ParsedColInfo in m_aggResultColumns for (ParsedColInfo col : m_parsedSelect.groupByColumns()) { aggNode.addGroupByExpression(col.expression); if (topAggNode != null) { topAggNode.addGroupByExpression(m_parsedSelect.m_groupByExpressions.get(col.alias)); } } aggNode.setOutputSchema(agg_schema); if (topAggNode != null) { if (m_parsedSelect.hasComplexGroupby()) { topAggNode.setOutputSchema(top_agg_schema); } else { topAggNode.setOutputSchema(agg_schema); } } // Never push down aggregation for MV fix case. root = pushDownAggregate(root, aggNode, topAggNode, m_parsedSelect); } return handleDistinctWithGroupby(root); } // Sets IndexGroupByInfo for an IndexScan private void calculateIndexGroupByInfo(IndexScanPlanNode root, IndexGroupByInfo gbInfo) { String fromTableAlias = root.getTargetTableAlias(); assert(fromTableAlias != null); Index index = root.getCatalogIndex(); if ( ! IndexType.isScannable(index.getType())) { return; } ArrayList<AbstractExpression> bindings = new ArrayList<>(); gbInfo.m_coveredGroupByColumns = calculateGroupbyColumnsCovered( index, fromTableAlias, bindings); gbInfo.m_canBeFullySerialized = (gbInfo.m_coveredGroupByColumns.size() == m_parsedSelect.groupByColumns().size()); } // Turn sequential scan to index scan for group by if possible private AbstractPlanNode indexAccessForGroupByExprs(SeqScanPlanNode root, IndexGroupByInfo gbInfo) { if (root.isSubQuery()) { // sub-query edge case will not be handled now return root; } String fromTableAlias = root.getTargetTableAlias(); assert(fromTableAlias != null); List<ParsedColInfo> groupBys = m_parsedSelect.groupByColumns(); Table targetTable = m_catalogDb.getTables().get(root.getTargetTableName()); assert(targetTable != null); CatalogMap<Index> allIndexes = targetTable.getIndexes(); List<Integer> maxCoveredGroupByColumns = new ArrayList<>(); ArrayList<AbstractExpression> maxCoveredBindings = null; Index pickedUpIndex = null; boolean foundAllGroupByCoveredIndex = false; for (Index index : allIndexes) { if ( ! IndexType.isScannable(index.getType())) { continue; } if ( ! index.getPredicatejson().isEmpty()) { // do not try to look at Partial/Sparse index continue; } ArrayList<AbstractExpression> bindings = new ArrayList<>(); List<Integer> coveredGroupByColumns = calculateGroupbyColumnsCovered( index, fromTableAlias, bindings); if (coveredGroupByColumns.size() > maxCoveredGroupByColumns.size()) { maxCoveredGroupByColumns = coveredGroupByColumns; pickedUpIndex = index; maxCoveredBindings = bindings; if (maxCoveredGroupByColumns.size() == groupBys.size()) { foundAllGroupByCoveredIndex = true; break; } } } if (pickedUpIndex == null) { return root; } IndexScanPlanNode indexScanNode = new IndexScanPlanNode( root, null, pickedUpIndex, SortDirectionType.INVALID); indexScanNode.setForGroupingOnly(); indexScanNode.setBindings(maxCoveredBindings); gbInfo.m_coveredGroupByColumns = maxCoveredGroupByColumns; gbInfo.m_canBeFullySerialized = foundAllGroupByCoveredIndex; return indexScanNode; } private List<Integer> calculateGroupbyColumnsCovered(Index index, String fromTableAlias, List<AbstractExpression> bindings) { List<Integer> coveredGroupByColumns = new ArrayList<>(); List<ParsedColInfo> groupBys = m_parsedSelect.groupByColumns(); String exprsjson = index.getExpressionsjson(); if (exprsjson.isEmpty()) { List<ColumnRef> indexedColRefs = CatalogUtil.getSortedCatalogItems(index.getColumns(), "index"); for (int j = 0; j < indexedColRefs.size(); j++) { String indexColumnName = indexedColRefs.get(j).getColumn().getName(); // ignore order of keys in GROUP BY expr int ithCovered = 0; boolean foundPrefixedColumn = false; for (; ithCovered < groupBys.size(); ithCovered++) { AbstractExpression gbExpr = groupBys.get(ithCovered).expression; if ( ! (gbExpr instanceof TupleValueExpression)) { continue; } TupleValueExpression gbTVE = (TupleValueExpression) gbExpr; // TVE column index has not been resolved currently if (fromTableAlias.equals(gbTVE.getTableAlias()) && indexColumnName.equals(gbTVE.getColumnName())) { foundPrefixedColumn = true; break; } } if ( ! foundPrefixedColumn) { // no prefix match any more break; } coveredGroupByColumns.add(ithCovered); if (coveredGroupByColumns.size() == groupBys.size()) { // covered all group by columns already break; } } } else { StmtTableScan fromTableScan = m_parsedSelect.getStmtTableScanByAlias(fromTableAlias); // either pure expression index or mix of expressions and simple columns List<AbstractExpression> indexedExprs = null; try { indexedExprs = AbstractExpression.fromJSONArrayString(exprsjson, fromTableScan); } catch (JSONException e) { e.printStackTrace(); // This case sounds impossible return coveredGroupByColumns; } for (AbstractExpression indexExpr : indexedExprs) { // ignore order of keys in GROUP BY expr List<AbstractExpression> binding = null; for (int ithCovered = 0; ithCovered < groupBys.size(); ithCovered++) { AbstractExpression gbExpr = groupBys.get(ithCovered).expression; binding = gbExpr.bindingToIndexedExpression(indexExpr); if (binding != null) { bindings.addAll(binding); coveredGroupByColumns.add(ithCovered); break; } } // no prefix match any more or covered all group by columns already if (binding == null || coveredGroupByColumns.size() == groupBys.size()) { break; } } } return coveredGroupByColumns; } /** * This function is called once it's been determined that we can push down * an aggregation plan node. * * If an APPROX_COUNT_DISTINCT aggregate is distributed, then we need to * convert the distributed aggregate function to VALS_TO_HYPERLOGLOG, * and the coordinating aggregate function to HYPERLOGLOGS_TO_CARD. * * @param distNode The aggregate node executed on each partition * @param coordNode The aggregate node executed on the coordinator */ private static void fixDistributedApproxCountDistinct( AggregatePlanNode distNode, AggregatePlanNode coordNode) { assert (distNode != null); assert (coordNode != null); // Patch up any APPROX_COUNT_DISTINCT on the distributed node. List<ExpressionType> distAggTypes = distNode.getAggregateTypes(); boolean hasApproxCountDistinct = false; for (int i = 0; i < distAggTypes.size(); ++i) { ExpressionType et = distAggTypes.get(i); if (et == ExpressionType.AGGREGATE_APPROX_COUNT_DISTINCT) { hasApproxCountDistinct = true; distNode.updateAggregate(i, ExpressionType.AGGREGATE_VALS_TO_HYPERLOGLOG); } } if (hasApproxCountDistinct) { // Now, patch up any APPROX_COUNT_DISTINCT on the coordinating node. List<ExpressionType> coordAggTypes = coordNode.getAggregateTypes(); for (int i = 0; i < coordAggTypes.size(); ++i) { ExpressionType et = coordAggTypes.get(i); if (et == ExpressionType.AGGREGATE_APPROX_COUNT_DISTINCT) { coordNode.updateAggregate(i, ExpressionType.AGGREGATE_HYPERLOGLOGS_TO_CARD); } } } } /** * Push the given aggregate if the plan is distributed, then add the * coordinator node on top of the send/receive pair. If the plan * is not distributed, or coordNode is not provided, the distNode * is added at the top of the plan. * * Note: this works in part because the push-down node is also an acceptable * top level node if the plan is not distributed. This wouldn't be true * if we started pushing down something like (sum, count) to calculate * a distributed average. (We already do something like this for * APPROX_COUNT_DISTINCT, which must be split into two different functions * for the pushed-down case.) * * @param root * The root node * @param distNode * The node to push down * @param coordNode [may be null] * The top node to put on top of the send/receive pair after * push-down. If this is null, no push-down will be performed. * @return The new root node. */ private static AbstractPlanNode pushDownAggregate(AbstractPlanNode root, AggregatePlanNode distNode, AggregatePlanNode coordNode, ParsedSelectStmt selectStmt) { AggregatePlanNode rootAggNode; // remember that coordinating aggregation has a pushed-down // counterpart deeper in the plan. this allows other operators // to be pushed down past the receive as well. if (coordNode != null) { coordNode.m_isCoordinatingAggregator = true; } /* * Push this node down to partition if it's distributed. First remove * the send/receive pair, add the node, then put the send/receive pair * back on top of the node, followed by another top node at the * coordinator. */ if (coordNode != null && root instanceof ReceivePlanNode) { AbstractPlanNode accessPlanTemp = root; root = accessPlanTemp.getChild(0).getChild(0); root.clearParents(); accessPlanTemp.getChild(0).clearChildren(); distNode.addAndLinkChild(root); if (selectStmt.hasPartitionColumnInGroupby()) { // Set post predicate for final distributed Aggregation node distNode.setPostPredicate(selectStmt.getHavingPredicate()); // Edge case: GROUP BY clause contains the partition column // No related GROUP BY or even Re-agg will apply on coordinator // Projection plan node can just be pushed down also except for // a very edge ORDER BY case. if (selectStmt.isComplexOrderBy()) { // Put the send/receive pair back into place accessPlanTemp.getChild(0).addAndLinkChild(distNode); root = processComplexAggProjectionNode(selectStmt, accessPlanTemp); return root; } root = processComplexAggProjectionNode(selectStmt, distNode); // Put the send/receive pair back into place accessPlanTemp.getChild(0).addAndLinkChild(root); return accessPlanTemp; } // Without including partition column in GROUP BY clause, // there has to be a top GROUP BY plan node on coordinator. // // Now that we're certain the aggregate will be pushed down // (no turning back now!), fix any APPROX_COUNT_DISTINCT aggregates. fixDistributedApproxCountDistinct(distNode, coordNode); // Put the send/receive pair back into place accessPlanTemp.getChild(0).addAndLinkChild(distNode); // Add the top node coordNode.addAndLinkChild(accessPlanTemp); rootAggNode = coordNode; } else { distNode.addAndLinkChild(root); rootAggNode = distNode; } // Set post predicate for final Aggregation node. rootAggNode.setPostPredicate(selectStmt.getHavingPredicate()); root = processComplexAggProjectionNode(selectStmt, rootAggNode); return root; } private static AbstractPlanNode processComplexAggProjectionNode( ParsedSelectStmt selectStmt, AbstractPlanNode root) { if ( ! selectStmt.hasComplexAgg()) { return root; } ProjectionPlanNode proj = new ProjectionPlanNode(selectStmt.getFinalProjectionSchema()); proj.addAndLinkChild(root); return proj; } /** * Check if we can push the limit node down. * * Return a mid-plan send node, if one exists and can host a * distributed limit node. * There is guaranteed to be at most a single receive/send pair. * Abort the search if a node that a "limit" can't be pushed past * is found before its receive node. * * Can only push past: * * coordinatingAggregator: a distributed aggregator * a copy of which has already been pushed down. * Distributing a LIMIT to just above that aggregator is correct. * (I've got some doubts that this is correct??? --paul) * * * order by: if the plan requires a sort, getNextSelectPlan() * will have already added an ORDER BY. * A distributed LIMIT will be added above a copy * of that ORDER BY node. * * * projection: these have no effect on the application of limits. * * @param root * @return If we can push the limit down, the send plan node is returned. * Otherwise null -- when the plan is single-partition when * its "coordinator" part contains a push-blocking node type. */ protected AbstractPlanNode checkLimitPushDownViability( AbstractPlanNode root) { AbstractPlanNode receiveNode = root; List<ParsedColInfo> orderBys = m_parsedSelect.orderByColumns(); boolean orderByCoversAllGroupBy = m_parsedSelect.groupByIsAnOrderByPermutation(); while ( ! (receiveNode instanceof ReceivePlanNode)) { // Limitation: can only push past some nodes (see above comment) // Delete the aggregate node case to handle ENG-6485, // or say we don't push down meeting aggregate node // TODO: We might want to optimize/push down "limit" for some cases if ( ! (receiveNode instanceof OrderByPlanNode) && ! (receiveNode instanceof ProjectionPlanNode) && ! isValidAggregateNodeForLimitPushdown(receiveNode, orderBys, orderByCoversAllGroupBy) ) { return null; } if (receiveNode instanceof OrderByPlanNode) { // if grouping by the partition key, // limit can still push down if ordered by aggregate values. if (! m_parsedSelect.hasPartitionColumnInGroupby() && isOrderByAggregationValue(m_parsedSelect.orderByColumns())) { return null; } } // Traverse... if (receiveNode.getChildCount() == 0) { return null; } // nothing that allows pushing past has multiple inputs assert(receiveNode.getChildCount() == 1); receiveNode = receiveNode.getChild(0); } return receiveNode.getChild(0); } private static boolean isOrderByAggregationValue(List<ParsedColInfo> orderBys) { for (ParsedColInfo col : orderBys) { AbstractExpression rootExpr = col.expression; // Fix ENG-3487: can't usually push down limits // when results are ordered by aggregate values. for (AbstractExpression tve : rootExpr.findAllTupleValueSubexpressions()) { if (((TupleValueExpression) tve).hasAggregate()) { return true; } } } return false; } private static boolean isValidAggregateNodeForLimitPushdown( AbstractPlanNode aggregateNode, List<ParsedColInfo> orderBys, boolean orderByCoversAllGroupBy) { if (aggregateNode instanceof AggregatePlanNode == false) { return false; } if (aggregateNode.getParentCount() == 0) { return false; } // Limitation: can only push past coordinating aggregation nodes if ( ! ((AggregatePlanNode)aggregateNode).m_isCoordinatingAggregator) { return false; } AbstractPlanNode parent = aggregateNode.getParent(0); AbstractPlanNode orderByNode = null; if (parent instanceof OrderByPlanNode) { orderByNode = parent; } else if ( parent instanceof ProjectionPlanNode && parent.getParentCount() > 0 && parent.getParent(0) instanceof OrderByPlanNode) { // Xin really wants inline project with aggregation orderByNode = parent.getParent(0); } if (orderByNode == null) { // When an aggregate without order by and group by columns // does not contain the partition column, // the limit should not be pushed down. return false; } if (( ! orderByCoversAllGroupBy) || isOrderByAggregationValue(orderBys)) { return false; } return true; } /** * Handle DISTINCT with GROUP BY if it is not redundant with the * aggregation/grouping. * DISTINCT is basically rewritten with GROUP BY to benefit from * all kinds of GROUP BY optimizations. * Trivial case DISTINCT in a statement with no GROUP BY has been * rewritten very early at query parsing time. * In the non-trivial case, where an existing GROUP BY column is NOT * in the select list, DISTINCT can be implemented via a final aggregation * (never pushed down) added to the top of the plan. * @param root can be an aggregate plan node or projection plan node * @return */ private AbstractPlanNode handleDistinctWithGroupby(AbstractPlanNode root) { if (! m_parsedSelect.hasDistinctWithGroupBy()) { return root; } assert(m_parsedSelect.isGrouped()); // DISTINCT is redundant with GROUP BY IFF // all of the grouping columns are present in the display columns. if (m_parsedSelect.displayColumnsContainAllGroupByColumns()) { return root; } // Now non complex aggregation cases are handled already assert(m_parsedSelect.hasComplexAgg()); AggregatePlanNode distinctAggNode = new HashAggregatePlanNode(); distinctAggNode.setOutputSchema(m_parsedSelect.getDistinctProjectionSchema()); for (ParsedColInfo col : m_parsedSelect.distinctGroupByColumns()) { distinctAggNode.addGroupByExpression(col.expression); } // TODO(xin): push down the DISTINCT for certain cases // Ticket: ENG-7360 /* boolean pushedDown = false; boolean canPushdownDistinctAgg = m_parsedSelect.hasPartitionColumnInDistinctGroupby(); // // disable pushdown, DISTINCT push down turns out complex // canPushdownDistinctAgg = false; if (canPushdownDistinctAgg && !m_parsedSelect.m_mvFixInfo.needed()) { assert(m_parsedSelect.hasPartitionColumnInGroupby()); AbstractPlanNode receive = root; if (receive instanceof ReceivePlanNode) { // Temporarily strip send/receive pair AbstractPlanNode distNode = receive.getChild(0).getChild(0); receive.getChild(0).unlinkChild(distNode); distinctAggNode.addAndLinkChild(distNode); receive.getChild(0).addAndLinkChild(distinctAggNode); pushedDown = true; } }*/ distinctAggNode.addAndLinkChild(root); root = distinctAggNode; return root; } /** * Get the unique set of names of all columns that are part of an index on * the given table. * * @param table * The table to build the list of index-affected columns with. * @return The set of column names affected by indexes with duplicates * removed. */ private static Set<String> getIndexedColumnSetForTable(Table table) { HashSet<String> columns = new HashSet<>(); for (Index index : table.getIndexes()) { for (ColumnRef colRef : index.getColumns()) { columns.add(colRef.getColumn().getTypeName()); } } return columns; } String getErrorMessage() { return m_recentErrorMsg; } /** * Outer join simplification using null rejection. * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.43.2531 * Outerjoin Simplification and Reordering for Query Optimization * by Cesar A. Galindo-Legaria , Arnon Rosenthal * Algorithm: * Traverse the join tree top-down: * For each join node n1 do: * For each expression expr (join and where) at the node n1 * For each join node n2 descended from n1 do: * If expr rejects nulls introduced by n2 inner table, then * - convert LEFT OUTER n2 to an INNER join. * - convert FULL OUTER n2 to RIGHT OUTER join * If expr rejects nulls introduced by n2 outer table, then * - convert RIGHT OUTER n2 to an INNER join. * - convert FULL OUTER n2 to LEFT OUTER join */ private static void simplifyOuterJoin(BranchNode joinTree) { assert(joinTree != null); List<AbstractExpression> exprs = new ArrayList<>(); JoinNode leftNode = joinTree.getLeftNode(); JoinNode rightNode = joinTree.getRightNode(); // For the top level node only, // WHERE expressions need to be evaluated for NULL-rejection if (leftNode.getWhereExpression() != null) { exprs.add(leftNode.getWhereExpression()); } if (rightNode.getWhereExpression() != null) { exprs.add(rightNode.getWhereExpression()); } simplifyOuterJoinRecursively(joinTree, exprs); } private static void simplifyOuterJoinRecursively(BranchNode joinNode, List<AbstractExpression> exprs) { assert (joinNode != null); JoinNode leftNode = joinNode.getLeftNode(); JoinNode rightNode = joinNode.getRightNode(); if (joinNode.getJoinType() == JoinType.LEFT) { // Get all the inner tables underneath this node and // see if the expression is NULL-rejecting for any of them if (isNullRejecting(rightNode.generateTableJoinOrder(), exprs)) { joinNode.setJoinType(JoinType.INNER); } } else if (joinNode.getJoinType() == JoinType.RIGHT) { // Get all the outer tables underneath this node and // see if the expression is NULL-rejecting for any of them if (isNullRejecting(leftNode.generateTableJoinOrder(), exprs)) { joinNode.setJoinType(JoinType.INNER); } } else if (joinNode.getJoinType() == JoinType.FULL) { // Get all the outer tables underneath this node and // see if the expression is NULL-rejecting for any of them if (isNullRejecting(leftNode.generateTableJoinOrder(), exprs)) { joinNode.setJoinType(JoinType.LEFT); } // Get all the inner tables underneath this node and // see if the expression is NULL-rejecting for any of them if (isNullRejecting(rightNode.generateTableJoinOrder(), exprs)) { if (JoinType.FULL == joinNode.getJoinType()) { joinNode.setJoinType(JoinType.RIGHT); } else { // LEFT join was just removed joinNode.setJoinType(JoinType.INNER); } } } // Now add this node expression to the list and descend. // The WHERE expressions can be combined with the input list // because they simplify both inner and outer nodes. if (leftNode.getWhereExpression() != null) { exprs.add(leftNode.getWhereExpression()); } if (rightNode.getWhereExpression() != null) { exprs.add(rightNode.getWhereExpression()); } // The JOIN expressions (ON) are only applicable // to the INNER node of an outer join. List<AbstractExpression> exprsForInnerNode = new ArrayList<>(exprs); if (leftNode.getJoinExpression() != null) { exprsForInnerNode.add(leftNode.getJoinExpression()); } if (rightNode.getJoinExpression() != null) { exprsForInnerNode.add(rightNode.getJoinExpression()); } List<AbstractExpression> leftNodeExprs; List<AbstractExpression> rightNodeExprs; switch (joinNode.getJoinType()) { case INNER: leftNodeExprs = exprsForInnerNode; rightNodeExprs = exprsForInnerNode; break; case LEFT: leftNodeExprs = exprs; rightNodeExprs = exprsForInnerNode; break; case RIGHT: leftNodeExprs = exprsForInnerNode; rightNodeExprs = exprs; break; case FULL: leftNodeExprs = exprs; rightNodeExprs = exprs; break; default: // shouldn't get there leftNodeExprs = null; rightNodeExprs = null; assert(false); } if (leftNode instanceof BranchNode) { simplifyOuterJoinRecursively((BranchNode)leftNode, leftNodeExprs); } if (rightNode instanceof BranchNode) { simplifyOuterJoinRecursively((BranchNode)rightNode, rightNodeExprs); } } /** * Verify if an expression from the input list is NULL-rejecting * for any of the tables from the list * @param tableAliases list of tables * @param exprs list of expressions * @return TRUE if there is a NULL-rejecting expression */ private static boolean isNullRejecting(Collection<String> tableAliases, List<AbstractExpression> exprs) { for (AbstractExpression expr : exprs) { for (String tableAlias : tableAliases) { if (ExpressionUtil.isNullRejectingExpression(expr, tableAlias)) { // We are done at this level return true; } } } return false; } }