StatementPartitioning.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.planner;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.voltdb.VoltType;
import org.voltdb.catalog.Column;
import org.voltdb.expressions.AbstractExpression;
import org.voltdb.expressions.ConstantValueExpression;
import org.voltdb.expressions.ParameterValueExpression;
import org.voltdb.expressions.TupleValueExpression;
import org.voltdb.planner.parseinfo.StmtSubqueryScan;
import org.voltdb.planner.parseinfo.StmtTableScan;
import org.voltdb.plannodes.AbstractReceivePlanNode;
import org.voltdb.plannodes.SchemaColumn;

/**
 * Represents the partitioning of the data underlying a statement.
 * In the simplest case, this is pre-determined by the single-partition context of the statement
 * from a stored procedure annotation or a single-statement procedure attribute.
 * In the more interesting ad hoc case, a user can specify that a statement be run on all partitions,
 * but the semantics of the statement may indicate that the same result could be produced more optimally
 * by running it on a single partition selected based on the hash of some partition key value,
 * whether a statement parameter or a constant in the text of the statement.
 * These cases arise both in queries and in (partitioned table) DML.
 * As a multi-partition statement is analyzed in the planner, this object is filled in with details
 * regarding its suitability for running correctly on a single partition.
 *
 * For a multi-fragment plan that contains a join,
 * is it better to send partitioned tuples and join them on the coordinator
 * or is it better to join them before sending?
 * If bandwidth (or capacity of the receiving temp table) were the primary concern,
 * a decision could be based on
 * A) how much wider the joined rows are than the pre-joined rows.
 * B) the expected yield of the join filtering -- does each pre-joined row typically
 *    match and get joined with multiple partner rows or does it typically fail to match
 *    any row.
 * The statistics required to determine "B" are not generally available.
 * In any case, there are two over-arching concerns.
 * One is the correct handling of a special case
 * -- a join of partitioned tables on their partition keys.
 * In this case, the join MUST happen on each partition prior to sending any tuples.
 * This restriction stems directly from the limitation that there can only be two fragments in a plan,
 * and that a fragment produces a single (intermediate or final) result table.
 * The "coordinator" receives the (one) intermediate result table and produces
 * the final result table. It can not receive tuples from two different partitioned tables.
 * The second over-arching consideration is that there is an optimization available to the
 * transaction processor for the special case in which a coordinator fragment does not need to
 * access any persistent local data (I learned this second hand from Izzy. --paul).
 * This provides further motivation to do all scanning and joining in the collector fragment
 * prior to sending tuples.
 *
 * These two considerations normally override all others,
 * so that all multi-partition plans only "send after all joins", regardless of bandwidth/capacity
 * considerations, but there remains some edge cases in which the decision MUST go the other way,
 * that is, sending tuples prior to joining on the coordinator.
 * This occurs for some OUTER JOINS between a replicated OUTER table and a partitioned INNER table as in:
 *
 * SELECT * FROM replicated R LEFT JOIN partitioned P ON ...;
 *
 * See the comment in SelectSubPlanAssembler.getSelectSubPlanForJoin
 */
public class StatementPartitioning implements Cloneable{
    /**
     * This value is only meaningful if m_inferPartitioning is false.
     * It can be set true to force single-partition statement planning and
     * to forbid single-partition planning/execution of replicated table DML.
     * Since that would corrupt the replication, it is flagged as an error.
     * Otherwise, no attempt is made to validate that a single partition statement would
     * have the same result as the same query run on all partitions.
     * It is up to the user to decide whether that is an issue.
     * It can be set to false to force multi-partition statement planning.
     * This MAY involve sub-optimal dispatch of fragments to partitions with no matching data.
     * Currently, even inserts into partitioned tables are allowed to successfully execute
     * on "wrong" partitions, but they are prevented at the lowest level from taking effect there.
     */
    private final boolean m_forceSP;

    /**
     * Enables inference of single partitioning from statement.
     */
    private final boolean m_inferPartitioning;
    /*
     * For partitioned table DML, caches the partitioning column for later matching with its prospective value.
     * If that value is constant or a parameter, SP is an option.
     */
    private Column m_partitionColForDML; // Not used in SELECT plans.
    /*
     * For a multi-partition statement that can definitely be run SP, this is a constant partitioning key value
     * inferred from the analysis (suitable for hashinating).
     * If null, SP may not be safe, or the partitioning may be based on something less obvious like a parameter or constant expression.
     */
    private Object m_inferredValue = null;
    private int m_inferredParameterIndex = -1;
    /*
     * Any constant/parameter-based expressions found to be equality-filtering partitioning columns.
     */
    private final Set<AbstractExpression> m_inferredExpression = new HashSet<AbstractExpression>();
    /*
     * The actual number of partitioned table scans in the query (when supported, self-joins should count as multiple).
     */
    private int m_countOfPartitionedTables = -1;
    /*
     * The number of independently partitioned table scans in the query. This is initially the same as
     * m_countOfPartitionedTables, but gets reduced by 1 each time a partitioned table (scan)'s partitioning column
     * is seen to be filtered by equality to a constant value or to a previously scanned partition column.
     * When the count is 0, the statement can be executed single-partition.
     * When the count is 1, multi-partition execution can join any number of tables in the collector plan fragment.
     * When the count is 2 or greater, the statement would require three or more fragments to execute, so is disallowed.
     */
    private int m_countOfIndependentlyPartitionedTables = -1;
    /*
     * If true, and the target table it replicated,
     * SP execution is strictly forbidden, even if requested.
     */
    private boolean m_isDML = false;
    /*
     * The table and column name of a partitioning column, typically the first scanned, if there are more than one,
     * proposed in feedback messages for possible use in single-partitioning annotations and attributes.
     */
    private String m_fullColumnName;

    private boolean m_joinValid = true;

    // If m_joinValid is set to false, we also set
    // this string to a hint telling why it is false.
    private String m_recentInvalidReason = null;

    /** Most of the time DML on a replicated table for a plan that is executed
     * as single-partition is a bad idea, and the planner will refuse to do it.
     * However, sometimes we want to bypass this rule; for example, when planning
     * the DELETE statement executed when LIMIT PARTITION ROWS is about to be violated.
     * In this special case, the statement is being planned, for simplicity, as if for
     * single-partition execution, since it never requires a coordinator fragment,
     * but it will only ever be executed in the context of a replicated table MP insert
     * on ALL partitions.*/
    private boolean m_isReplicatedDmlToRunOnAllPartitions = false;

    /**
     * @param specifiedValue non-null if only SP plans are to be assumed
     * @param lockInInferredPartitioningConstant true if MP plans should be automatically optimized for SP where possible
     */
    private StatementPartitioning(boolean inferPartitioning, boolean forceSP) {
        m_inferPartitioning = inferPartitioning;
        m_forceSP = forceSP;
        //* enable to debug */ System.out.println("DEBUG: StatementPartitioning(" + m_inferPartitioning + ", " + m_forceSP + ")");
    }

    public static StatementPartitioning forceSP() {
        return new StatementPartitioning(false, true);
    }

    public static StatementPartitioning forceMP() {
        return new StatementPartitioning(false, false);
    }

    public static StatementPartitioning inferPartitioning() {
        return new StatementPartitioning(true, /* default to MP */ false);
    }

    /** See comment for m_singlePartitionReplicatedDMLAllowed, above. */
    public static StatementPartitioning partitioningForRowLimitDelete() {
        StatementPartitioning partitioning = forceSP();
        partitioning.m_isReplicatedDmlToRunOnAllPartitions = true;
        return partitioning;
    }

    public boolean isInferred() {
        return m_inferPartitioning;
    }

    /**
     * @return A new PartitioningForStatement
     */
    @Override
    public Object clone() {
        return new StatementPartitioning(m_inferPartitioning, m_forceSP);
    }

    /**
     * accessor
     */
    public boolean wasSpecifiedAsSingle() {
        return m_forceSP && ! m_inferPartitioning;
    }

    /**
     * Returns true if the expression can be used to restrict plan execution to a single partition.
     * For now this is anything other than a constant or parameter.  (In the future, one could
     * imagine evaluating expressions like sqrt(8 * 8) and the like during planning)
     *
     * @param expr  The expression to consider
     * @return      true or false
     */
    private static boolean isUsefulPartitioningExpression(AbstractExpression expr) {
        if (expr instanceof ParameterValueExpression) {
            return true;
        }
        if (expr instanceof ConstantValueExpression) {
            return true;
        }

        return false;
    }

    /**
     * @param string table.column name of a(nother) equality-filtered partitioning column
     * @param constExpr -- a constant/parameter-based expression that equality-filters the partitioning column
     */
    public void addPartitioningExpression(String fullColumnName, AbstractExpression constExpr,
            VoltType valueType) {
        //* enable to debug */ System.out.println("DEBUG: addPartitioningExpression(" + fullColumnName + ", " + constExpr + ")");
        if (m_fullColumnName == null) {
            m_fullColumnName = fullColumnName;
        }
        m_inferredExpression.add(constExpr);
        if (constExpr instanceof ParameterValueExpression) {
            ParameterValueExpression pve = (ParameterValueExpression)constExpr;
            m_inferredParameterIndex = pve.getParameterIndex();
        } else {
            m_inferredValue = ConstantValueExpression.extractPartitioningValue(valueType, constExpr);
        }
    }

    /**
     * For a multi-partition statement that can definitely be run SP, this is a constant partitioning key value
     * inferred from the analysis (suitable for hashinating).
     * If null, SP may not be safe, or the partitioning may be based on something less obvious like a parameter or constant expression.
     *
     * @return  an instance of String or an instance of container class Long
     */
    public Object getInferredPartitioningValue() {
        return m_inferredValue;
    }

    public int getInferredParameterIndex() {
        return m_inferredParameterIndex;
    }


    /**
     * accessor
     */
    public int getCountOfPartitionedTables() {
        // Should always have been set, early on.
        assert(m_countOfPartitionedTables != -1);
        return m_countOfPartitionedTables;
    }

    /**
     * accessor
     */
    public int getCountOfIndependentlyPartitionedTables() {
        return m_countOfIndependentlyPartitionedTables;

    }

    /**
     * Returns true if partitioning inference has been requested, and
     * at least one of the following is true:
     *    - We are not doing DML on a replicated table, OR
     *    - There is a single useful partitioning expression
     */
    public boolean isInferredSingle() {
        return m_inferPartitioning &&
                (((m_countOfIndependentlyPartitionedTables == 0) && ! m_isDML)  ||
                        (singlePartitioningExpression() != null));
    }

    /**
     * Returns true if the statement will require two fragments.
     */
    public boolean requiresTwoFragments() {
        if (m_inferPartitioning) {
            if (isInferredSingle()) {
                return false;
            }
        } else {
            if (m_forceSP || (m_countOfPartitionedTables == 0)) {
                return false;
            }
        }
        return true;
    }

    /**
     * smart accessor - only returns a value if it was unique and is useful
     * @return
     */
    public AbstractExpression singlePartitioningExpression() {
        AbstractExpression e = singlePartitioningExpressionForReport();
        if (e != null && isUsefulPartitioningExpression(e)) {
            return e;
        }
        return null;
    }

    /**
     * smart accessor - only returns a value if it was unique.
     * @return
     */
    public AbstractExpression singlePartitioningExpressionForReport() {
        if (m_inferredExpression.size() == 1) {
            return m_inferredExpression.iterator().next();
        }
        return null;
    }

    /**
     * accessor
     */
    public boolean getIsReplicatedTableDML() {
        return m_isDML && (m_countOfIndependentlyPartitionedTables == 0);
    }

    /**
     * @param parameter potentially enabling replicatedTableDML check
     */
    public void setIsDML() { m_isDML = true; }

    /**
     * accessor
     * @return
     */
    public String getFullColumnName() {
        return m_fullColumnName;
    }

    /**
     * accessor
     * @param partitioncolumn
     */
    public void setPartitioningColumnForDML(Column partitioncolumn) {
        if (m_inferPartitioning) {
            m_partitionColForDML = partitioncolumn; // Not used in SELECT plans.
        }
    }

    /**
     * @return
     */
    public Column getPartitionColForDML() {
        return m_partitionColForDML;
    }

    /**
     * Accessor
     */
    public boolean isReplicatedDmlToRunOnAllPartitions() {
        return m_isReplicatedDmlToRunOnAllPartitions;
    }

    /**
     * Given the query's list of tables and its collection(s) of equality-filtered columns and their equivalents,
     * determine whether all joins involving partitioned tables can be executed locally on a single partition.
     * This is only the case when they include equality comparisons between partition key columns.
     * VoltDB will reject joins of multiple partitioned tables unless all their partition keys are
     * constrained to be equal to each other.
     * Example: select * from T1, T2 where T1.ID = T2.ID
     * Additionally, in this case, there may be a constant equality filter on any of the columns,
     * which we want to extract as our SP partitioning parameter.
     *
     * @param tableAliasList The tables.
     * @param valueEquivalence Their column equality filters
     * @return the number of independently partitioned tables
     *         -- partitioned tables that aren't joined or filtered by the same value.
     *         The caller can raise an alarm if there is more than one.
     */
    public void analyzeForMultiPartitionAccess(Collection<StmtTableScan> scans,
            HashMap<AbstractExpression, Set<AbstractExpression>> valueEquivalence) {
        //* enable to debug */ System.out.println("DEBUG: analyze4MPAccess w/ scans:" + scans.size() + " filters:" + valueEquivalence.size());
        TupleValueExpression tokenPartitionKey = null;
        Set< Set<AbstractExpression> > eqSets = new HashSet< Set<AbstractExpression> >();
        int unfilteredPartitionKeyCount = 0;

        // reset this flag to forget the last result of the multiple partition access path.
        // AdHoc with parameters will call this function at least two times
        // By default this flag should be true.
        setJoinValid(true);
        setJoinInvalidReason(null);
        boolean subqueryHasReceiveNode = false;
        boolean hasPartitionedTableJoin = false;
        // Iterate over the tables to collect partition columns.
        for (StmtTableScan tableScan : scans) {
            // Replicated tables don't need filter coverage.
            if (tableScan.getIsReplicated()) {
                continue;
            }

            // The partition column can be null in an obscure edge case.
            // The table is declared non-replicated yet specifies no partitioning column.
            // This can occur legitimately when views based on partitioned tables neglect to group by the partition column.
            // The interpretation of this edge case is that the table has "randomly distributed data".
            // In such a case, the table is valid for use by MP queries only and can only be joined with replicated tables
            // because it has no recognized partitioning join key.
            List<SchemaColumn> columnsNeedingCoverage = tableScan.getPartitioningColumns();

            if (tableScan instanceof StmtSubqueryScan) {
                StmtSubqueryScan subScan = (StmtSubqueryScan) tableScan;
                subScan.promoteSinglePartitionInfo(valueEquivalence, eqSets);
                CompiledPlan subqueryPlan = subScan.getBestCostPlan();
                if (( ! subScan.canRunInOneFragment()) ||
                        ((subqueryPlan != null) &&
                         subqueryPlan.rootPlanGraph.hasAnyNodeOfClass(AbstractReceivePlanNode.class))) {
                    if (subqueryHasReceiveNode) {
                        // Has found another subquery with receive node on the same level
                        // Not going to support this kind of subquery join with 2 fragment plan.
                        setJoinValid(false);
                        setJoinInvalidReason("This multipartition query is not plannable.  "
                                             + "It has a subquery which cannot be single partition.");
                        // Still needs to count the independent partition tables
                        break;
                    }
                    subqueryHasReceiveNode = true;

                    if (subScan.isTableAggregate()) {
                        // Partition Table Aggregate only return one aggregate row.
                        // It has been marked with receive node, any join or processing based on
                        // this table aggregate subquery should be done on coordinator.
                        // Joins: has to be replicated table
                        // Any process based on this subquery should require 1 fragment only.
                        continue;
                    }
                } else {
                    // this subquery partition table without receive node
                    hasPartitionedTableJoin = true;
                }
            } else {
                // This table is a partition table
                hasPartitionedTableJoin = true;
            }

            boolean unfiltered = true;
            for (AbstractExpression candidateColumn : valueEquivalence.keySet()) {
                if ( ! (candidateColumn instanceof TupleValueExpression)) {
                    continue;
                }
                TupleValueExpression candidatePartitionKey = (TupleValueExpression) candidateColumn;
                if (! canCoverPartitioningColumn(candidatePartitionKey, columnsNeedingCoverage)) {
                    continue;
                }
                unfiltered = false;
                if (tokenPartitionKey == null) {
                    tokenPartitionKey = candidatePartitionKey;
                }
                eqSets.add(valueEquivalence.get(candidatePartitionKey));
            }

            if (unfiltered) {
                ++unfilteredPartitionKeyCount;
            }
        } // end for each table StmtTableScan in the collection

        m_countOfIndependentlyPartitionedTables = eqSets.size() + unfilteredPartitionKeyCount;
        //* enable to debug */ System.out.println("DEBUG: analyze4MPAccess found: " + m_countOfIndependentlyPartitionedTables + " = " + eqSets.size() + " + " + unfilteredPartitionKeyCount);
        if (m_countOfIndependentlyPartitionedTables > 1) {
            setJoinValid(false);
            setJoinInvalidReason("This query is not plannable.  "
                                 + "The planner cannot guarantee that all rows would be in a single partition.");
        }

        // This is the case that subquery with receive node join with another partition table
        // on outer level. Not going to support this kind of join.
        if (subqueryHasReceiveNode && hasPartitionedTableJoin) {
            setJoinValid(false);
            setJoinInvalidReason("This query is not plannable.  It has a subquery which needs cross-partition access.");
        }

        if ((unfilteredPartitionKeyCount == 0) && (eqSets.size() == 1)) {
            for (Set<AbstractExpression> partitioningValues : eqSets) {
                for (AbstractExpression constExpr : partitioningValues) {
                    if (constExpr instanceof TupleValueExpression) {
                        continue;
                    }
                    VoltType valueType = tokenPartitionKey.getValueType();
                    addPartitioningExpression(tokenPartitionKey.getTableName() +
                            '.' + tokenPartitionKey.getColumnName(), constExpr, valueType);
                    // Only need one constant value.
                    break;
                }
            }
        }
    }

    public boolean isJoinValid() {
        return m_joinValid;
    }

    public String getJoinInvalidReason() {
        return m_recentInvalidReason;
    }

    public void setJoinValid(boolean isValid) {
        m_joinValid = isValid;
    }

    public void setJoinInvalidReason(String why) {
        m_recentInvalidReason = why;
    }

    private static boolean canCoverPartitioningColumn(TupleValueExpression candidatePartitionKey,
            List<SchemaColumn> columnsNeedingCoverage) {
        if (columnsNeedingCoverage == null)
            return false;

        for (SchemaColumn col: columnsNeedingCoverage) {
            String partitionedTableAlias = col.getTableAlias();
            String columnNeedingCoverage = col.getColumnAlias();

            assert(candidatePartitionKey.getTableAlias() != null);
            if ( ! candidatePartitionKey.getTableAlias().equals(partitionedTableAlias)) {
                continue;
            }
            String candidateColumnName = candidatePartitionKey.getColumnName();
            if ( ! candidateColumnName.equals(columnNeedingCoverage)) {
                continue;
            }

            // Maybe need more checkings
            return true;
        }

        return false;
    }

    /**
     * This simple analysis counts the number of partitioned tables in the join tree
     * of a query, and initializes a guess for the count of independently partitioned tables.
     *
     * @param tableCacheList
     * @throws PlanningErrorException
     */
    void analyzeTablePartitioning(Collection<StmtTableScan> collection)
            throws PlanningErrorException
    {
        m_countOfPartitionedTables = 0;
        // Do we have a need for a distributed scan at all?
        // Iterate over the tables to collect partition columns.
        for (StmtTableScan tableScan : collection) {
            if ( ! tableScan.getIsReplicated()) {
                ++m_countOfPartitionedTables;
            }
        }
        // Initial guess -- as if no equality filters.
        m_countOfIndependentlyPartitionedTables = m_countOfPartitionedTables;
    }

    /**
     * Sometimes when we fail to plan a statement, we try again with different inputs
     * using the same StatementPartitioning object.  In this case, it's incumbent on
     * callers to reset the cached analysis state set by calling this method.
     *
     * TODO: one could imagine separating this class into two classes:
     * - One for partitioning context (such as AdHoc, stored proc, row limit delete
     *   trigger), which is immutable
     * - One to capture the results of partitioning analysis, which can be GC'd when no
     *   longer needed
     * This might avoid some of the pitfalls of reused stateful objects.
     *   */
    public void resetAnalysisState() {
        m_countOfIndependentlyPartitionedTables = -1;
        m_countOfPartitionedTables = -1;
        m_fullColumnName = null;
        m_inferredExpression.clear();
        m_inferredParameterIndex = -1;
        m_inferredValue = null;
        m_isDML = false;
        setJoinValid(true);
        setJoinInvalidReason(null);
        m_partitionColForDML = null;
    }

}