VerifyAggregates.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Jul 27, 2011
 */

package com.bigdata.rdf.sail.sparql;

import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.concurrent.atomic.AtomicBoolean;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Bind;
import com.bigdata.bop.IBind;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IValueExpressionConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.AggregateBase;
import com.bigdata.bop.aggregate.IAggregate;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.constraints.SPARQLConstraint;
import com.bigdata.rdf.internal.impl.literal.XSDBooleanIV;
import com.bigdata.rdf.model.BigdataLiteral;
import com.bigdata.rdf.sail.sparql.ast.VisitorException;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.FunctionNode;
import com.bigdata.rdf.sparql.ast.FunctionRegistry;
import com.bigdata.rdf.sparql.ast.FunctionRegistry.UnknownFunctionBOp;
import com.bigdata.rdf.sparql.ast.GroupByNode;
import com.bigdata.rdf.sparql.ast.HavingNode;
import com.bigdata.rdf.sparql.ast.IValueExpressionNode;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import cutthecrap.utils.striterators.Striterator;

/**
 * An object which encapsulates the validation and state of an aggregation
 * operation with an optional GROUP BY clause, SELECT expressions, and an
 * optional HAVING clause. The SELECT expressions MUST be aggregates (if the
 * SELECT expressions do not involve aggregates then you should not be using an
 * aggregation operator to compute the select expressions).
 * <p>
 * Note: This is a port of {@link com.bigdata.bop.solutions.GroupByState} that
 * does not depend on the blazegraph operator model. It was developed as part of
 * BLZG-1176 to decouple the SPARQL parser from the database.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 *
 * @see https://jira.blazegraph.com/browse/BLZG-1176
 */
@SuppressWarnings({ "rawtypes", "unchecked" })
public class VerifyAggregates {

    private static final Logger log = Logger.getLogger(VerifyAggregates.class);
    
    private final IValueExpression<?>[] select;
    private final IValueExpression<?>[] groupBy;
    private final IConstraint[] having;
    private final LinkedHashSet<IVariable<?>> groupByVars = new LinkedHashSet<IVariable<?>>();
    private final LinkedHashSet<IVariable<?>> selectVars = new LinkedHashSet<IVariable<?>>();
    private final LinkedHashSet<IVariable<?>> columnVars = new LinkedHashSet<IVariable<?>>();

    @Override
    public String toString() {
        final StringBuilder sb = new StringBuilder();
        sb.append(getClass().getSimpleName());
        sb.append("{select=" + Arrays.toString(select));
        sb.append(",groupBy=" + Arrays.toString(groupBy));
        sb.append(",having=" + Arrays.toString(having));
        sb.append("}");
        return sb.toString();
    }
    
    public VerifyAggregates(final ProjectionNode projection,
            final GroupByNode groupBy, final HavingNode having) {

//        // normalize an empty[] to a null.
        this.groupBy = groupBy == null || groupBy.arity() == 0 ? null : groupBy.getValueExpressions();
        
        // Replacing unresolvedFunctionNodes with specific AggregateBase to be able
        // to check for aggregates in com.bigdata.bop.solutions.GroupByState.isAggregate
        if (projection!=null) {
            final IValueExpression<?>[] exprs = new IValueExpression[projection.arity()];
            int i = 0;
            final Striterator projectionNodes = new Striterator(projection.iterator());
            while(projectionNodes.hasMoreElements()) {
                final AssignmentNode n = (AssignmentNode) projectionNodes.nextElement();
                IValueExpression expr = n.getValueExpression();
                final IValueExpressionNode exprNode = n.getValueExpressionNode();
                if (expr == null && exprNode instanceof FunctionNode) {
                    expr = convertAggregates((FunctionNode)exprNode);
                }
                exprs[i++] = new Bind(n.getVar(), expr);
            }
            select = exprs;
        } else {
            select = null;
        }


        if (projection == null)
            throw new IllegalArgumentException();

        if (projection.arity() == 0)
            throw new IllegalArgumentException();

        if (having!=null) {
            final IConstraint[] exprs2 = new IConstraint[having.arity()];
            int i = 0;
            for (final IValueExpressionNode node : having) {
    
                final IValueExpression<? extends IV> ve = node.getValueExpression();
                if (ve!=null) {
                    exprs2[i] = new SPARQLConstraint<XSDBooleanIV<BigdataLiteral>>(
                            ve);
                } else if (node instanceof FunctionNode) {
                    
                    final FunctionNode exprNode = (FunctionNode)node;
                    final BOp expr = convertAggregates(exprNode);
                    exprs2[i] = new SPARQLConstraint<XSDBooleanIV<BigdataLiteral>>(new BOp[]{
                            expr}, null);
                    
                    log.debug("Unknown node "+node);
                }
                i++;
    
            }
            this.having = exprs2;
        } else {
            this.having = null;
        }

        
        // true iff any aggregate expression uses DISTINCT.
        final AtomicBoolean anyDistinct = new AtomicBoolean(false);

        // true iff any aggregate expression nests another aggregate expression.
        final AtomicBoolean nestedAggregates = new AtomicBoolean(false);
        
        /*
         * Validate GROUP_BY value expressions.
         * 
         * Note: The GROUP BY clause may include bare variables such as "?x",
         * non-aggregate expressions such as "STR(?x)" and declarations of
         * variables for non-aggregate expressions such as "STR(?x) as strX".
         * However, only bare variables or variables declared using "AS" may
         * appear in the SELECT clause. Those variables are collected in
         * [groupByVars].
         * 
         * Note: Aggregate functions MAY NOT appear in the GROUP_BY clause.
         */
        if (groupBy != null) {

            // Collect top-level variables from GROUP_BY value expressions.
            for (final IValueExpression<?> expr : this.groupBy) {
                if (expr instanceof IVariable<?>) {
                    groupByVars.add((IVariable<?>) expr);
                } else if (expr instanceof IBind<?>) {
                    final IBind<?> bindExpr = (IBind<?>) expr;
                    final IValueExpression<?> e = bindExpr.getExpr();
                    if (isAggregate(e, false/* isSelectClause */,
                            null/* isSelectDependency */, nestedAggregates,
                            anyDistinct)) {
                        throw new IllegalArgumentException(
                                "Aggregate expression not allowed in GROUP_BY: "
                                        + expr);
                    }
                    groupByVars.add(bindExpr.getVar());
                }
            }

        }

        /*
         * Validate SELECT value expressions.
         * 
         * Note: SELECT value expressions must be either variables appearing in
         * the top-level of the GROUP BY value expressions -or- a IBind wrapping
         * an aggregate function.
         * 
         * Note: Certain optimizations are possible when none of the SELECT
         * value expressions use DISTINCT.
         * 
         * Note: Certain optimizations are possible when all of the SELECT value
         * expressions may be computed based on per-group counters.
         */
        {
            // true iff any aggregate expression uses a reference to another
            // aggregate expression in the select clause.
            final AtomicBoolean selectDependency = new AtomicBoolean(false);
            for (final IValueExpression<?> expr : this.select) {
                /*
                 * Each SELECT value expression must be either a top-level
                 * IVariable in the GROUP BY clause or an IBind wrapping a value
                 * expression consisting solely of aggregates (which may of
                 * course wrap bare variables) and constants.
                 */
                if (expr instanceof IVariable<?>) {
                    final IVariable<?> var = (IVariable<?>) expr;
                    if (!groupByVars.contains(var)) {
                        throw new IllegalArgumentException(
                                "Bare variable not declared by GROUP_BY clause: "
                                        + var);
                    }
                    selectVars.add(var);
                } else if (expr instanceof IBind<?>) {
                    /*
                     * Child of IBind must be a valid aggregate expression
                     * consisting solely of aggregates (which may wrap bare
                     * variables declared in the GROUP_BY clause) and constants.
                     * 
                     * Note: Top-level variables already declared in a GROUP_BY
                     * or SELECT clause MAY appear within other value
                     * expressions in the SELECT clause.
                     * 
                     * Note: If any aggregate in the expression uses DISTINCT
                     * then we make a note of that as certain optimizations are
                     * not possible when DISTINCT is used within an aggregate
                     * expression (this is done by isAggregate()).
                     */
                    final IBind<?> bindExpr = (IBind<?>) expr;
                    final IValueExpression<?> e = bindExpr.getExpr();
                    if (!isAggregate(e, true/* isSelectClause */,
                            selectDependency, nestedAggregates, anyDistinct))
                        throw new IllegalArgumentException("Not an aggregate: "
                                + bindExpr);
                    selectVars.add(bindExpr.getVar());
                } else {
                    throw new IllegalArgumentException(
                            "Top-level of SELECT expression must be IVariable or IBind: "
                                    + expr);
                }
            }
        }

        /*
         * HAVING clause.
         * 
         * The having[] may be null or an empty[]. However, any value
         * expressions used within the IConstraint[] must be aggregates (as
         * defined for SELECT expressions).
         */
        /*
         * true iff none of the value expressions in the HAVING clause involve
         * IAggregate functions.
         */
        boolean simpleHaving = true;
        if (having != null) {
            
            for (final IConstraint c : this.having) {

                /*
                 * The constraint must be an aggregate expression.
                 * 
                 * Note: Top-level variables already declared in a GROUP_BY or
                 * SELECT clause MAY appear within value expressions in the
                 * HAVING clause.
                 * 
                 * Note: If any aggregate in the expression uses DISTINCT then
                 * we make a note of that as certain optimizations are not
                 * possible when DISTINCT is used within an aggregate expression
                 * (this is done by isAggregate()).
                 */

                if (!isAggregate(c, false/* isSelectClause */,
                        null/* isSelectDependency */, nestedAggregates,
                        anyDistinct))
                    throw new IllegalArgumentException("Not an aggregate: " + c);

                if (simpleHaving) {
                    /*
                     * Inspect the value expression for each constraint.
                     * Typically the constraint will be a SPARQLConstraint,
                     * which reports the EBV of a value expression. If that
                     * value expression uses an IAggregate function then we set
                     * [simpleHaving := false]. We are done as soon as we have
                     * falsified the "simpleHaving" hypothesis.
                     */
                    final IValueExpression<?> expr = ((IValueExpressionConstraint<?>) c)
                            .getValueExpression();
                    final Iterator<BOp> itr = BOpUtility.preOrderIterator(expr);
                    while (itr.hasNext()) {
                        final BOp t = itr.next();
                        if (t instanceof IAggregate<?>) {
                            simpleHaving = false;
                            break;
                        }
                    }
                }
            }
        }

    }

    private IValueExpression convertAggregates(final BOp exprNode) {
        IValueExpression expr;
        final BOp[] args = new BOp[exprNode.args().size()];
        if (exprNode!=null && exprNode.arity()>0) {
            for (int i=0; i<exprNode.args().size(); i++) {
                final BOp arg = exprNode.args().get(i);
                final IValueExpression newValue = convertAggregates(arg); 
                if (newValue!=null) {
                    args[i] = newValue;
                } else {
                    args[i] = arg;
                }
            }
        }
        if (exprNode instanceof FunctionNode) {
            if ((exprNode instanceof FunctionNode) && FunctionRegistry.isAggregate(((FunctionNode) exprNode).getFunctionURI())) {
                expr = new AggregateBase(args, null) {
                    @Override public void reset() {}
                    @Override public IV done() { return null; }
                };
            } else {
                expr = new UnknownFunctionBOp(args, null);
            }
        } else {
            expr = null;
        }
        return expr;
    }

    /**
     * Return <code>true</code> iff the expression is an aggregate.
     * <p>
     * Aggregates may be built out of constants, references to {@link IVariable}
     * s which are already defined and which are themselves aggregates, and
     * {@link IAggregate} functions. An {@link IVariable} will be an aggregate
     * if it appears as a bare variable in a GROUP_BY clause or if it declared
     * by a prior value expression in a GROUP_BY or SELECT clause. Testing
     * whether or not an {@link IValueExpression} is an aggregate therefore
     * depends on access to the set of known aggregates. The value expressions
     * in the GROUP_BY clause must be processed first (in order) followed by the
     * value expressions in the SELECT clause (in order).
     * <p>
     * An aggregate may use a non-aggregate variable only allowed within an
     * {@link IAggregate} function. For example, given:
     * <code>SUM(?x) as ?y</code>, <code>?x</code> must be a non-aggregate
     * variable and <code>?y</code> will be an aggregate variable.
     * <p>
     * Aggregate variables may be used both inside and outside of an
     * {@link IAggregate} function as long as the variable was declared before
     * it was used. For example, the following are legal:
     * 
     * <pre>
     * SELECT SUM(?x) as ?y, SUM(?x + ?y) as ?z, SUM(?x)+AVG(?x) as ?z2
     * 
     * SELECT SUM(?x) as ?y, SUM(?x + COUNT(?y)) as ?z
     * </pre>
     * 
     * Patterns where an aggregate depends on a prior aggregate prevent certain
     * optimizations, notably you have to evaluate each aggregate in turn rather
     * than evaluating them in parallel over the solutions is a group. If any
     * such patterns are observed in the SELECT clause then this method will set
     * <code>isSelectDependency := true</code> as a side-effect.
     * 
     * @param op
     *            An {@link IValueExpression} or {@link IConstraint}.
     * @param isSelectClause
     *            <code>true</code> if the <i>op</i> appears a SELECT clause.
     * @param isSelectDependency
     *            Set as a side-effect when an {@link IValueExpression}
     *            appearing in a SELECT clause has a dependency on an
     *            {@link IVariable} declared in the GROUP_BY clause or earlier
     *            in the SELECT clause. This argument is optional unless
     *            <i>isSelectClause</i> is <code>true</code>.
     * @param isNestedAggregates
     *            Set as a side-effect when an {@link IValueExpression}
     *            containing an {@link IAggregate} nests another
     *            {@link IAggregate} within it.
     * @param isAnyDistinct
     *            Set as a side-effect if an {@link IAggregate} function is
     *            encountered which reports <code>true</code> for
     *            {@link IAggregate#isDistinct()}.
     * 
     * @return <code>true</code> iff the operator is an aggregate.
     */
    protected boolean isAggregate(final BOp op,
            final boolean isSelectClause,
            final AtomicBoolean isSelectDependency,
            final AtomicBoolean isNestedAggregates,
            final AtomicBoolean isAnyDistinct) {

        if (op == null) {
            // BOp is not prepared yet, but it is not an Aggregate,
            // otherwise dummy aggregate BOp provided
            // by com.bigdata.rdf.sparql.ast.ProjectionNode.getValueExpressions() 
            return false;
        }

        if (op instanceof IConstant && isSelectClause) {
            /*
             * A constant appearing in the root of a SELECT expression is an
             * aggregate.
             */
            return true;
        }

        return isAggregate(op, isSelectClause, isSelectDependency,
                isNestedAggregates, isAnyDistinct, false/* withinAggregateFunction */);

    }

    private boolean isAggregate(final BOp op,
            final boolean isSelectClause,
            final AtomicBoolean isSelectDependency,
            final AtomicBoolean isNestedAggregates,
            final AtomicBoolean isAnyDistinct,
            final boolean withinAggregateFunction) {

        if (op instanceof IAggregate<?> ||
                ((op instanceof FunctionNode) && FunctionRegistry.isAggregate(((FunctionNode) op).getFunctionURI()))
                    ) {
            if(withinAggregateFunction) {
                isNestedAggregates.set(true);
            }
            if (((IAggregate<?>) op).isDistinct()) {
                isAnyDistinct.set(true);
            }
        }
        final boolean aggregationContext = withinAggregateFunction
                || op instanceof IAggregate<?>;
        boolean isAggregate = aggregationContext;
        {
            final BOp t = op;
            if (t instanceof IVariable<?>) {
                final IVariable<?> v = (IVariable<?>) t;
                if (aggregationContext) {
                    /*
                     * Decide if a variable appearing in within an aggregation
                     * context is a reference to a previously observed
                     * aggregate. If not, then we presume it to be a variable in
                     * the detail records and aggregation will (at least logically)
                     * form a column projection of that variable for each group.
                     */
                    if (!groupByVars.contains(v) && !selectVars.contains(v)) {
                        columnVars.add(v);
                    }
                    return false;
                }
                if (groupByVars.contains(v)) {
                    isAggregate = true;
                    return true;
                }
                if (selectVars.contains(v)) {
                    if (isSelectClause)
                        isSelectDependency.set(true);
                    isAggregate = true;
                    return true;
                }
                if(isSelectClause) {
                    /*
                     * Note: This is also thrown when there is a forward
                     * reference to a variable in the select expression which we
                     * have not yet seen.
                     * 
                     * Note: This situation does not arise for the GROUP_BY
                     * clause because it may only reference non-aggregate
                     * variables.
                     * 
                     * Note: This situation does not arise for the HAVING clause
                     * because it can not define new variables using "AS".
                     */
                    throw new IllegalArgumentException(
                        "Non-aggregate variable in select expression: " + v);
                }
            }
        }
        final Iterator<BOp> itr = op.argIterator();
        while (itr.hasNext()) {
            final BOp arg = itr.next();
            if (selectVars.contains(arg)) {
                if (isSelectClause)
                    isSelectDependency.set(true);
                isAggregate = true;
                return true;
            }
            if (log.isTraceEnabled())
                log.trace("op=" + op.getClass()
                        + //
                        ", isSelectClause="
                        + isSelectClause //
                        + ", isSelectDependency="
                        + isSelectDependency //
                        + ", isNestedAggregates="
                        + isNestedAggregates//
                        + ", isAnyDistinct="
                        + isAnyDistinct //
                        + ", withinAggregateFunction="
                        + withinAggregateFunction //
                        + ", aggregationContext=" + aggregationContext //
                        + ", groupByVars=" + groupByVars//
                        + ", selectVars=" + selectVars //
                        + ", arg=" + arg//
                );
            // recursion through child value expression.
            isAggregate |= isAggregate(arg, isSelectClause, isSelectDependency,
                    isNestedAggregates, isAnyDistinct, aggregationContext/* withinAggregateFunction */);
        }

        return isAggregate;

    }

    /**
     * Verify the various conditions that must be met when a query uses GROUP BY
     * or when a query uses aggregates in a PROJECTION.
     * 
     * @param queryBase
     *            The query.
     * 
     * @throws VisitorException
     */
    public static void verifyAggregate(final QueryBase queryBase)
            throws VisitorException {

        /*
         * The following code has some dependencies on whether or not the
         * value expressions have been cached. That is not done until we get
         * into AST2BOpUtility. I have worked some hacks to support this in
         * FunctionRegistry.isAggregate() and StaticAnalysis.isAggregate().
         * However, the code is still hitting some edge cases.
         * 
         * MP: I fixed this by running the ASTSetValueOptimizer earlier in the
         * parsing process - ie. in Bigdata2ASTSPARQLParser.parseQuery2.
         * 
         * There is some commented out code from openrdf that depends on setting
         * a flag for the expression if an AggregationCollector reports at least
         * one aggregation in a projection element. We could do this same thing
         * here but we still need to have the logic to figure out what is an
         * invalid aggregate.
         * 
         * MP: I think the place to go look for reference is Sesame's
         * TupleExprBuilder, especially:
         * 
         * public TupleExpr visit(ASTSelect node, Object data)
         * 
         * And also look at the AggregateCollector.
         */
        
        final ProjectionNode projection = queryBase.getProjection() == null ? null
                : queryBase.getProjection().isEmpty() ? null : queryBase
                        .getProjection();

        final GroupByNode groupBy = queryBase.getGroupBy() == null ? null
                : queryBase.getGroupBy().isEmpty() ? null : queryBase
                        .getGroupBy();

        final HavingNode having = queryBase.getHaving() == null ? null
                : queryBase.getHaving().isEmpty() ? null : queryBase
                        .getHaving();

        // true if this is an aggregation query.
        final boolean isAggregate = StaticAnalysis.isAggregate(projection,
                groupBy, having);

        if (isAggregate) {

            if (projection.isWildcard())
                throw new VisitorException(
                        "Wildcard not allowed with aggregate.");

            try {

                /*
                 * Delegate logic to validate the aggregate query.
                 */

                new VerifyAggregates(projection, groupBy, having);

            } catch (final IllegalArgumentException ex) {

                throw new VisitorException("Bad aggregate", ex);

            }

        }

    }
}