/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jul 27, 2011
*/
package com.bigdata.rdf.sail.sparql;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.Bind;
import com.bigdata.bop.IBind;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IValueExpression;
import com.bigdata.bop.IValueExpressionConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.AggregateBase;
import com.bigdata.bop.aggregate.IAggregate;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.internal.constraints.SPARQLConstraint;
import com.bigdata.rdf.internal.impl.literal.XSDBooleanIV;
import com.bigdata.rdf.model.BigdataLiteral;
import com.bigdata.rdf.sail.sparql.ast.VisitorException;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.FunctionNode;
import com.bigdata.rdf.sparql.ast.FunctionRegistry;
import com.bigdata.rdf.sparql.ast.FunctionRegistry.UnknownFunctionBOp;
import com.bigdata.rdf.sparql.ast.GroupByNode;
import com.bigdata.rdf.sparql.ast.HavingNode;
import com.bigdata.rdf.sparql.ast.IValueExpressionNode;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import cutthecrap.utils.striterators.Striterator;
/**
* An object which encapsulates the validation and state of an aggregation
* operation with an optional GROUP BY clause, SELECT expressions, and an
* optional HAVING clause. The SELECT expressions MUST be aggregates (if the
* SELECT expressions do not involve aggregates then you should not be using an
* aggregation operator to compute the select expressions).
* <p>
* Note: This is a port of {@link com.bigdata.bop.solutions.GroupByState} that
* does not depend on the blazegraph operator model. It was developed as part of
* BLZG-1176 to decouple the SPARQL parser from the database.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*
* @see https://jira.blazegraph.com/browse/BLZG-1176
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public class VerifyAggregates {
private static final Logger log = Logger.getLogger(VerifyAggregates.class);
private final IValueExpression<?>[] select;
private final IValueExpression<?>[] groupBy;
private final IConstraint[] having;
private final LinkedHashSet<IVariable<?>> groupByVars = new LinkedHashSet<IVariable<?>>();
private final LinkedHashSet<IVariable<?>> selectVars = new LinkedHashSet<IVariable<?>>();
private final LinkedHashSet<IVariable<?>> columnVars = new LinkedHashSet<IVariable<?>>();
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append("{select=" + Arrays.toString(select));
sb.append(",groupBy=" + Arrays.toString(groupBy));
sb.append(",having=" + Arrays.toString(having));
sb.append("}");
return sb.toString();
}
public VerifyAggregates(final ProjectionNode projection,
final GroupByNode groupBy, final HavingNode having) {
// // normalize an empty[] to a null.
this.groupBy = groupBy == null || groupBy.arity() == 0 ? null : groupBy.getValueExpressions();
// Replacing unresolvedFunctionNodes with specific AggregateBase to be able
// to check for aggregates in com.bigdata.bop.solutions.GroupByState.isAggregate
if (projection!=null) {
final IValueExpression<?>[] exprs = new IValueExpression[projection.arity()];
int i = 0;
final Striterator projectionNodes = new Striterator(projection.iterator());
while(projectionNodes.hasMoreElements()) {
final AssignmentNode n = (AssignmentNode) projectionNodes.nextElement();
IValueExpression expr = n.getValueExpression();
final IValueExpressionNode exprNode = n.getValueExpressionNode();
if (expr == null && exprNode instanceof FunctionNode) {
expr = convertAggregates((FunctionNode)exprNode);
}
exprs[i++] = new Bind(n.getVar(), expr);
}
select = exprs;
} else {
select = null;
}
if (projection == null)
throw new IllegalArgumentException();
if (projection.arity() == 0)
throw new IllegalArgumentException();
if (having!=null) {
final IConstraint[] exprs2 = new IConstraint[having.arity()];
int i = 0;
for (final IValueExpressionNode node : having) {
final IValueExpression<? extends IV> ve = node.getValueExpression();
if (ve!=null) {
exprs2[i] = new SPARQLConstraint<XSDBooleanIV<BigdataLiteral>>(
ve);
} else if (node instanceof FunctionNode) {
final FunctionNode exprNode = (FunctionNode)node;
final BOp expr = convertAggregates(exprNode);
exprs2[i] = new SPARQLConstraint<XSDBooleanIV<BigdataLiteral>>(new BOp[]{
expr}, null);
log.debug("Unknown node "+node);
}
i++;
}
this.having = exprs2;
} else {
this.having = null;
}
// true iff any aggregate expression uses DISTINCT.
final AtomicBoolean anyDistinct = new AtomicBoolean(false);
// true iff any aggregate expression nests another aggregate expression.
final AtomicBoolean nestedAggregates = new AtomicBoolean(false);
/*
* Validate GROUP_BY value expressions.
*
* Note: The GROUP BY clause may include bare variables such as "?x",
* non-aggregate expressions such as "STR(?x)" and declarations of
* variables for non-aggregate expressions such as "STR(?x) as strX".
* However, only bare variables or variables declared using "AS" may
* appear in the SELECT clause. Those variables are collected in
* [groupByVars].
*
* Note: Aggregate functions MAY NOT appear in the GROUP_BY clause.
*/
if (groupBy != null) {
// Collect top-level variables from GROUP_BY value expressions.
for (final IValueExpression<?> expr : this.groupBy) {
if (expr instanceof IVariable<?>) {
groupByVars.add((IVariable<?>) expr);
} else if (expr instanceof IBind<?>) {
final IBind<?> bindExpr = (IBind<?>) expr;
final IValueExpression<?> e = bindExpr.getExpr();
if (isAggregate(e, false/* isSelectClause */,
null/* isSelectDependency */, nestedAggregates,
anyDistinct)) {
throw new IllegalArgumentException(
"Aggregate expression not allowed in GROUP_BY: "
+ expr);
}
groupByVars.add(bindExpr.getVar());
}
}
}
/*
* Validate SELECT value expressions.
*
* Note: SELECT value expressions must be either variables appearing in
* the top-level of the GROUP BY value expressions -or- a IBind wrapping
* an aggregate function.
*
* Note: Certain optimizations are possible when none of the SELECT
* value expressions use DISTINCT.
*
* Note: Certain optimizations are possible when all of the SELECT value
* expressions may be computed based on per-group counters.
*/
{
// true iff any aggregate expression uses a reference to another
// aggregate expression in the select clause.
final AtomicBoolean selectDependency = new AtomicBoolean(false);
for (final IValueExpression<?> expr : this.select) {
/*
* Each SELECT value expression must be either a top-level
* IVariable in the GROUP BY clause or an IBind wrapping a value
* expression consisting solely of aggregates (which may of
* course wrap bare variables) and constants.
*/
if (expr instanceof IVariable<?>) {
final IVariable<?> var = (IVariable<?>) expr;
if (!groupByVars.contains(var)) {
throw new IllegalArgumentException(
"Bare variable not declared by GROUP_BY clause: "
+ var);
}
selectVars.add(var);
} else if (expr instanceof IBind<?>) {
/*
* Child of IBind must be a valid aggregate expression
* consisting solely of aggregates (which may wrap bare
* variables declared in the GROUP_BY clause) and constants.
*
* Note: Top-level variables already declared in a GROUP_BY
* or SELECT clause MAY appear within other value
* expressions in the SELECT clause.
*
* Note: If any aggregate in the expression uses DISTINCT
* then we make a note of that as certain optimizations are
* not possible when DISTINCT is used within an aggregate
* expression (this is done by isAggregate()).
*/
final IBind<?> bindExpr = (IBind<?>) expr;
final IValueExpression<?> e = bindExpr.getExpr();
if (!isAggregate(e, true/* isSelectClause */,
selectDependency, nestedAggregates, anyDistinct))
throw new IllegalArgumentException("Not an aggregate: "
+ bindExpr);
selectVars.add(bindExpr.getVar());
} else {
throw new IllegalArgumentException(
"Top-level of SELECT expression must be IVariable or IBind: "
+ expr);
}
}
}
/*
* HAVING clause.
*
* The having[] may be null or an empty[]. However, any value
* expressions used within the IConstraint[] must be aggregates (as
* defined for SELECT expressions).
*/
/*
* true iff none of the value expressions in the HAVING clause involve
* IAggregate functions.
*/
boolean simpleHaving = true;
if (having != null) {
for (final IConstraint c : this.having) {
/*
* The constraint must be an aggregate expression.
*
* Note: Top-level variables already declared in a GROUP_BY or
* SELECT clause MAY appear within value expressions in the
* HAVING clause.
*
* Note: If any aggregate in the expression uses DISTINCT then
* we make a note of that as certain optimizations are not
* possible when DISTINCT is used within an aggregate expression
* (this is done by isAggregate()).
*/
if (!isAggregate(c, false/* isSelectClause */,
null/* isSelectDependency */, nestedAggregates,
anyDistinct))
throw new IllegalArgumentException("Not an aggregate: " + c);
if (simpleHaving) {
/*
* Inspect the value expression for each constraint.
* Typically the constraint will be a SPARQLConstraint,
* which reports the EBV of a value expression. If that
* value expression uses an IAggregate function then we set
* [simpleHaving := false]. We are done as soon as we have
* falsified the "simpleHaving" hypothesis.
*/
final IValueExpression<?> expr = ((IValueExpressionConstraint<?>) c)
.getValueExpression();
final Iterator<BOp> itr = BOpUtility.preOrderIterator(expr);
while (itr.hasNext()) {
final BOp t = itr.next();
if (t instanceof IAggregate<?>) {
simpleHaving = false;
break;
}
}
}
}
}
}
private IValueExpression convertAggregates(final BOp exprNode) {
IValueExpression expr;
final BOp[] args = new BOp[exprNode.args().size()];
if (exprNode!=null && exprNode.arity()>0) {
for (int i=0; i<exprNode.args().size(); i++) {
final BOp arg = exprNode.args().get(i);
final IValueExpression newValue = convertAggregates(arg);
if (newValue!=null) {
args[i] = newValue;
} else {
args[i] = arg;
}
}
}
if (exprNode instanceof FunctionNode) {
if ((exprNode instanceof FunctionNode) && FunctionRegistry.isAggregate(((FunctionNode) exprNode).getFunctionURI())) {
expr = new AggregateBase(args, null) {
@Override public void reset() {}
@Override public IV done() { return null; }
};
} else {
expr = new UnknownFunctionBOp(args, null);
}
} else {
expr = null;
}
return expr;
}
/**
* Return <code>true</code> iff the expression is an aggregate.
* <p>
* Aggregates may be built out of constants, references to {@link IVariable}
* s which are already defined and which are themselves aggregates, and
* {@link IAggregate} functions. An {@link IVariable} will be an aggregate
* if it appears as a bare variable in a GROUP_BY clause or if it declared
* by a prior value expression in a GROUP_BY or SELECT clause. Testing
* whether or not an {@link IValueExpression} is an aggregate therefore
* depends on access to the set of known aggregates. The value expressions
* in the GROUP_BY clause must be processed first (in order) followed by the
* value expressions in the SELECT clause (in order).
* <p>
* An aggregate may use a non-aggregate variable only allowed within an
* {@link IAggregate} function. For example, given:
* <code>SUM(?x) as ?y</code>, <code>?x</code> must be a non-aggregate
* variable and <code>?y</code> will be an aggregate variable.
* <p>
* Aggregate variables may be used both inside and outside of an
* {@link IAggregate} function as long as the variable was declared before
* it was used. For example, the following are legal:
*
* <pre>
* SELECT SUM(?x) as ?y, SUM(?x + ?y) as ?z, SUM(?x)+AVG(?x) as ?z2
*
* SELECT SUM(?x) as ?y, SUM(?x + COUNT(?y)) as ?z
* </pre>
*
* Patterns where an aggregate depends on a prior aggregate prevent certain
* optimizations, notably you have to evaluate each aggregate in turn rather
* than evaluating them in parallel over the solutions is a group. If any
* such patterns are observed in the SELECT clause then this method will set
* <code>isSelectDependency := true</code> as a side-effect.
*
* @param op
* An {@link IValueExpression} or {@link IConstraint}.
* @param isSelectClause
* <code>true</code> if the <i>op</i> appears a SELECT clause.
* @param isSelectDependency
* Set as a side-effect when an {@link IValueExpression}
* appearing in a SELECT clause has a dependency on an
* {@link IVariable} declared in the GROUP_BY clause or earlier
* in the SELECT clause. This argument is optional unless
* <i>isSelectClause</i> is <code>true</code>.
* @param isNestedAggregates
* Set as a side-effect when an {@link IValueExpression}
* containing an {@link IAggregate} nests another
* {@link IAggregate} within it.
* @param isAnyDistinct
* Set as a side-effect if an {@link IAggregate} function is
* encountered which reports <code>true</code> for
* {@link IAggregate#isDistinct()}.
*
* @return <code>true</code> iff the operator is an aggregate.
*/
protected boolean isAggregate(final BOp op,
final boolean isSelectClause,
final AtomicBoolean isSelectDependency,
final AtomicBoolean isNestedAggregates,
final AtomicBoolean isAnyDistinct) {
if (op == null) {
// BOp is not prepared yet, but it is not an Aggregate,
// otherwise dummy aggregate BOp provided
// by com.bigdata.rdf.sparql.ast.ProjectionNode.getValueExpressions()
return false;
}
if (op instanceof IConstant && isSelectClause) {
/*
* A constant appearing in the root of a SELECT expression is an
* aggregate.
*/
return true;
}
return isAggregate(op, isSelectClause, isSelectDependency,
isNestedAggregates, isAnyDistinct, false/* withinAggregateFunction */);
}
private boolean isAggregate(final BOp op,
final boolean isSelectClause,
final AtomicBoolean isSelectDependency,
final AtomicBoolean isNestedAggregates,
final AtomicBoolean isAnyDistinct,
final boolean withinAggregateFunction) {
if (op instanceof IAggregate<?> ||
((op instanceof FunctionNode) && FunctionRegistry.isAggregate(((FunctionNode) op).getFunctionURI()))
) {
if(withinAggregateFunction) {
isNestedAggregates.set(true);
}
if (((IAggregate<?>) op).isDistinct()) {
isAnyDistinct.set(true);
}
}
final boolean aggregationContext = withinAggregateFunction
|| op instanceof IAggregate<?>;
boolean isAggregate = aggregationContext;
{
final BOp t = op;
if (t instanceof IVariable<?>) {
final IVariable<?> v = (IVariable<?>) t;
if (aggregationContext) {
/*
* Decide if a variable appearing in within an aggregation
* context is a reference to a previously observed
* aggregate. If not, then we presume it to be a variable in
* the detail records and aggregation will (at least logically)
* form a column projection of that variable for each group.
*/
if (!groupByVars.contains(v) && !selectVars.contains(v)) {
columnVars.add(v);
}
return false;
}
if (groupByVars.contains(v)) {
isAggregate = true;
return true;
}
if (selectVars.contains(v)) {
if (isSelectClause)
isSelectDependency.set(true);
isAggregate = true;
return true;
}
if(isSelectClause) {
/*
* Note: This is also thrown when there is a forward
* reference to a variable in the select expression which we
* have not yet seen.
*
* Note: This situation does not arise for the GROUP_BY
* clause because it may only reference non-aggregate
* variables.
*
* Note: This situation does not arise for the HAVING clause
* because it can not define new variables using "AS".
*/
throw new IllegalArgumentException(
"Non-aggregate variable in select expression: " + v);
}
}
}
final Iterator<BOp> itr = op.argIterator();
while (itr.hasNext()) {
final BOp arg = itr.next();
if (selectVars.contains(arg)) {
if (isSelectClause)
isSelectDependency.set(true);
isAggregate = true;
return true;
}
if (log.isTraceEnabled())
log.trace("op=" + op.getClass()
+ //
", isSelectClause="
+ isSelectClause //
+ ", isSelectDependency="
+ isSelectDependency //
+ ", isNestedAggregates="
+ isNestedAggregates//
+ ", isAnyDistinct="
+ isAnyDistinct //
+ ", withinAggregateFunction="
+ withinAggregateFunction //
+ ", aggregationContext=" + aggregationContext //
+ ", groupByVars=" + groupByVars//
+ ", selectVars=" + selectVars //
+ ", arg=" + arg//
);
// recursion through child value expression.
isAggregate |= isAggregate(arg, isSelectClause, isSelectDependency,
isNestedAggregates, isAnyDistinct, aggregationContext/* withinAggregateFunction */);
}
return isAggregate;
}
/**
* Verify the various conditions that must be met when a query uses GROUP BY
* or when a query uses aggregates in a PROJECTION.
*
* @param queryBase
* The query.
*
* @throws VisitorException
*/
public static void verifyAggregate(final QueryBase queryBase)
throws VisitorException {
/*
* The following code has some dependencies on whether or not the
* value expressions have been cached. That is not done until we get
* into AST2BOpUtility. I have worked some hacks to support this in
* FunctionRegistry.isAggregate() and StaticAnalysis.isAggregate().
* However, the code is still hitting some edge cases.
*
* MP: I fixed this by running the ASTSetValueOptimizer earlier in the
* parsing process - ie. in Bigdata2ASTSPARQLParser.parseQuery2.
*
* There is some commented out code from openrdf that depends on setting
* a flag for the expression if an AggregationCollector reports at least
* one aggregation in a projection element. We could do this same thing
* here but we still need to have the logic to figure out what is an
* invalid aggregate.
*
* MP: I think the place to go look for reference is Sesame's
* TupleExprBuilder, especially:
*
* public TupleExpr visit(ASTSelect node, Object data)
*
* And also look at the AggregateCollector.
*/
final ProjectionNode projection = queryBase.getProjection() == null ? null
: queryBase.getProjection().isEmpty() ? null : queryBase
.getProjection();
final GroupByNode groupBy = queryBase.getGroupBy() == null ? null
: queryBase.getGroupBy().isEmpty() ? null : queryBase
.getGroupBy();
final HavingNode having = queryBase.getHaving() == null ? null
: queryBase.getHaving().isEmpty() ? null : queryBase
.getHaving();
// true if this is an aggregation query.
final boolean isAggregate = StaticAnalysis.isAggregate(projection,
groupBy, having);
if (isAggregate) {
if (projection.isWildcard())
throw new VisitorException(
"Wildcard not allowed with aggregate.");
try {
/*
* Delegate logic to validate the aggregate query.
*/
new VerifyAggregates(projection, groupBy, having);
} catch (final IllegalArgumentException ex) {
throw new VisitorException("Bad aggregate", ex);
}
}
}
}