/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Jan 9, 2015
*/
package com.bigdata.rdf.sparql.ast.optimizers;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.aggregate.AggregateBase;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.DatasetNode;
import com.bigdata.rdf.sparql.ast.FunctionNode;
import com.bigdata.rdf.sparql.ast.FunctionRegistry;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.GroupByNode;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.IValueExpressionNode;
import com.bigdata.rdf.sparql.ast.JoinGroupNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
/**
* Optimizes <code>
SELECT (COUNT(*) as ?count) ?z WHERE { ?x rdf:type ?z } GROUP BY ?z
* </code> and similar patterns using an O(N) algorithm, where N is
* the number of distinct solutions.
* <p>
* The optimizer aims at establishing an execution plan that applies a
* combination of distinct term scan pattern (to efficiently compute the
* distinct values for the group variable) and fast range count pattern to
* efficiently calculate the COUNT, without materialization of the variables on
* which the COUNT operation is performed.
*
* The basic idea is to
*
* (i) replace the GROUP BY pattern through a SELECT DISTINCT subquery to
* calculate the distinct bindings for variable ?z first, and (ii) use a fast
* range count operator to efficiently calculate the COUNT.
*
* Note that the sub query in step (i) may (where possible) be optimized by the
* {@link ASTDistinctTermScanOptimizer}, i.e. if possible the subquery producing
* the ?z bindings will be replaced by a distinct term scan in a later
* optimization step.
*
* @see <a href="http://trac.blazegraph.com/ticket/1059"> GROUP BY optimization
* using distinct-term-scan and fast-range-count</a>
*
* @author <a href="mailto:ms@metaphacts.com">Michael Schmidt</a>
*/
public class ASTSimpleGroupByAndCountOptimizer implements IASTOptimizer {
private static final transient Logger log = Logger.getLogger(ASTSimpleGroupByAndCountOptimizer.class);
public ASTSimpleGroupByAndCountOptimizer() {
}
@Override
public QueryNodeWithBindingSet optimize(
final AST2BOpContext context, final QueryNodeWithBindingSet input) {
final IQueryNode queryNode = input.getQueryNode();
final IBindingSet[] bindingSets = input.getBindingSets();
if (context.getAbstractTripleStore().
getSPORelation().indicesHaveDeleteMarkers()) {
/**
* Disallow for optimization when using delete markers.
* <p>
* The presence of deleteMarkers means that the fast-range count will
* be turned into a key-range scan, which is not desired.
* <p>
* While AccessPath.rangeCountExact(true) method will do the right
* thing even if the index has delete markers (it will convert to a
* scan for either delete markers or if there is a FILTER attached
* to the index), converting to a scan defeats the purpose of the
* ASTFastRangeCountOptimizer. In this case, the cost would have
* been the same if we had not rewritten the AST. Hence we do not
* rewrite the query.
*/
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
final QueryRoot queryRoot = (QueryRoot) queryNode;
final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);
final DatasetNode dataset = queryRoot.getDataset();
if (context.getAbstractTripleStore().isQuads()) {
boolean ok = false;
if (dataset == null || dataset.getNamedGraphs() == null) {
/*
* The dataset is all graphs.
*/
ok = true;
}
if (!ok) {
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
}
// First, process any pre-existing named subqueries.
{
final NamedSubqueriesNode namedSubqueries = queryRoot
.getNamedSubqueries();
if (namedSubqueries != null) {
// Note: works around concurrent modification error.
final List<NamedSubqueryRoot> list = BOpUtility.toList(
namedSubqueries, NamedSubqueryRoot.class);
for (NamedSubqueryRoot namedSubquery : list) {
// Rewrite the named sub-select
doSelectQuery(context, sa, (QueryRoot) queryNode, namedSubquery);
}
}
}
// rewrite the top-level select
/**
* https://jira.blazegraph.com/browse/BLZG-618, failure of
* http://www.w3.org/2009/sparql/docs/tests/data-sparql11/aggregates/manifest#agg06
* mentioned in one of the comments: we are not allowed to do the fast range count
* optimization if the query exhibits an additional having clause (which might apply
* any [possibly other] kind of aggregation on the query body).
*/
final QueryRoot qr = (QueryRoot)queryNode;
if (qr.getHaving()==null)
doSelectQuery(context, sa, (QueryRoot) queryNode, (QueryBase) queryNode);
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
/**
* Attempt to rewrite the SELECT.
*
* @param context
* @param sa
* @param queryRoot
* The top-level of the query.
* @param queryBase
* Either a top-level query or a sub-query.
*/
private void doSelectQuery(final AST2BOpContext context,
final StaticAnalysis sa, final QueryRoot queryRoot,
final QueryBase queryBase) {
/**
* The prerequisites for the optimizer, which we check in the following,
* are as follows:
*
* (C1) Query must be a SELECT query
*
* (C2) Single triple pattern in body
*
* (C3) Single GROUP BY variable that is part of the triple pattern
*
* (C4) Projection for the GROUP BY variable plus a COUNT operation, which
* operates over star or any other variable in the triple pattern,
* plus the COUNT must not be DISTINCT
*/
// the following variables will be bound during precondition check,
// for later use in rewriting (used only in case the optimization applies)
ProjectionNode projectionNode = null;
Integer indexOfVarNode = null; // index of VarNode in projectionNode
Integer indexOfCountNode = null;// index of COUNT node in projectionNode
VarNode countNodeVar = null; // VarNode bound through COUNT operation
GraphPatternGroup<IGroupMemberNode> graphPattern; // surrounding gp
StatementPatternNode stmtPattern = null; // the inner statement pattern
VarNode groupingVar = null; // the variable which is grouped
{
final GroupByNode groupByNode = queryBase.getGroupBy();
if (groupByNode == null || groupByNode.arity() != 1
|| !(groupByNode.get(0) instanceof AssignmentNode)) {
return;
}
final AssignmentNode assignmentNodeInGroupBy =
(AssignmentNode) groupByNode.get(0);
if (assignmentNodeInGroupBy.arity() != 2
|| !(assignmentNodeInGroupBy.get(0) instanceof VarNode)
|| !(assignmentNodeInGroupBy.get(1) instanceof VarNode)) {
return; // something's wrong here
}
groupingVar = (VarNode) assignmentNodeInGroupBy.get(1);
final VarNode groupingVarRenamed =
(VarNode) assignmentNodeInGroupBy.get(0);
// Check for condition (C1)
final QueryType queryType = queryBase.getQueryType();
if (!QueryType.SELECT.equals(queryType)) {
return; // optimization not applicable
}
// Check for condition (C2)
graphPattern = queryBase.getGraphPattern();
if (graphPattern.args().size() != 1) {
return;
}
final BOp potentialStmtPattern = graphPattern.get(0);
if (!(potentialStmtPattern instanceof StatementPatternNode)) {
return;
}
stmtPattern = (StatementPatternNode) potentialStmtPattern;
/*
* When in history mode, we can't do this optimization because neither
* a distinct term scan nor a fast range count is possible, except if
* the StatementPatternNode has been marked to read history.
*/
if (context.getAbstractTripleStore().isRDRHistory()) {
if (!stmtPattern.getQueryHintAsBoolean(QueryHints.HISTORY, false)) {
if (log.isDebugEnabled()) {
log.debug("nope");
}
// Can not rewrite.
return;
}
}
if (log.isDebugEnabled()) {
log.debug("yep");
}
Set<VarNode> varNodesInStmtPattern = new HashSet<VarNode>();
VarNode graphVarNode = null;
for (int i = 0; i < stmtPattern.arity(); i++) {
final BOp arg = stmtPattern.get(i);
if (arg instanceof VarNode) {
varNodesInStmtPattern.add((VarNode) arg);
if (i == 3) {
graphVarNode = (VarNode) arg;
}
}
}
// Check for condition (C3)
if (!varNodesInStmtPattern.contains(groupingVar)) {
return; // illegal grouping, optimization not applicable
}
// Check for condition (C4)
projectionNode = queryBase.getProjection();
if (projectionNode.size() != 2) {
return;
}
indexOfVarNode = null;
indexOfCountNode = null;
for (int i = 0; i < 2; i++) {
final AssignmentNode curNode = projectionNode.getExpr(i);
final IValueExpressionNode valNode =
curNode.getValueExpressionNode();
if (valNode instanceof FunctionNode && indexOfCountNode == null) {
final FunctionNode fNode = (FunctionNode) curNode
.getValueExpressionNode();
if (!fNode.getFunctionURI().equals(FunctionRegistry.COUNT))
return; // NOT COUNT
// check for problematic COUNT(DISTINCT ...)) pattern
final Map<String, Object> scalarVals = fNode.getScalarValues();
final Object isDistinct = scalarVals
.get(AggregateBase.Annotations.DISTINCT);
if (isDistinct != null && isDistinct instanceof Boolean
&& (Boolean) isDistinct) {
return; // COUNT (DISTINCT ...) cannot be optimized
}
if (fNode.args().size() != 1) {
return;
}
final BOp inner = fNode.args().get(0);
if (!(inner instanceof VarNode)) {
return;
}
final VarNode innerVarNode = (VarNode) inner;
// the count operation must be performed on a variable
// distinct from the named graph variable, if any
// (as the latter is not necessarily bound);
// alternatively, a COUNT(*) is also ok
boolean countOnPotentiallyUnboundVariable =
varNodesInStmtPattern.contains(innerVarNode) &&
!innerVarNode.equals(graphVarNode);
boolean isWildcard = innerVarNode.isWildcard();
if (!(countOnPotentiallyUnboundVariable || isWildcard)) {
return; // optimization in overall not applicable
}
if (!(curNode.get(0) instanceof VarNode)) {
return; // first exp of assignment node is assigned var
}
countNodeVar = (VarNode) curNode.get(0);
indexOfCountNode = i;
} else if (valNode instanceof VarNode && indexOfVarNode == null) {
final VarNode valNodeAsVarNode = (VarNode) valNode;
if (!valNodeAsVarNode.equals(groupingVarRenamed)) {
return; // optimization not applicable
}
indexOfVarNode = i;
} else {
return; // invalid
}
}
}
/**
* Once we reach this point, we're sure that the optimization is
* applicable, we now rewrite the query plan accordingly; the following
* needs to be done:
*
* (O1) Wrap the existing statement pattern into a SELECT DISTINCT
* subquery, which could (potentially) be optimized to a distinct term
* scan by the #{ASTDistinctTermScanOptimzer}
*
* (O2) Duplicate the statement pattern and append a fast range count
* annotation, binding the count var of the SELECT clause
*
* (O3) Transform the SELECT clause: the COUNT expression is replaced by
* a projection for the variable introduced in O2
*
* (O4) Eliminate the group by clause
*/
{
// apply optimization step (O1)
final SubqueryRoot selectDistinct = new SubqueryRoot(QueryType.SELECT);
final ProjectionNode projection = new ProjectionNode();
final AssignmentNode assignemntNode =
new AssignmentNode(groupingVar, groupingVar);
projection.addArg(assignemntNode);
projection.setDistinct(true);
selectDistinct.setProjection(projection);
final JoinGroupNode join = new JoinGroupNode();
join.addArg(stmtPattern);
selectDistinct.setWhereClause(join);
graphPattern.setArg(0, selectDistinct);
// apply optimization step (O2)
final StatementPatternNode stmtPatternClone =
new StatementPatternNode(stmtPattern);
stmtPatternClone.setFastRangeCount(countNodeVar);
graphPattern.addArg(stmtPatternClone);
// apply optimization step (O3)
projectionNode.setArg(indexOfCountNode, new AssignmentNode(
new VarNode(countNodeVar), new VarNode(countNodeVar)));
// apply optimization step (O4)
queryBase.setGroupBy(null);
}
}
}