/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Sep 14, 2011
*/
package com.bigdata.rdf.sparql.ast.optimizers;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.openrdf.query.algebra.StatementPattern.Scope;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.aggregate.AggregateBase;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.DatasetNode;
import com.bigdata.rdf.sparql.ast.FunctionNode;
import com.bigdata.rdf.sparql.ast.FunctionRegistry;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryBase;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.ValueExpressionNode;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpBase;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;
/**
* Optimizes SELECT COUNT(*) { triple-pattern } using the fast range count
* mechanisms when that feature would produce exact results for the KB instance.
*
* <h2>Cases handled by this optimizer</h2>
*
* Basic combinations with identical semantics:
* <pre>SELECT COUNT(DISTINCT *) {?s ?p ?o}</pre>
* <pre>SELECT COUNT(REDUCED *) {?s ?p ?o}</pre>
* <pre>SELECT COUNT(*) {?s ?p ?o}</pre>
*
* Combinations using a constrained range-count.
* <pre>SELECT COUNT(*) {:s ?p ?o}</pre>
* <pre>SELECT COUNT(*) {?s :p ?o}</pre>
* <pre>SELECT COUNT(*) {?s ?p :o}</pre>
* <pre>SELECT COUNT(*) {:s ?p :o}</pre>
* Combinations using a constrained range-count where the triple pattern is
* 1-unbound and the COUNT() references the unbound variable.
* <pre>SELECT COUNT(?s) {?s :p :o}</pre>
* <pre>SELECT COUNT(?p) {:s ?p :o}</pre>
* <pre>SELECT COUNT(?o) {:s :p ?o}</pre>
* Combinations using a constrained range-count with a QUADS mode access path.
* <pre>SELECT COUNT(*) { GRAPH ?g {?s ?p ?o} }</pre>
* <pre>SELECT COUNT(*) { GRAPH :g {:s ?p ?o} }</pre>
*
* Combinations using a constrained range-count with a QUADS mode access path
* where the triple pattern is 1-unbound and the COUNT() references the unbound variable.
* <pre>SELECT COUNT(?s) { GRAPH :g {?s :p :o} }</pre>
* <pre>SELECT COUNT(?g) { GRAPH ?g {:s :p :o} }</pre>
*
* Combinations using a sub-select with nothing projected in:
* <pre>SELECT * { { SELECT COUNT(*) {?s ?p ?o} } }</pre>
* <pre>SELECT * { { SELECT COUNT(*) {?s ?p ?o} } :s :p :o .}</pre>
*
* Combinations using a sub-select with something projected in:
* <pre>SELECT * { ?s a :b . { SELECT COUNT(*) {?s ?p ?o} .}</pre>
*
* <h2>Correct rejection cases NOT handled by this optimizer</h2>
*
* Combinations using DISTINCT/REDUCED and a constrained range-count,
* explicitly naming the variables, and having variables that are not in
* the COUNT() aggregate and not projected in are NOT handled here. These
* are covered by the {@link ASTDistinctTermScanOptimizer} instead:
*
* <pre>SELECT COUNT(?p) {:s ?p ?o}</pre>
* <pre>SELECT COUNT(DISTINCT ?p) {:s ?p ?o}</pre>
* <pre>SELECT COUNT(REDUCED ?p) {:s ?p ?o}</pre>
*
* Sub-select that would be handled as a distinct term scan with something
* projected in.
* <pre>SELECT * { ?s a :b . { SELECT COUNT(?p) {?s ?p ?o} .}</pre>
*
* @see <a href="http://trac.blazegraph.com/ticket/1037" > Rewrite SELECT
* COUNT(...) (DISTINCT|REDUCED) {single-triple-pattern} as ESTCARD </a>
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
public class ASTFastRangeCountOptimizer implements IASTOptimizer {
/**
*
*/
public ASTFastRangeCountOptimizer() {
}
@Override
public QueryNodeWithBindingSet optimize(
final AST2BOpContext context, final QueryNodeWithBindingSet input) {
final IQueryNode queryNode = input.getQueryNode();
final IBindingSet[] bindingSets = input.getBindingSets();
if (context.getAbstractTripleStore().
getSPORelation().indicesHaveDeleteMarkers()) {
/**
* Disallow for optimization when using delete markers.
* <p>
* The presence of deleteMarkers means that the fast-range count will
* be turned into a key-range scan, which is not desired.
* <p>
* While AccessPath.rangeCountExact(true) method will do the right
* thing even if the index has delete markers (it will convert to a
* scan for either delete markers or if there is a FILTER attached
* to the index), converting to a scan defeats the purpose of the
* ASTFastRangeCountOptimizer. In this case, the cost would have
* been the same if we had not rewritten the AST. Hence we do not
* rewrite the query.
*/
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
final QueryRoot queryRoot = (QueryRoot) queryNode;
final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);
// First, process any pre-existing named subqueries.
{
final NamedSubqueriesNode namedSubqueries = queryRoot
.getNamedSubqueries();
if (namedSubqueries != null) {
// Note: works around concurrent modification error.
final List<NamedSubqueryRoot> list = BOpUtility.toList(
namedSubqueries, NamedSubqueryRoot.class);
for (NamedSubqueryRoot namedSubquery : list) {
// Rewrite the named sub-select
doSelectQuery(context, sa, namedSubquery);
}
}
}
// rewrite the top-level select
doSelectQuery(context, sa, (QueryRoot) queryNode);
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
private void doRecursiveRewrite(final AST2BOpContext context,
final StaticAnalysis sa,
final GraphPatternGroup<IGroupMemberNode> group) {
final int arity = group.arity();
for (int i = 0; i < arity; i++) {
final BOp child = (BOp) group.get(i);
if (child instanceof GraphPatternGroup<?>) {
// Recursion into groups.
doRecursiveRewrite(context, sa,
((GraphPatternGroup<IGroupMemberNode>) child));
} else if (child instanceof SubqueryRoot) {
// Recursion into subqueries.
final SubqueryRoot subqueryRoot = (SubqueryRoot) child;
doRecursiveRewrite(context, sa, subqueryRoot.getWhereClause());
// rewrite the sub-select
doSelectQuery(context, sa, (SubqueryBase) child);
} else if (child instanceof ServiceNode) {
// Do not rewrite things inside of a SERVICE node.
continue;
}
}
}
/**
* Attempt to rewrite the SELECT.
*
* @param context
* @param sa
* @param queryBase
*/
private void doSelectQuery(final AST2BOpContext context,
final StaticAnalysis sa, final QueryBase queryBase) {
// recursion first.
doRecursiveRewrite(context, sa, queryBase.getWhereClause());
if (queryBase.getQueryType() != QueryType.SELECT) {
return;
}
// if (!StaticAnalysis.isAggregate(queryBase)) {
// return;
// }
/*
* Looking for COUNT([DISTINCT|REDUCED]? "*")
*
* The DISTINCT and REDUCED are optional for triples mode APs and for
* quads mode APs where all 4 components of the quad are captured in the
* COUNT( expression-list ). In both cases the fast range count will
* automatically give us the DISTINCT triples / quads.
*/
final ProjectionNode projection = queryBase.getProjection();
if (projection.isEmpty())
return;
if (projection.arity() > 1)
return;
final AssignmentNode assignmentNode = projection.getExpr(0);
if (!(assignmentNode.getValueExpressionNode() instanceof FunctionNode))
return;
final FunctionNode functionNode = (FunctionNode) assignmentNode
.getValueExpressionNode();
if (!functionNode.getFunctionURI().equals(FunctionRegistry.COUNT))
// Not COUNT
return;
/*
* Extract the single triple pattern from the WHERE clause.
*/
final GraphPatternGroup<IGroupMemberNode> whereClause = queryBase
.getWhereClause();
if (whereClause == null || whereClause.arity() != 1) {
// Not simple triple pattern.
return;
}
if (!(whereClause.get(0) instanceof StatementPatternNode)) {
// Not simple triple pattern.
return;
}
// The single triple pattern.
final StatementPatternNode sp = (StatementPatternNode) whereClause
.get(0);
if (context.getAbstractTripleStore().isQuads()) {
final DatasetNode dataset = sa.getQueryRoot().getDataset();
boolean ok = false;
if (dataset == null || dataset.getNamedGraphs() == null) {
/*
* The dataset is all graphs.
*/
ok = true;
}
if (sp.getScope() == Scope.DEFAULT_CONTEXTS) {
final Map<String, Object> scalarValues = functionNode
.getScalarValues();
if (scalarValues != null) {
final Boolean isDistinct = (Boolean) scalarValues
.get(AggregateBase.Annotations.DISTINCT);
if (isDistinct != null && isDistinct) {
/*
* We can not use the fast-range-count for a quads-mode
* default graph query. If there are multiple graphs in
* the default graph query, then we need to take the RDF
* merge of those named graphs. This is done by feeding
* the quads into a filter that stripes off the context
* position and then imposes a DISTINCT-SPO filter. The
* result of that DISTINCT-SPO filter are then the
* distinct triples (vs distinct quads). The count of
* those distinct triples is what is required for
* COUNT(DISTINCT) for a quads mode default graph query.
*
* TODO We can do this for the case where there is only
* a single named graph that is being considered by the
* default graph query since that case reduces to the
* same as having the graph be a constant.
*/
ok = false;
}
}
}
if (!ok) {
// Can not rewrite.
return;
}
}
/**
* When in history mode, we can't do fast range count with two key-
* probes, unless the StatementPatternNode has been marked to read
* history. Without that a scan+filter is necessary.
*/
if (context.getAbstractTripleStore().isRDRHistory()) {
if (!sp.getQueryHintAsBoolean(QueryHints.HISTORY, false)) {
// Can not rewrite.
return;
}
}
/**
* Figure out if this is COUNT(*) or semantically equivalent to
* COUNT(*).
*
* Note: COUNT(x y z) is semantically equivalent to COUNT(*) if x, y,
* and z are the names of the variables in the triple pattern.
*
* Note: A simple BPG does not declare the graph variable. The graph
* variable is ONLY declared by GRAPH ?g {BPG}. For quads mode APs, the
* graph variable needs to be declared (using GRAPH ?g {BPG}) and used
* in the COUNT( expression-list ) in order for the rewrite to have the
* correct semantics.
*
* FIXME We also need to handle named graph vs default graph.
*
* Thus any of the following can be converted:
*
* - COUNT(*) {GRAPH ?g {?s ?p ?o}}
*
* - COUNT(*) {GRAPH ?g {:s ?p ?o}}
*
* - COUNT(*) {GRAPH :g {?s ?p ?o}}
*
* - COUNT(?s ?p ?p ?g) {GRAPH ?g {?s ?p ?o}}
*
* However, in quads mode the following MAY NOT be converted (unless
* there is a single named graph) because the hidden graph variable is
* not part of the expression-list for COUNT.
*
* - COUNT(*) {?s ?p ?o}
*
* - COUNT(?s ?p ?o) {?s ?p ?o}
*
* In particular, we would get the WRONG answer if we converted the
* following in quads mode since the COUNT(DISTINCT ?s ?p ?o) is just
* distinct TRIPLES but the AP fast range count would report distinct
* QUADS.
*
* - COUNT(DISTINCT ?s ?p ?o) {?s ?p ?o} where defaultGraph=ALL
*
* TODO Another possibility for this last case is to explicitly compute
* the sum of the range counts over the triple pattern for the set of
* named or default graphs.
*/
boolean isCountStar = false;
if (functionNode.arity() == 1
&& (functionNode.get(0) instanceof VarNode)
&& (((VarNode) functionNode.get(0)).isWildcard())) {
/*
* COUNT(*)
*/
isCountStar = true;
}
if (!isCountStar
&& functionNode.arity() == context.getAbstractTripleStore()
.getSPOKeyArity()) {
/*
* There are as many function arguments as the arity of the KB.
*
* Check to see if all variables in the associated triple pattern
* are declared in the COUNT( expression-list ).
*/
final Set<IVariable<?>> boundVars = sp.getProducedBindings();
for (int i = 0; i < functionNode.arity(); i++) {
final ValueExpressionNode arg = (ValueExpressionNode) functionNode
.get(i);
if (!(arg instanceof VarNode)) {
// Not a simple variable.
break;
}
// remove any variable in the COUNT( expression-list )
boundVars.remove(((VarNode) arg).getValueExpression());
}
if (boundVars.isEmpty()) {
/*
* If boundVars is now empty then all variables appearing in the
* triple pattern also appear in the COUNT( expression-list ).
* So this is effectively equivalent to a COUNT(*) expression.
*/
isCountStar = true;
}
}
if (!isCountStar) {
/*
* Neither explicit nor implicit COUNT(*).
*/
return;
}
/**
* Rewrite the (sub-)SELECT.
*/
final VarNode theVar = assignmentNode.getVarNode();
// Mark the triple pattern for fast range count.
if (markForFastRangeCount(context, sp, theVar)) {
// in case the triple pattern has been marked (i.e., the method returns
// true, rewrite the projection as SELECT ?var.
final ProjectionNode newProjection = new ProjectionNode();
newProjection.addProjectionVar(theVar);
queryBase.setProjection(newProjection);
} // else: nothing to do
}
/**
* Marks the triple pattern for fast range count, if supported. If fast range
* counts are supported (they're currently not yet on the GPU) and the triple
* pattern has been marked, the method returns true. Otherwise, it must
* return false (see ASTGPUFastRangeCountOptimizer override).
*
* @param sp
* @param fastRangeCountVariable
*/
protected boolean markForFastRangeCount(final AST2BOpContext context,
final StatementPatternNode sp,
final VarNode fastRangeCountVariable)
{
// Mark the triple pattern with the FAST-RANGE-COUNT attribute.
sp.setFastRangeCount(fastRangeCountVariable);
/*
* Mark the triple pattern as having an ESTIMATED-CARDINALITY one ONE.
*
* Note: We will compute the COUNT(*) for the triple pattern using two
* key probes. Therefore we set the estimated cost of computing that
* cardinality to the minimum.
*/
sp.setProperty(AST2BOpBase.Annotations.ESTIMATED_CARDINALITY, 1L);
return true;
}
}