/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Sep 14, 2011
*/
package com.bigdata.rdf.sparql.ast.optimizers;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import org.apache.log4j.Logger;
import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstant;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.IVariableOrConstant;
import com.bigdata.bop.Var;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.DatasetNode;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryBase;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpBase;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;
import com.bigdata.rdf.spo.ISPO;
import com.bigdata.rdf.spo.SPOKeyOrder;
import com.bigdata.striterator.IKeyOrder;
/**
* Optimizes
* <code>SELECT (DISTINCT|REDUCED) ?property WHERE { ?x ?property ?y . }</code>
* and similar patterns using an O(N) algorithm, where N is the number of
* distinct solutions.
* <p>
* The main advantage here is to turn an access path that is fully unbound into
* a distinct-term scan. If the access path would be evaluated as-bound with at
* least one variable bound, then the distinct term scan might not have any
* advantage over the pipeline join (and the semantics of DISTINCT would be
* violated with multiple as-bound evaluations of the distinct-term-scan without
* a hash index to impose the DISTINCT constraint).
*
* TODO We are doing something very similar for <code>GRAPH ?g {}</code>. It
* would be worth while to look at that code in the light of this optimizer.
*
* @see <a href="http://trac.blazegraph.com/ticket/1035" > DISTINCT PREDICATEs
* query is slow </a>
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
public class ASTDistinctTermScanOptimizer implements IASTOptimizer {
/**
*
*/
public ASTDistinctTermScanOptimizer() {
}
@Override
public QueryNodeWithBindingSet optimize(
final AST2BOpContext context, final QueryNodeWithBindingSet input) {
final IQueryNode queryNode = input.getQueryNode();
final IBindingSet[] bindingSets = input.getBindingSets();
final QueryRoot queryRoot = (QueryRoot) queryNode;
final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);
final DatasetNode dataset = queryRoot.getDataset();
if (context.getAbstractTripleStore().isQuads()) {
boolean ok = false;
if (dataset == null || dataset.getNamedGraphs() == null) {
/*
* The dataset is all graphs.
*/
ok = true;
}
if (!ok) {
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
}
// First, process any pre-existing named subqueries.
{
final NamedSubqueriesNode namedSubqueries = queryRoot
.getNamedSubqueries();
if (namedSubqueries != null) {
// Note: works around concurrent modification error.
final List<NamedSubqueryRoot> list = BOpUtility.toList(
namedSubqueries, NamedSubqueryRoot.class);
for (NamedSubqueryRoot namedSubquery : list) {
// Rewrite the named sub-select
doSelectQuery(context, sa, (QueryRoot) queryNode, namedSubquery);
}
}
}
// rewrite the top-level select
doSelectQuery(context, sa, (QueryRoot) queryNode, (QueryBase) queryNode);
return new QueryNodeWithBindingSet(queryNode, bindingSets);
}
private void doRecursiveRewrite(final AST2BOpContext context,
final StaticAnalysis sa, final QueryRoot queryRoot,
final GraphPatternGroup<IGroupMemberNode> group) {
final int arity = group.arity();
for (int i = 0; i < arity; i++) {
final BOp child = (BOp) group.get(i);
if (child instanceof GraphPatternGroup<?>) {
// Recursion into groups.
doRecursiveRewrite(context, sa, queryRoot,
((GraphPatternGroup<IGroupMemberNode>) child));
} else if (child instanceof SubqueryRoot) {
// Recursion into subqueries.
final SubqueryRoot subqueryRoot = (SubqueryRoot) child;
doRecursiveRewrite(context, sa, queryRoot,
subqueryRoot.getWhereClause());
// rewrite the sub-select
doSelectQuery(context, sa, queryRoot, (SubqueryBase) child);
} else if (child instanceof ServiceNode) {
// Do not rewrite things inside of a SERVICE node.
continue;
}
}
}
/**
* Attempt to rewrite the SELECT.
*
* @param context
* @param sa
* @param queryRoot
* The top-level of the query.
* @param queryBase
* Either a top-level query or a sub-query.
*/
private void doSelectQuery(final AST2BOpContext context,
final StaticAnalysis sa, final QueryRoot queryRoot,
final QueryBase queryBase) {
// recursion first.
doRecursiveRewrite(context, sa, queryRoot, queryBase.getWhereClause());
if (queryBase.getQueryType() != QueryType.SELECT) {
return;
}
/*
* Looking for SELECT ?var { triple-or-quads-pattern }
*
* where ?var is one of the variables in that triple or quads pattern.
*/
final ProjectionNode projection = queryBase.getProjection();
if (!projection.isDistinct() && !projection.isReduced()) {
/*
* The distinct term scan automatically eliminates duplicates.
* Therefore it is only allowable with SELECT DISTINCT or SELECT
* REDUCED.
*/
return;
}
if (projection.isEmpty())
return;
if (projection.arity() > 1)
return;
final AssignmentNode assignmentNode = projection.getExpr(0);
if (!(assignmentNode.getValueExpressionNode() instanceof VarNode)) {
/*
* The projection needs to be a single, simple variable.
*/
return;
}
final IVariable<?> projectedVar = assignmentNode.getVar();
/**
* Looking for a single triple or quad pattern in the WHERE clause.
*/
final GraphPatternGroup<IGroupMemberNode> whereClause =
queryBase.getWhereClause();
if (whereClause == null || whereClause.arity() != 1) {
// Not simple triple pattern.
return;
}
if (!(whereClause.get(0) instanceof StatementPatternNode)) {
// Not simple triple pattern.
return;
}
// The single triple pattern.
final StatementPatternNode sp = (StatementPatternNode) whereClause.get(0);
/*
* When in history mode, we can't do a distinct term scan unless the
* StatementPatternNode has been marked to read history. The distinct
* term scan will visit terms that might have been deleted.
*/
if (context.getAbstractTripleStore().isRDRHistory()) {
if (!sp.getQueryHintAsBoolean(QueryHints.HISTORY, false)) {
// Can not rewrite.
return;
}
}
IKeyOrder<ISPO> keyOrder = getApplicableKeyOrderIfExists(sp,
projectedVar, context);
if (keyOrder == null) {
return;
}
/*
* Make sure that there are no correlated variables in the SP.
*/
{
final Set<VarNode> vars = new LinkedHashSet<VarNode>();
for (VarNode varNode : BOpUtility.toList(sp, VarNode.class)) {
if (!vars.add(varNode)) {
// This variable appears more than once.
return;
}
}
}
final Set<IVariable<?>> producedBindings = sp.getProducedBindings();
if (!producedBindings.contains(projectedVar)) {
/*
* The projected variable is not any of the variables used in the
* triple pattern.
*
* Note: This rewrite is only advantageous when a single variable is
* bound by the triple pattern is projected out of the query (or
* perhaps when 2 variables are bound by a quad pattern and projected
* out of the query). The benefit of this rewrite is that we can avoid
* binding the variables that are NOT projected out of the query.
*
* TODO Does this already handle named graph APs?
*/
return;
}
if (queryBase instanceof SubqueryRoot) {
/*
* The pattern is detected is a sub-select.
*
* Mark the SELECT as "run-once". This will cause it to be lifted out
* as a named subquery. This is how we enforce bottom-up evaluation. In
* this case it is REQUIRED since the semantics of DISTINCT / REDUCED
* would not be enforced across multiple as-bound invocations of the
* rewritten sub-SELECT.
*
* FIXME Make sure that we have a correctness test for the case of an
* embedded sub-select where the DISTINCT semantics would otherwise
* break. (We are REQUIRED to use bottom-up evaluation in order to have
* the semantics of DISTINCT/REDUCED across the SELECT if this clause
* is appearing as a sub-SELECT. Otherwise the sub-SELECT could be
* evaluated for multiple source bindings leading to multiple
* applications of the distinct-term-scan and that would break the
* DISTINCT/REDUCED semantics of the operator.)
*/
((SubqueryRoot) queryBase).setRunOnce(true/* runOnce */);
}
/*
* Disable DISTINCT/REDUCED. The distinct-term-scan will automatically
* enforce this.
*/
projection.setDistinct(false);
projection.setReduced(false);
/*
* Setup the distinct-term-scan annotation with the variables that will be
* projected out of the SELECT.
*/
final VarNode distinctTermScanVar = new VarNode(projectedVar.getName());
sp.setDistinctTermScanVar(distinctTermScanVar);
sp.setQueryHint(IPredicate.Annotations.KEY_ORDER, keyOrder.toString());
/**
* Change the estimated cardinality.
*
* The new cardinality is:
*
* <pre>
* newCard = oldCard * 1.0 / arity(context, sp)
* </pre>
*
* where arity() is 3 for triples and 4 for quads.
*/
final Long oldCard = (Long) sp
.getProperty(AST2BOpBase.Annotations.ESTIMATED_CARDINALITY);
if (oldCard == null) {
throw new AssertionError(
"Expecting estimated-cardinality to be bound: sp=" + sp);
}
final int arity = context.isQuads() ? 4 : 3;
final long newCard = (long) (oldCard * 1.0 / arity);
sp.setProperty(AST2BOpBase.Annotations.ESTIMATED_CARDINALITY, newCard);
}
/**
* Computes an applicable key order for performing a distinct range term
* scan, if exists. Such a key order must be formed out of a prefix
* [ConstList + DistinctVar], where ConstList is the list of constants in the
* triple pattern.
*
* @param sp
* @param context
* @return matching key order, if exists, null if not (indicating failure)
*/
private IKeyOrder<ISPO> getApplicableKeyOrderIfExists(
StatementPatternNode sp, IVariable<?> termScanVar,
AST2BOpContext context) {
boolean isQuads = context.getAbstractTripleStore().isQuads();
// first, construct a predicate for index probing
IVariableOrConstant[] args = new IVariableOrConstant[isQuads ? 4 : 3];
args[0] = sp.s().getValueExpression();
args[1] = sp.p().getValueExpression();
args[2] = sp.o().getValueExpression();
if (isQuads) {
args[3] = sp.c() == null ? Var.var("--anon-" + context.nextId()) : sp
.c().getValueExpression();
}
Set<SPOKeyOrder> candidateKeyOrder = getCandidateKeyOrders(sp,
termScanVar, context, isQuads);
if (candidateKeyOrder.isEmpty()) {
return null;
} else {
return candidateKeyOrder.iterator().next();
}
}
/**
* Return all candidate key orders. Candidate key orders must satisfy the
* condition that the constants in the triple pattern form a prefix, followed
* by the term scan variable in the position right behind the constant
* positions.
*
* @param sp
* @param termScanVar
* @param context
* @param isQuads
* @return
*/
private Set<SPOKeyOrder> getCandidateKeyOrders(StatementPatternNode sp,
IVariable<?> termScanVar, AST2BOpContext context, boolean isQuads) {
/**
* Constraints on the positions are as follows:
*
* 2 - constant 1 - the distinct term scan var 0 - unconstrained
*/
final StringBuffer constantPosBuf = new StringBuffer();
Character distinctTermScanPos = null;
final StringBuffer unconstrainedPosBuf = new StringBuffer();
final int pcS = getPositionConstraint(sp.s().getValueExpression(), termScanVar);
final int pcP = getPositionConstraint(sp.p().getValueExpression(), termScanVar);
final int pcO = getPositionConstraint(sp.o().getValueExpression(), termScanVar);
if (pcS == 2)
constantPosBuf.append("S");
if (pcP == 2)
constantPosBuf.append("P");
if (pcO == 2)
constantPosBuf.append("O");
if (pcS == 1)
distinctTermScanPos = 'S';
if (pcP == 1)
distinctTermScanPos = 'P';
if (pcO == 1)
distinctTermScanPos = 'O';
if (pcS == 0)
unconstrainedPosBuf.append("S");
if (pcP == 0)
unconstrainedPosBuf.append("P");
if (pcO == 0)
unconstrainedPosBuf.append("O");
if (isQuads) {
if (sp.c() == null || sp.c().getValueExpression() == null) {
unconstrainedPosBuf.append("C");
} else {
int pcC = getPositionConstraint(sp.c().getValueExpression(),
termScanVar);
if (pcC == 2)
constantPosBuf.append("C");
if (pcC == 1)
distinctTermScanPos = 'C';
if (pcC == 0)
unconstrainedPosBuf.append("C");
}
}
/**
* There's some bug if all but one of the positions are constrained,
* we do *not* want to apply the optimization in that case (though it
* should be possible). For now, the fix is to not optimize, while in
* future we may want to address the root cause of the issue.
*
* See https://jira.blazegraph.com/browse/BLZG-1346.
*/
final int maxLength = isQuads ? 2 : 1;
if (constantPosBuf.length() > maxLength) {
return new LinkedHashSet<SPOKeyOrder>();
}
final String prefix = constantPosBuf.toString();
final Set<String> allPossibleConstPrefixes = new LinkedHashSet<String>();
getPermutations(prefix, allPossibleConstPrefixes);
if (allPossibleConstPrefixes.isEmpty())
allPossibleConstPrefixes.add(""); // neutral element
final String suffix = unconstrainedPosBuf.toString();
final Set<String> allPossibleConstSuffixes = new LinkedHashSet<String>();
getPermutations(suffix, allPossibleConstSuffixes);
if (allPossibleConstSuffixes.isEmpty())
allPossibleConstSuffixes.add(""); // neutral element
// calculate set of all key order candidates
final Set<SPOKeyOrder> allPossiblePrefixes =
new LinkedHashSet<SPOKeyOrder>();
for (String constPrefix : allPossibleConstPrefixes) {
for (String constSuffix : allPossibleConstSuffixes) {
final String index = constPrefix + distinctTermScanPos + constSuffix;
try {
allPossiblePrefixes.add(SPOKeyOrder.fromString(index));
} catch (IllegalArgumentException e) {
// key order does not exist, ignoring
}
}
}
return allPossiblePrefixes;
}
/**
* Returns a constraint ID defined as follows:
*
* 2 - constant 1 - the distinct term scan var 0 - unconstrained
*/
private int getPositionConstraint(IVariableOrConstant val,
IVariable<?> termScanVar) {
if (val instanceof IConstant) {
return 2;
} else if (val instanceof IVariable) {
return val.equals(termScanVar) ? 1 : 0;
} else {
return 0; // val == null
}
}
private void getPermutations(String str, Set<String> collector) {
getPermutations("", str, collector);
}
private void getPermutations(
String prefix, String str, Set<String> collector) {
final int n = str.length();
if (n == 0)
collector.add(prefix);
else {
for (int i = 0; i < n; i++)
getPermutations(prefix + str.charAt(i),
str.substring(0, i) + str.substring(i + 1, n), collector);
}
}
}