/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Oct 19, 2011 */ package com.bigdata.rdf.sparql.ast.optimizers; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import com.bigdata.bop.BOp; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IVariable; import com.bigdata.rdf.sparql.ast.ArbitraryLengthPathNode; import com.bigdata.rdf.sparql.ast.AssignmentNode; import com.bigdata.rdf.sparql.ast.BindingsClause; import com.bigdata.rdf.sparql.ast.FilterNode; import com.bigdata.rdf.sparql.ast.GraphPatternGroup; import com.bigdata.rdf.sparql.ast.IBindingProducerNode; import com.bigdata.rdf.sparql.ast.IGroupMemberNode; import com.bigdata.rdf.sparql.ast.IQueryNode; import com.bigdata.rdf.sparql.ast.JoinGroupNode; import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode; import com.bigdata.rdf.sparql.ast.NamedSubqueryInclude; import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot; import com.bigdata.rdf.sparql.ast.ProjectionNode; import com.bigdata.rdf.sparql.ast.QueryBase; import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet; import com.bigdata.rdf.sparql.ast.QueryRoot; import com.bigdata.rdf.sparql.ast.QueryType; import com.bigdata.rdf.sparql.ast.StatementPatternNode; import com.bigdata.rdf.sparql.ast.StaticAnalysis; import com.bigdata.rdf.sparql.ast.SubqueryRoot; import com.bigdata.rdf.sparql.ast.UnionNode; import com.bigdata.rdf.sparql.ast.VarNode; import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext; import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility; import com.bigdata.rdf.sparql.ast.service.ServiceNode; /** * NOTE: this optimizer was not sound from a correctness perspective in previous * versions (cf. http://trac.blazegraph.com/ticket/1071); the rewritten form should * be fine from a correctness perspective, but does not really optimize queries * from a performance perspective, so it has been (temporarily) disabled in the * {@link DefaultOptimizerList}. The description below describes the original * idea, the actual implementation slightly differs in (i) that it passes around * intermediate solutions (rather than computing all joins over the original * outer mapping set) and (ii) does not compute a final join over all * intermediate solutions but simply returns the last computed result (which * is now possible due to (i)). * * Also note that the {@link AST2BOpUtility} has been refactored (as part of * ticket #1071 and #1118) and now implements intelligent pushing of projection * variables into OPTIONAL blocks, so there's probably no more need for this * optimizer. * * * Rewrite a join group using two or more complex OPTIONAL groups using a hash * join pattern. * <p> * Note: this optimization is not required if there is only one complex optional * in the join group. It is only when there are multiple complex optional groups * that we need to lift those groups out as named subqueries (since they need to * feed each other). If there is only one complex optional group, then we can * run it as a sub-group instead. * <p> * NOte: This optimization presumes that simple optional groups were already * translated into optional {@link StatementPatternNode}s. * <p> * Queries with multiple complex optional groups can be rewritten into the hash * join of solution sets as follows. * * <ol> * * <li>First, create a hash index from the required joins and any simple * optional joins. Given the modeling abilities of the AST, this is most easily * achieved by converting the required joins into a named subquery. The join * variable(s) for that named subquery will the subset of variable(s) which are * shared by each of the complex OPTIONAL groups. The * {@link ASTNamedSubqueryOptimizer} already handles the assignment of join * variables, so we do not need to consider it further here.</li> * * <li>For each complex optional group, use the solution step generated in (1) * and run the optional group as a named subquery producing a new solution set. * The WHERE clause of the named subquery should look like: * * <pre> * {INCLUDE %set . OPTIONAL {...}} * </pre> * * </li> * * <li>Join all of the named solution sets in (2) back together. For example, if * there were two complex optional groups and the required joins resulted in * known bound variables for var1 and var2, then those result sets might be * combined as follows in the main WHERE clause of the rewritten query. * * <pre> * INCLUDE %set1 . * INCLUDE %set2 JOIN ON (?var1, ?var2) . * </pre> * * Note: The join variables for those INCLUDEs MUST be identified through static * analysis. Failure to use available join variables will result in an extremely * inefficient query plan as the full cross product of the solutions will be * compared to identify solutions which join.</li> * * </ol> * * TODO The rewrite into named subquery includes means that we wind up building * more hash indices than we strictly require as a hash index will also be built * at the start of each optional group. However, since the hash index at the * start of the optional group has exactly the same data as the named subquery * include's hash index, we should elide the step which builds the extra hash * index. * * TODO This optimzer should not be strictly necessary at all. The same behavior * should arise from running the complex optionals as sub-groups. Based on a few * govtrack CI queries, it looks like we do better when the complex optional * groups are lifted out as named subqueries. We need to go back and investigate * whether or not this is true and why. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/397 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @author <a href="mailto:ms@metaphacts.com">Michael Schmidt</a> * */ @Deprecated public class ASTComplexOptionalOptimizer implements IASTOptimizer { // private static final Logger log = Logger // .getLogger(ASTComplexOptionalOptimizer.class); /** * Set this to <code>false</code> to only optimize the top level group. * <p> * Note: We have confirm that this change does not have a negative impact on * either govtrack or BSBM. The change also fixes a known problem with some * customer queries. * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/668" > Join * Group Optimization </a> */ private static boolean recurse = false; @Override public QueryNodeWithBindingSet optimize( final AST2BOpContext context, final QueryNodeWithBindingSet input) { final IQueryNode queryNode = input.getQueryNode(); final IBindingSet[] bindingSets = input.getBindingSets(); final QueryRoot queryRoot = (QueryRoot) queryNode; final StaticAnalysis sa = new StaticAnalysis(queryRoot, context); final Set<IVariable<?>> exogenousVars = context.getSolutionSetStats() .getUsedVars(); // First, process any pre-existing named subqueries. { final NamedSubqueriesNode namedSubqueries = queryRoot .getNamedSubqueries(); if (namedSubqueries != null) { final List<NamedSubqueryRoot> roots = new LinkedList<NamedSubqueryRoot>(); for (NamedSubqueryRoot namedSubquery : namedSubqueries) { roots.add(namedSubquery); } for (NamedSubqueryRoot namedSubquery : roots) { convertComplexOptionalGroups(context, sa, namedSubquery, namedSubquery.getWhereClause(), exogenousVars); } } } // Now process the main where clause. convertComplexOptionalGroups(context, sa, queryRoot, queryRoot.getWhereClause(), exogenousVars); return new QueryNodeWithBindingSet(queryNode, bindingSets); } /** * Using a depth-first recursion, examine each join group. If a join group * contains two or more complex optional groups (that is, an optional group * which was not translated into a optional statement pattern node) then * convert the join group having those complex optional child groups. */ private void convertComplexOptionalGroups(final AST2BOpContext context, final StaticAnalysis sa, final QueryBase query, final GraphPatternGroup<IGroupMemberNode> group, final Set<IVariable<?>> exogenousVars) { final int arity = group.arity(); int complexOptionalGroupCount = 0; for (int i = 0; i < arity; i++) { final BOp child = (BOp) group.get(i); if (child instanceof GraphPatternGroup<?>) { /* * Note: Do recursion *before* we do the rewrite. */ @SuppressWarnings("unchecked") final GraphPatternGroup<IGroupMemberNode> childGroup = (GraphPatternGroup<IGroupMemberNode>) child; if (recurse) { convertComplexOptionalGroups(context, sa, query, childGroup, exogenousVars); } if (childGroup.isOptional() && (!(childGroup.arity() == 1 && childGroup.get(0) instanceof NamedSubqueryInclude))) { /* * Note: Do NOT translate a child group which consists * solely of a named subquery include. That is the target * output for this optimizer! * * Note: This presumes that simple optionals were already * translated into optional statement patterns. */ complexOptionalGroupCount++; } } else if (child instanceof SubqueryRoot) { // Recursion into subqueries. final SubqueryRoot subqueryRoot = (SubqueryRoot) child; if (recurse) { convertComplexOptionalGroups(context, sa, query, subqueryRoot.getWhereClause(), exogenousVars); } } } if (complexOptionalGroupCount >= 2 && (group instanceof JoinGroupNode)) { /* * Convert a join group having more than one direct child which is a * complex optional group. */ convertJoinGroup(context, sa, query, (JoinGroupNode) group, exogenousVars); } } /** * 1. Move the required joins (INCLUDEs, statement pattern nodes, and * required subqueries) plus any simple OPTIONALs (which are statement * pattern nodes) into a new named subquery (unless the required joins are * already a single INCLUDE). * <p> * 2. Lift each complex optional group into a new named subquery which * INCLUDEs the result from (1), replacing it with an INCLUDE of the named * subquery in this group. * <p> * Note: The named solution set from (1) is NOT directly INCLUDEd back into * this join group. Instead, it is INCLUDEd into each the named subquery for * each complex optional lifted out of this join group. * * @param query * The (sub-)query in which the join group appears. We need to * know what is being projected out of the (sub-)query in order * to compute the projections when converting sub-groups into * subqueries. * @param group * The join group to be rewritten. * @param exogenousVars * The exogenous variables for the query. */ private void convertJoinGroup(final AST2BOpContext context, final StaticAnalysis sa, final QueryBase query, final JoinGroupNode group, final Set<IVariable<?>> exogenousVars) { /* * Step 0: Precondition check * * Check if the optimization is indeed applicable: rewriting is only * sound if the WHERE clause of the body select query (as constructed * in Step 1 below) will contain one or more solution generating * expressions (triple patterns, join groups, etc.) */ { final IGroupMemberNode[] members = group .toArray(new IGroupMemberNode[] {}); boolean optimizationApplicable = false; for(int i=0; i<members.length && !optimizationApplicable; i++) { // the where clause will be non-empty if any of the group // members contains a solution-generating pattern (different // from optional, which will be optimized) IGroupMemberNode t = members[i]; optimizationApplicable |= t instanceof IBindingProducerNode && !(t instanceof AssignmentNode) && // assignments run last !(t instanceof JoinGroupNode && ((JoinGroupNode) t).isOptional()); } if (!optimizationApplicable) return; // not safe, can't do anything here } /* * Step 1. * * FIXME This MUST recognize the case where everything is already in a * single INCLUDE, which is the post-condition for Step 1. Test this * by feeding the post-condition of this step into a unit test. */ // The name of the solution set for this join group. final String mainSolutionSetName = "--nsr-" + context.nextId(); // The set of direct children which are complex optional groups. final List<JoinGroupNode> complexGroups = new LinkedList<JoinGroupNode>(); // // The list of direct children which were moved. // final List<IGroupMemberNode> move = new LinkedList<IGroupMemberNode>(); { final NamedSubqueryRoot nsr = new NamedSubqueryRoot( QueryType.SELECT, mainSolutionSetName); sa.getQueryRoot().getNamedSubqueriesNotNull().add(nsr); final JoinGroupNode whereClause = new JoinGroupNode(); nsr.setWhereClause(whereClause); // Move any pre-filters or join-filters to the named subquery. { for (FilterNode f : sa.getPreFilters(group)) { whereClause.addChild(f); } for (FilterNode f : sa.getJoinFilters(group)) { whereClause.addChild(f); } } // Move any required joins to the named subquery. final IGroupMemberNode[] members = group .toArray(new IGroupMemberNode[] {}); for(IGroupMemberNode t : members) { if (t instanceof StatementPatternNode || t instanceof NamedSubqueryInclude || t instanceof SubqueryRoot || t instanceof ServiceNode || t instanceof UnionNode || t instanceof ArbitraryLengthPathNode || t instanceof BindingsClause) { // Moved to the named subquery. // move.add(t); group.removeChild(t); whereClause.addChild(t); } else if (t instanceof FilterNode) { /* * Leave other filters in place. They depend on something in * the optional groups. */ } else if(t instanceof AssignmentNode) { /* * Leave assignment nodes in place. They run last. */ } else if (t instanceof JoinGroupNode) { final JoinGroupNode childGroup = (JoinGroupNode) t; if (childGroup.isOptional()) { /* * This will be moved into a different named subquery in * the next step (below). */ complexGroups.add(childGroup); } else { // Move non-optional child group to the named subquery. group.removeChild(t); whereClause.addChild(childGroup); } } else { /* * This is a catch all for things which might not have been * considered above. */ throw new AssertionError("Not expecting: "+t+" in "+group); } } /* * Create the PROJECTION for the lifted named subquery. * * Note: Everything which was lifted is no longer present in the * WHERE clause. Thus, when computing the projection of the lifted * subquery we want to project anything which appeared in the lifted * where clause IF it is referenced again by those things which * remain in the group (but paying attention to variable scoping for * sub-queries). * * TODO This would appear to ignore variables which are referenced * in a parent group. I.e., we should recursively apply the same * analysis to the downstream siblings in the parent group to make * sure that the variable is not reused. * * TODO Make this projection DISTINCT if that does not change the * query semantics. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/368 (Prune * variables during query evaluation) */ { // All variables which are used within the WHERE clause of the lifted named subquery. final Set<IVariable<?>> groupVars = sa.getSpannedVariables(whereClause, new LinkedHashSet<IVariable<?>>()); // All variables still referenced in the joins or filters of // the group (after extracting the named subquery). final Set<IVariable<?>> afterVars = sa.getSpannedVariables( (BOp) group, new LinkedHashSet<IVariable<?>>()); if (query.getProjection() != null) { // Include anything that we must project out of the query. final ProjectionNode tmp = query.getProjection(); tmp.getSelectExprVars(afterVars); // tmp.getProjectionVars(afterVars);// FIXME This needs to be the variables USED in the SELECT expressions, NOT the variables projected out of the query. } final Set<IVariable<?>> projectedVars = new LinkedHashSet<IVariable<?>>(); projectedVars.addAll(groupVars); projectedVars.retainAll(afterVars); final ProjectionNode projection = new ProjectionNode(); for (IVariable<?> v : projectedVars) { projection.addProjectionVar(new VarNode(v.getName())); } nsr.setProjection(projection); } } /** * Extract maybe produced variables from the complex join groups, * making the accessible in an easy way for reuse in the subsequent * iteration */ final List<Set<IVariable<?>>> complexGroupsDefiniteVars = new ArrayList<Set<IVariable<?>>>(complexGroups.size()); for (int i=0; i<complexGroups.size(); i++) { final Set<IVariable<?>> cur = new HashSet<IVariable<?>>(); sa.getDefinitelyProducedBindings(complexGroups.get(i), cur, true); complexGroupsDefiniteVars.add(i,cur); } // Step 2 (for each direct child complex optional group). String precedingSolutionName = mainSolutionSetName; for (int i=0; i<complexGroups.size(); i++) { final JoinGroupNode childGroup = complexGroups.get(i); // log.error("Convert: " + childGroup); final String solutionSetName = "--nsr-" + context.nextId(); final NamedSubqueryRoot nsr = new NamedSubqueryRoot( QueryType.SELECT, solutionSetName); sa.getQueryRoot().getNamedSubqueriesNotNull().add(nsr); final JoinGroupNode whereClause = new JoinGroupNode(); nsr.setWhereClause(whereClause); final NamedSubqueryInclude mainInclude = new NamedSubqueryInclude( precedingSolutionName); whereClause.addChild(mainInclude); whereClause.addChild(childGroup); final NamedSubqueryInclude anInclude = new NamedSubqueryInclude( solutionSetName); /* * We substitute the current include into the main query. * * TODO: Note that it may be removed again at the end of the for * loop: actually, we only keep the final subquery, all others are * dropped again. We just need to add them temporarily, * to be able to reuse the static analysis (call * sa.getProjectedVars) below. We might try to change this to make * the code more readable. */ if (group.replaceWith(childGroup, anInclude) != 1) throw new AssertionError(); /* * Create the projection for the named subquery and replace the * query with the named subquery ID. */ { /* * sa.getProjectedVars computes required variables according * to the ancestor axis */ final Set<IVariable<?>> projectedVars = sa.getProjectedVars( anInclude, whereClause, query, exogenousVars, new LinkedHashSet<IVariable<?>>()); /* * In addition to the vars collected by sa.getProjectedVars, * we need to retain variables appearing in subsequent complex * join groups. This is necessary to avoid a blowup (duplicates) * in the number of results, see ticket #801, i.e. we need to * make sure that joins with subsequent join groups are * executed over *all* joint variables. * * To do so, we start up with the maybe vars of the group itself * and retain all maybe vars occurring in one of the following * join groups, and add them to the list of projected vars. */ final Set<IVariable<?>> joinVarCandidates = complexGroupsDefiniteVars.get(i); final Set<IVariable<?>> subsequentGroupDefiniteVars = new HashSet<IVariable<?>>(); for (int j=i+1; j<complexGroupsDefiniteVars.size(); j++) { subsequentGroupDefiniteVars.addAll(complexGroupsDefiniteVars.get(j)); } joinVarCandidates.retainAll(subsequentGroupDefiniteVars); projectedVars.addAll(joinVarCandidates); /* * Having computed the projection vars, we're now ready to * build the projection clause for the current named subquery. */ final ProjectionNode projection = new ProjectionNode(); for (IVariable<?> v : projectedVars) { projection.addProjectionVar(new VarNode(v.getName())); } nsr.setProjection(projection); // remove group again if (i!=complexGroups.size()-1) { if (!group.removeArg(anInclude)) throw new AssertionError(); } } precedingSolutionName = solutionSetName; } } }