ASTComplexOptionalOptimizer.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 19, 2011
 */

package com.bigdata.rdf.sparql.ast.optimizers;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import com.bigdata.bop.BOp;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.ArbitraryLengthPathNode;
import com.bigdata.rdf.sparql.ast.AssignmentNode;
import com.bigdata.rdf.sparql.ast.BindingsClause;
import com.bigdata.rdf.sparql.ast.FilterNode;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.IBindingProducerNode;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.JoinGroupNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryInclude;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.ProjectionNode;
import com.bigdata.rdf.sparql.ast.QueryBase;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.QueryType;
import com.bigdata.rdf.sparql.ast.StatementPatternNode;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.UnionNode;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;

/**
 * NOTE: this optimizer was not sound from a correctness perspective in previous
 * versions (cf. http://trac.blazegraph.com/ticket/1071); the rewritten form should 
 * be fine from a correctness perspective, but does not really optimize queries
 * from a performance perspective, so it has been (temporarily) disabled in the
 * {@link DefaultOptimizerList}. The description below describes the original
 * idea, the actual implementation slightly differs in (i) that it passes around
 * intermediate solutions (rather than computing all joins over the original
 * outer mapping set) and (ii) does not compute a final join over all
 * intermediate solutions but simply returns the last computed result (which
 * is now possible due to (i)). 
 * 
 * Also note that the {@link AST2BOpUtility} has been refactored (as part of
 * ticket #1071 and #1118) and now implements intelligent pushing of projection
 * variables into OPTIONAL blocks, so there's probably no more need for this
 * optimizer.
 * 
 * 
 * Rewrite a join group using two or more complex OPTIONAL groups using a hash
 * join pattern.
 * <p>
 * Note: this optimization is not required if there is only one complex optional
 * in the join group. It is only when there are multiple complex optional groups
 * that we need to lift those groups out as named subqueries (since they need to
 * feed each other). If there is only one complex optional group, then we can
 * run it as a sub-group instead.
 * <p>
 * NOte: This optimization presumes that simple optional groups were already
 * translated into optional {@link StatementPatternNode}s.
 * <p>
 * Queries with multiple complex optional groups can be rewritten into the hash
 * join of solution sets as follows.
 * 
 * <ol>
 * 
 * <li>First, create a hash index from the required joins and any simple
 * optional joins. Given the modeling abilities of the AST, this is most easily
 * achieved by converting the required joins into a named subquery. The join
 * variable(s) for that named subquery will the subset of variable(s) which are
 * shared by each of the complex OPTIONAL groups. The
 * {@link ASTNamedSubqueryOptimizer} already handles the assignment of join
 * variables, so we do not need to consider it further here.</li>
 * 
 * <li>For each complex optional group, use the solution step generated in (1)
 * and run the optional group as a named subquery producing a new solution set.
 * The WHERE clause of the named subquery should look like:
 * 
 * <pre>
 * {INCLUDE %set . OPTIONAL {...}}
 * </pre>
 * 
 * </li>
 * 
 * <li>Join all of the named solution sets in (2) back together. For example, if
 * there were two complex optional groups and the required joins resulted in
 * known bound variables for var1 and var2, then those result sets might be
 * combined as follows in the main WHERE clause of the rewritten query.
 * 
 * <pre>
 * INCLUDE %set1 .
 * INCLUDE %set2 JOIN ON (?var1, ?var2) .
 * </pre>
 * 
 * Note: The join variables for those INCLUDEs MUST be identified through static
 * analysis. Failure to use available join variables will result in an extremely
 * inefficient query plan as the full cross product of the solutions will be
 * compared to identify solutions which join.</li>
 * 
 * </ol>
 * 
 * TODO The rewrite into named subquery includes means that we wind up building
 * more hash indices than we strictly require as a hash index will also be built
 * at the start of each optional group. However, since the hash index at the
 * start of the optional group has exactly the same data as the named subquery
 * include's hash index, we should elide the step which builds the extra hash
 * index.
 * 
 * TODO This optimzer should not be strictly necessary at all. The same behavior
 * should arise from running the complex optionals as sub-groups. Based on a few
 * govtrack CI queries, it looks like we do better when the complex optional
 * groups are lifted out as named subqueries. We need to go back and investigate
 * whether or not this is true and why.
 * 
 * @see https://sourceforge.net/apps/trac/bigdata/ticket/397
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @author <a href="mailto:ms@metaphacts.com">Michael Schmidt</a>
 * 
 */


@Deprecated
public class ASTComplexOptionalOptimizer implements IASTOptimizer {

//    private static final Logger log = Logger
//            .getLogger(ASTComplexOptionalOptimizer.class);
	
    /**
     * Set this to <code>false</code> to only optimize the top level group.
     * <p>
     * Note: We have confirm that this change does not have a negative impact on
     * either govtrack or BSBM. The change also fixes a known problem with some
     * customer queries.
     * 
     * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/668" > Join
     *      Group Optimization </a>
     */
	private static boolean recurse = false;
    
    @Override
    public QueryNodeWithBindingSet optimize(
       final AST2BOpContext context, final QueryNodeWithBindingSet input) {

        final IQueryNode queryNode = input.getQueryNode();
        final IBindingSet[] bindingSets = input.getBindingSets();     
       
        final QueryRoot queryRoot = (QueryRoot) queryNode;

        final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);

        final Set<IVariable<?>> exogenousVars = context.getSolutionSetStats()
                .getUsedVars();

        // First, process any pre-existing named subqueries.
        {
            
            final NamedSubqueriesNode namedSubqueries = queryRoot
                    .getNamedSubqueries();

            if (namedSubqueries != null) {

            	final List<NamedSubqueryRoot> roots = new LinkedList<NamedSubqueryRoot>();
            	
                for (NamedSubqueryRoot namedSubquery : namedSubqueries) {
                	
                	roots.add(namedSubquery);
                	
                }

                for (NamedSubqueryRoot namedSubquery : roots) {

                    convertComplexOptionalGroups(context, sa, namedSubquery,
                            namedSubquery.getWhereClause(), exogenousVars);

                }

            }

        }
        
        // Now process the main where clause.
        convertComplexOptionalGroups(context, sa, queryRoot,
                queryRoot.getWhereClause(), exogenousVars);

        return new QueryNodeWithBindingSet(queryNode, bindingSets);

    }

    /**
     * Using a depth-first recursion, examine each join group. If a join group
     * contains two or more complex optional groups (that is, an optional group
     * which was not translated into a optional statement pattern node) then
     * convert the join group having those complex optional child groups.
     */
    private void convertComplexOptionalGroups(final AST2BOpContext context,
            final StaticAnalysis sa,
            final QueryBase query,
            final GraphPatternGroup<IGroupMemberNode> group,
            final Set<IVariable<?>> exogenousVars) {

        final int arity = group.arity();

        int complexOptionalGroupCount = 0;

        for (int i = 0; i < arity; i++) {

            final BOp child = (BOp) group.get(i);

            if (child instanceof GraphPatternGroup<?>) {

                /*
                 * Note: Do recursion *before* we do the rewrite.
                 */

                @SuppressWarnings("unchecked")
                final GraphPatternGroup<IGroupMemberNode> childGroup = (GraphPatternGroup<IGroupMemberNode>) child;

                if (recurse) {
                
	                convertComplexOptionalGroups(context, sa, query, childGroup,
	                        exogenousVars);
	                
                }

                if (childGroup.isOptional()
                        && (!(childGroup.arity() == 1 && childGroup.get(0) instanceof NamedSubqueryInclude))) {
                    /*
                     * Note: Do NOT translate a child group which consists
                     * solely of a named subquery include. That is the target
                     * output for this optimizer!
                     * 
                     * Note: This presumes that simple optionals were already
                     * translated into optional statement patterns.
                     */
                    complexOptionalGroupCount++;
                }

            } else if (child instanceof SubqueryRoot) {

                // Recursion into subqueries.

                final SubqueryRoot subqueryRoot = (SubqueryRoot) child;

                if (recurse) {
                
	                convertComplexOptionalGroups(context, sa, query,
	                        subqueryRoot.getWhereClause(), exogenousVars);
	                
                }

            }

        }
        
        if (complexOptionalGroupCount >= 2 && (group instanceof JoinGroupNode)) {

            /*
             * Convert a join group having more than one direct child which is a
             * complex optional group.
             */
            
            convertJoinGroup(context, sa, query, (JoinGroupNode) group,
                    exogenousVars);

        }
        
    }
    
    /**
     * 1. Move the required joins (INCLUDEs, statement pattern nodes, and
     * required subqueries) plus any simple OPTIONALs (which are statement
     * pattern nodes) into a new named subquery (unless the required joins are
     * already a single INCLUDE).
     * <p>
     * 2. Lift each complex optional group into a new named subquery which
     * INCLUDEs the result from (1), replacing it with an INCLUDE of the named
     * subquery in this group.
     * <p>
     * Note: The named solution set from (1) is NOT directly INCLUDEd back into
     * this join group. Instead, it is INCLUDEd into each the named subquery for
     * each complex optional lifted out of this join group.
     * 
     * @param query
     *            The (sub-)query in which the join group appears. We need to
     *            know what is being projected out of the (sub-)query in order
     *            to compute the projections when converting sub-groups into
     *            subqueries.
     * @param group
     *            The join group to be rewritten.
     * @param exogenousVars
     *            The exogenous variables for the query.
     */
    private void convertJoinGroup(final AST2BOpContext context,
            final StaticAnalysis sa, final QueryBase query,
            final JoinGroupNode group, final Set<IVariable<?>> exogenousVars) {

    	/*
    	 * Step 0: Precondition check
    	 * 
    	 * Check if the optimization is indeed applicable: rewriting is only
    	 * sound if the WHERE clause of the body select query (as constructed
    	 * in Step 1 below) will contain one or more solution generating
    	 * expressions (triple patterns, join groups, etc.)
    	 */
    	{
            final IGroupMemberNode[] members = group
                    .toArray(new IGroupMemberNode[] {});
            boolean optimizationApplicable = false;
            for(int i=0; i<members.length && !optimizationApplicable; i++) {
            	
            	// the where clause will be non-empty if any of the group
            	// members contains a solution-generating pattern (different
            	// from optional, which will be optimized)
            	IGroupMemberNode t = members[i];
            	optimizationApplicable |= 
            		t instanceof IBindingProducerNode && 
            		!(t instanceof AssignmentNode) && // assignments run last
            		!(t instanceof JoinGroupNode && ((JoinGroupNode) t).isOptional());
            }
    	
	    	if (!optimizationApplicable)
	    		return; // not safe, can't do anything here
    	}
    	
        /*
         * Step 1.
         * 
         * FIXME This MUST recognize the case where everything is already in a
         * single INCLUDE, which is the post-condition for Step 1.  Test this
         * by feeding the post-condition of this step into a unit test.
         */
        
        // The name of the solution set for this join group.
        final String mainSolutionSetName = "--nsr-" + context.nextId();

        // The set of direct children which are complex optional groups.
        final List<JoinGroupNode> complexGroups = new LinkedList<JoinGroupNode>();
        
//        // The list of direct children which were moved.
//        final List<IGroupMemberNode> move = new LinkedList<IGroupMemberNode>();

        {

            final NamedSubqueryRoot nsr = new NamedSubqueryRoot(
                    QueryType.SELECT, mainSolutionSetName);

            sa.getQueryRoot().getNamedSubqueriesNotNull().add(nsr);

            final JoinGroupNode whereClause = new JoinGroupNode();
            nsr.setWhereClause(whereClause);
            
            // Move any pre-filters or join-filters to the named subquery.
            {
                for (FilterNode f : sa.getPreFilters(group)) {
                    whereClause.addChild(f);
                }
                for (FilterNode f : sa.getJoinFilters(group)) {
                    whereClause.addChild(f);
                }
            }

            // Move any required joins to the named subquery.
            final IGroupMemberNode[] members = group
                    .toArray(new IGroupMemberNode[] {});
            for(IGroupMemberNode t : members) {
                
                if (t instanceof StatementPatternNode
                        || t instanceof NamedSubqueryInclude
                        || t instanceof SubqueryRoot
                        || t instanceof ServiceNode
                        || t instanceof UnionNode
                        || t instanceof ArbitraryLengthPathNode 
                        || t instanceof BindingsClause) {
                    // Moved to the named subquery.
//                    move.add(t);
                    group.removeChild(t);
                    whereClause.addChild(t);
                } else if (t instanceof FilterNode) {
                    /*
                     * Leave other filters in place. They depend on something in
                     * the optional groups.
                     */
                } else if(t instanceof AssignmentNode) {
                    /*
                     * Leave assignment nodes in place. They run last.
                     */
                } else if (t instanceof JoinGroupNode) {
                    final JoinGroupNode childGroup = (JoinGroupNode) t;
                    if (childGroup.isOptional()) {
                        /*
                         * This will be moved into a different named subquery in
                         * the next step (below).
                         */
                        complexGroups.add(childGroup);
                    } else {
                        // Move non-optional child group to the named subquery.
                        group.removeChild(t);
                        whereClause.addChild(childGroup);
                    }
                } else {
                    /*
                     * This is a catch all for things which might not have been
                     * considered above.
                     */
                    throw new AssertionError("Not expecting: "+t+" in "+group);
                }

            }

            /*
             * Create the PROJECTION for the lifted named subquery.
             * 
             * Note: Everything which was lifted is no longer present in the
             * WHERE clause. Thus, when computing the projection of the lifted
             * subquery we want to project anything which appeared in the lifted
             * where clause IF it is referenced again by those things which
             * remain in the group (but paying attention to variable scoping for
             * sub-queries).
             * 
             * TODO This would appear to ignore variables which are referenced
             * in a parent group. I.e., we should recursively apply the same
             * analysis to the downstream siblings in the parent group to make
             * sure that the variable is not reused.
             * 
             * TODO Make this projection DISTINCT if that does not change the
             * query semantics.
             * 
             * @see https://sourceforge.net/apps/trac/bigdata/ticket/368 (Prune
             * variables during query evaluation)
             */
            {

                // All variables which are used within the WHERE clause of the lifted named subquery.
                final Set<IVariable<?>> groupVars = sa.getSpannedVariables(whereClause,
                        new LinkedHashSet<IVariable<?>>());

                // All variables still referenced in the joins or filters of
                // the group (after extracting the named subquery).
                final Set<IVariable<?>> afterVars = sa.getSpannedVariables(
                        (BOp) group, new LinkedHashSet<IVariable<?>>());

                if (query.getProjection() != null) {
                    // Include anything that we must project out of the query.
                    final ProjectionNode tmp = query.getProjection();
                    tmp.getSelectExprVars(afterVars);
//                    tmp.getProjectionVars(afterVars);// FIXME This needs to be the variables USED in the SELECT expressions, NOT the variables projected out of the query.
                }

                final Set<IVariable<?>> projectedVars = new LinkedHashSet<IVariable<?>>();
                projectedVars.addAll(groupVars);
                projectedVars.retainAll(afterVars);

                final ProjectionNode projection = new ProjectionNode();

                for (IVariable<?> v : projectedVars) {

                    projection.addProjectionVar(new VarNode(v.getName()));

                }

                nsr.setProjection(projection);

            }
            
        }

        /**
         * Extract maybe produced variables from the complex join groups,
         * making the accessible in an easy way for reuse in the subsequent
         * iteration
         */
        final List<Set<IVariable<?>>> complexGroupsDefiniteVars = 
             new ArrayList<Set<IVariable<?>>>(complexGroups.size());
        for (int i=0; i<complexGroups.size(); i++) {
           
           final Set<IVariable<?>> cur = new HashSet<IVariable<?>>();
           sa.getDefinitelyProducedBindings(complexGroups.get(i), cur, true);
           complexGroupsDefiniteVars.add(i,cur);
        }
        
        // Step 2 (for each direct child complex optional group).
        String precedingSolutionName = mainSolutionSetName;
        for (int i=0; i<complexGroups.size(); i++) {
            final JoinGroupNode childGroup = complexGroups.get(i);

//            log.error("Convert: " + childGroup);

            final String solutionSetName = "--nsr-" + context.nextId();

            final NamedSubqueryRoot nsr = new NamedSubqueryRoot(
                    QueryType.SELECT, solutionSetName);

            sa.getQueryRoot().getNamedSubqueriesNotNull().add(nsr);

            final JoinGroupNode whereClause = new JoinGroupNode();
            nsr.setWhereClause(whereClause);

            final NamedSubqueryInclude mainInclude = new NamedSubqueryInclude(
                  precedingSolutionName);

            whereClause.addChild(mainInclude);
            whereClause.addChild(childGroup);
            

            final NamedSubqueryInclude anInclude = new NamedSubqueryInclude(
                    solutionSetName);

            /*
             * We substitute the current include into the main query.
             * 
             * TODO: Note that it may be removed again at the end of the for 
             * loop: actually, we only keep the final subquery, all others are
             * dropped again. We just need to add them temporarily,
             * to be able to reuse the static analysis (call 
             * sa.getProjectedVars) below. We might try to change this to make
             * the code more readable.
             */
            if (group.replaceWith(childGroup, anInclude) != 1)
                throw new AssertionError();

            /*
             * Create the projection for the named subquery and replace the
             * query with the named subquery ID.
             */
            {
                /*
                 * sa.getProjectedVars computes required variables according
                 * to the ancestor axis
                 */
                final Set<IVariable<?>> projectedVars = sa.getProjectedVars(
                        anInclude, whereClause, query, exogenousVars,
                        new LinkedHashSet<IVariable<?>>());
                
                /*
                 * In addition to the vars collected by sa.getProjectedVars,
                 * we need to retain variables appearing in subsequent complex
                 * join groups. This is necessary to avoid a blowup (duplicates)
                 * in the number of results, see ticket #801, i.e. we need to
                 * make sure that joins with subsequent join groups are
                 * executed over *all* joint variables.
                 * 
                 * To do so, we start up with the maybe vars of the group itself
                 * and retain all maybe vars occurring in one of the following
                 * join groups, and add them to the list of projected vars.
                 */
                final Set<IVariable<?>> joinVarCandidates = 
                        complexGroupsDefiniteVars.get(i);
                
                final Set<IVariable<?>> subsequentGroupDefiniteVars = 
                        new HashSet<IVariable<?>>();
                for (int j=i+1; j<complexGroupsDefiniteVars.size(); j++) {
                   subsequentGroupDefiniteVars.addAll(complexGroupsDefiniteVars.get(j));
                }
                
                joinVarCandidates.retainAll(subsequentGroupDefiniteVars);

                projectedVars.addAll(joinVarCandidates);

                /*
                 * Having computed the projection vars, we're now ready to 
                 * build the projection clause for the current named subquery. 
                 */
                final ProjectionNode projection = new ProjectionNode();

                for (IVariable<?> v : projectedVars) {

                    projection.addProjectionVar(new VarNode(v.getName()));

                }

                nsr.setProjection(projection);
               
                // remove group again
                if (i!=complexGroups.size()-1) {
                    if (!group.removeArg(anInclude))
                        throw new AssertionError();
                }

            }
            
            precedingSolutionName = solutionSetName;
        }

    }
}