ASTHashJoinOptimizer.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 20, 2011
 */

package com.bigdata.rdf.sparql.ast.optimizers;

import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedHashSet;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.FilterNode;
import com.bigdata.rdf.sparql.ast.GraphPatternGroup;
import com.bigdata.rdf.sparql.ast.IGroupMemberNode;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.JoinSetUtil;
import com.bigdata.rdf.sparql.ast.JoinSetUtil.VertexJoinSet;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;
import com.bigdata.rdf.sparql.ast.eval.IEvaluationContext;
import com.bigdata.rdf.sparql.ast.service.ServiceNode;

/**
 * Rewrites join groups having one or more joins which would involve a full
 * cross product as hash joins of sub-groups where there is a constraint imposed
 * indirectly via a FILTER operating across the variables bound by the joins.
 * This handles queries such as BSBM Q5.
 * 
 * @see https://sourceforge.net/apps/trac/bigdata/ticket/253
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class ASTHashJoinOptimizer implements IASTOptimizer {

    private static final Logger log = Logger
            .getLogger(ASTHashJoinOptimizer.class);
    
    @Override
    public QueryNodeWithBindingSet optimize(
        final AST2BOpContext context, final QueryNodeWithBindingSet input) {

        final IQueryNode queryNode = input.getQueryNode();
        final IBindingSet[] bindingSets = input.getBindingSets();     

        final QueryRoot queryRoot = (QueryRoot) queryNode;

        final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);

        // First, process any pre-existing named subqueries.
        {
            
            final NamedSubqueriesNode namedSubqueries = queryRoot
                    .getNamedSubqueries();

            if (namedSubqueries != null) {

                for (NamedSubqueryRoot namedSubquery : namedSubqueries) {

                    optimizeJoinGroups(context, sa, namedSubquery.getWhereClause());

                }

            }

        }
        
        // Now process the main where clause.
        optimizeJoinGroups(context, sa, queryRoot.getWhereClause());

        return new QueryNodeWithBindingSet(queryRoot, bindingSets);
        
    }

    /**
     * Identify sets of joins which share variables only indirectly through a
     * constraint (FILTER). Such joins are pushed down into a sub-group along
     * with the constraint. The sub-group can be efficiently joined back to the
     * parent group (using a hash join) as long as there is a shared variable
     * between the sub-group and the parent (this condition is satisified if one
     * of the joins shares a variable with the parent group).
     */
    private void optimizeJoinGroups(final IEvaluationContext context,
            final StaticAnalysis sa,
            final GraphPatternGroup<IGroupMemberNode> group) {

        final int arity = group.arity();

        for (int i = 0; i < arity; i++) {

            final BOp child = (BOp) group.get(i);
 
            if (child instanceof GraphPatternGroup<?>) {

                /*
                 * Note: Do recursion *before* we do the rewrite.
                 */
                optimizeJoinGroups(context, sa,
                        ((GraphPatternGroup<IGroupMemberNode>) child));

            } else if (child instanceof ServiceNode) {
            
                // Do not rewrite things inside of a SERVICE node.
                continue;
                
            }

        }
        
        /*
         * Analyze the joins in the group.
         */
        final JoinSetUtil joinSets = new JoinSetUtil(sa, null/* knownBound */,
                group);

        if (joinSets.joinFilters.isEmpty()) {
            /*
             * All of the required joins in this group can be made using
             * directly shared variables.
             */
            return;
        }

        /*
         * There is more than one join set. Join sets, by definition, are sets
         * of vertices with disjoint sets of variables (no variables are shared
         * between different join sets).
         * 
         * This means that we will have a full cross product between those join
         * sets unless there are some filters which indirectly share variables
         * and hence constraint the joins between those join sets.
         * 
         * FIXME Identify joins which depend on indirectly shared variables and
         * push them down into a sub-group. The largest join set stays in this
         * group. The other join sets get pushed down into subgroups. Any filter
         * which is fully bound in the subgroup moves into the subgroup. This
         * needs to be done with some awareness of a good join order (e.g.,
         * after the static optimizer) since we need to know which joins will
         * run before each subgroup that we push down in order to know what
         * variables are known bound on entry.
         * 
         * TODO Should we always push down a sub-group for a disjoint join set?
         * A hash join may very well be more efficient than a pipelined join as
         * the sub-group can run once (actually, it could be lifted out into a
         * named subquery) and the hash join will wind up doing less work than a
         * pipelined join.
         * 
         * TODO This process could use a recursive expansion in case we can
         * connect things transitively. It is really just like computing the
         * direct join sets. The inner loop would scan everything not yet "used"
         * and recurse if we are able to build up a larger join set.
         * 
         * TODO A DISTINCT projection into the sub-group would benefit BSBM Q5,
         * but we need to recognize that the DISTINCT projection is Ok based on
         * both the top-level projection and the fact that the join to pick up
         * the other variable (productLabel) would occur against the distinct
         * variable (product).
         */
        final int directJoinSetCount = joinSets.directJoinSets.size();

        if (directJoinSetCount > 1) {

            final VertexJoinSet[] a = joinSets.directJoinSets
                    .toArray(new VertexJoinSet[directJoinSetCount]);

            /*
             * Sort into order by decreasing join set size (#of vertices). This
             * let's us handle the join groups with the most vertices first.
             */
            Arrays.sort(a, new VertexJoinSetComparator());

            for (int i = 0; i < a.length; i++) {

                for (int j = i + 1; j < a.length; j++) {

                    final Set<IVariable<?>> set1 = a[i].joinvars;
                    final Set<IVariable<?>> set2 = a[j].joinvars;

                    final Set<IVariable<?>> joinvars = new LinkedHashSet<IVariable<?>>();
                    joinvars.addAll(set1);
                    joinvars.addAll(set2);

                    for (FilterNode f : joinSets.joinFilters) {

                        if (sa.isFullyBound(f, set1)
                                || sa.isFullyBound(f, set2)) {
                            // filter runs with one of the join sets.
                            continue;
                        }

                        if (sa.isFullyBound(f, joinvars)) {
                            /*
                             * This join filter does not run with either of
                             * those join sets which considered by themselves,
                             * but it can run when we consider those join sets
                             * together. Thus the filter implicitly shares some
                             * variables across the join sets and provides a
                             * join which is at least somewhat constrained.
                             */
                            log.error("indirect join: joinSet1=" + a[i]
                                    + ",joinSet2=" + a[j] + " on filter=" + f);
                        }

                    }

                }

            }

        }

    }
    
    /**
     * Place {@link VertexJoinSet}s into decreasing order by the #of vertices.
     */
    private static class VertexJoinSetComparator implements
            Comparator<VertexJoinSet> {

        @Override
        public int compare(VertexJoinSet o1, VertexJoinSet o2) {

            return o2.vertices.size() - o1.vertices.size();
            
        }
        
    }

}