/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Oct 20, 2011 */ package com.bigdata.rdf.sparql.ast.optimizers; import java.util.Arrays; import java.util.Comparator; import java.util.LinkedHashSet; import java.util.Set; import org.apache.log4j.Logger; import com.bigdata.bop.BOp; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IVariable; import com.bigdata.rdf.sparql.ast.FilterNode; import com.bigdata.rdf.sparql.ast.GraphPatternGroup; import com.bigdata.rdf.sparql.ast.IGroupMemberNode; import com.bigdata.rdf.sparql.ast.IQueryNode; import com.bigdata.rdf.sparql.ast.JoinSetUtil; import com.bigdata.rdf.sparql.ast.JoinSetUtil.VertexJoinSet; import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode; import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot; import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet; import com.bigdata.rdf.sparql.ast.QueryRoot; import com.bigdata.rdf.sparql.ast.StaticAnalysis; import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext; import com.bigdata.rdf.sparql.ast.eval.IEvaluationContext; import com.bigdata.rdf.sparql.ast.service.ServiceNode; /** * Rewrites join groups having one or more joins which would involve a full * cross product as hash joins of sub-groups where there is a constraint imposed * indirectly via a FILTER operating across the variables bound by the joins. * This handles queries such as BSBM Q5. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/253 * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class ASTHashJoinOptimizer implements IASTOptimizer { private static final Logger log = Logger .getLogger(ASTHashJoinOptimizer.class); @Override public QueryNodeWithBindingSet optimize( final AST2BOpContext context, final QueryNodeWithBindingSet input) { final IQueryNode queryNode = input.getQueryNode(); final IBindingSet[] bindingSets = input.getBindingSets(); final QueryRoot queryRoot = (QueryRoot) queryNode; final StaticAnalysis sa = new StaticAnalysis(queryRoot, context); // First, process any pre-existing named subqueries. { final NamedSubqueriesNode namedSubqueries = queryRoot .getNamedSubqueries(); if (namedSubqueries != null) { for (NamedSubqueryRoot namedSubquery : namedSubqueries) { optimizeJoinGroups(context, sa, namedSubquery.getWhereClause()); } } } // Now process the main where clause. optimizeJoinGroups(context, sa, queryRoot.getWhereClause()); return new QueryNodeWithBindingSet(queryRoot, bindingSets); } /** * Identify sets of joins which share variables only indirectly through a * constraint (FILTER). Such joins are pushed down into a sub-group along * with the constraint. The sub-group can be efficiently joined back to the * parent group (using a hash join) as long as there is a shared variable * between the sub-group and the parent (this condition is satisified if one * of the joins shares a variable with the parent group). */ private void optimizeJoinGroups(final IEvaluationContext context, final StaticAnalysis sa, final GraphPatternGroup<IGroupMemberNode> group) { final int arity = group.arity(); for (int i = 0; i < arity; i++) { final BOp child = (BOp) group.get(i); if (child instanceof GraphPatternGroup<?>) { /* * Note: Do recursion *before* we do the rewrite. */ optimizeJoinGroups(context, sa, ((GraphPatternGroup<IGroupMemberNode>) child)); } else if (child instanceof ServiceNode) { // Do not rewrite things inside of a SERVICE node. continue; } } /* * Analyze the joins in the group. */ final JoinSetUtil joinSets = new JoinSetUtil(sa, null/* knownBound */, group); if (joinSets.joinFilters.isEmpty()) { /* * All of the required joins in this group can be made using * directly shared variables. */ return; } /* * There is more than one join set. Join sets, by definition, are sets * of vertices with disjoint sets of variables (no variables are shared * between different join sets). * * This means that we will have a full cross product between those join * sets unless there are some filters which indirectly share variables * and hence constraint the joins between those join sets. * * FIXME Identify joins which depend on indirectly shared variables and * push them down into a sub-group. The largest join set stays in this * group. The other join sets get pushed down into subgroups. Any filter * which is fully bound in the subgroup moves into the subgroup. This * needs to be done with some awareness of a good join order (e.g., * after the static optimizer) since we need to know which joins will * run before each subgroup that we push down in order to know what * variables are known bound on entry. * * TODO Should we always push down a sub-group for a disjoint join set? * A hash join may very well be more efficient than a pipelined join as * the sub-group can run once (actually, it could be lifted out into a * named subquery) and the hash join will wind up doing less work than a * pipelined join. * * TODO This process could use a recursive expansion in case we can * connect things transitively. It is really just like computing the * direct join sets. The inner loop would scan everything not yet "used" * and recurse if we are able to build up a larger join set. * * TODO A DISTINCT projection into the sub-group would benefit BSBM Q5, * but we need to recognize that the DISTINCT projection is Ok based on * both the top-level projection and the fact that the join to pick up * the other variable (productLabel) would occur against the distinct * variable (product). */ final int directJoinSetCount = joinSets.directJoinSets.size(); if (directJoinSetCount > 1) { final VertexJoinSet[] a = joinSets.directJoinSets .toArray(new VertexJoinSet[directJoinSetCount]); /* * Sort into order by decreasing join set size (#of vertices). This * let's us handle the join groups with the most vertices first. */ Arrays.sort(a, new VertexJoinSetComparator()); for (int i = 0; i < a.length; i++) { for (int j = i + 1; j < a.length; j++) { final Set<IVariable<?>> set1 = a[i].joinvars; final Set<IVariable<?>> set2 = a[j].joinvars; final Set<IVariable<?>> joinvars = new LinkedHashSet<IVariable<?>>(); joinvars.addAll(set1); joinvars.addAll(set2); for (FilterNode f : joinSets.joinFilters) { if (sa.isFullyBound(f, set1) || sa.isFullyBound(f, set2)) { // filter runs with one of the join sets. continue; } if (sa.isFullyBound(f, joinvars)) { /* * This join filter does not run with either of * those join sets which considered by themselves, * but it can run when we consider those join sets * together. Thus the filter implicitly shares some * variables across the join sets and provides a * join which is at least somewhat constrained. */ log.error("indirect join: joinSet1=" + a[i] + ",joinSet2=" + a[j] + " on filter=" + f); } } } } } } /** * Place {@link VertexJoinSet}s into decreasing order by the #of vertices. */ private static class VertexJoinSetComparator implements Comparator<VertexJoinSet> { @Override public int compare(VertexJoinSet o1, VertexJoinSet o2) { return o2.vertices.size() - o1.vertices.size(); } } }