/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Aug 30, 2011 */ package com.bigdata.rdf.sparql.ast.optimizers; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import com.bigdata.bop.BOp; import com.bigdata.bop.BOpUtility; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IVariable; import com.bigdata.rdf.sparql.ast.ASTUtil; import com.bigdata.rdf.sparql.ast.IQueryNode; import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode; import com.bigdata.rdf.sparql.ast.NamedSubqueryInclude; import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot; import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet; import com.bigdata.rdf.sparql.ast.QueryRoot; import com.bigdata.rdf.sparql.ast.StaticAnalysis; import com.bigdata.rdf.sparql.ast.SubqueryBase; import com.bigdata.rdf.sparql.ast.SubqueryRoot; import com.bigdata.rdf.sparql.ast.VarNode; import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext; import cutthecrap.utils.striterators.Striterator; /** * Class identifies the join variables for each instance in which a named * subquery solution set is incorporated into the query plan. * * @see NamedSubqueryRoot * @see NamedSubqueryInclude * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class ASTNamedSubqueryOptimizer implements IASTOptimizer { // private static final Logger log = Logger // .getLogger(ASTNamedSubqueryOptimizer.class); /** * * @throws RuntimeException * if there is an {@link NamedSubqueryInclude} for a named * solution set which is not generated by the query. * @throws RuntimeException * if there is an {@link NamedSubqueryRoot} for a named solution * set which is not consumed by the query. * @throws RuntimeException * if there is more than one {@link NamedSubqueryRoot} for a * given named solution set. */ @Override public QueryNodeWithBindingSet optimize( final AST2BOpContext context, final QueryNodeWithBindingSet input) { final IQueryNode queryNode = input.getQueryNode(); final IBindingSet[] bindingSet = input.getBindingSets(); final QueryRoot queryRoot = (QueryRoot) queryNode; final NamedSubqueriesNode namedSubqueries = queryRoot .getNamedSubqueries(); if (namedSubqueries == null || namedSubqueries.isEmpty()) { // NOP. return new QueryNodeWithBindingSet(queryRoot, bindingSet); } /* * Order the named subqueries in order to support nested includes. * * Note: The named subqueries must form an acyclic graph. They can * INCLUDE one another, but not in patterns which form cycles. This puts * them into an evaluation order. */ orderNamedSubqueries(queryRoot, namedSubqueries); // The set of all INCLUDEs in the query. final NamedSubqueryInclude[] allIncludes = findAllIncludes(queryRoot); // Verify that a named subquery or solution set exists for each INCLUDE. assertNamedSubqueryForEachInclude(context, namedSubqueries, allIncludes); /* * Verify that each named subquery is consumed by at least one include * somewhere in the WHERE clause of the query. */ assertEachNamedSubqueryIsUsed(namedSubqueries, allIncludes); /* * Figure out the join variables for each INCLUDE. */ assignJoinVars(queryRoot, context, namedSubqueries, allIncludes); return new QueryNodeWithBindingSet(queryRoot, bindingSet); } /** * Return all {@link NamedSubqueryInclude}s which appear in the WHERE clause * of the main query. */ static private NamedSubqueryInclude[] findAllIncludes(final QueryRoot queryRoot) { final Striterator itr = new Striterator( BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause())); itr.addTypeFilter(NamedSubqueryInclude.class); final List<NamedSubqueryInclude> list = new LinkedList<NamedSubqueryInclude>(); while (itr.hasNext()) { list.add((NamedSubqueryInclude) itr.next()); } final Striterator itr2 = new Striterator( BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause())); itr2.addTypeFilter(SubqueryRoot.class); while (itr2.hasNext()) { list.addAll(findSubqueryIncludes((SubqueryRoot) itr2.next())); } if (queryRoot.getNamedSubqueries() != null) { for(NamedSubqueryRoot root:queryRoot.getNamedSubqueries()){ list.addAll(findSubqueryIncludes(root)); } } return list.toArray(new NamedSubqueryInclude[] {}); } /** * TODO This seems to be inefficient. We do not need to proceed * {@link SubqueryBase} by {@link SubqueryBase}. * {@link BOpUtility#visitAll(BOp, Class)} can be used to locate all * INCLUDEs in the entire query and then we can build up whatever indices we * need in optimize() and use them elsewhere as required. */ static private List<NamedSubqueryInclude> findSubqueryIncludes(final SubqueryBase queryRoot){ final Striterator itr = new Striterator( BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause())); itr.addTypeFilter(NamedSubqueryInclude.class); final List<NamedSubqueryInclude> list = new LinkedList<NamedSubqueryInclude>(); while (itr.hasNext()) { list.add((NamedSubqueryInclude) itr.next()); } final Striterator itr2 = new Striterator( BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause())); itr2.addTypeFilter(SubqueryRoot.class); while (itr2.hasNext()) { list.addAll(findSubqueryIncludes((SubqueryRoot) itr2.next())); } return list; } /** * Verify that a named subquery of solution set exists for each INCLUDE. * * @param context For querying solution sets * @param namedSubqueries * @param allIncludes */ static private void assertNamedSubqueryForEachInclude( final AST2BOpContext context, final NamedSubqueriesNode namedSubqueries, final NamedSubqueryInclude[] allIncludes) { for (NamedSubqueryInclude anInclude : allIncludes) { final String namedSet = anInclude.getName(); if (namedSet == null || namedSet.trim().length() == 0) throw new RuntimeException( "Missing or illegal name for include."); boolean found = false; for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) { if (aNamedSubquery.getName().equals(namedSet)) { found = true; break; } } if (!found) { try { context.getSolutionSetStats(namedSet); // There is a named solution set so we are OK. } catch (RuntimeException e) { throw new RuntimeException( "No subquery produces the solution set: " + namedSet, e); } } } } /** * Verify that each named subquery is consumed by at least one include * somewhere in the WHERE clause of the query. * * @param namedSubqueries * @param allIncludes */ static private void assertEachNamedSubqueryIsUsed( final NamedSubqueriesNode namedSubqueries, final NamedSubqueryInclude[] allIncludes) { // The set of all named solution sets produced by this query. final Set<String> namedSets = new LinkedHashSet<String>(); for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) { final String namedSet = aNamedSubquery.getName(); if (!namedSets.add(namedSet)) { throw new RuntimeException("NamedSet declared more than once: " + namedSet); } if (namedSet == null || namedSet.trim().length() == 0) throw new RuntimeException( "Missing or illegal name for named subquery."); final List<NamedSubqueryInclude> includes = new LinkedList<NamedSubqueryInclude>(); for (NamedSubqueryInclude anInclude : allIncludes) { if (namedSet.equals(anInclude.getName())) { includes.add(anInclude); } } if (includes.isEmpty()) { throw new RuntimeException( "Named subquery results are not used by this query: " + namedSet); } } } /** * Figure out the join variables for each INCLUDE. If the join variables * were already assigned to a {@link NamedSubqueryInclude}, then we just * make sure that the {@link NamedSubqueryRoot} will produce a suitable hash * index. If an INCLUDE does not have its join variables pre-assigned, then * we do a static analysis of the query and figure out which shared * variables MUST be bound. The set of shared variables is assigned as the * join variables. Again, we verify that a suitable hash index will be * produced for that INCLUDE. * <p> * Note: If the join variables were not pre-assigned (by a query hint) and * no join variables are identified by a static analysis then a full N x M * cross product of the solutions must be tested and filtered for those * solutions which join. This is a lot of effort when compared with a hash * join. Having the right join variables is very important for performance. * * @param namedSubqueries * @param allIncludes * * @see https://sourceforge.net/apps/trac/bigdata/ticket/405 */ static private void assignJoinVars(// final QueryRoot queryRoot,// final AST2BOpContext context,// final NamedSubqueriesNode namedSubqueries,// final NamedSubqueryInclude[] allIncludes) { final StaticAnalysis sa = new StaticAnalysis(queryRoot, context); for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) { final String namedSet = aNamedSubquery.getName(); // Collect each INCLUDE for this named subquery. final List<NamedSubqueryInclude> includes = new LinkedList<NamedSubqueryInclude>(); { for (NamedSubqueryInclude anInclude : allIncludes) { if (namedSet.equals(anInclude.getName())) { includes.add(anInclude); } } } /* * Collect each distinct joinvar[] combination for those includes. * * Note: Since having the distinct joinvar[] combinations is * important, we sort each joinvar[] to ensure that they have a * common order. */ final Set<JoinVars> distinctJoinVarsSet = new LinkedHashSet<JoinVars>(); for (NamedSubqueryInclude anInclude : includes) { @SuppressWarnings("rawtypes") final IVariable[] joinvars; if (anInclude.getJoinVars() == null) { /** * Since no query hint was used, then figure out the join * variables using a static analysis of the query. * * Note: Since the named subqueries run with only the * exogenous bindings as input, anything which is * exogenously bound plus anything which is known bound can * serve as a join variable. * * TODO There is a StaticAnalysis bug - it fails to consider * the exogenous bindings when computing the definitely * bound variables. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/412"> * getDefinatelyBound() ignores exogenous variables * </a> * * TODO Optimize case where there are no exogenous * bindings such that the sole source solution for the * named subquery is an empty solution set. * * @see <a * href="http://sourceforge.net/apps/trac/bigdata/ticket/535"> * Optimize JOIN VARS for Sub-Selects </a> */ final Set<IVariable<?>> set = new LinkedHashSet<IVariable<?>>(); sa.getJoinVars(aNamedSubquery, anInclude, set); joinvars = set.toArray(new IVariable[set.size()]); // Sort. Arrays.sort(joinvars); // Set those join variables on the include. anInclude.setJoinVars(ASTUtil.convert(joinvars)); } else { // Get the user specified join variables. joinvars = ASTUtil.convert(anInclude.getJoinVars()); // Sort. Arrays.sort(joinvars); // Set them back on the include in sorted order. anInclude.setJoinVars(ASTUtil.convert(joinvars)); } distinctJoinVarsSet.add(new JoinVars(joinvars)); } /* * Figure out the join variables for each place in the query where * the named result set is included and annotate the include * operator to specify the join variables for that include. */ final int nhashIndices = distinctJoinVarsSet.size(); if (nhashIndices > 1) { /* * Since there is more than one set of join variables required * by the INCLUDEs, we use the largest subset of the join * variables defined across all of the includes. */ // First, collect all join variables. final Set<IVariable<?>> sharedVariables = new LinkedHashSet<IVariable<?>>(); for (JoinVars joinVars : distinctJoinVarsSet) { sharedVariables.addAll(joinVars.vars()); } // Now, retain only those variables in scope for each include. for (JoinVars joinVars : distinctJoinVarsSet) { sharedVariables.retainAll(joinVars.vars()); } /* * The join variables which are shared across all contexts in * which this named solution set is joined back into the query. */ final VarNode[] sharedJoinVars = ASTUtil .convert(sharedVariables.toArray(new IVariable[] {})); // Set the shared join variables on the named subquery. aNamedSubquery.setJoinVars(sharedJoinVars); for (NamedSubqueryInclude anInclude : includes) { // Set the shared join variables on each subquery include. anInclude.setJoinVars(sharedJoinVars); } } else { /* * Since there is just one set of join variables we will use * that. */ final JoinVars joinVars = distinctJoinVarsSet.iterator().next(); aNamedSubquery.setJoinVars(ASTUtil.convert(joinVars.toArray())); } } } /** * Order the named subqueries based on nested includes. * * TODO This should reuse the same arrays/collections that are generated for * the other logic in this class. No need to repeatedly traverse the query * looking for INCLUDEs. * * TODO This should use some generic topological sort algorithm. e.g. it is not obvious that * this code covers the case where two named subqueries include each other. */ static private void orderNamedSubqueries(final QueryRoot queryRoot, final NamedSubqueriesNode namedSubqueries) { // Map from solution set name to named subquery root. final Map<String, NamedSubqueryRoot> nameToSubquery = new LinkedHashMap<String, NamedSubqueryRoot>(); { for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) { nameToSubquery.put(aNamedSubquery.getName(), aNamedSubquery); } } /* * Map from named subquery root to the list of named solutions on which * each named subquery depends. Those named solutions must be computed * before any named subquery root which will consume them. */ final Map<NamedSubqueryRoot, List<String>> subqueryToIncludes = new LinkedHashMap<NamedSubqueryRoot, List<String>>(); { for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) { final List<String> includes = new LinkedList<String>(); final List<String> includesNamedSubqueries = new LinkedList<String>(); subqueryToIncludes.put(aNamedSubquery, includesNamedSubqueries); for (NamedSubqueryInclude include : findSubqueryIncludes(aNamedSubquery)) { String name = include.getName(); includes.add(name); if ( nameToSubquery.containsKey(name) ) { includesNamedSubqueries.add(name); } // else name gives a named solution set. } // Set the DEPENDS_ON annotation: named subqueries and solution sets aNamedSubquery.setDependsOn(includes.toArray(new String[0])); } } /* * Create a new NamedSubqueriesNode which corresponds to a valid * evaluation order for the named subqueries. */ { final Set<String> processed = new HashSet<String>(); final NamedSubqueriesNode newNode = new NamedSubqueriesNode(); Iterator<Map.Entry<NamedSubqueryRoot, List<String>>> iter = subqueryToIncludes .entrySet().iterator(); while (iter.hasNext()) { final Map.Entry<NamedSubqueryRoot, List<String>> entry = iter .next(); final NamedSubqueryRoot namedSubquery = entry.getKey(); if (entry.getValue().size() == 0) { newNode.add(namedSubquery); processed.add(namedSubquery.getName()); iter.remove(); } } while (subqueryToIncludes.size() > 0) { iter = subqueryToIncludes.entrySet().iterator(); while (iter.hasNext()) { boolean ok = true; final Map.Entry<NamedSubqueryRoot, List<String>> entry = iter .next(); for (String dep : entry.getValue()) { if (!processed.contains(dep)) { ok = false; break; } } if (ok) { newNode.add(entry.getKey()); processed.add(entry.getKey().getName()); iter.remove(); } } } // Update the QueryRoot with the named subquery evaluation order. queryRoot.setNamedSubqueries(newNode); } } /** * Wrapper class used to inflict Arrays.equals() rather than Object.equals() * when an array is used in a Collection. */ private static class JoinVars { private final Set<IVariable<?>> vars; private final int hashCode; public Set<IVariable<?>> vars() { return Collections.unmodifiableSet(vars); } public IVariable<?>[] toArray() { return vars.toArray(new IVariable[vars.size()]); } public JoinVars(final IVariable<?>[] vars) { this.vars = new LinkedHashSet<IVariable<?>>(); for (int i = 0; i < vars.length; i++) { this.vars.add(vars[i]); } this.hashCode = Arrays.hashCode(vars); } @Override public int hashCode() { return hashCode; } @Override public boolean equals(final Object o) { if (this == o) return true; if (!(o instanceof JoinVars)) return false; final JoinVars t = (JoinVars) o; return vars.equals(t.vars); // return Arrays.equals(vars, t.vars); } } }