ASTNamedSubqueryOptimizer.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Aug 30, 2011
 */

package com.bigdata.rdf.sparql.ast.optimizers;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.ASTUtil;
import com.bigdata.rdf.sparql.ast.IQueryNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueriesNode;
import com.bigdata.rdf.sparql.ast.NamedSubqueryInclude;
import com.bigdata.rdf.sparql.ast.NamedSubqueryRoot;
import com.bigdata.rdf.sparql.ast.QueryNodeWithBindingSet;
import com.bigdata.rdf.sparql.ast.QueryRoot;
import com.bigdata.rdf.sparql.ast.StaticAnalysis;
import com.bigdata.rdf.sparql.ast.SubqueryBase;
import com.bigdata.rdf.sparql.ast.SubqueryRoot;
import com.bigdata.rdf.sparql.ast.VarNode;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpContext;

import cutthecrap.utils.striterators.Striterator;

/**
 * Class identifies the join variables for each instance in which a named
 * subquery solution set is incorporated into the query plan.
 *
 * @see NamedSubqueryRoot
 * @see NamedSubqueryInclude
 *
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class ASTNamedSubqueryOptimizer implements IASTOptimizer {

//    private static final Logger log = Logger
//            .getLogger(ASTNamedSubqueryOptimizer.class);
    
    /**
     *
     * @throws RuntimeException
     *             if there is an {@link NamedSubqueryInclude} for a named
     *             solution set which is not generated by the query.
     * @throws RuntimeException
     *             if there is an {@link NamedSubqueryRoot} for a named solution
     *             set which is not consumed by the query.
     * @throws RuntimeException
     *             if there is more than one {@link NamedSubqueryRoot} for a
     *             given named solution set.
     */
    @Override
    public QueryNodeWithBindingSet optimize(
        final AST2BOpContext context, final QueryNodeWithBindingSet input) {

        final IQueryNode queryNode = input.getQueryNode();
        final IBindingSet[] bindingSet = input.getBindingSets();     

        final QueryRoot queryRoot = (QueryRoot) queryNode;

        final NamedSubqueriesNode namedSubqueries = queryRoot
                .getNamedSubqueries();

        if (namedSubqueries == null || namedSubqueries.isEmpty()) {

            // NOP.
           return new QueryNodeWithBindingSet(queryRoot, bindingSet);

        }

        /*
         * Order the named subqueries in order to support nested includes.
         * 
         * Note: The named subqueries must form an acyclic graph. They can
         * INCLUDE one another, but not in patterns which form cycles. This puts
         * them into an evaluation order.
         */
        orderNamedSubqueries(queryRoot, namedSubqueries);

        // The set of all INCLUDEs in the query.
        final NamedSubqueryInclude[] allIncludes = findAllIncludes(queryRoot);

        // Verify that a named subquery or solution set exists for each INCLUDE.
        assertNamedSubqueryForEachInclude(context, namedSubqueries, allIncludes);

        /*
         * Verify that each named subquery is consumed by at least one include
         * somewhere in the WHERE clause of the query.
         */
        assertEachNamedSubqueryIsUsed(namedSubqueries, allIncludes);

        /*
         * Figure out the join variables for each INCLUDE.
         */
        assignJoinVars(queryRoot, context, namedSubqueries, allIncludes);

        return new QueryNodeWithBindingSet(queryRoot, bindingSet);

    }

    /**
     * Return all {@link NamedSubqueryInclude}s which appear in the WHERE clause
     * of the main query.
     */
    static private NamedSubqueryInclude[] findAllIncludes(final QueryRoot queryRoot) {

        final Striterator itr = new Striterator(
                BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause()));

        itr.addTypeFilter(NamedSubqueryInclude.class);

        final List<NamedSubqueryInclude> list = new LinkedList<NamedSubqueryInclude>();

        while (itr.hasNext()) {

            list.add((NamedSubqueryInclude) itr.next());

        }

        final Striterator itr2 = new Striterator(
                BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause()));

        itr2.addTypeFilter(SubqueryRoot.class);


        while (itr2.hasNext()) {

            list.addAll(findSubqueryIncludes((SubqueryRoot) itr2.next()));

        }

        if (queryRoot.getNamedSubqueries() != null) {

            for(NamedSubqueryRoot root:queryRoot.getNamedSubqueries()){

                list.addAll(findSubqueryIncludes(root));

            }

        }

        return list.toArray(new NamedSubqueryInclude[] {});

    }

    /**
     * TODO This seems to be inefficient. We do not need to proceed
     * {@link SubqueryBase} by {@link SubqueryBase}.
     * {@link BOpUtility#visitAll(BOp, Class)} can be used to locate all
     * INCLUDEs in the entire query and then we can build up whatever indices we
     * need in optimize() and use them elsewhere as required.
     */
    static private List<NamedSubqueryInclude> findSubqueryIncludes(final SubqueryBase queryRoot){
        
        final Striterator itr = new Striterator(
                BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause()));

        itr.addTypeFilter(NamedSubqueryInclude.class);

        final List<NamedSubqueryInclude> list = new LinkedList<NamedSubqueryInclude>();

        while (itr.hasNext()) {

            list.add((NamedSubqueryInclude) itr.next());

        }

        final Striterator itr2 = new Striterator(
                BOpUtility.postOrderIterator((BOp) queryRoot.getWhereClause()));

        itr2.addTypeFilter(SubqueryRoot.class);


        while (itr2.hasNext()) {

            list.addAll(findSubqueryIncludes((SubqueryRoot) itr2.next()));

        }

        return list;

    }

    /**
     * Verify that a named subquery of solution set exists for each INCLUDE.
     *
     * @param context For querying solution sets
     * @param namedSubqueries
     * @param allIncludes
     */
    static private void assertNamedSubqueryForEachInclude(
    		final AST2BOpContext context,
            final NamedSubqueriesNode namedSubqueries,
            final NamedSubqueryInclude[] allIncludes) {

        for (NamedSubqueryInclude anInclude : allIncludes) {

            final String namedSet = anInclude.getName();

            if (namedSet == null || namedSet.trim().length() == 0)
                throw new RuntimeException(
                        "Missing or illegal name for include.");

            boolean found = false;

            for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) {

                if (aNamedSubquery.getName().equals(namedSet)) {
                    found = true;
                    break;
                }

            }

            if (!found) {
            	try {
            	    context.getSolutionSetStats(namedSet);
            	    // There is a named solution set so we are OK.
            	}
            	catch (RuntimeException e) {
	                throw new RuntimeException(
	                        "No subquery produces the solution set: " + namedSet, e);
            	}
            }

        }

    }

    /**
     * Verify that each named subquery is consumed by at least one include
     * somewhere in the WHERE clause of the query.
     *
     * @param namedSubqueries
     * @param allIncludes
     */
    static private void assertEachNamedSubqueryIsUsed(
            final NamedSubqueriesNode namedSubqueries,
            final NamedSubqueryInclude[] allIncludes) {

        // The set of all named solution sets produced by this query.
        final Set<String> namedSets = new LinkedHashSet<String>();

        for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) {

            final String namedSet = aNamedSubquery.getName();

            if (!namedSets.add(namedSet)) {

                throw new RuntimeException("NamedSet declared more than once: "
                        + namedSet);

            }

            if (namedSet == null || namedSet.trim().length() == 0)
                throw new RuntimeException(
                        "Missing or illegal name for named subquery.");

            final List<NamedSubqueryInclude> includes = new LinkedList<NamedSubqueryInclude>();

            for (NamedSubqueryInclude anInclude : allIncludes) {

                if (namedSet.equals(anInclude.getName())) {

                    includes.add(anInclude);

                }

            }

            if (includes.isEmpty()) {
                throw new RuntimeException(
                        "Named subquery results are not used by this query: "
                                + namedSet);
            }

        }

    }

    /**
     * Figure out the join variables for each INCLUDE. If the join variables
     * were already assigned to a {@link NamedSubqueryInclude}, then we just
     * make sure that the {@link NamedSubqueryRoot} will produce a suitable hash
     * index. If an INCLUDE does not have its join variables pre-assigned, then
     * we do a static analysis of the query and figure out which shared
     * variables MUST be bound. The set of shared variables is assigned as the
     * join variables. Again, we verify that a suitable hash index will be
     * produced for that INCLUDE.
     * <p>
     * Note: If the join variables were not pre-assigned (by a query hint) and
     * no join variables are identified by a static analysis then a full N x M
     * cross product of the solutions must be tested and filtered for those
     * solutions which join. This is a lot of effort when compared with a hash
     * join. Having the right join variables is very important for performance.
     * 
     * @param namedSubqueries
     * @param allIncludes
     * 
     * @see https://sourceforge.net/apps/trac/bigdata/ticket/405
     */
    static private void assignJoinVars(//
            final QueryRoot queryRoot,//
            final AST2BOpContext context,//
            final NamedSubqueriesNode namedSubqueries,//
            final NamedSubqueryInclude[] allIncludes) {

        final StaticAnalysis sa = new StaticAnalysis(queryRoot, context);

        for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) {

            final String namedSet = aNamedSubquery.getName();

            // Collect each INCLUDE for this named subquery.
            final List<NamedSubqueryInclude> includes = new LinkedList<NamedSubqueryInclude>();
            {

                for (NamedSubqueryInclude anInclude : allIncludes) {

                    if (namedSet.equals(anInclude.getName())) {

                        includes.add(anInclude);

                    }

                }

            }

            /*
             * Collect each distinct joinvar[] combination for those includes.
             *
             * Note: Since having the distinct joinvar[] combinations is
             * important, we sort each joinvar[] to ensure that they have a
             * common order.
             */
            final Set<JoinVars> distinctJoinVarsSet = new LinkedHashSet<JoinVars>();

            for (NamedSubqueryInclude anInclude : includes) {

                @SuppressWarnings("rawtypes")
                final IVariable[] joinvars;

                if (anInclude.getJoinVars() == null) {

                    /**
                     * Since no query hint was used, then figure out the join
                     * variables using a static analysis of the query.
                     * 
                     * Note: Since the named subqueries run with only the
                     * exogenous bindings as input, anything which is
                     * exogenously bound plus anything which is known bound can
                     * serve as a join variable.
                     * 
                     * TODO There is a StaticAnalysis bug - it fails to consider
                     * the exogenous bindings when computing the definitely
                     * bound variables.
                     * 
                     * @see <a
                     *      href="https://sourceforge.net/apps/trac/bigdata/ticket/412">
                     *      getDefinatelyBound() ignores exogenous variables
                     *      </a>
                     * 
                     *      TODO Optimize case where there are no exogenous
                     *      bindings such that the sole source solution for the
                     *      named subquery is an empty solution set.
                     * 
                     * @see <a
                     *      href="http://sourceforge.net/apps/trac/bigdata/ticket/535">
                     *      Optimize JOIN VARS for Sub-Selects </a>
                     */

                    final Set<IVariable<?>> set = new LinkedHashSet<IVariable<?>>();
                    
                    sa.getJoinVars(aNamedSubquery, anInclude, set);

                    joinvars = set.toArray(new IVariable[set.size()]);

                    // Sort.
                    Arrays.sort(joinvars);

                    // Set those join variables on the include.
                    anInclude.setJoinVars(ASTUtil.convert(joinvars));

                } else {

                    // Get the user specified join variables.
                    joinvars = ASTUtil.convert(anInclude.getJoinVars());

                    // Sort.
                    Arrays.sort(joinvars);

                    // Set them back on the include in sorted order.
                    anInclude.setJoinVars(ASTUtil.convert(joinvars));

                }

                distinctJoinVarsSet.add(new JoinVars(joinvars));

            }

            /*
             * Figure out the join variables for each place in the query where
             * the named result set is included and annotate the include
             * operator to specify the join variables for that include.
             */

            final int nhashIndices = distinctJoinVarsSet.size();

            if (nhashIndices > 1) {

                /*
                 * Since there is more than one set of join variables required
                 * by the INCLUDEs, we use the largest subset of the join
                 * variables defined across all of the includes.
                 */

                // First, collect all join variables.
                final Set<IVariable<?>> sharedVariables = new LinkedHashSet<IVariable<?>>();

                for (JoinVars joinVars : distinctJoinVarsSet) {

                    sharedVariables.addAll(joinVars.vars());

                }
                
                // Now, retain only those variables in scope for each include.
                for (JoinVars joinVars : distinctJoinVarsSet) {

                    sharedVariables.retainAll(joinVars.vars());

                }

                /*
                 * The join variables which are shared across all contexts in
                 * which this named solution set is joined back into the query.
                 */
                final VarNode[] sharedJoinVars = ASTUtil
                        .convert(sharedVariables.toArray(new IVariable[] {}));

                // Set the shared join variables on the named subquery.
                aNamedSubquery.setJoinVars(sharedJoinVars);

                for (NamedSubqueryInclude anInclude : includes) {

                    // Set the shared join variables on each subquery include.
                    anInclude.setJoinVars(sharedJoinVars);

                }

            } else {

                /*
                 * Since there is just one set of join variables we will use
                 * that.
                 */

                final JoinVars joinVars = distinctJoinVarsSet.iterator().next();

                aNamedSubquery.setJoinVars(ASTUtil.convert(joinVars.toArray()));

            }

        }

    }

    /**
     * Order the named subqueries based on nested includes.
     * 
     * TODO This should reuse the same arrays/collections that are generated for
     * the other logic in this class. No need to repeatedly traverse the query
     * looking for INCLUDEs.
     * 
     * TODO This should use some generic topological sort algorithm. e.g. it is not obvious that
     * this code covers the case where two named subqueries include each other.
     */
    static private void orderNamedSubqueries(final QueryRoot queryRoot,
            final NamedSubqueriesNode namedSubqueries) {
    
        // Map from solution set name to named subquery root.
        final Map<String, NamedSubqueryRoot> nameToSubquery = new LinkedHashMap<String, NamedSubqueryRoot>();
        {
        
            for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) {

                nameToSubquery.put(aNamedSubquery.getName(), aNamedSubquery);

            }

        }

        /*
         * Map from named subquery root to the list of named solutions on which
         * each named subquery depends. Those named solutions must be computed
         * before any named subquery root which will consume them.
         */
        final Map<NamedSubqueryRoot, List<String>> subqueryToIncludes = new LinkedHashMap<NamedSubqueryRoot, List<String>>();
        {
            
            for (NamedSubqueryRoot aNamedSubquery : namedSubqueries) {

                final List<String> includes = new LinkedList<String>();
                final List<String> includesNamedSubqueries = new LinkedList<String>();

                subqueryToIncludes.put(aNamedSubquery, includesNamedSubqueries);

                for (NamedSubqueryInclude include : findSubqueryIncludes(aNamedSubquery)) {
                	
                	String name = include.getName();
                	includes.add(name);
                	
                	if ( nameToSubquery.containsKey(name) ) {

                		includesNamedSubqueries.add(name);
                        
                	} // else name gives a named solution set.

                }

                // Set the DEPENDS_ON annotation: named subqueries and solution sets
                aNamedSubquery.setDependsOn(includes.toArray(new String[0]));

            }
            
        }
        
        /*
         * Create a new NamedSubqueriesNode which corresponds to a valid
         * evaluation order for the named subqueries.
         */
        {

            final Set<String> processed = new HashSet<String>();

            final NamedSubqueriesNode newNode = new NamedSubqueriesNode();

            Iterator<Map.Entry<NamedSubqueryRoot, List<String>>> iter = subqueryToIncludes
                    .entrySet().iterator();

            while (iter.hasNext()) {
                final Map.Entry<NamedSubqueryRoot, List<String>> entry = iter
                        .next();
                final NamedSubqueryRoot namedSubquery = entry.getKey();
                if (entry.getValue().size() == 0) {
                    newNode.add(namedSubquery);
                    processed.add(namedSubquery.getName());
                    iter.remove();
                }
            }

            while (subqueryToIncludes.size() > 0) {
                iter = subqueryToIncludes.entrySet().iterator();
                while (iter.hasNext()) {
                    boolean ok = true;
                    final Map.Entry<NamedSubqueryRoot, List<String>> entry = iter
                            .next();
                    for (String dep : entry.getValue()) {
                        if (!processed.contains(dep)) {
                            ok = false;
                            break;
                        }
                    }
                    if (ok) {
                        newNode.add(entry.getKey());
                        processed.add(entry.getKey().getName());
                        iter.remove();
                    }
                }
            }

            // Update the QueryRoot with the named subquery evaluation order.
            queryRoot.setNamedSubqueries(newNode);

        }

    }

    /**
     * Wrapper class used to inflict Arrays.equals() rather than Object.equals()
     * when an array is used in a Collection.
     */
    private static class JoinVars {

        private final Set<IVariable<?>> vars;

        private final int hashCode;

        public Set<IVariable<?>> vars() {
            
            return Collections.unmodifiableSet(vars);
            
        }
        
        public IVariable<?>[] toArray() {
            
            return vars.toArray(new IVariable[vars.size()]);
            
        }
        
        public JoinVars(final IVariable<?>[] vars) {

            this.vars = new LinkedHashSet<IVariable<?>>();

            for (int i = 0; i < vars.length; i++) {

                this.vars.add(vars[i]);

            }

            this.hashCode = Arrays.hashCode(vars);

        }

        @Override
        public int hashCode() {
            return hashCode;
        }

        @Override
        public boolean equals(final Object o) {
            if (this == o)
                return true;
            if (!(o instanceof JoinVars))
                return false;
            final JoinVars t = (JoinVars) o;
            return vars.equals(t.vars);
//            return Arrays.equals(vars, t.vars);
        }

    }

}