JoinSetUtil.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 21, 2011
 */

package com.bigdata.rdf.sparql.ast;

import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.IVariable;
import com.bigdata.rdf.sparql.ast.optimizers.ASTHashJoinOptimizer;

/**
 * Utility class for join analysis.
 * 
 * TODO Surely we can do some bit math which would be slimmer and faster than
 * managing the IVariable sets?
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class JoinSetUtil {

    private static final Logger log = Logger
            .getLogger(ASTHashJoinOptimizer.class);

    /**
     * The group.
     */
    public final GraphPatternGroup<IGroupMemberNode> group;

    /**
     * The variables known to be bound on entry to the group.
     */
    public final Set<IVariable<?>> knownBound;

    /**
     * The set of variables which are bound by the time the last required join
     * is done. This includes the {@link #knownBound} variables plus any
     * variables for any required join.
     */
    public final Set<IVariable<?>> eventuallyBoundVars;
    
    /**
     * An array of the vertices for the required joins in the group. The indices
     * into the array are the order in which the required joins were encountered
     * in the group.
     */
    public final IJoinNode[] requiredJoins;

    /** The #of required joins in the group (the {@link #requiredJoins} length). */
    public final int requiredJoinCount;

    /**
     * The FILTERS that can be run on entry to the group (and which should be
     * lifted into the parent).
     */
    public final Set<FilterNode> preFilters;

    /**
     * The FILTERS that will be attached to the required joins for this group.
     */
    public final Set<FilterNode> joinFilters;

    /**
     * The FILTERS that can not be run until the end of the join group.
     */
    public final Set<FilterNode> postFilters;

    /**
     * The set of variables which are bound be each of the vertices for the
     * required joins. The indices into the array are the order in which the
     * requried joins were encountered in the group.
     */
    public final Set<IVariable<?>>[] vars;

    // final Set<IVariable<?>>[] varsWithFilters;

    /**
     * Diagonal matrix for the required joins. A cells having a positive value
     * gives the #of directly shared variables. This will be ZERO (0) if there
     * is no direct join. Only the upper diagonal of the matrix is populated.
     */
    public final int[][] canJoin;

    /**
     * The #of direct joins found for each vertex (summed across
     * {@link #canJoin} matrix.
     */
    public final int[] directJoinCount;

    /**
     * The distinct sets of vertices which are composed solely of (the
     * transitive closure of) joins on directly shared variables. For example
     * 
     * <pre>
     * p(x,y) X p(y,z) x p(z,t)
     * </pre>
     * 
     * Would form a direct join set each join will share a variable directly.
     * Even though there are some vertices which do not share variables
     * directly, they can be joined directly once other joins have been
     * processed. For example <code>p(z,t)</code> can be joined once we have
     * <code>p(y,z)</code>.
     * <p>
     * The number of such direct join sets varies with the query. If all
     * vertices fit into one direct join set, then the query can be pipelined
     * efficiently. If there is more than one such set then we need to look
     * further to see if we can identify joins on indirectly shared variables
     * (shared through a FILTER) which can be used to piece these join sets
     * together. If not, there is an unconstrained cross product in the query.
     */
    public final Set<VertexJoinSet> directJoinSets;

    /**
     * A collection of vertices and the join variables they bind.
     */
    public static class VertexJoinSet {

        /**
         * The set of verticies in this join set. The indices are the order in
         * which the vertices were encountered in the group.
         */
        final public Set<Integer> vertices;

        /**
         * The set of variables bound by at least one join in the join set.
         */
        final public Set<IVariable<?>> joinvars;

        public VertexJoinSet() {
            vertices = new LinkedHashSet<Integer>();
            joinvars = new LinkedHashSet<IVariable<?>>();
        }
        
        public VertexJoinSet(final Set<Integer> vertices,
                final Set<IVariable<?>> joinvars) {
            this.vertices = vertices;
            this.joinvars = joinvars;
        }

        public String toString() {
            return getClass().getSimpleName() + "{vertices=" + vertices
                    + ",joinvars=" + joinvars + "}";
        }
        
        public int hashCode() {
            return vertices.hashCode();
        }
        
        public boolean equals(final Object o) {
            if (this == o)
                return true;
            if (!(o instanceof VertexJoinSet))
                return false;
            final VertexJoinSet t = (VertexJoinSet) o;
            if (!vertices.equals(t.vertices))
                return false;
            if (!joinvars.equals(t.joinvars))
                return false;
            return true;
        }

    }
    
    /**
     * 
     * @param sa
     * @param knownBound Any variables known to be bound on entry to the group.
     * @param group The group.
     */
    public JoinSetUtil(final StaticAnalysis sa,
            Set<IVariable<?>> knownBound, 
            final GraphPatternGroup<IGroupMemberNode> group) {

        this.group = group;
        
        if (knownBound == null) {
            knownBound = new LinkedHashSet<IVariable<?>>();
        }
        this.knownBound = knownBound;
        
        /*
         * Create an array of the vertices for the required joins.
         */

        // Locate all the required joins.
        {
            final List<IJoinNode> list = new LinkedList<IJoinNode>();
            for (IGroupMemberNode m : group) {
                if (m instanceof IJoinNode) {
                    final IJoinNode j = (IJoinNode) m;
                    if (!j.isOptional())
                        list.add(j);
                }
            }
            requiredJoinCount = list.size();
            requiredJoins = list.toArray(new IJoinNode[requiredJoinCount]);
        }

        /*
         * Create an array of the variables for each of the required join
         * vertices.
         */
        eventuallyBoundVars = new LinkedHashSet<IVariable<?>>();
        {
            vars = new Set[requiredJoinCount];
            for (int i = 0; i < requiredJoinCount; i++) {
                final IJoinNode j = requiredJoins[i];
                // anything bound by this join.
                final Set<IVariable<?>> tmp = sa.getSpannedVariables((BOp) j,
                        new LinkedHashSet<IVariable<?>>());
                tmp.addAll(knownBound); // plus anything bound on entry to the group.
                eventuallyBoundVars.addAll(vars[i] = tmp);
            }
        }

        /*
         * Identify the FILTERs which can run against the required joins. This
         * will include both pre-filters (which really should be lifted out) and
         * join-filters (which are satisified by the time we have run all the
         * required joins).
         */
        {
            preFilters = new LinkedHashSet<FilterNode>();
            joinFilters = new LinkedHashSet<FilterNode>();
            postFilters = new LinkedHashSet<FilterNode>();
            for (IGroupMemberNode m : group) {
                if (m instanceof FilterNode) {
                    final FilterNode f = (FilterNode) m;
                    if (sa.isFullyBound(f, knownBound)) {
                        /*
                         * The variables for this filter are already bound on
                         * entry.
                         */
                        preFilters.add(f);
                        continue;
                    }
                    if (sa.isFullyBound(f, eventuallyBoundVars)) {
                        /*
                         * The variables for this filter will be fully bound by
                         * the time we are done with the required joins.
                         */
                        joinFilters.add(f);
                        continue;
                    }
                    /*
                     * The variables for this filter will not be bound by the
                     * end of the required joins. If there are optional joins,
                     * then the filter might be able to succeed if it is run
                     * after the last optional join which could bind a variable
                     * which it needs and which is not bound by a required join
                     * in this group.
                     */
                    postFilters.add(f);
                }
            }
        }
        
        /*
         * Diagonal matrix for the required joins
         * 
         * In the first pass, we set a cell to the #of directly shared join
         * variables between each pair of vertices. This will be ZERO (0) if
         * there is no direct join. Only the upper diagonal of the matrix is
         * populated.
         */
        canJoin = new int[requiredJoinCount][requiredJoinCount];
        // The #of direct joins found for each vertex (summed across canJoin
        // matrix).
        directJoinCount = new int[requiredJoinCount];
        {
            for (int i = 0; i < requiredJoinCount; i++) {
                for (int j = i + 1; j < requiredJoinCount; j++) {
                    final Set<IVariable<?>> sharedVars = new HashSet<IVariable<?>>();
                    sharedVars.addAll(vars[i]);
                    sharedVars.retainAll(vars[j]);
                    if ((canJoin[i][j] = sharedVars.size()) > 0) {
                        // #of times there is a direct join for this vertex.
                        directJoinCount[i]++;
                        directJoinCount[j]++;
                    }
                }
            }
        }
        if (log.isDebugEnabled()) {
            // TODO Could be moved into a toString() for this class.
            log.debug("\ncanJoin:\n" + toString(canJoin));
            log.debug("\ndirectJoinCount:\n" + Arrays.toString(directJoinCount));
        }

        /*
         * The distinct sets of vertices which composed solely of (the
         * transitive closure of) direct joins.
         */
        directJoinSets = calcDirectJoinSets();
              
    }

    /**
     * Identify the subsets of the vertices which can join directly.
     * <p>
     * Start with each row and create a set containing the variables for that
     * vertex and a set of the vertices with which it has joined (initially this
     * contains just that row). For each column that it joins with, add in all
     * variables for that column and add the column to the set of joined
     * vertices. Once all columns have been processed for the initial row, we
     * have the first join set of vertices.
     * <p>
     * Repeat for the first row not found in that join set until all joins have
     * been incorporated into a join set. This gives us the #of join sets which
     * do not overlap based on directly shared variables. It would be ideal to
     * have one such join set. Where there is more than one there is clearly a
     * question concerning which join set we should handle first.
     */
    private Set<VertexJoinSet> calcDirectJoinSets() {
        
        final Set<VertexJoinSet> joinSets = new LinkedHashSet<VertexJoinSet>();

        // The set of vertices which have been consumed.
        final Set<Integer> used = new LinkedHashSet<Integer>();
        for (int i = 0; i < requiredJoinCount; i++) {
            if (used.contains(i))
                continue;
            // This join set.
            final VertexJoinSet joinSet = new VertexJoinSet();
            joinSet.vertices.add(i);
            used.add(i);
            // The initial vertex for this join set.
            joinSet.joinvars.addAll(vars[i]);
            expandJoinSet(i, joinSet.vertices, used, joinSet.joinvars);
            joinSets.add(joinSet);
            if (log.isInfoEnabled())
                log.info("joinSet: " + joinSet.vertices + " on "
                        + joinSet.joinvars);
        }
        // All vertices must have been used.
        assert used.size() == requiredJoinCount : "used=" + used
                + ", but requiredJoinCount=" + requiredJoinCount;
        return joinSets;
    }
    
    private void expandJoinSet(final int i, final Set<Integer> joinSet,
            final Set<Integer> used, final Set<IVariable<?>> joinvars) {
        // Build up the join set
        for (int j = 0; j < requiredJoinCount; j++) {
            if (i == j)
                continue;
            if (used.contains(j))
                continue;
            if ((i < j && canJoin[i][j] > 0) || (i > j && canJoin[j][i] > 0)) {
                joinSet.add(j);
                used.add(j);
                joinvars.addAll(vars[j]);
                // And visit anything which canJoin() with j (recursion).
                expandJoinSet(j, joinSet, used, joinvars);
            }
        }
    }

    static private String toString(final int[][] data) {

        final StringBuilder sb = new StringBuilder();

        final int n = data.length;

        for (int i = 0; i < n; i++) {

            for (int j = 0; j < n; j++) {

                if (j <= i)
                    sb.append("-");
                else
                    sb.append(data[i][j]);

                sb.append(" ");

            }

            sb.append("\n");

        }

        return sb.toString();

    }

}