PipelinedHashIndexAndSolutionSetJoinOp.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Oct 20, 2015
 */

package com.bigdata.bop.join;

import java.util.Map;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.NV;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.controller.INamedSolutionSetRef;
import com.bigdata.bop.controller.SubqueryAnnotations;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.rdf.sparql.ast.eval.AST2BOpUtility;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.UnsyncLocalOutputBuffer;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Operator for pipelined hash index construction and subsequent join. Note that
 * this operator needs not to be combined with a solution set hash join, but
 * instead gets the subquery/subgroup passed as a parameter and thus can be
 * considered as an "all-in-one" build-hash-index-and-join operation.
 * 
 * The operator is designed for single-threaded use (as it is the case for the
 * non-pipelined hash index & join operators). It's processing scheme is
 * illustrated by an example in the following. Assume we have a query such as
 * 
 * <code>
   SELECT * WHERE {
     ?s <http://p1> ?o1
     OPTIONAL {
       ?s <http://p2> ?o2 .
       ?s <http://p3> ?o3 .
     }
   } LIMIT 10
   </code>
   
   to be evaluated over the data set
   
   <code>
     <http://s1> <http://p1> <http://o11> .
     <http://s1> <http://p2> <http://o12> .
     <http://s1> <http://p3> <http://o13> .

     <http://s2> <http://p1> <http://o21> .
     <http://s2> <http://p2> <http://o22> .

     <http://s1> <http://p1> <http://o11b> .

   </code>
 * 
 * , where the OPTIONAL is considered as a complex group that is translated
 * using a hash join pattern. The approach taken by this operator is that the
 * OPTIONAL pattern is considered as subquery that is passed in to this operator
 * via Annotations.SUBQUERY. The operator logically proceeds as follows:
 * 
 * 1. Incoming are the bindings from outside, i.e. in our example the bindings
 *    for triple pattern "?s <http://p1> ?o1". Given the input data at hand,
 *    this means we have the following binding set
 *    
 *    { 
 *      { ?s -> <http://s1> , ?o1 -> <http://o11> },
 *      { ?s -> <http://s2> , ?o1 -> <http://o21> } 
 *      { ?s -> <http://s1> , ?o1 -> <http://o11b> }      
 *    }
 *    
 *    coming in. For the sake of this example, assume the solutions are dropping
 *    in one after the after (chunk size=1) s.t. we have three iterations,
 *    but note that the implementation implements a generalized, vectored
 *    approach processing the input chunk by chunk..
 *    
 *  2. The join variable set is { ?s }. We compute the distinct projection over
 *     ?s on the incoming bindings:
 *     
 *     - In the 1st iteration, the distinct projection is { ?s -> <http://s1> },
 *     - In the 2nd iteration, the distinct projection is { ?s -> <http://s2> },
 *     - In the 3rd iteration, the distinct projection is { ?s -> <http://s1> } again.
 *     
 *  3. For each of these distinct projections, we decide whether the subquery
 *     has been evaluated with the distinct projection as input before. 
 *     If the subquery has already been evaluated, we take the fast path and 
 *     proceed with step 5. If it has not yet been evaluated before, proceed 
 *     to step 4 (namely its evaluation and buffering of the solution on the
 *     hash index).
 *     
 *     For our example this means: in the first iteration we proceed to step
 *     (4) with { ?s -> <http://s1> } as input; in the second iteration, we
 *     proceed to step (4) with { ?s -> <http://s2> } as input; in the third
 *     iteration, we can directly proceed to step (5): the distinct projection 
 *     { ?s -> <http://s1> } has been encountered in iteration 1 already.
 *     
 *  4. Evaluate the subquery for the incoming binding. The result is stored in
 *     a hash index (this hash index is used, in future, to decide whether
 *     the result has been already computed before for the distinct projection
 *     in step 3; note that, in addition to the hash index, we also record those
 *     distinct projections for which the subquery has been evaluated without 
 *     producing a result, to avoid unnecessary re-computation).
 *     
 *     In the first iteration, we compute the subquery result 
 *     { ?s -> <http://s1> , ?o2 -> <http://o12> , ?o3 -> <http://o13> } for
 *     the input binding { ?s -> <http://s1> }; in the second iteration, we
 *     compute { ?s -> <http://s2> } for the input binding { ?s -> <http://s2> }
 *     (i.e., OPTIONAL subquery that does not match, leaving the input
 *     unmodified); there is no third iteration step.
 *     
 *  5. Join the original bindings (from which the distinct projection was 
 *     obtained) against the hash index and output the results. The operator
 *     supports all kinds of joins (Normal, Exists, NotExists, etc.).
 *     
 *     We thus obtain the following matches with the hash index:
 *     - Iteration 1: { ?s -> <http://s1>,  ?o1 -> <http://o11>} JOIN 
 *                    { ?s -> <http://s1> , ?o2 -> <http://o12> , ?o3 -> <http://o13> }
 *     - Iteration 2: { ?s -> <http://s2>, ?o1 -> <http://o12> } JOIN { ?s -> <http://s2> }
 *     - Iteration 3: { ?s -> <http://s1> , ?o1 -> <http://o11b> } JOIN 
 *                    { ?s -> <http://s1> , ?o2 -> <http://o12> , ?o3 -> <http://o13> }
 *                    
 *     This gives us the expected final result:
 *     
 *     {
 *       { ?s -> <http://s1>, ?o1 -> <http://o11>,  ?o2 -> <http://o12> , ?o3 -> <http://o13> },
 *       { ?s -> <http://s2>, ?o1 -> <http://o12> },
 *       { ?s -> <http://s2>, ?o1 -> <http://o11b>, ?o2 -> <http://o12> , ?o3 -> <http://o13> }
 *     }
 *     
 *  Note that this strategy is pipelined in the sense that all results from the
 *  left are emitted as soon as the subquery result for its distinct projection
 *  result has been calculated.
 *  
 *  # Further notes: 
 *  1.) Vectored processing: the implementation uses a vectored processing
 *  approach: instead of processing the mappings one by one, we collect
 *  unseen distinct projections and their associated incoming mappings in two
 *  buffers, namely the distinctProjectionBuffer and the incomingBindingssBuffer. 
 *  
 *  Buffer size is controlled via annotations: 
 *  incomingBindingssBufferThreshold and the Annotations.DISTINCT_PROJECTION_BUFFER_THRESHOLD. 
 *  If, after processing a chunk, one of these thresholds is exceeded for the
 *  respective buffer (or, alternatively, if we're in the lastPass), the
 *  mappings in the distinctProjectionBuffer are provided as input to
 *  the subquery "in batch" (cf. step 4.)) and subsequently the bindings from
 *  the incomingBindingssBuffer are joined (and released). At that point, the
 *  buffers become empty again.
 *  
 *  Effectively, the thresholds allow to reserve some internal buffer space. For
 *  now, we go with default values, but we may want to expose these buffers via
 *  query hints or the like at some point. 
 *  
 *  2.) Alternative source: the strategy sketched above illustrates how the op
 *  works for subqueries. An alternative way of using the operator is by
 *  setting Annotations.BINDING_SETS_SOURCE *instead of* the subquery. This
 *  is used when computing a hash join with a VALUES clause; in that case, the
 *  binding sets provided by the VALUES clause are considered as "inner query";
 *  as a notable difference, this set of values is static and does not need to
 *  be re-evaluated each time, it is submitted once to the hash index in the
 *  beginning and joined with every incoming binding.
 *  
 *  3.) There exist two "implementations" of the operator, namely the
 *  {@link JVMPipelinedHashJoinUtility} and the {@link HTreePipelinedHashJoinUtility}.
 *  
 *  # Other remarks:
 *  There are some more technicalities like support for ASK_VAR (which is used
 *  by the FILTER (NOT) EXISTS translation scheme, which work in principle in
 *  the same way as they do for the standard hash join.
 *  
 *  Usage: the pipelined hash join operator is preferably used for queries
 *  containing LIMIT but *no* ORDER BY. It can also be globally enabled by
 *  system parameter {@link QueryHints#PIPELINED_HASH_JOIN} and via query
 *  hints. See {@link AST2BOpUtility#usePipelinedHashJoin} for the method
 *  implementing its selection strategy.
 * 
 * @see JVMPipelinedHashJoinUtility for implementation
 * 
 * @author <a href="mailto:ms@metaphacts.com">Michael Schmidt</a>
 */
public class PipelinedHashIndexAndSolutionSetJoinOp extends HashIndexOp {

   private static final long serialVersionUID = 3473675701742394157L;

   
   public interface Annotations extends HashIndexOp.Annotations, SubqueryAnnotations {
   
       /**
        * The variables that is projected into the inner subgroup. Typically,
        * this is identical to the join variables. There are, however, 
        * exceptions where we need to project in a superset. For instance, for
        * the query 
        * 
        * select * where {
        * ?a :knows ?b .
        *   OPTIONAL {
        *     ?b :knows ?c .
        *     ?c :knows ?d .
        *     filter(?a != :paul) # Note: filter applies to *outer* variable
        *    }
        * }
        * 
        * we have joinVars={?b} and projectInVars={?a, ?b}, because variables
        * from outside are visible in the inner filter.
        */
       String PROJECT_IN_VARS = 
           PipelinedHashIndexAndSolutionSetJoinOp.class.getName() + ".projectInVars";

       /**
        * The threshold defining when to release the distinctProjectionBuffer.
        * Note that releasing this buffer means releasing the
        * distinctProjectionBuffer at the same time.
        */
       String DISTINCT_PROJECTION_BUFFER_THRESHOLD = 
           PipelinedHashIndexAndSolutionSetJoinOp.class.getName() 
           + ".distinctProjectionBufferThreshold";
       
       // set default to have a default's chunk size default
       int DEFAULT_DISTINCT_PROJECTION_BUFFER_THRESHOLD = 50;
       

       /**
        * The threshold defining when to release the incomingBindingsBuffer.
        * Note that releasing this buffer means releasing the
        * distinctProjectionBuffer at the same time.
        */
       String INCOMING_BINDINGS_BUFFER_THRESHOLD = 
           PipelinedHashIndexAndSolutionSetJoinOp.class.getName() 
           + ".incomingBindingsBuffer";
       
       // having buffered 1000 incoming bindings, we release both buffers;
       // this might happen if we observe 1000 incoming mappings with less
       // than DISTINCT_PROJECTION_BUFFER_THRESHOLD distinct projections
       int DEFAULT_INCOMING_BINDINGS_BUFFER_THRESHOLD = 1000;

    }

    /**
      * Deep copy constructor.
      */
    public PipelinedHashIndexAndSolutionSetJoinOp(final PipelinedHashIndexAndSolutionSetJoinOp op) {
       
        super(op);

        // max parallel must be one
        if (getMaxParallel() != 1)
            throw new IllegalArgumentException(Annotations.MAX_PARALLEL + "=" + getMaxParallel());

    }
    
    /**
     * Shallow copy constructor.
     * 
     * @param args
     * @param annotations
     */
    public PipelinedHashIndexAndSolutionSetJoinOp(final BOp[] args, final Map<String, Object> annotations) {

        super(args, annotations);
        
        // max parallel must be one
        if (getMaxParallel() != 1)
            throw new IllegalArgumentException(Annotations.MAX_PARALLEL + "=" + getMaxParallel());

    }
    
    public PipelinedHashIndexAndSolutionSetJoinOp(final BOp[] args, final NV... annotations) {

        this(args, NV.asMap(annotations));
        
    }
    
    @Override
    protected ChunkTaskBase createChunkTask(final BOpContext<IBindingSet> context) {

        /**
         * The operator offers two ways to generate the hash index of the input
         * stream, either via subquery or via binding set that is passed in.
         * Exactly one of both *must* be provided.
         */
        final PipelineOp subquery = 
            (PipelineOp)getProperty(Annotations.SUBQUERY);
        
        final IBindingSet[] bsFromBindingsSetSource =
            (IBindingSet[]) getProperty(Annotations.BINDING_SETS_SOURCE);
        
        if (subquery==null && bsFromBindingsSetSource==null) {
            throw new IllegalArgumentException(
                "Neither subquery nor binding set source provided.");
        } else if (subquery!=null && bsFromBindingsSetSource!=null) {
            throw new IllegalArgumentException(
                "Both subquery and binding set source provided.");           
        }
        
        final IVariable<?> askVar = 
           (IVariable<?>) getProperty(HashJoinAnnotations.ASK_VAR);
        
        final IVariable<?>[] projectInVars = 
           (IVariable<?>[]) getProperty(Annotations.PROJECT_IN_VARS);
        
        final int distinctProjectionBufferThreshold = 
            getProperty(
               Annotations.DISTINCT_PROJECTION_BUFFER_THRESHOLD, 
               Annotations.DEFAULT_DISTINCT_PROJECTION_BUFFER_THRESHOLD);

        final int incomingBindingsBufferThreshold = 
              getProperty(
                 Annotations.INCOMING_BINDINGS_BUFFER_THRESHOLD, 
                 Annotations.DEFAULT_INCOMING_BINDINGS_BUFFER_THRESHOLD);
        
        return new ChunkTask(this, context, subquery, 
           bsFromBindingsSetSource, projectInVars, askVar,
           distinctProjectionBufferThreshold, incomingBindingsBufferThreshold);
        
    }
    
    /**
     * A chunk task. See outer class for explanation of parameters.
     */
    private static class ChunkTask extends com.bigdata.bop.join.HashIndexOp.ChunkTask {

        final PipelineOp subquery;

        final IBindingSet[] bsFromBindingsSetSource;
        
        final IConstraint[] joinConstraints;
        
        final IVariable<?> askVar;
        
        final IVariable<?>[] projectInVars;
        
        final int distinctProjectionBufferThreshold;
        
        final int incomingBindingsBufferThreshold;
        
        public ChunkTask(final PipelinedHashIndexAndSolutionSetJoinOp op,
                final BOpContext<IBindingSet> context, 
                final PipelineOp subquery, 
                final IBindingSet[] bsFromBindingsSetSource,
                final IVariable<?>[] projectInVars,
                final IVariable<?> askVar,
                final int distinctProjectionBufferThreshold, 
                final int incomingBindingsBufferThreshold) {

            super(op, context);
            
            joinConstraints = BOpUtility.concat(
                  (IConstraint[]) op.getProperty(Annotations.CONSTRAINTS),
                  state.getConstraints());
            
            // exactly one of the two will be non-null
            this.subquery = subquery;
            this.bsFromBindingsSetSource = bsFromBindingsSetSource;
            this.projectInVars = projectInVars;
            this.askVar = askVar;
            this.distinctProjectionBufferThreshold = distinctProjectionBufferThreshold;
            this.incomingBindingsBufferThreshold = incomingBindingsBufferThreshold;

        }
        
        /**
         * Evaluate.
         */
        @Override
        public Void call() throws Exception {

            try {

                // Buffer all source solutions.
                acceptAndOutputSolutions();

                if (context.isLastInvocation()) {

                    // Done. Release the allocation context.
                    state.release();

                }

                // Done.
                return null;

            } finally {
                
                context.getSource().close();

                context.getSink().close();

            }
            
        }
        
        /**
         * Output the buffered solutions.
         */
        private void acceptAndOutputSolutions() {

            // default sink
            final IBlockingBuffer<IBindingSet[]> sink = context.getSink();

            final UnsyncLocalOutputBuffer<IBindingSet> unsyncBuffer = 
               new UnsyncLocalOutputBuffer<IBindingSet>(
                 op.getChunkCapacity(), sink);

            final ICloseableIterator<IBindingSet[]> src;

            if (sourceIsPipeline) {
            
                src = context.getSource();
                
            } else if (op.getProperty(Annotations.NAMED_SET_SOURCE_REF) != null) {
                
                /*
                 * Metadata to identify the optional *source* solution set. When
                 * <code>null</code>, the hash index is built from the solutions flowing
                 * through the pipeline. When non-<code>null</code>, the hash index is
                 * built from the solutions in the identifier solution set.
                 */
                final INamedSolutionSetRef namedSetSourceRef = (INamedSolutionSetRef) op
                        .getRequiredProperty(Annotations.NAMED_SET_SOURCE_REF);

                src = context.getAlternateSource(namedSetSourceRef);
                
            } else if (bsFromBindingsSetSource != null) {

                /**
                 * We handle the BINDINGS_SETS_SOURCE case as follows: the
                 * binding sets on the source are treated as input. Given that
                 * in this case no inner query is set, we consider the
                 * BINDINGS_SETS_SOURCE as the result of the query instead.
                 * It is extracted here and passed in as a parameter.
                 */
                src = context.getSource();
                
            } else {

                throw new UnsupportedOperationException(
                        "Source was not specified");
                
            }
            
            
            ((PipelinedHashJoinUtility)state).acceptAndOutputSolutions(
               unsyncBuffer, src, stats, joinConstraints, subquery,
               bsFromBindingsSetSource, projectInVars, askVar,
               context.isLastInvocation(), distinctProjectionBufferThreshold,
               incomingBindingsBufferThreshold, context);
            
            
            unsyncBuffer.flush();

            sink.flush();

        }

    } // ControllerTask

}