IHashJoinUtility.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General License for more details.

You should have received a copy of the GNU General License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
/*
 * Created on Nov 8, 2011
 */

package com.bigdata.bop.join;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IConstraint;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.engine.BOpStats;
import com.bigdata.htree.HTree;
import com.bigdata.relation.accesspath.IBuffer;
import com.ibm.icu.util.BytesTrie.Iterator;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Interface for hash index build and hash join operations.
 * 
 * <h2>Use cases</h2>
 * 
 * For a JOIN, there are two core steps, plus one additional step if the join is
 * optional. The hash join logically has a <em>Left Hand Side</em> (LHS) and a
 * Right Hand Side (RHS). The RHS is used to build up a hash index which is then
 * probed for each LHS solution. The LHS is generally an access path scan, which
 * is done once. A hash join therefore provides an alternative to a nested index
 * join in which we visit the access path once, probing the hash index for
 * solutions which join.
 * <dl>
 * <dt>Accept solutions</dt>
 * <dd>This step builds the hash index, also known as the RHS (Right Hand Side).
 * </dd>
 * <dt>hash join</dt>
 * <dd>The hash join considers each left solution in turn and outputs solutions
 * which join. If optionals are required, this step also builds an hash index
 * (the <i>joinSet</i>) over the right solutions which did join.</dd>
 * <dt>Output optionals</dt>
 * <dd>The RHS hash index is scanned and the <i>joinSet</i> is probed to
 * identify right solutions which did not join with any left solution. Those
 * solutions are output as "optionals".</dd>
 * </dl>
 * <p>
 * This class also supports DISTINCT SOLUTIONS filters. For this use case, the
 * caller uses {@link #filterSolutions(ICloseableIterator, BOpStats, IBuffer)}
 * method.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public interface IHashJoinUtility {
    
    /**
     * Return the type safe enumeration indicating what kind of operation is to
     * be performed.
     */
    JoinTypeEnum getJoinType();
    
    /**
     * The variable bound based on whether or not a solution survives an
     * "EXISTS" graph pattern (optional).
     * 
     * @see HashJoinAnnotations#ASK_VAR
     */
    IVariable<?> getAskVar();
    
    /**
     * The join variables.
     * 
     * @see HashJoinAnnotations#JOIN_VARS
     */
    IVariable<?>[] getJoinVars();

    /**
     * The variables to be retained (optional, all variables are retained if
     * not specified).
     * 
     * @see JoinAnnotations#SELECT
     */
    IVariable<?>[] getSelectVars();

    /**
     * Returns true if the projection outputs the distinct join vars (in
     * that case, the variables delivered by {{@link #getSelectVars()} will
     * be ignored, might even be uninitialized). See
     * {@link HashJoinAnnotations#OUTPUT_DISTINCT_JVs}.
     */
    public boolean isOutputDistinctJoinVars();
    
    /**
     * The join constraints (optional).
     * 
     * @see JoinAnnotations#CONSTRAINTS
     */
    IConstraint[] getConstraints();
    
    /**
     * Return <code>true</code> iff there are no solutions in the hash index.
     */
    boolean isEmpty();

    /**
     * Return the #of solutions in the hash index.
     */
    long getRightSolutionCount();

    /**
     * Discard the hash index.
     */
    void release();

    /**
     * Buffer solutions on a hash index.
     * <p>
     * When <code>optional:=true</code>, solutions which do not have a binding
     * for one or more of the join variables will be inserted into the hash
     * index anyway using <code>hashCode:=1</code>. This allows the solutions to
     * be discovered when we scan the hash index and the set of solutions which
     * did join to identify the optional solutions.
     * 
     * @param itr
     *            The source from which the solutions will be drained.
     * @param stats
     *            The statistics to be updated as the solutions are buffered on
     *            the hash index.
     * 
     * @return The #of solutions that were buffered.
     */
    long acceptSolutions(ICloseableIterator<IBindingSet[]> itr,
            BOpStats stats);

    /**
     * Filter solutions, writing only the DISTINCT solutions onto the sink.
     * 
     * @param itr
     *            The source solutions.
     * @param stats
     *            The stats to be updated.
     * @param sink
     *            The sink.
     *            
     * @return The #of source solutions which pass the filter.
     */
    long filterSolutions(ICloseableIterator<IBindingSet[]> itr,
            BOpStats stats, IBuffer<IBindingSet> sink);

    /**
     * Do a hash join between a stream of source solutions (left) and a hash
     * index (right). For each left solution, the hash index (right) is probed
     * for possible matches (solutions whose as-bound values for the join
     * variables produce the same hash code). Possible matches are tested for
     * consistency and the constraints (if any) are applied. Solutions which
     * join are written on the caller's buffer.
     * <p>
     * Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
     * this joins, once method has been invoked for the final time, you must
     * then invoke either {@link #outputOptionals(IBuffer)} (Optional or
     * NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
     * 
     * @param leftItr
     *            A stream of chunks of solutions to be joined against the hash
     *            index (left).
     * @param stats
     *            The statistics to be updated as solutions are drained from the
     *            <i>leftItr</i> (optional). When <code>left</code> is the
     *            pipeline, {@link BOpStats#chunksIn} and
     *            {@link BOpStats#unitsIn} should be updated by passing in the
     *            {@link BOpStats} object. When <code>left</code> is a hash
     *            index (i.e., for a hash join against an access path), you
     *            should pass <code>null</code> since the chunksIn and unitsIn
     *            are updated as the {@link HashIndexOp} builds the hash index
     *            rather than when it executes the join against the access
     *            path).
     * @param outputBuffer
     *            Where to write the solutions which join.
     */
    void hashJoin(//
            ICloseableIterator<IBindingSet[]> leftItr,//
            BOpStats stats,//
            IBuffer<IBindingSet> outputBuffer//
    );

    /**
     * Variant hash join method allows the caller to impose different
     * constraints or additional constraints. This is used to impose join
     * constraints when a solution set is joined back into a query based on the
     * join filters in the join group in which the solution set is included.
     * <p>
     * Note: Some {@link JoinTypeEnum}s have side-effects on the join state. For
     * this joins, once method has been invoked for the final time, you must
     * then invoke either {@link #outputOptionals(IBuffer)} (Optional or
     * NotExists) or {@link #outputJoinSet(IBuffer)} (Exists).
     * 
     * @param leftItr
     *            A stream of chunks of solutions to be joined against the hash
     *            index (left).
     * @param stats
     *            The statistics to be updated as solutions are drained from the
     *            <i>leftItr</i>.
     * @param outputBuffer
     *            Where to write the solutions which join.
     * @param constraints
     *            Constraints attached to this join (optional). Any constraints
     *            specified here are combined with those specified in the
     *            constructor.
     */
    void hashJoin2(//
            ICloseableIterator<IBindingSet[]> leftItr,//
            BOpStats stats,//
            IBuffer<IBindingSet> outputBuffer,//
            IConstraint[] constraints//
    );

    /**
     * Perform an N-way merge join. For an OPTIONAL join, <i>this</i> instance
     * is understood to be the index having the "required" solutions.
     * <p>
     * The merge join takes a set of solution sets in the some order and having
     * the same join variables. It examines the next solution in order for each
     * solution set and compares them. For each solution set which reported a
     * solution having the same join variables as that earliest solution, it
     * outputs the cross product and advances the iterator on that solution set.
     * <p>
     * The iterators draining the source solution sets need to be synchronized
     * such that we consider only solutions having the same hash code in each
     * cycle of the MERGE JOIN. The synchronization step is different depending
     * on whether or not the MERGE JOIN is OPTIONAL.
     * <p>
     * If the MERGE JOIN is REQUIRED, then we want to synchronize the source
     * solution iterators on the next lowest key (aka hash code) which they all
     * have in common.
     * <p>
     * If the MERGE JOIN is OPTIONAL, then we want to synchronize the source
     * solution iterators on the next lowest key (aka hash code) which appears
     * for any source iterator. Solutions will not be drawn from iterators not
     * having that key in that pass.
     * <p>
     * Note that each hash code may be an alias for solutions having different
     * values for their join variables. Such solutions will not join. However,
     * only solutions having the same values for the hash code can join. Thus,
     * by proceeding with synchronized iterators and operating only on solutions
     * having the same hash code in each round, we will consider all solutions
     * which COULD join with one another in each round.
     * <p>
     * Note: If the solutions are not in a stable and mutually consistent order
     * by hash code in the hash indices then the solutions in each hash index
     * MUST be SORTED before proceeding. (The {@link HTree} maintains solutions
     * in such an order but the JVM collections do not.)
     * 
     * @param others
     *            The other solution sets to be joined. All instances must be of
     *            the same concrete type as <i>this</i>.
     * @param outputBuffer
     *            Where to write the solutions.
     * @param constraints
     *            The join constraints.
     * @param optional
     *            <code>true</code> iff the join is optional.
     */
    void mergeJoin(//
            IHashJoinUtility[] others,//
            IBuffer<IBindingSet> outputBuffer,//
            IConstraint[] constraints,//
            boolean optional//
    );
    
    /**
     * Checkpoint the generated hash index such that it becomes safe for
     * concurrent readers.
     */
    void saveSolutionSet();

    /**
     * Identify and output the optional solutions. This is used with OPTIONAL
     * and NOT EXISTS.
     * <p>
     * Optionals are identified using a <i>joinSet</i> containing each right
     * solution which joined with at least one left solution. The total set of
     * right solutions is then scanned once. For each right solution, we probe
     * the <i>joinSet</i>. If the right solution did not join, then it is output
     * now as an optional join.
     * 
     * @param outputBuffer
     *            Where to write the optional solutions.
     */
    void outputOptionals(IBuffer<IBindingSet> outputBuffer);

    /**
     * Output the solutions buffered in the hash index. This is used when an
     * operator is building a hash index for use by a downstream operator.
     * 
     * @param out
     *            Where to write the solutions.
     */
    void outputSolutions(IBuffer<IBindingSet> out);

    /**
     * Return an {@link Iterator} that visits all solutions in the index (index
     * scan). The visited solutions MAY contain variables that would not be
     * projected out of the hash join.
     * <p>
     * Note: This is very nearly the same as {@link #outputSolutions(IBuffer)}
     * except that the latter only outputs the projected variables and it writes
     * onto an {@link IBuffer} rather than returning an
     * {@link ICloseableIterator}.
     * 
     * @return The {@link Iterator}.
     */
    ICloseableIterator<IBindingSet> indexScan();
    
    /**
     * Output the solutions which joined. This is used with EXISTS.
     * 
     * @param out
     *            Where to write the solutions.
     */
    void outputJoinSet(IBuffer<IBindingSet> out);
    
}