HashJoinOp.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Nov 14, 2011
 */

package com.bigdata.bop.join;

import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.FutureTask;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpEvaluationContext;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IQueryAttributes;
import com.bigdata.bop.IShardwisePipelineOp;
import com.bigdata.bop.IVariable;
import com.bigdata.bop.NV;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.controller.INamedSolutionSetRef;
import com.bigdata.bop.controller.NamedSetAnnotations;
import com.bigdata.relation.IRelation;
import com.bigdata.relation.accesspath.AbstractUnsynchronizedArrayBuffer;
import com.bigdata.relation.accesspath.IAccessPath;
import com.bigdata.relation.accesspath.IBindingSetAccessPath;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.UnsyncLocalOutputBuffer;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Abstract base class for both JVM and native memory hash join against an
 * {@link IAccessPath}. The source solutions from the pipeline are buffered on a
 * hash index. Depending on the implementation, the hash index may have a
 * threshold that will trigger an evaluation pass of the hash join. If not, then
 * the hash join will run exactly once. When the hash join runs, the access path
 * is scanned and the hash index (of intermediate solutions from the pipeline)
 * is probed for each solution read from the {@link IAccessPath}. Solutions
 * which join are output.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 */
abstract public class HashJoinOp<E> extends PipelineOp implements
        IShardwisePipelineOp<E> {

    static private final transient Logger log = Logger
            .getLogger(HashJoinOp.class);

    private static final long serialVersionUID = 1L;

    public interface Annotations extends AccessPathJoinAnnotations,
            NamedSetAnnotations, HashJoinAnnotations {

    }

    /**
     * @param op
     */
    public HashJoinOp(final HashJoinOp<E> op) {

        super(op);

    }

    public HashJoinOp(final BOp[] args, final NV... annotations) {

        this(args, NV.asMap(annotations));

    }

    /**
     * @param args
     * @param annotations
     */
    public HashJoinOp(final BOp[] args, final Map<String, Object> annotations) {

        super(args, annotations);

        /*
         * Validate common requirements for all concrete implementations of this
         * operator.
         */
        
        switch (getEvaluationContext()) {
        case CONTROLLER:
        case SHARDED:
        case HASHED:
            break;
        default:
            throw new UnsupportedOperationException(
                    Annotations.EVALUATION_CONTEXT + "="
                            + getEvaluationContext());
        }

        // Predicate for the access path must be specified.
        getPredicate();

        getRequiredProperty(Annotations.NAMED_SET_REF);

        // Join variables must be specified.
        final IVariable<?>[] joinVars = (IVariable[]) getRequiredProperty(Annotations.JOIN_VARS);

//        if (joinVars.length == 0)
//            throw new IllegalArgumentException(Annotations.JOIN_VARS);

        for (IVariable<?> var : joinVars) {

            if (var == null)
                throw new IllegalArgumentException(Annotations.JOIN_VARS);

        }

    }

    /**
     * {@inheritDoc}
     * 
     * @see Annotations#PREDICATE
     */
    @Override
    @SuppressWarnings("unchecked")
    public IPredicate<E> getPredicate() {

        return (IPredicate<E>) getRequiredProperty(Annotations.PREDICATE);

    }
        
    /**
     * Return <code>true</code> iff the predicate associated with the join is
     * optional.
     * 
     * @see IPredicate.Annotations#OPTIONAL
     */
    protected boolean isOptional() {
        
        return getPredicate().isOptional();
        
    }
    
    @Override
    public BaseJoinStats newStats() {

        return new BaseJoinStats();

    }

    /**
     * Return the instance of the {@link IHashJoinUtility} to be used by this
     * operator. This method is invoked once, the first time this operator is
     * evaluated. The returned {@link IHashJoinUtility} reference is attached to
     * the {@link IQueryAttributes} and accessed there on subsequent evaluation
     * passes for this operator.
     * 
     * @param context
     *            The {@link BOpEvaluationContext}
     * @param namedSetRef
     *            Metadata to identify the named solution set.
     * @param joinType
     *            The type of join.
     */
    abstract protected IHashJoinUtility newState(//
            final BOpContext<IBindingSet> context,//
            final INamedSolutionSetRef namedSetRef, //
            final JoinTypeEnum joinType//
            );

    /**
     * Return <code>true</code> if {@link ChunkTask#doHashJoin()} should be
     * executed in a given operator {@link ChunkTask} invocation.
     * 
     * @param context
     *            The operator evaluation context.
     * @param state
     *            The {@link IHashJoinUtility} instance.
     */
    abstract protected boolean runHashJoin(final BOpContext<?> context,
            final IHashJoinUtility state);

    @Override
    public FutureTask<Void> eval(final BOpContext<IBindingSet> context) {

        return new FutureTask<Void>(new ChunkTask<E>(context, this));
        
    }

    /**
     * Task executing on the node.
     */
    private static class ChunkTask<E> implements Callable<Void> {

        private final BOpContext<IBindingSet> context;

        private final HashJoinOp<E> op;

        private final IRelation<E> relation;
        
        private final IPredicate<E> pred;
        
        private final BaseJoinStats stats;

        private final IHashJoinUtility state;
        
        private final IBlockingBuffer<IBindingSet[]> sink;
        
        private final IBlockingBuffer<IBindingSet[]> sink2;

        public ChunkTask(final BOpContext<IBindingSet> context,
                final HashJoinOp<E> op) {

            this.context = context;

            this.stats = (BaseJoinStats) context.getStats();

            this.pred = op.getPredicate();

            this.relation = context.getRelation(pred);

            this.sink = context.getSink();

            this.sink2 = context.getSink2();

            this.op = op;

            {

                /*
                 * First, see if the map already exists.
                 * 
                 * Note: Since the operator is not thread-safe, we do not need
                 * to use a putIfAbsent pattern here.
                 * 
                 * Note: Publishing the [state] as a query attribute provides
                 * visibility into the hash join against the access path even
                 * for implementations (such as the JVMHashJoinOp) where the
                 * entire operation will occur within a single evaluation pass.
                 */

                final INamedSolutionSetRef namedSetRef = (INamedSolutionSetRef) op
                        .getRequiredProperty(Annotations.NAMED_SET_REF);

                /*
    			 * Lookup the attributes for the query on which we will hang the
    			 * solution set. See BLZG-1493 (if queryId is null, use the query
    			 * attributes for this running query).
    			 */
				final IQueryAttributes attrs = context.getQueryAttributes(namedSetRef.getQueryId());

                IHashJoinUtility state = (IHashJoinUtility) attrs
                        .get(namedSetRef);

                if (state == null) {

                    state = op.newState(context, namedSetRef,
                            op.isOptional() ? JoinTypeEnum.Optional
                                    : JoinTypeEnum.Normal);

                    attrs.put(namedSetRef, state);

                }

                this.state = state;

            }

        }

        @Override
        public Void call() throws Exception {

            boolean didRun = false;
            try {

                acceptSolutions();

                if(op.runHashJoin(context, state)) {

                    didRun = true;
                    
                    doHashJoin();
                    
                }

                // Done.
                return null;
                
            } finally {

                if (didRun) {

                    /*
                     * The state needs to be released each time this operator
                     * runs in order to discard the intermediate solutions
                     * buffered on the hash index that were just joined against
                     * the access path. If we do not discard the state after
                     * processing the intermediate solutions, then they will
                     * continue to accumulate and we will over-report joins
                     * (duplicate solutions will be output for things already in
                     * the hash index the next time we evaluate the hash join
                     * against the access path).
                     */

                    state.release();

                }
                
                sink.close();

                if (sink2 != null)
                    sink2.close();
                
            }

        }

        /**
         * Buffer intermediate resources.
         */
        private void acceptSolutions() {

            state.acceptSolutions(context.getSource(), stats);

        }

        /**
         * Return the access path that to be scanned. Solutions read from this
         * access path will be used to probe the hash index to identify
         * solutions that can join.
         */
        private IBindingSetAccessPath<?> getAccessPath() {

            return (IBindingSetAccessPath<?>) context.getAccessPath(relation,
                    pred);

        }
        
        /**
         * Do a hash join of the buffered solutions with the access path.
         */
        private void doHashJoin() {

            if (state.isEmpty())
                return;

            final IBindingSetAccessPath<?> accessPath = getAccessPath();

            if (log.isInfoEnabled())
                log.info("accessPath=" + accessPath);

            stats.accessPathCount.increment();

            stats.accessPathRangeCount.add(accessPath
                    .rangeCount(false/* exact */));

            final UnsyncLocalOutputBuffer<IBindingSet> unsyncBuffer = new UnsyncLocalOutputBuffer<IBindingSet>(
                    op.getChunkCapacity(), sink);

            final long cutoffLimit = pred.getProperty(
                    IPredicate.Annotations.CUTOFF_LIMIT,
                    IPredicate.Annotations.DEFAULT_CUTOFF_LIMIT);

            // Obtain the iterator for the current join dimension.
            final ICloseableIterator<IBindingSet[]> itr = accessPath
                  .solutions(context, cutoffLimit, stats);

            /*
             * Note: The [stats] are NOT passed in here since the chunksIn and
             * unitsIn were updated when the pipeline solutions were accepted
             * into the hash index. If we passed in stats here, they would be
             * double counted when we executed the hash join against the access
             * path.
             */
            state.hashJoin(
                    itr,// left
                    null, // stats
                    unsyncBuffer// out
                    );

            switch (state.getJoinType()) {
            case Normal:
                /*
                 * Nothing to do.
                 */
                break;
            case Optional:
            case NotExists: {
                /*
                 * Output the optional solutions.
                 */

                // where to write the optional solutions.
                final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer2 = sink2 == null ? unsyncBuffer
                        : new UnsyncLocalOutputBuffer<IBindingSet>(
                                op.getChunkCapacity(), sink2);

                state.outputOptionals(unsyncBuffer2);

                unsyncBuffer2.flush();
                if (sink2 != null)
                    sink2.flush();

                break;
            }
            case Exists: {
                /*
                 * Output the join set.
                 */
                state.outputJoinSet(unsyncBuffer);
                break;
            }
            default:
                throw new AssertionError();
            }

            unsyncBuffer.flush();
            sink.flush();

        }
        
    } // class ChunkTask

}