DistinctSPOIterator.java example

Explorer
blazegraph-master
- database-master
package com.bigdata.rdf.spo;

import java.util.LinkedHashSet;
import java.util.NoSuchElementException;
import java.util.Set;

import com.bigdata.BigdataStatics;
import com.bigdata.btree.BTree;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.rdf.internal.IV;
import com.bigdata.rdf.store.IRawTripleStore;
import com.bigdata.util.Bytes;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * Iterator using a {@link BTree} filter out duplicate (s,p,o) tuples.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id: DistinctSPOIterator.java 3472 2010-08-31 16:21:47Z thompsonbry
 *          $
 * 
 * @see SPORelation#distinctSPOIterator(ICloseableIterator)
 * 
 * @deprecated By a simple distinct filter and a filter which strips off the
 *             context position from an SPOC. Stack them together and it does
 *             the same thing. (The fall back to the B+Tree might still be
 *             interesting if we do not have a persistent hash map to fall back
 *             on instead and we want streaming results. Otherwise use an
 *             external merge sort.)
 */
public class DistinctSPOIterator implements ICloseableIterator<ISPO> {

    /**
     * The backing relation, which is only used to obtain the {@link BTree}
     * instance in {@link #overflowToBTree(Set)}.
     */
    private SPORelation spoRelation;

    /**
     * The source iterator.
     */
    private ICloseableIterator<ISPO> src;

    /**
     * Hash set is allocated when the first {@link ISPO} is visited and is used
     * until the {@link #MAX_HASH_SET_CAPACITY} is reached, at which point the
     * {@link #btreeSet} is allocated.
     */
    private Set<ISPO> hashSet;

    /**
     * B+Tree is used once the {@link #MAX_HASH_SET_CAPACITY} is reached. The
     * B+Tree is slowed than the {@link #hashSet}, but can spill onto the disk
     * and is appropriate for very large distinct sets.
     */
    private BTree btreeSet;

    /**
     * Buffer reused for each (s,p,o) key. The buffer is allocated to the exact
     * size when the {@link #btreeSet} is allocated.
     */
    private KeyBuilder keyBuilder;

    /**
     * The next element to be visited or <code>null</code> if we need to scan
     * ahead.
     */
    private ISPO next = null;

    /**
     * <code>true</code> iff the iterator has been proven to be exhausted.
     */
    private boolean exhausted = false;

    /**
     * <code>true</code> iff the iterator has been {@link #close()}ed.
     */
    private boolean closed = false;

    /**
     * The #of distinct {@link ISPO}s read from the {@link #src} iterator so
     * far.
     */
    private int ndistinct = 0;
    
    /**
     * The #of {@link ISPO}s read from the {@link #src} iterator.
     */
    private int nscanned = 0;

    /**
     * After this many entries we create the {@link #btreeSet} which can spill
     * out onto the disk.
     * 
     * @todo configuration parameter (via the constructor). Low memory JVMs
     *       might want to use a smaller threshold, but the hash set is much
     *       faster (10x or better). Large memory JVMs might want to use an even
     *       larger threshold. [I've set this to MAX_VALUE since the BTree is a
     *       huge performance hit.]
     */
    static final int MAX_HASH_SET_CAPACITY = Integer.MAX_VALUE;//100000;

    /**
     * 
     * @param src
     *            The source iterator.
     */
    public DistinctSPOIterator(final SPORelation spoRelation,
            final ICloseableIterator<ISPO> src) {

        if (spoRelation == null)
            throw new IllegalArgumentException();

        if (src == null)
            throw new IllegalArgumentException();

        this.spoRelation = spoRelation;

        this.src = src;

    }

    public void close() {

        if (closed)
            return;

        closed = true;

        /*
         * Close the source iterator.
         */

        src.close();

        /*
         * Close the btree. This will discard all of its buffers.
         */

        if (btreeSet != null) {

            btreeSet.close();

        }

        spoRelation = null;

        src = null;

        hashSet = null;

        btreeSet = null;

        keyBuilder = null;

    }

    /**
     * Returns immediately if there is an element waiting. Otherwise, scans
     * ahead until it finds an element which has not already been visited. It
     * then add the element to the set of elements already seen and saves a
     * reference to that element to be returned by {@link #next()}.
     */
    public boolean hasNext() {

        if (exhausted || closed)
            return false;

        if (next != null)
            return true;

        if (hashSet == null) {

            /*
             * Allocate hash set.
             * 
             * Note: using a linked hash set for faster iterator if we have to
             * convert to a B+Tree.
             * 
             * Note: the initial capacity is the default since most access paths
             * have low cardinality.
             * 
             * @todo if the caller knows the range count (upper bound) then we
             * could plan the hash set capacity more accurately.
             * 
             * @todo defer hashSet creation until 2nd distinct SPO shows up to
             * reduce the object allocation.
             */

            hashSet = new LinkedHashSet<ISPO>();

        } else if (btreeSet == null && ndistinct >= MAX_HASH_SET_CAPACITY) {

            /*
             * Open B+Tree. We will not put anything new into the hashSet,
             * but we will continue to test first against the hashSet and
             * then against the B+Tree. New distinct ISPOs are inserted into
             * the B+Tree.
             */

            if (BigdataStatics.debug)
                System.err.println("Distinct SPO iterator overflow");

            // allocate buffer.
            keyBuilder = new KeyBuilder(3 * Bytes.SIZEOF_LONG);

            // allocate B+Tree w/ bloom filter.
            btreeSet = spoRelation.getSPOOnlyBTree(true/* bloomFilter */);

        }

        // scan for the next distinct ISPO from the src iterator.
        return _hasNext();

    }

    /**
     * Scan for the next distinct {@link ISPO} from the src iterator and set it
     * on {@link #next}.
     * 
     * @return <code>true</code> if another distinct {@link ISPO} was found.
     */
    private boolean _hasNext() {

        while (next == null && src.hasNext()) {

            /*
             * Read another ISPO from the iterator and strip off the context
             * position.
             * 
             * Note: distinct is enforced on (s,p,o). By stripping off the
             * context and statement type information first, we ensure that
             * (s,p,o) duplicates will be recognized as such.
             * 
             * Note: this approach requires us to discard the statement type
             * metadata.
             */

            // read next from the source iterator.
            ISPO tmp = src.next(); nscanned++;

            // strip off the context (and statement type).
            tmp = new SPO(tmp.s(), tmp.p(), tmp.o(), (IV) null/* c */);

            if (btreeSet == null) {

                // Insert into the hash set.

                if (!hashSet.add(tmp)) {

                    // duplicate, keep scanning.
                    continue;

                }

            } else {

                // First, test the hash set.
                if (hashSet.contains(tmp)) {

                    // duplicate, keep scanning.
                    continue;

                }

                // Next, test the B+Tree.
                final byte[] key = SPOKeyOrder.SPO.encodeKey(keyBuilder, tmp);

                if (btreeSet.contains(key)) {

                    // duplicate, keep scanning.
                    continue;

                }

                // Finally, insert into the B+Tree.
                btreeSet.insert(key, null);

            }

            // found a new distinct spo.
            next = tmp;

            ndistinct++;

        } // while(...)

        if (next == null) {

            exhausted = true;

            return false;
            
        }

        return true;

    }

    public ISPO next() {

        if (!hasNext())
            throw new NoSuchElementException();

        assert next != null;

        final ISPO tmp = next;

        next = null;

        return tmp;

    }

    public void remove() {

        throw new UnsupportedOperationException();

    }

}