PrefixFilter.java example

Explorer
blazegraph-master
- database-master
package com.bigdata.btree.filter;

import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.log4j.Logger;

import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleCursor;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.keys.SuccessorUtil;
import com.bigdata.util.BytesUtil;

import cutthecrap.utils.striterators.FilterBase;

/**
 * <p>
 * Filter visits all {@link ITuple}s whose keys begin with any of the specified
 * prefix(s). The filer accepts a key or an array of keys that define the key
 * prefix(s) whose completions will be visited. It efficiently forms the
 * successor of each key prefix, performs a key-range scan of the key prefix,
 * and (if more than one key prefix is given), seeks to the start of the next
 * key-range scan.
 * </p>
 * <h4>WARNING</h4>
 * <p>
 * <strong>The prefix keys MUST be formed with {@link StrengthEnum#Primary}.
 * This is necessary in order to match all keys in the index since it causes the
 * secondary characteristics to NOT be included in the prefix key even if they
 * are present in the keys in the index.</strong> Using other
 * {@link StrengthEnum}s will result in secondary characteristics being encoded
 * by additional bytes appended to the key. This will result in scan matching
 * ONLY the given prefix key(s) and matching nothing if those prefix keys are
 * not actually present in the index.
 * </p>
 * <p>
 * For example, the Unicode text "Bryan" is encoded as the <em>unsigned</em>
 * byte[]
 * </p>
 * 
 * <pre>
 * [43, 75, 89, 41, 67]
 * </pre>
 * 
 * <p>
 * at PRIMARY strength but as the <em>unsigned</em> byte[]
 * </p>
 * 
 * <pre>
 * [43, 75, 89, 41, 67, 1, 9, 1, 143, 8]
 * </pre>
 * 
 * <p>
 * at IDENTICAL strength. The additional bytes for the IDENTICAL strength
 * reflect the Locale specific Unicode sort key encoding of secondary
 * characteristics such as case. The successor of the IDENTICAL strength byte[]
 * is
 * </p>
 * 
 * <pre>
 * [43, 75, 89, 41, 67, 1, 9, 1, 143, 9]
 * </pre>
 * 
 * <p>
 * (one was added to the last byte) which spans all keys of interest. However
 * the successor of the PRIMARY strength byte[] would
 * </p>
 * 
 * <pre>
 * [43, 75, 89, 41, 68]
 * </pre>
 * 
 * <p>
 * and would ONLY span the single tuple whose key was "Bryan".
 * </p>
 * <p>
 * You can form an appropriate {@link IKeyBuilder} for the prefix keys using
 * </p>
 * 
 * <pre>
 * Properties properties = new Properties();
 * 
 * properties.setProperty(KeyBuilder.Options.STRENGTH,
 *         StrengthEnum.Primary.toString());
 * 
 * prefixKeyBuilder = KeyBuilder.newUnicodeInstance(properties);
 * </pre>
 * 
 * <p>
 * Note: It is NOT trivial to define filter that may be used to accept only keys
 * that extend the prefix on a caller-defined boundary (e.g., corresponding to
 * the encoding of a whitespace or word break). There are two issues: (1) the
 * keys are encoded so the filter needs to recognize the byte(s) in the Unicode
 * sort key that correspond to, e.g., the work boundary. (2) the keys may have
 * been encoded with secondary characteristics, in which case the boundary will
 * not begin immediately after the prefix.
 * </p>
 * 
 * @todo Only pass the relevant elements of keyPrefix to any given index
 *       partition. It is possible that an element spans the end of an index
 *       partition, in which case the scan must resume with the next partition.
 *       There is no real way to know this without testing the next
 *       partition....
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * 
 * @see <a href="http://trac.blazegraph.com/ticket/974" >
 *      Name2Addr.indexNameScan(prefix) uses scan + filter </a>
 */
public class PrefixFilter<E> extends FilterBase implements ITupleFilter<E> {
    
    protected transient static final Logger log = Logger
            .getLogger(PrefixFilter.class);

    private static final long serialVersionUID = 1828228416774862469L;

    /**
     * The array of key prefixes to be scanned.
     */
    private final byte[][] keyPrefix;

    /**
     * Completion scan with a single prefix. The iterator will visit all tuples
     * having the given key prefix.
     * 
     * @param keyPrefix
     *            An unsigned byte[] containing a key prefix.
     */
    public PrefixFilter(byte[] keyPrefix) {

        this(new byte[][] { keyPrefix });

    }

    /**
     * Completion scan with an array of key prefixes. The iterator will visit
     * all tuples having the first key prefix, then all tuples having the next
     * key prefix, etc. until all key prefixes have been evaluated.
     * 
     * @param keyPrefix
     *            An array of unsigned byte prefixes (the elements of the array
     *            MUST be presented in sorted order and <code>null</code>s
     *            are not permitted).
     */
    public PrefixFilter(byte[][] keyPrefix) {

        if (keyPrefix == null)
            throw new IllegalArgumentException();

        if (keyPrefix.length == 0)
            throw new IllegalArgumentException();

        for (int i = 0; i < keyPrefix.length; i++) {

            if (keyPrefix[i] == null)
                throw new IllegalArgumentException();

        }

        this.keyPrefix = keyPrefix;

    }

    @SuppressWarnings("unchecked")
    @Override
    public ITupleIterator<E> filterOnce(Iterator src, Object context) {

        return new PrefixFilterator<E>((ITupleCursor<E>) src, context, this);

    }

    private static class PrefixFilterator<E> implements ITupleIterator<E> {

        /**
         * The source iterator. The lower bound for the source iterator should
         * be the first key prefix. The upper bound should be the fixed length
         * successor of the last key prefix (formed by adding one bit, not by
         * appending a <code>nul</code> byte).
         */
        private final ITupleCursor<E> src;

        private final Object context;
        
        private final PrefixFilter<E> filter;

        /**
         * The index of the key prefix that is currently being scanned. The
         * entire scan is complete when index == keyPrefix.length.
         */
        private int index = 0;

        /**
         * The exclusive upper bound. This is updated each time we begin to scan
         * another key prefix.
         */
        protected byte[] toKey;

        /** The current tuple. */
        private ITuple<E> current = null;

        /**
         * Completion scan.
         * 
         * @param src
         *            The source iterator.
         * @param filter
         *            The filter to be applied.
         */
        public PrefixFilterator(final ITupleCursor<E> src,
                final Object context, final PrefixFilter<E> filter) {

            if (src == null)
                throw new IllegalArgumentException();

            if (filter == null)
                throw new IllegalArgumentException();

            this.src = src;

            this.context = context;
            
            this.filter = filter;

            this.index = 0;

            nextPrefix();

        }

        public boolean hasNext() {

            if (current != null)
                return true;

            /*
             * Find the next tuple having the same prefix.
             */
            while (src.hasNext()) {

                final ITuple<E> tuple = src.next();

                final byte[] key = tuple.getKey();

                if (BytesUtil.compareBytes(key, toKey) >= 0) {

                    if (log.isInfoEnabled())
                        log.info("Scanned beyond prefix: toKey="
                                + BytesUtil.toString(toKey) + ", tuple="
                                + tuple);

                    if (index + 1 < filter.keyPrefix.length) {

                        // next prefix.
                        index++;

                        nextPrefix();

                        if (current != null) {

                            // found an exact prefix match.
                            return true;

                        }

                        continue;

                    }

                    if(log.isInfoEnabled())
                        log.info("No more prefixes.");

                    return false;

                }

                current = tuple;

                // found another tuple that is a completion of the current prefix.
                return true;

            }

            // no more tuples (at least in this index partition).

            if(log.isInfoEnabled())
                log.info("No more tuples.");

            return false;

        }

        /**
         * Start a sub-scan of the key prefix at the current {@link #index}.
         */
        protected void nextPrefix() {

            final byte[] prefix = filter.keyPrefix[index];

            // make a note of the exclusive upper bound for that prefix.
            toKey = SuccessorUtil.successor(prefix.clone());

            /*
             * Seek to the inclusive lower bound for that key prefix.
             * 
             * Note: if we seek to a key that has a visitable tuple then that
             * will be the next tuple to be returned.
             */
            current = src.seek(prefix);
//            current = src.tuple();

            if (log.isInfoEnabled()) {

                log.info("index=" + index + ", prefix="
                        + BytesUtil.toString(prefix) + ", current=" + current);

            }

        }

        public ITuple<E> next() {

            if (!hasNext()) {

                throw new NoSuchElementException();

            }

            assert current != null;

            final ITuple<E> t = current;

            current = null;

            return t;

        }

        public void remove() {

            src.remove();

        }

    }

}