package com.bigdata.btree.filter; import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.log4j.Logger; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleCursor; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.StrengthEnum; import com.bigdata.btree.keys.SuccessorUtil; import com.bigdata.util.BytesUtil; import cutthecrap.utils.striterators.FilterBase; /** * <p> * Filter visits all {@link ITuple}s whose keys begin with any of the specified * prefix(s). The filer accepts a key or an array of keys that define the key * prefix(s) whose completions will be visited. It efficiently forms the * successor of each key prefix, performs a key-range scan of the key prefix, * and (if more than one key prefix is given), seeks to the start of the next * key-range scan. * </p> * <h4>WARNING</h4> * <p> * <strong>The prefix keys MUST be formed with {@link StrengthEnum#Primary}. * This is necessary in order to match all keys in the index since it causes the * secondary characteristics to NOT be included in the prefix key even if they * are present in the keys in the index.</strong> Using other * {@link StrengthEnum}s will result in secondary characteristics being encoded * by additional bytes appended to the key. This will result in scan matching * ONLY the given prefix key(s) and matching nothing if those prefix keys are * not actually present in the index. * </p> * <p> * For example, the Unicode text "Bryan" is encoded as the <em>unsigned</em> * byte[] * </p> * * <pre> * [43, 75, 89, 41, 67] * </pre> * * <p> * at PRIMARY strength but as the <em>unsigned</em> byte[] * </p> * * <pre> * [43, 75, 89, 41, 67, 1, 9, 1, 143, 8] * </pre> * * <p> * at IDENTICAL strength. The additional bytes for the IDENTICAL strength * reflect the Locale specific Unicode sort key encoding of secondary * characteristics such as case. The successor of the IDENTICAL strength byte[] * is * </p> * * <pre> * [43, 75, 89, 41, 67, 1, 9, 1, 143, 9] * </pre> * * <p> * (one was added to the last byte) which spans all keys of interest. However * the successor of the PRIMARY strength byte[] would * </p> * * <pre> * [43, 75, 89, 41, 68] * </pre> * * <p> * and would ONLY span the single tuple whose key was "Bryan". * </p> * <p> * You can form an appropriate {@link IKeyBuilder} for the prefix keys using * </p> * * <pre> * Properties properties = new Properties(); * * properties.setProperty(KeyBuilder.Options.STRENGTH, * StrengthEnum.Primary.toString()); * * prefixKeyBuilder = KeyBuilder.newUnicodeInstance(properties); * </pre> * * <p> * Note: It is NOT trivial to define filter that may be used to accept only keys * that extend the prefix on a caller-defined boundary (e.g., corresponding to * the encoding of a whitespace or word break). There are two issues: (1) the * keys are encoded so the filter needs to recognize the byte(s) in the Unicode * sort key that correspond to, e.g., the work boundary. (2) the keys may have * been encoded with secondary characteristics, in which case the boundary will * not begin immediately after the prefix. * </p> * * @todo Only pass the relevant elements of keyPrefix to any given index * partition. It is possible that an element spans the end of an index * partition, in which case the scan must resume with the next partition. * There is no real way to know this without testing the next * partition.... * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * * @see <a href="http://trac.blazegraph.com/ticket/974" > * Name2Addr.indexNameScan(prefix) uses scan + filter </a> */ public class PrefixFilter<E> extends FilterBase implements ITupleFilter<E> { protected transient static final Logger log = Logger .getLogger(PrefixFilter.class); private static final long serialVersionUID = 1828228416774862469L; /** * The array of key prefixes to be scanned. */ private final byte[][] keyPrefix; /** * Completion scan with a single prefix. The iterator will visit all tuples * having the given key prefix. * * @param keyPrefix * An unsigned byte[] containing a key prefix. */ public PrefixFilter(byte[] keyPrefix) { this(new byte[][] { keyPrefix }); } /** * Completion scan with an array of key prefixes. The iterator will visit * all tuples having the first key prefix, then all tuples having the next * key prefix, etc. until all key prefixes have been evaluated. * * @param keyPrefix * An array of unsigned byte prefixes (the elements of the array * MUST be presented in sorted order and <code>null</code>s * are not permitted). */ public PrefixFilter(byte[][] keyPrefix) { if (keyPrefix == null) throw new IllegalArgumentException(); if (keyPrefix.length == 0) throw new IllegalArgumentException(); for (int i = 0; i < keyPrefix.length; i++) { if (keyPrefix[i] == null) throw new IllegalArgumentException(); } this.keyPrefix = keyPrefix; } @SuppressWarnings("unchecked") @Override public ITupleIterator<E> filterOnce(Iterator src, Object context) { return new PrefixFilterator<E>((ITupleCursor<E>) src, context, this); } private static class PrefixFilterator<E> implements ITupleIterator<E> { /** * The source iterator. The lower bound for the source iterator should * be the first key prefix. The upper bound should be the fixed length * successor of the last key prefix (formed by adding one bit, not by * appending a <code>nul</code> byte). */ private final ITupleCursor<E> src; private final Object context; private final PrefixFilter<E> filter; /** * The index of the key prefix that is currently being scanned. The * entire scan is complete when index == keyPrefix.length. */ private int index = 0; /** * The exclusive upper bound. This is updated each time we begin to scan * another key prefix. */ protected byte[] toKey; /** The current tuple. */ private ITuple<E> current = null; /** * Completion scan. * * @param src * The source iterator. * @param filter * The filter to be applied. */ public PrefixFilterator(final ITupleCursor<E> src, final Object context, final PrefixFilter<E> filter) { if (src == null) throw new IllegalArgumentException(); if (filter == null) throw new IllegalArgumentException(); this.src = src; this.context = context; this.filter = filter; this.index = 0; nextPrefix(); } public boolean hasNext() { if (current != null) return true; /* * Find the next tuple having the same prefix. */ while (src.hasNext()) { final ITuple<E> tuple = src.next(); final byte[] key = tuple.getKey(); if (BytesUtil.compareBytes(key, toKey) >= 0) { if (log.isInfoEnabled()) log.info("Scanned beyond prefix: toKey=" + BytesUtil.toString(toKey) + ", tuple=" + tuple); if (index + 1 < filter.keyPrefix.length) { // next prefix. index++; nextPrefix(); if (current != null) { // found an exact prefix match. return true; } continue; } if(log.isInfoEnabled()) log.info("No more prefixes."); return false; } current = tuple; // found another tuple that is a completion of the current prefix. return true; } // no more tuples (at least in this index partition). if(log.isInfoEnabled()) log.info("No more tuples."); return false; } /** * Start a sub-scan of the key prefix at the current {@link #index}. */ protected void nextPrefix() { final byte[] prefix = filter.keyPrefix[index]; // make a note of the exclusive upper bound for that prefix. toKey = SuccessorUtil.successor(prefix.clone()); /* * Seek to the inclusive lower bound for that key prefix. * * Note: if we seek to a key that has a visitable tuple then that * will be the next tuple to be returned. */ current = src.seek(prefix); // current = src.tuple(); if (log.isInfoEnabled()) { log.info("index=" + index + ", prefix=" + BytesUtil.toString(prefix) + ", current=" + current); } } public ITuple<E> next() { if (!hasNext()) { throw new NoSuchElementException(); } assert current != null; final ITuple<E> t = current; current = null; return t; } public void remove() { src.remove(); } } }