BytesUtil.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package com.bigdata.util;

import it.unimi.dsi.fastutil.bytes.custom.CustomByteArrayFrontCodedList;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;

import java.nio.ByteBuffer;
import java.util.Comparator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;

/**
 * Class supporting operations on variable length byte[] keys.
 * <p>
 * Comparison operations that accept a starting offset are used when the byte[]s
 * are known to share a leading prefix that may be skipped during comparison.
 * <p>
 * Comparison operations that accept a starting offset and length are used when
 * immutable keys are stored in a single byte[] and an index into starting
 * positions in that byte[] is maintained.
 * <p>
 * JNI methods are provided for unsigned byte[] comparison. However, note that
 * the JNI methods do not appear to be as fast as the pure Java methods -
 * presumably because of the overhead of going from Java to C. In order to
 * execute using the JNI methods you MUST define the optional boolean system
 * property, e.g.,
 * 
 * <pre>
 * java -Dcom.bigdata.btree.BytesUtil.jni=true ...
 * </pre>
 * 
 * See BytesUtil.c in this package for instructions on compiling the JNI
 * methods.
 * </p>
 * See {@link #main(String[])} which provides a test for the JNI integration and
 * some pointers on how to get this running on your platform.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class BytesUtil {

	private static final transient Logger log = Logger.getLogger(BytesUtil.class);

	/**
	 * An empty <code>byte[]</code>.
	 */
	public static final byte[] EMPTY = new byte[0];

	/**
	 * An empty <code>byte[][]</code>.
	 */
	public static final byte[][] EMPTY2 = new byte[0][];

	/**
	 * Flag set iff JNI linking succeeds. When this flag is false we run with
	 * the pure Java implementations of these methods. When the flag is true,
	 * the JNI versions are used.
	 */
	static boolean linked = false;

	/**
	 * JNI routines are not invoked unless we will compare byte[]s with at least
	 * this many potential bytes to compare (the actual# may be much less of
	 * course since comparisons may fail fast).
	 */
	static public final int minlen = 100;

	static private native int _compareBytes(int alen, byte[] a, int blen, byte[] b);

	static private native int _compareBytesWithOffsetAndLen(int aoff, int alen, byte[] a, int boff, int blen, byte[] b);

	static {

		final boolean jni;

		String val = System.getProperty("com.bigdata.btree.BytesUtil.jni");

		if (val != null) {

			jni = Boolean.parseBoolean(val);

		} else {

			jni = false; // Note: We will not even try to use JNI by default!

		}

		if (jni) {

			/*
			 * Attempt to load the JNI library.
			 */

			loadJNILibrary();

		}

	}

	/**
	 * Attempt to load the JNI library.
	 * <p>
	 * Note: this is done automatically if the optional boolean system property
	 * <code>com.bigdata.btree.BytesUtil.jni=true</code> is specified, e.g.,
	 * using
	 * 
	 * <pre>
	 *    java -Dcom.bigdata.btree.BytesUtil.jni=true ...
	 * </pre>
	 * 
	 * @return True iff the JNI library was successfully linked.
	 */
	public static boolean loadJNILibrary() {

		if (!linked) {

			try {

				System.loadLibrary("BytesUtil");

				if (log.isInfoEnabled())
					log.info("BytesUtil JNI linked");

				linked = true;

			} catch (UnsatisfiedLinkError ex) {

				log.warn("BytesUtil JNI NOT linked: " + ex);

				linked = false;

			}
		}

		return linked;

	}

	/**
	 * True iff the two arrays compare as equal. This is somewhat optimized in
	 * that it tests the array lengths first, assumes that it is being used on
	 * sorted data and therefore compares the last bytes first, and does not
	 * convert the bytes to unsigned integers before testing for equality.
	 * 
	 * @param a
	 *            A byte[].
	 * @param b
	 *            Another byte[].
	 * 
	 * @return If the two arrays have the same reference (including
	 *         <code>null</code>) or if they have the same data.
	 */
	final public static boolean bytesEqual(final byte[] a, final byte[] b) {

		if (a == b)
			return true;

		final int alen = a.length;

		final int blen = b.length;

		if (alen != blen)
			return false;

		int i = alen - 1;

		while (i >= 0) {

			if (a[i] != b[i])
				return false;

			i--;

		}

		// for (int i = 0; i < alen; i++) {
		//        
		// if( a[i] != b[i] ) return false;
		//
		// }

		return true;

	}

	/**
	 * Byte-wise comparison of byte[]s (the arrays are treated as arrays of
	 * unsigned bytes).
	 * 
	 * @param a
	 *            A byte[].
	 * 
	 * @param b
	 *            A byte[].
	 * 
	 * @return a negative integer, zero, or a positive integer if the first
	 *         argument is less than, equal to, or greater than the second.
	 * 
	 * @todo Return the index of the byte at which the difference with the sign
	 *       adjusted to indicate the relative order of the data rather than the
	 *       difference of the bytes at that index. The index would be negative
	 *       or positive depending on which way the comparison went. See
	 *       {@link CustomByteArrayFrontCodedList} for an implementation
	 *       guideline.
	 *       <p>
	 *       Change all implementations in this class and also BytesUtil.c,
	 *       which needs to be recompiled for Windows. Also makes sure that it
	 *       gets compiled and linked for Un*x. That should be tested from the
	 *       ant installer and the result reported. Do the same for ICU4JNI.
	 */
	final public static int compareBytes(final byte[] a, final byte[] b) {
		if (a == b) // includes a and b both null
			return 0;
		
		// Handle null values
		if (a == null) {
			return -1;
		}
		if (b == null) {
			return 1;
		}
		
		// neither are null
		final int alen = a.length;
		final int blen = b.length;
		if (linked && alen > minlen && blen > minlen) {
			/*
			 * JNI implementation.
			 * 
			 * @todo test for trade off when max(len) is short. unroll loop for
			 * small N.
			 */
			return _compareBytes(alen, a, blen, b);
		}
		for (int i = 0; i < alen && i < blen; i++) {
			// promotes to signed integers in [0:255] for comparison.
			final int ret = (a[i] & 0xff) - (b[i] & 0xff);
			// int ret = a[i] - b[i];
			if (ret != 0)
				return ret;
		}
		return alen - blen;
	}

	// /**
	// * Byte-wise comparison of a {@link ByteBuffer} and a byte[]. The data are
	// * treated as arrays of unsigned bytes. The {@link ByteBuffer} position,
	// * limit and mark are unchanged by this procedure.
	// *
	// * @param a
	// * A {@link ByteBuffer}.
	// * @param aoff
	// * The offset of the starting byte in the buffer.
	// * @param blen
	// * The number of bytes to be compared.
	// * @param b
	// * A byte[].
	// *
	// * @return a negative integer, zero, or a positive integer if the first
	// * argument is less than, equal to, or greater than the second.
	// */
	// final public static int compareBytes(final ByteBuffer a, final int aoff,
	// final int alen, final byte[] b) {
	// final int blen = b.length;
	// for (int i = 0; i < alen && i < blen; i++) {
	// // promotes to signed integers in [0:255] for comparison.
	// final int ret = (a.get(aoff + i) & 0xff) - (b[i] & 0xff);
	// if (ret != 0)
	// return ret;
	// }
	// return alen - blen;
	// }

	// /**
	// * Byte-wise comparison of byte[]s (the arrays are treated as arrays of
	// * unsigned bytes).
	// *
	// * @param aoff
	// * The offset into <i>a</i> at which the comparison will
	// * begin.
	// * @param a
	// * A byte[].
	// * @param boff
	// * The offset into <i>b</i> at which the comparison will
	// * begin.
	// * @param b
	// * A byte[].
	// *
	// * @return a negative integer, zero, or a positive integer as the first
	// * argument is less than, equal to, or greater than the second.
	// */
	// final public static int compareBytes(int aoff, final byte[] a, int boff,
	// final byte[] b) {
	// final int alen = a.length;
	// final int blen = b.length;
	// for (int i = aoff, j = boff; i < alen && j < blen; i++, j++) {
	// // promotes to signed integers in [0:255] for comaprison.
	// int ret = (a[i] & 0xff) - (b[j] & 0xff);
	// // int ret = a[i] - b[j];
	// if (ret != 0)
	// return ret;
	// }
	// return (alen - aoff) - (blen - boff);
	// }

	/**
	 * Byte-wise comparison of byte[]s (the arrays are treated as arrays of
	 * unsigned bytes).
	 * 
	 * @param aoff
	 *            The offset into <i>a</i> at which the comparison will begin.
	 * @param alen
	 *            The #of bytes in <i>a</i> to consider starting at <i>aoff</i>.
	 * @param a
	 *            A byte[].
	 * @param boff
	 *            The offset into <i>b</i> at which the comparison will begin.
	 * @param blen
	 *            The #of bytes in <i>b</i> to consider starting at <i>boff</i>.
	 * @param b
	 *            A byte[].
	 * 
	 * @return a negative integer, zero, or a positive integer as the first
	 *         argument is less than, equal to, or greater than the second.
	 */
	final public static int compareBytesWithLenAndOffset(//
			int aoff, int alen, final byte[] a,//
			int boff, int blen, final byte[] b//
	) {

		if (linked && alen > minlen && blen > minlen) {

			// JNI implementation.
			return _compareBytesWithOffsetAndLen(aoff, alen, a, boff, blen, b);

		}

		// last index to consider in a[].
		final int alimit = aoff + alen;

		// last index to consider in b[].
		final int blimit = boff + blen;

		for (int i = aoff, j = boff; i < alimit && j < blimit; i++, j++) {

			// promotes to signed integers in [0:255] for comaprison.
			int ret = (a[i] & 0xff) - (b[j] & 0xff);

			if (ret != 0)
				return ret;

		}

		return alen - blen;

	}

	/**
	 * Return the #of leading bytes in common. This is used to compute the
	 * prefix for a node or leaf, which is formed by the leading bytes in common
	 * between the first and last key for a node or leaf.
	 * 
	 * @param a
	 *            A variable length unsigned byte array.
	 * @param b
	 *            A variable length unsigned byte array.
	 * 
	 * @return The #of leading bytes in common (aka the index of the first byte
	 *         in which the two arrays differ, although that index could lie
	 *         beyond the end of one of the arrays).
	 */
	public final static int getPrefixLength(final byte[] a, final byte[] b) {

		final int alen = a.length;

		final int blen = b.length;

		int i;

		for (i = 0; i < alen && i < blen; i++) {

			if (a[i] != b[i])
				break;

		}

		return i;

	}

	/**
	 * Return a new byte[] containing the leading bytes in common between two
	 * byte[]s. This is often used to compute the minimum length separator key.
	 * 
	 * @param a
	 *            A variable length unsigned byte array[].
	 * @param b
	 *            A variable length unsigned byte array[].
	 * 
	 * @return A new byte[] containing the leading bytes in common between the
	 *         two arrays.
	 */
	public final static byte[] getPrefix(final byte[] a, final byte[] b) {

		final int len = getPrefixLength(a, b);

		final byte[] prefix = new byte[len];

		System.arraycopy(a, 0, prefix, 0, len);

		return prefix;

	}

	/**
	 * Computes the successor of a variable length byte array by appending a
	 * unsigned zero(0) byte to the end of the array.
	 * 
	 * @param key
	 *            A variable length unsigned byte array.
	 * 
	 * @return A new unsigned byte[] that is the successor of the key.
	 */
	public final static byte[] successor(final byte[] key) {

		final int keylen = key.length;

		final byte[] tmp = new byte[keylen + 1];

		System.arraycopy(key, 0, tmp, 0, keylen);

		return tmp;

	}

	/**
	 * <p>
	 * The keys in the nodes of a btree are known as <i>separator keys</i>. The
	 * role of the separator keys is to direct search towards the leaf in which
	 * a key exists or would exist by always searching the first child having a
	 * separator key that is greater than or equal to the search key.
	 * </p>
	 * <p>
	 * Separator keys separate leaves and must be choosen with that purpose in
	 * mind. The simplest way to choose the separator key is to just take the
	 * first key of the leaf - this is always correct. However, shorter
	 * separator keys may be choosen by defining the separator key as the
	 * shortest key that is less than or equal to the first key of a leaf and
	 * greater than the last key of the left sibling of that leaf (that is, the
	 * key for the entry that immediately proceeds the first entry on the leaf).
	 * </p>
	 * <p>
	 * There are several advantages to always choosing the shortest separator
	 * key. The original rationale (in "Prefix <i>B</i>-Trees" by Bayer and
	 * Unterauer) was to increase the branching factors for fixed size pages.
	 * Since we use variable size serialized record, that is not an issue.
	 * However, using the shortest separator keys in this implementation
	 * provides both smaller serialized records for nodes and faster search
	 * since fewer bytes must be tested.
	 * </p>
	 * <p>
	 * Note that this trick can not be used at higher levels in the btree -
	 * separator keys are always formed based on the keys in the leaves and then
	 * propagated through the tree.
	 * </p>
	 * <p>
	 * The rules are simple enough:
	 * <ol>
	 * <li>The separator contains all bytes in the shared prefix (if any) plus
	 * the 1st byte at which the given key differs from the prior key.</li>
	 * <li>If the separator key would equal the given key by value then return
	 * the reference to the given key.</li>
	 * </ol>
	 * </p>
	 * 
	 * @param givenKey
	 *            A key.
	 * 
	 * @param priorKey
	 *            Another key that <em>proceeds</em> the <i>givenKey</i>.
	 * 
	 * @return The shortest key that is less than or equal to <i>givenKey</i>
	 *         and greater than <i>priorKey</i>. This will be a reference to the
	 *         <i>givenKey</i> iff that is also the shortest separator.
	 * 
	 * @see http://portal.acm.org/citation.cfm?doid=320521.320530
	 * 
	 * @throws IllegalArgumentException
	 *             if either key is <code>null</code>.
	 * @throws IllegalArgumentException
	 *             if both keys are the same reference.
	 */
	// * @throws IllegalArgumentException
	// * if the keys are equal.
	// * @throws IllegalArgumentException
	// * if the keys are out of order.
	final public static byte[] getSeparatorKey(final byte[] givenKey, final byte[] priorKey) {

		if (givenKey == null)
			throw new IllegalArgumentException();

		if (priorKey == null)
			throw new IllegalArgumentException();

		if (givenKey == priorKey)
			throw new IllegalArgumentException();

		final int prefixLen = getPrefixLength(givenKey, priorKey);

		if (prefixLen == givenKey.length - 1) {

			/*
			 * The given key is the shortest separator. Examples would include:
			 * 
			 * given: 0 1 2 prior: 0 1
			 * 
			 * or
			 * 
			 * given: 0 1 2 prior: 0 1 1
			 * 
			 * or
			 * 
			 * given: 0 1 2 prior: 0 1 1 2
			 */

			return givenKey;

		}

		/*
		 * The separator includes all bytes in the shared prefix plus the next
		 * byte from the given key.
		 */

		// allocate to right size.
		final byte[] tmp = new byte[prefixLen + 1];

		// copy shared prefix plus the following byte.
		System.arraycopy(givenKey, 0, tmp, 0, prefixLen + 1);

		return tmp;

	}

	/**
	 * Formats a key as a series of comma delimited unsigned bytes.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return The string representation of the array as unsigned bytes.
	 */
	final public static String toString(final byte[] key) {

		if (key == null)
			return NULL;

		return toString(key, 0, key.length);

	}

	/**
	 * Formats a key as a series of comma delimited unsigned bytes.
	 * 
	 * @param key
	 *            The key.
	 * @param off
	 *            The index of the first byte that will be visited.
	 * @param len
	 *            The #of bytes to visit.
	 * 
	 * @return The string representation of the array as unsigned bytes.
	 */
	final public static String toString(final byte[] key, final int off, final int len) {

		if (key == null)
			return NULL;

		final StringBuilder sb = new StringBuilder(len * 4 + 2);

		sb.append("[");

		for (int i = off; i < off + len; i++) {

			if (i > 0)
				sb.append(",");

			// as an unsigned integer.
			// sb.append(Integer.toHexString(key[i] & 0xff));
			sb.append(Integer.toString(key[i] & 0xff));

		}

		sb.append("]");

		return sb.toString();

	}

	private static transient String NULL = "null";

	/**
	 * Formats the data into a {@link String}.
	 * 
	 * @param data
	 *            An array of unsigned byte arrays.
	 */
	static public String toString(final byte[][] data) {

		final StringBuilder sb = new StringBuilder();

		final int n = data.length;

		sb.append("data(n=" + n + ")={");

		for (int i = 0; i < n; i++) {

			final byte[] a = data[i];

			sb.append("\n");

			sb.append("data[" + i + "]=");

			sb.append(BytesUtil.toString(a));

			if (i + 1 < n)
				sb.append(",");

		}

		sb.append("}");

		return sb.toString();

	}

	/**
	 * Binary search on an array whose members are variable length unsigned
	 * byte[]s.
	 * 
	 * @param keys
	 *            The buffer.
	 * @param base
	 *            The offset of the base of the array within the buffer.
	 * @param nmem
	 *            The #of members in the array. When [nmem == 0], the array is
	 *            empty.
	 * @param key
	 *            The key for the search.
	 * 
	 * @return index of the search key, if it is contained in <i>keys</i>;
	 *         otherwise, <code>(-(insertion point) - 1)</code>. The insertion
	 *         point is defined as the point at which the key would be inserted
	 *         into the array of keys. Note that this guarantees that the return
	 *         value will be >= 0 if and only if the key is found.
	 */
	static final public int binarySearch(final byte[][] keys, final int base, final int nmem, final byte[] key) {

		int low = 0;

		int high = nmem - 1;

		while (low <= high) {

			final int mid = (low + high) >> 1;

			final int offset = base + mid;

			final byte[] midVal = keys[offset];

			// compare actual vs probe
			final int tmp = BytesUtil.compareBytes(midVal, key);

			if (tmp < 0) {

				// Actual LT probe, restrict lower bound and try again.
				low = mid + 1;

			} else if (tmp > 0) {

				// Actual GT probe, restrict upper bound and try again.
				high = mid - 1;

			} else {

				// Actual EQ probe. Found : return offset.

				return offset;

			}

		}

		// Not found: return insertion point.

		final int offset = (base + low);

		return -(offset + 1);

	}

	/**
	 * Compares two unsigned byte[]s.
	 * 
	 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
	 *         Thompson</a>
	 * @version $Id$
	 */
	public static class UnsignedByteArrayComparator implements Comparator<byte[]> {

		public static transient final Comparator<byte[]> INSTANCE = new UnsignedByteArrayComparator();

		public int compare(final byte[] o1, final byte[] o2) {

			return BytesUtil.compareBytes(o1, o2);

		}

	}

	/**
	 * Return <code>true</code> if the <i>key</i> lies inside of the optional
	 * half-open range constraint.
	 * 
	 * @param key
	 *            The key.
	 * @param fromKey
	 *            The inclusive lower bound -or- <code>null</code> if there is
	 *            no lower bound.
	 * @param toKey
	 *            The exclusive upper bound -or- <code>null</code> if there is
	 *            no upper bound.
	 * 
	 * @return <code>true</code> unless the <i>key</i> is LT [fromKey] or GTE
	 *         [toKey].
	 */
	final static public boolean rangeCheck(final byte[] key, final byte[] fromKey, final byte[] toKey) {

		if (fromKey == null && toKey == null) {

			// no range constraint.
			return true;

		}

		if (fromKey != null) {

			if (BytesUtil.compareBytes(key, fromKey) < 0) {

				if (log.isDebugEnabled()) {

					log.debug("key=" + BytesUtil.toString(key) + " LT fromKey" + BytesUtil.toString(fromKey));

				}

				// key is LT then the optional inclusive lower bound.
				return false;

			}

		}

		if (toKey != null) {

			if (BytesUtil.compareBytes(key, toKey) >= 0) {

				if (log.isDebugEnabled()) {

					log.debug("key=" + BytesUtil.toString(key) + " GTE toKey" + BytesUtil.toString(toKey));

				}

				// key is GTE the optional exclusive upper bound
				return false;

			}

		}

		return true;

	}

	/**
	 * This method forces the load of the JNI library and tries to execute the
	 * JNI methods.
	 * <p>
	 * In order to use the JNI library under Windows, you must specify the JNI
	 * library location using the PATH environment variable, e.g.,
	 * 
	 * <pre>
	 *   cd bigdata
	 *   set PATH=%PATH%;lib
	 *   java -cp bin com.bigdata.btree.BytesUtil
	 * </pre>
	 * 
	 * <p>
	 * In order to use the JNI library under un*x, you must specify the JNI
	 * library location
	 * 
	 * <pre>
	 *    java -Djava.library.path=lib com.bigdata.btree.BytesUtil
	 * </pre>
	 * 
	 * @param args
	 * 
	 * @exception UnsatisfiedLinkError
	 *                if the JNI methods can not be resolved.
	 * @exception AssertionError
	 *                if the JNI methods do not produce the expected answers.
	 */
	public static void main(final String[] args) {

		// Force load of the JNI library.
		loadJNILibrary();

		if (0 != BytesUtil._compareBytes(3, new byte[] { 1, 2, 3 }, 3, new byte[] { 1, 2, 3 })) {

			throw new AssertionError();

		}

		if (0 != BytesUtil._compareBytesWithOffsetAndLen(0, 3, new byte[] { 1, 2, 3 }, 0, 3, new byte[] { 1, 2, 3 })) {

			throw new AssertionError();

		}

		System.out.println("JNI library routines Ok.");

	}

	/**
	 * Return the #of bytes required to bit code the specified #of bits.
	 * 
	 * @param nbits
	 *            The #of bit flags.
	 * 
	 * @return The #of bytes required. This will be zero iff <i>nbits</i> is
	 *         zero.
	 */
	final public static int bitFlagByteLength(final int nbits) {

		return nbits / 8 + (nbits % 8 == 0 ? 0 : 1);

		// return nbits>>>3;

		// if (nbits == 0)
		// return 0;
		//        
		// return ((int) ((nbits / 8) + 1));

	}

	/**
	 * Return the index of the byte in which the bit with the given index is
	 * encoded.
	 * 
	 * @param bitIndex
	 *            The bit index.
	 * 
	 * @return The byte index.
	 */
	final public static int byteIndexForBit(final long bitIndex) {

		return ((int) (bitIndex / 8));

	}

	/**
	 * Return the offset within the byte in which the bit is coded of the bit
	 * (this is just the remainder <code>bitIndex % 8</code>).
	 * <p>
	 * Note, the computation of the bit offset is intentionally aligned with
	 * {@link OutputBitStream} and {@link InputBitStream}.
	 * 
	 * @param bitIndex
	 *            The bit index into the byte[].
	 * 
	 * @return The offset of the bit in the appropriate byte.
	 */
	final public static int withinByteIndexForBit(final long bitIndex) {

		return 7 - ((int) bitIndex) % 8;

	}

	/**
	 * Get the value of a bit.
	 * <p>
	 * Note, the computation of the bit offset is intentionally aligned with
	 * {@link OutputBitStream} and {@link InputBitStream}.
	 * 
	 * @param bitIndex
	 *            The index of the bit.
	 * 
	 * @return The value of the bit.
	 */
	final public static boolean getBit(final byte[] buf, final long bitIndex) {

		final int mask = (1 << withinByteIndexForBit(bitIndex));

		final int off = byteIndexForBit(bitIndex);

		final byte b = buf[off];

		return (b & mask) != 0;

	}

	/**
	 * Set the value of a bit - this is NOT thread-safe (contention for the byte
	 * in the backing buffer can cause lost updates).
	 * <p>
	 * Note, the computation of the bit offset is intentionally aligned with
	 * {@link OutputBitStream} and {@link InputBitStream}.
	 * 
	 * @param bitIndex
	 *            The index of the bit.
	 * 
	 * @return The old value of the bit.
	 */
	final public static boolean setBit(final byte[] buf, final long bitIndex, final boolean value) {

		final int mask = (1 << withinByteIndexForBit(bitIndex));

		final int off = byteIndexForBit(bitIndex);

		// current byte at that index.
		byte b = buf[off];

		final boolean oldValue = (b & mask) != 0;

		if (value)
			b |= mask;
		else
			b &= ~mask;

		buf[off] = b;

		return oldValue;

	}

	/**
	 * An array of 32-bit mask values. The index in the array is the #of bits of
	 * the hash code to be considered. The value at that index in the array is
	 * the mask to be applied to mask off to zero the high bits of the hash code
	 * which are to be ignored.
	 */
	static private final int[] masks32;
	static {

		// Populate the array of masking values.
		masks32 = new int[33];

		for (int i = 0; i < 33; i++)
			masks32[i] = getMSBMask(i);

	}

	/**
	 * Return a bit mask which reveals only the MSB (Most Significant Bits) N
	 * bits of an int32 value.
	 * 
	 * @param nbits
	 *            The #of bits to be revealed.
	 * 
	 * @return The mask.
	 * 
	 * @throws IllegalArgumentException
	 *             if <i>nbits</i> is LT ZERO (0).
	 * @throws IllegalArgumentException
	 *             if <i>nbits</i> is GT 32.
	 */
	public static/* private */int getMSBMask(final int nbits) {

		if (nbits < 0 || nbits > 32)
			throw new IllegalArgumentException();

		final int limit = (32 - nbits);
		int mask = 0;

		for (int i = 31; i >= limit; i--) {

			final int bit = (1 << i);

			mask |= bit;

		}

		return mask;

	}

	/**
	 * Mask off all but the MSB <i>nbits</i> of the hash value and shift them
	 * down such that the masked bits appear at bits (nbits:0] of the returned
	 * value. This is used to index into a dictionary page based on the revealed
	 * bits.
	 * 
	 * @param h
	 *            The hash value.
	 * @param nbits
	 *            The #of bits already accounted for by the path from the root.
	 * 
	 * @return The hash value considering only the MSB <i>nbits</i> and shifted
	 *         down into an <i>nbits</i> integer.
	 */
	public static int maskOffMSB(final int h, final int nbits) {

		if (nbits < 0 || nbits > 32)
			throw new IllegalArgumentException();

		final int v = h & masks32[nbits];

		final int x = v >>> (32 - nbits);

		return x;

	}

	/**
	 * Mask off all but the LSB <i>nbits</i> of the hash value.
	 * 
	 * @param h
	 *            The hash value.
	 * @param nbits
	 *            The #of LSB bits to be retained.
	 * 
	 * @return The LSB <i>nbits</i>.
	 * 
	 *         TODO unit test.
	 */
	public static int maskOffLSB(final int h, final int nbits) {

		if (nbits < 0 || nbits > 32)
			throw new IllegalArgumentException();

		final int v = h & ~masks32[32-nbits];

		return v;

	}

	/**
	 * Return the n-bit integer corresponding to the inclusive bit range of the
	 * byte[]. Bit ZERO (0) is the Most Significant Bit (MSB). Bit positions
	 * increase from zero up to <code>a.length * 8 - 1</code>. The return value
	 * is an int32 and the bit range must not be greater than 32 bits.
	 * <p>
	 * For example, given the following data and the bit range (0,2)
	 * 
	 * <pre>
	 * bit index: 01234567 
	 * ---------+---------- 
	 * bit value: 10110000
	 * </pre>
	 * 
	 * TWO (2) bits starting at bit offset ZERO (0) would be extracted and
	 * returned as a 2-bit integer. For those data, the return value would be an
	 * int32 value whose binary representation was <code>10</code> (with leading
	 * zeros suppressed).
	 * <p>
	 * Note: This method is design for use with the unsigned byte[] keys in a
	 * bigdata hash index. All keys in bigdata are internally represented as
	 * unsigned byte[]s, which is why this method accepts a byte[] rather than
	 * an long[] for the bits. Also, while the length of an unsigned byte[] key
	 * can vary, they are never huge and an int32 value is sufficient to index
	 * into the bits in the byte[]. Finally, the return value is an int because
	 * it will be used in hash table designs to index into a hash table based on
	 * those bits in a hash code key which are masked as relevant to that hash
	 * table. 32bits is far more than we will need to index into a hash table.
	 * For an 8k page, we might expect a fan out of at most 1024 which is only
	 * 10 bits.
	 * 
	 * @param a
	 *            A byte[].
	 * @param off
	 *            The index of the first bit to be included.
	 * @param len
	 *            The number of bits to be returned in [0:32]. However, a bit
	 *            length of zero will always return zero.
	 * 
	 * @return The integer extracted from the specified bit range.
	 */
	public static int getBits(final byte[] a, final int off, final int len) {
		if (a == null || off < 0 || len < 0 || len > 32)
			throw new IllegalArgumentException();
		
		final int maxbits = a.length * 8;
		
		if (off + len > maxbits)
			throw new IllegalArgumentException("off: " + off + ", len: " + len + ", a.length: " + a.length);

		if (len == 0) // zero length is always a zero.
			return 0;

		/*
		 * Build int32 value having the desired bits.
		 */

		// byte in which the bit range begins.
		final int fromByteOffset = off / 8; // byteIndexForBit(off);
		final int lastByte = (off + len - 1);
		// byte in which the bit range ends (inclusive).
		final int toByteOffset = lastByte / 8; // byteIndexForBit(off +
														// len - 1);

		/*
		 * The data are assembled into the int64 value by copying each byte in
		 * turn having data for the slice. This will copy at most 5 bytes. For
		 * example, when a 32-bit window starts in the middle of a byte. Once
		 * the bytes are assembled into the int64 buffer, they are shifted down
		 * to put the last bit extracted at bit index ZERO (0) of the int32
		 * word. Finally, the unused high bits are cleared to zero using a mask.
		 */

		final int nbytes = toByteOffset - fromByteOffset + 1;
		long v = 0L; // buffer for up to 5 source bytes.
		for (int i = fromByteOffset, j = 1; i <= toByteOffset; i++, j++) {
			final long x = 0xFF & a[i]; // next byte.
			final int shift = ((nbytes - j) << 3); //  
			v |= (x << shift); // mask off high bits and shift into buf.

			// inlining the above expressions produces SLOWER code
			// v |= ((0xFF & a[i]) << ((nbytes-j)<<3));
		} // next byte in the byte[].

		// NOTE an explicit select on byte count to avoid the for loop
		// produces SLOWER code

		final int rshift = 7 - (lastByte % 8); // final right shift to word align.
		final int w = (int) (v >>> rshift); // int32 result.

		// int mask = masks32[32 - len]; // lookup mask with [len] LSB ZEROs.
		// mask = ~mask; // flip bits to get [len] LSB ONEs.
		// w &= mask; // mask off the lower [len] bits (handles sign extension
		// and
		// starting offset within byte).
		// return w;
		// inlining the above is a little FASTER
		return w & ~masks32[32 - len];
	}

	static final int[] bitmasks = new int[] { 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF };

	/*
	 * Given a byte and a start and length copy the bits to a long and mask off
	 * those not required. The third argument defines the destination start bit
	 * in the long.
	 */
	private static long getLongVal(final byte b, final int s, final int l, final int t) {
		final int lastBit = s + l;
		final int shift = 64 - t - l;

		final long ret;
		if (lastBit < 8) {
			ret = b >>> (8 - lastBit);
		} else {
			ret = b;
		}

		return (ret & bitmasks[l]) << shift;
	}

	private static int getIntVal(final byte b, final int s, final int l, final int t) {
		final int lastBit = s + l;
		final int shift = 64 - t - l;

		final int ret;
		if (lastBit < 8) {
			ret = b >>> (8 - lastBit);
		} else {
			ret = b;
		}

		return (ret & bitmasks[l]) << shift;
	}

	public static long altGetBits64(final byte[] a, final int off, final int len) {

		if (a == null || off < 0 || len < 0 || len > 64)
			throw new IllegalArgumentException();
		if (len == 0) // zero length is always a zero.
			return 0;
		if (off + len > a.length * 8)
			throw new IllegalArgumentException();

		/*
		 * Build int64 value having the desired bits.
		 */

		// byte in which the bit range begins.
		int bi = off/8;
		// start bit offset
		int bo = off % 8;
		// bits remaining in current byte
		int br = 8 - bo;
		// long target start bit
		int lt = 64 - len;
		int lr = len;

		long ret = 0;
		for (int t = 0; t < len; t += br, bo = 0, br = 8, bi++) {
			if (br > lr) {
				br = lr;
			}

			ret |= getLongVal(a[bi], bo, br, lt);

			lr -= br;
			lt += br;
		}

		return ret;
	}
	
	/*
	 * Is there any advantage in 32-bit math over 64-bit?
	 */
	public static int altGetBits32(final byte[] a, final int off, final int len) {

		if (a == null || off < 0 || len < 0 || len > 64)
			throw new IllegalArgumentException();
		if (len == 0) // zero length is always a zero.
			return 0;
		if (off + len > a.length * 8)
			throw new IllegalArgumentException();

		/*
		 * Build int32 value having the desired bits.
		 */

		// byte in which the bit range begins.
		int bi = off/8;
		// start bit offset
		int bo = off % 8;
		// bits remaining in current byte
		int br = 8 - bo;
		// long target start bit
		int lt = 32 - len;
		int lr = len;

		int ret = 0;
		for (int t = 0; t < len; t += br, bo = 0, br = 8, bi++) {
			if (br > lr) {
				br = lr;
			}

			ret |= getIntVal(a[bi], bo, br, lt);

			lr -= br;
			lt += br;
		}

		return ret;
	}

	public static long getBits64(final byte[] a, final int off, final int len) {
		long ret = 0;
		if (len <= 32) {
			ret = 0xFFFFFFFFL & getBits(a, off, len);
		} else {
			final int hilen = len - 32;
			ret = getBits(a, off, hilen);
			ret <<= 32;
			ret |= (0xFFFFFFFFL & getBits(a, off + hilen, 32));
		}

		return ret;
	}
	
	/**
	 * Some benchmarks seem to indicate that altGetBits32 is faster than getBits
	 * for smaller byte counts.  OTOH the cost of the redirection may outweigh
	 * any benefit.
	 */
	public static int optGetBits(final byte[] a, final int off, final int len) {
		if (len <= 16) {
			return altGetBits32(a, off, len);
		} else {
			return getBits(a, off, len);
		}
	}


	/**
	 * Return the n-bit integer corresponding to the inclusive bit range of the
	 * byte[]. Bit ZERO (0) is the Most Significant Bit (MSB). Bit positions
	 * increase from zero up to <code>31</code>. The return value is an int32
	 * and the bit range must not be greater than 32 bits.
	 * <p>
	 * For example, given the following data and the bit range (0,2)
	 * 
	 * <pre>
	 * bit index: 01234567 
	 * ---------+---------- 
	 * bit value: 10110000
	 * </pre>
	 * 
	 * TWO (2) bits starting at bit offset ZERO (0) would be extracted and
	 * returned as a 2-bit integer. For those data, the return value would be an
	 * int32 value whose binary representation was <code>10</code> (with leading
	 * zeros suppressed).
	 * <p>
	 * Note: This method is design for use in a bigdata hash index having native
	 * int32 keys rather than unsigned byte[] keys.
	 * 
	 * @param a
	 *            An integer.
	 * @param off
	 *            The index of the first bit to be included.
	 * @param len
	 *            The number of bits to be returned in [0:32]. However, a bit
	 *            length of zero will always return zero.
	 * 
	 * @return The integer extracted from the specified bit range.
	 */
	public static int getBits(final int a, final int off, final int len) {
		if (off < 0)
			throw new IllegalArgumentException();
		if (len < 0 || len > 32)
			throw new IllegalArgumentException();
		if (len == 0) // zero length is always a zero.
			return 0;
		if (off + len > 32)
			throw new IllegalArgumentException();

		final int last = off + len - 1; // index of the last bit (inclusive).
		final int rshift = 31 - last; // right shift to word align.
		int w = (int) (a >>> rshift); // int32 result.
		int mask = masks32[32 - len]; // lookup mask with [len] LSB ZEROs.
		mask = ~mask; // flip bits to get [len] LSB ONEs.
		w &= mask; // mask off the lower [len] bits (handles sign extension and
		// starting offset within byte).
		return w;
	}

	/**
	 * Return the binary representation of the unsigned byte[].
	 * 
	 * @param b
	 *            The unsigned byte[].
	 * 
	 * @return The representation of the bits in that unsigned byte[].
	 * 
	 * @throws IllegalArgumentException
	 *             if the argument is <code>null</code>.
	 */
	public static String toBitString(final byte[] b) {
		if (b == null)// Note: fromKey/toKey may be null; caller must check 1st
			throw new IllegalArgumentException();
		final char[] chars = new char[b.length << 3]; // one char per bit.
		int bitIndex = 0; // start at the msb.
		for (int i = 0; i < b.length; i++) {
			final byte x = b[i]; // next byte.
			for (int withinByteIndex = 7; withinByteIndex >= 0; withinByteIndex--) {
				final int mask = 1 << withinByteIndex;
				final boolean bit = (x & mask) != 0;
				chars[bitIndex++] = bits[bit ? 1 : 0];
			} // next bit in the current byte.
		} // next byte in the byte[].
		// System.err.println("b[]=" + BytesUtil.toString(b) + ", chars="
		// + Arrays.toString(chars));
		return new String(chars);
	}

	/** binary digits. */
	private final static char[] bits = { '0', '1' };

	/**
	 * Decode a string of the form <code>[0-9]+(k|kb|m|mb|g|gb)?</code>,
	 * returning the number of bytes. When a suffix indicates kilobytes,
	 * megabytes, or gigabytes then the returned value is scaled accordingly.
	 * The suffix is NOT case sensitive.
	 * 
	 * @param s
	 *            The string value.
	 * 
	 * @return The byte count.
	 * 
	 * @throws IllegalArgumentException
	 *             if there is a problem with the argument (<code>null</code>,
	 *             ill-formed, etc).
	 */
	static public long getByteCount(final String s) {

		if (s == null)
			throw new IllegalArgumentException();

		final Matcher m = PATTERN_BYTE_COUNT.matcher(s);

		if (!m.matches())
			throw new IllegalArgumentException(s);

		// the numeric component.
		final String g1 = m.group(1);

		final long c = Long.valueOf(g1);

		// the units (null if not given).
		final String g2 = m.group(2);

		final long count;
		if (g2 == null) {
			count = c;
		} else if (g2.equalsIgnoreCase("k") || g2.equalsIgnoreCase("kb")) {
			count = c * Bytes.kilobyte;
		} else if (g2.equalsIgnoreCase("m") || g2.equalsIgnoreCase("mb")) {
			count = c * Bytes.megabyte;
		} else if (g2.equalsIgnoreCase("g") || g2.equalsIgnoreCase("gb")) {
			count = c * Bytes.gigabyte;
		} else {
			throw new AssertionError();
		}
		return count;
	}

	static final private Pattern PATTERN_BYTE_COUNT = Pattern.compile("([0-9]+)(k|kb|m|mb|g|gb)?",
			Pattern.CASE_INSENSITIVE);

	/**
	 * Return a byte[] having the data in the {@link ByteBuffer} from the
	 * {@link ByteBuffer#position()} to the {@link ByteBuffer#limit()}. The
	 * position, limit, and mark are not affected by this operation. When the
	 * {@link ByteBuffer} has a backing array, the array offset is ZERO (0), and
	 * the {@link ByteBuffer#limit()} is equal to the
	 * {@link ByteBuffer#capacity()} then the backing array is returned.
	 * Otherwise, a new byte[] is allocated and the data are copied into that
	 * byte[], which is then returned.
	 * 
	 * @param b
	 *            The {@link ByteBuffer}.
	 * 
	 * @return The byte[].
	 */
	static public byte[] toArray(final ByteBuffer b) {

        return toArray(b, false/* forceCopy */, null/* dst */);

	}

    /**
     * Return a byte[] having the data in the {@link ByteBuffer} from the
     * {@link ByteBuffer#position()} to the {@link ByteBuffer#limit()}. The
     * position, limit, and mark are not affected by this operation.
     * <p>
     * Under certain circumstances it is possible and may be desirable to return
     * the backing {@link ByteBuffer#array}. This behavior is enabled by
     * <code>forceCopy := false</code>.
     * <p>
     * It is possible to return the backing byte[] when the {@link ByteBuffer}
     * has a backing array, the array offset is ZERO (0), and the
     * {@link ByteBuffer#limit()} is equal to the {@link ByteBuffer#capacity()}
     * then the backing array is returned. Otherwise, a new byte[] must be
     * allocated, and the data are copied into that byte[], which may then be
     * returned.
     * 
     * @param b
     *            The {@link ByteBuffer}.
     * @param forceCopy
     *            When <code>false</code>, the backing array will be returned if
     *            possible.
     * @param dst
     *            A byte[] provided by the caller (optional). When non-
     *            <code>null</code> and having a length GTE
     *            {@link ByteBuffer#remaining()}, this array will be preferred
     *            to a newly allocated array.
     * 
     * @return The byte[] having the data. When <i>dst</i> is non-
     *         <code>null</code> this MAY be the caller's array. When it is the
     *         caller's array, it MAY be larger than the #of bytes actually
     *         read.
     */
    static public byte[] toArray(final ByteBuffer b, final boolean forceCopy,
            final byte[] dst) {

        if (!forceCopy && b.hasArray() && b.arrayOffset() == 0
                && b.position() == 0) {

			// && b.limit() == b.capacity()

			final byte[] a = b.array();

			if (a.length == b.limit()) {

				return a;

			}

		}

		/*
		 * Copy the data into a byte[] using a read-only view on the buffer so
		 * that we do not mess with its position, mark, or limit.
		 */
		final ByteBuffer tmp = b.asReadOnlyBuffer();

		final int len = tmp.remaining();

        final byte[] a = dst != null && dst.length >= len ? dst : new byte[len];

        // Transfer only the available bytes.
        tmp.get(a, 0, len);

		return a;

	}

    static private final char[] HEX_CHAR_TABLE = {
        '0', '1','2','3',
        '4','5','6','7',
        '8','9','a','b',
        'c','d','e','f'
       };    

    /**
     * Utility to convert an int array to a hex string
     * 
     * @param buf
     *            The data.
     *            
     * @return The hex string.
     */
    static public String toHexString(final int[] ibuf) {
        final byte[] buf = new byte[ibuf.length*4];
        for (int i = 0; i < ibuf.length; i++) {
        	final int v = ibuf[i];
        	final int sb = i * 4;
        	buf[sb] = (byte) (v >>> 24 & 0xFF);
        	buf[sb+1] = (byte) (v >>> 16 & 0xFF);
        	buf[sb+2] = (byte) (v >>> 8 & 0xFF);
        	buf[sb+3] = (byte) (v & 0xFF);
        }
        
        return toHexString(buf, buf.length);
        
    }
    
    /**
     * Utility to convert a byte array to a hex string.
     * 
     * @param buf
     *            The data.
     *            
     * @return The hex string.
     */
    static public String toHexString(final byte[] buf) {
        
        if (buf == null)
            return "NULL";

        return toHexString(buf, buf.length);
        
    }
    
    /**
     * Utility to display byte array of maximum i bytes as hexString.
     * 
     * @param buf
     *            The data.
     * @param n
     *            The #of bytes to convert.
     * 
     * @return The hex string.
     */
    static public String toHexString(final byte[] buf, int n) {

        if (buf == null)
            return "NULL";

        n = n < buf.length ? n : buf.length;
        final StringBuffer out = new StringBuffer();
        for (int i = 0; i < n; i++) {
            final int v = buf[i] & 0xFF;
            out.append(HEX_CHAR_TABLE[v >>> 4]);
            out.append(HEX_CHAR_TABLE[v & 0xF]);
        }
        return out.toString();
    }

    /**
     * Formats hex dta into 64 byte rows.
     * 
     * @param sb
     *            Where to format the data.
     * @param hexData
     *            The data.
     */
    static public void printHexString(final StringBuilder sb,
            final String hexData) {
        
        int rem = hexData.length();
        int curs = 0;
        while (rem >= 64) {
            sb.append(String.format("%8d: ", curs));
            sb.append(hexData.substring(curs, curs + 64) + "\n");
            curs += 64;
            rem -= 64;
        }

    }

    /**
     * Return the data in the buffer. When possible, the backing array is
     * returned. Otherwise, a new byte[] is allocated, the data are copied into
     * the array, and the new array is returned.
     */
    public static byte[] getBytes(ByteBuffer buf) {

        if (buf.hasArray() && buf.arrayOffset() == 0 && buf.position() == 0
                && buf.limit() == buf.capacity()) {

            /*
             * Return the backing array.
             */

            return buf.array();

        }

        /*
         * Copy the expected data into a byte[] using a read-only view on the
         * buffer so that we do not mess with its position, mark, or limit.
         */
        final byte[] a;
        {

            buf = buf.asReadOnlyBuffer();

            final int len = buf.remaining();

            a = new byte[len];

            buf.get(a);

        }

        return a;

    }
    

    /**
     * Converts a byte array into a binary string. Useful for debugging.
     * 
     * @param zOrderByteArray
     * @return
     */
    public static String byteArrToBinaryStr(byte[] zOrderByteArray) {
       
       final StringBuffer buf = new StringBuffer();
       for (int i=0; i<zOrderByteArray.length*8; i++) {
          buf.append(BytesUtil.getBit(zOrderByteArray, i) ? "1" : "0");
       }
       
       return buf.toString();
    }

}