/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.util; import it.unimi.dsi.fastutil.bytes.custom.CustomByteArrayFrontCodedList; import it.unimi.dsi.io.InputBitStream; import it.unimi.dsi.io.OutputBitStream; import java.nio.ByteBuffer; import java.util.Comparator; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; /** * Class supporting operations on variable length byte[] keys. * <p> * Comparison operations that accept a starting offset are used when the byte[]s * are known to share a leading prefix that may be skipped during comparison. * <p> * Comparison operations that accept a starting offset and length are used when * immutable keys are stored in a single byte[] and an index into starting * positions in that byte[] is maintained. * <p> * JNI methods are provided for unsigned byte[] comparison. However, note that * the JNI methods do not appear to be as fast as the pure Java methods - * presumably because of the overhead of going from Java to C. In order to * execute using the JNI methods you MUST define the optional boolean system * property, e.g., * * <pre> * java -Dcom.bigdata.btree.BytesUtil.jni=true ... * </pre> * * See BytesUtil.c in this package for instructions on compiling the JNI * methods. * </p> * See {@link #main(String[])} which provides a test for the JNI integration and * some pointers on how to get this running on your platform. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class BytesUtil { private static final transient Logger log = Logger.getLogger(BytesUtil.class); /** * An empty <code>byte[]</code>. */ public static final byte[] EMPTY = new byte[0]; /** * An empty <code>byte[][]</code>. */ public static final byte[][] EMPTY2 = new byte[0][]; /** * Flag set iff JNI linking succeeds. When this flag is false we run with * the pure Java implementations of these methods. When the flag is true, * the JNI versions are used. */ static boolean linked = false; /** * JNI routines are not invoked unless we will compare byte[]s with at least * this many potential bytes to compare (the actual# may be much less of * course since comparisons may fail fast). */ static public final int minlen = 100; static private native int _compareBytes(int alen, byte[] a, int blen, byte[] b); static private native int _compareBytesWithOffsetAndLen(int aoff, int alen, byte[] a, int boff, int blen, byte[] b); static { final boolean jni; String val = System.getProperty("com.bigdata.btree.BytesUtil.jni"); if (val != null) { jni = Boolean.parseBoolean(val); } else { jni = false; // Note: We will not even try to use JNI by default! } if (jni) { /* * Attempt to load the JNI library. */ loadJNILibrary(); } } /** * Attempt to load the JNI library. * <p> * Note: this is done automatically if the optional boolean system property * <code>com.bigdata.btree.BytesUtil.jni=true</code> is specified, e.g., * using * * <pre> * java -Dcom.bigdata.btree.BytesUtil.jni=true ... * </pre> * * @return True iff the JNI library was successfully linked. */ public static boolean loadJNILibrary() { if (!linked) { try { System.loadLibrary("BytesUtil"); if (log.isInfoEnabled()) log.info("BytesUtil JNI linked"); linked = true; } catch (UnsatisfiedLinkError ex) { log.warn("BytesUtil JNI NOT linked: " + ex); linked = false; } } return linked; } /** * True iff the two arrays compare as equal. This is somewhat optimized in * that it tests the array lengths first, assumes that it is being used on * sorted data and therefore compares the last bytes first, and does not * convert the bytes to unsigned integers before testing for equality. * * @param a * A byte[]. * @param b * Another byte[]. * * @return If the two arrays have the same reference (including * <code>null</code>) or if they have the same data. */ final public static boolean bytesEqual(final byte[] a, final byte[] b) { if (a == b) return true; final int alen = a.length; final int blen = b.length; if (alen != blen) return false; int i = alen - 1; while (i >= 0) { if (a[i] != b[i]) return false; i--; } // for (int i = 0; i < alen; i++) { // // if( a[i] != b[i] ) return false; // // } return true; } /** * Byte-wise comparison of byte[]s (the arrays are treated as arrays of * unsigned bytes). * * @param a * A byte[]. * * @param b * A byte[]. * * @return a negative integer, zero, or a positive integer if the first * argument is less than, equal to, or greater than the second. * * @todo Return the index of the byte at which the difference with the sign * adjusted to indicate the relative order of the data rather than the * difference of the bytes at that index. The index would be negative * or positive depending on which way the comparison went. See * {@link CustomByteArrayFrontCodedList} for an implementation * guideline. * <p> * Change all implementations in this class and also BytesUtil.c, * which needs to be recompiled for Windows. Also makes sure that it * gets compiled and linked for Un*x. That should be tested from the * ant installer and the result reported. Do the same for ICU4JNI. */ final public static int compareBytes(final byte[] a, final byte[] b) { if (a == b) // includes a and b both null return 0; // Handle null values if (a == null) { return -1; } if (b == null) { return 1; } // neither are null final int alen = a.length; final int blen = b.length; if (linked && alen > minlen && blen > minlen) { /* * JNI implementation. * * @todo test for trade off when max(len) is short. unroll loop for * small N. */ return _compareBytes(alen, a, blen, b); } for (int i = 0; i < alen && i < blen; i++) { // promotes to signed integers in [0:255] for comparison. final int ret = (a[i] & 0xff) - (b[i] & 0xff); // int ret = a[i] - b[i]; if (ret != 0) return ret; } return alen - blen; } // /** // * Byte-wise comparison of a {@link ByteBuffer} and a byte[]. The data are // * treated as arrays of unsigned bytes. The {@link ByteBuffer} position, // * limit and mark are unchanged by this procedure. // * // * @param a // * A {@link ByteBuffer}. // * @param aoff // * The offset of the starting byte in the buffer. // * @param blen // * The number of bytes to be compared. // * @param b // * A byte[]. // * // * @return a negative integer, zero, or a positive integer if the first // * argument is less than, equal to, or greater than the second. // */ // final public static int compareBytes(final ByteBuffer a, final int aoff, // final int alen, final byte[] b) { // final int blen = b.length; // for (int i = 0; i < alen && i < blen; i++) { // // promotes to signed integers in [0:255] for comparison. // final int ret = (a.get(aoff + i) & 0xff) - (b[i] & 0xff); // if (ret != 0) // return ret; // } // return alen - blen; // } // /** // * Byte-wise comparison of byte[]s (the arrays are treated as arrays of // * unsigned bytes). // * // * @param aoff // * The offset into <i>a</i> at which the comparison will // * begin. // * @param a // * A byte[]. // * @param boff // * The offset into <i>b</i> at which the comparison will // * begin. // * @param b // * A byte[]. // * // * @return a negative integer, zero, or a positive integer as the first // * argument is less than, equal to, or greater than the second. // */ // final public static int compareBytes(int aoff, final byte[] a, int boff, // final byte[] b) { // final int alen = a.length; // final int blen = b.length; // for (int i = aoff, j = boff; i < alen && j < blen; i++, j++) { // // promotes to signed integers in [0:255] for comaprison. // int ret = (a[i] & 0xff) - (b[j] & 0xff); // // int ret = a[i] - b[j]; // if (ret != 0) // return ret; // } // return (alen - aoff) - (blen - boff); // } /** * Byte-wise comparison of byte[]s (the arrays are treated as arrays of * unsigned bytes). * * @param aoff * The offset into <i>a</i> at which the comparison will begin. * @param alen * The #of bytes in <i>a</i> to consider starting at <i>aoff</i>. * @param a * A byte[]. * @param boff * The offset into <i>b</i> at which the comparison will begin. * @param blen * The #of bytes in <i>b</i> to consider starting at <i>boff</i>. * @param b * A byte[]. * * @return a negative integer, zero, or a positive integer as the first * argument is less than, equal to, or greater than the second. */ final public static int compareBytesWithLenAndOffset(// int aoff, int alen, final byte[] a,// int boff, int blen, final byte[] b// ) { if (linked && alen > minlen && blen > minlen) { // JNI implementation. return _compareBytesWithOffsetAndLen(aoff, alen, a, boff, blen, b); } // last index to consider in a[]. final int alimit = aoff + alen; // last index to consider in b[]. final int blimit = boff + blen; for (int i = aoff, j = boff; i < alimit && j < blimit; i++, j++) { // promotes to signed integers in [0:255] for comaprison. int ret = (a[i] & 0xff) - (b[j] & 0xff); if (ret != 0) return ret; } return alen - blen; } /** * Return the #of leading bytes in common. This is used to compute the * prefix for a node or leaf, which is formed by the leading bytes in common * between the first and last key for a node or leaf. * * @param a * A variable length unsigned byte array. * @param b * A variable length unsigned byte array. * * @return The #of leading bytes in common (aka the index of the first byte * in which the two arrays differ, although that index could lie * beyond the end of one of the arrays). */ public final static int getPrefixLength(final byte[] a, final byte[] b) { final int alen = a.length; final int blen = b.length; int i; for (i = 0; i < alen && i < blen; i++) { if (a[i] != b[i]) break; } return i; } /** * Return a new byte[] containing the leading bytes in common between two * byte[]s. This is often used to compute the minimum length separator key. * * @param a * A variable length unsigned byte array[]. * @param b * A variable length unsigned byte array[]. * * @return A new byte[] containing the leading bytes in common between the * two arrays. */ public final static byte[] getPrefix(final byte[] a, final byte[] b) { final int len = getPrefixLength(a, b); final byte[] prefix = new byte[len]; System.arraycopy(a, 0, prefix, 0, len); return prefix; } /** * Computes the successor of a variable length byte array by appending a * unsigned zero(0) byte to the end of the array. * * @param key * A variable length unsigned byte array. * * @return A new unsigned byte[] that is the successor of the key. */ public final static byte[] successor(final byte[] key) { final int keylen = key.length; final byte[] tmp = new byte[keylen + 1]; System.arraycopy(key, 0, tmp, 0, keylen); return tmp; } /** * <p> * The keys in the nodes of a btree are known as <i>separator keys</i>. The * role of the separator keys is to direct search towards the leaf in which * a key exists or would exist by always searching the first child having a * separator key that is greater than or equal to the search key. * </p> * <p> * Separator keys separate leaves and must be choosen with that purpose in * mind. The simplest way to choose the separator key is to just take the * first key of the leaf - this is always correct. However, shorter * separator keys may be choosen by defining the separator key as the * shortest key that is less than or equal to the first key of a leaf and * greater than the last key of the left sibling of that leaf (that is, the * key for the entry that immediately proceeds the first entry on the leaf). * </p> * <p> * There are several advantages to always choosing the shortest separator * key. The original rationale (in "Prefix <i>B</i>-Trees" by Bayer and * Unterauer) was to increase the branching factors for fixed size pages. * Since we use variable size serialized record, that is not an issue. * However, using the shortest separator keys in this implementation * provides both smaller serialized records for nodes and faster search * since fewer bytes must be tested. * </p> * <p> * Note that this trick can not be used at higher levels in the btree - * separator keys are always formed based on the keys in the leaves and then * propagated through the tree. * </p> * <p> * The rules are simple enough: * <ol> * <li>The separator contains all bytes in the shared prefix (if any) plus * the 1st byte at which the given key differs from the prior key.</li> * <li>If the separator key would equal the given key by value then return * the reference to the given key.</li> * </ol> * </p> * * @param givenKey * A key. * * @param priorKey * Another key that <em>proceeds</em> the <i>givenKey</i>. * * @return The shortest key that is less than or equal to <i>givenKey</i> * and greater than <i>priorKey</i>. This will be a reference to the * <i>givenKey</i> iff that is also the shortest separator. * * @see http://portal.acm.org/citation.cfm?doid=320521.320530 * * @throws IllegalArgumentException * if either key is <code>null</code>. * @throws IllegalArgumentException * if both keys are the same reference. */ // * @throws IllegalArgumentException // * if the keys are equal. // * @throws IllegalArgumentException // * if the keys are out of order. final public static byte[] getSeparatorKey(final byte[] givenKey, final byte[] priorKey) { if (givenKey == null) throw new IllegalArgumentException(); if (priorKey == null) throw new IllegalArgumentException(); if (givenKey == priorKey) throw new IllegalArgumentException(); final int prefixLen = getPrefixLength(givenKey, priorKey); if (prefixLen == givenKey.length - 1) { /* * The given key is the shortest separator. Examples would include: * * given: 0 1 2 prior: 0 1 * * or * * given: 0 1 2 prior: 0 1 1 * * or * * given: 0 1 2 prior: 0 1 1 2 */ return givenKey; } /* * The separator includes all bytes in the shared prefix plus the next * byte from the given key. */ // allocate to right size. final byte[] tmp = new byte[prefixLen + 1]; // copy shared prefix plus the following byte. System.arraycopy(givenKey, 0, tmp, 0, prefixLen + 1); return tmp; } /** * Formats a key as a series of comma delimited unsigned bytes. * * @param key * The key. * * @return The string representation of the array as unsigned bytes. */ final public static String toString(final byte[] key) { if (key == null) return NULL; return toString(key, 0, key.length); } /** * Formats a key as a series of comma delimited unsigned bytes. * * @param key * The key. * @param off * The index of the first byte that will be visited. * @param len * The #of bytes to visit. * * @return The string representation of the array as unsigned bytes. */ final public static String toString(final byte[] key, final int off, final int len) { if (key == null) return NULL; final StringBuilder sb = new StringBuilder(len * 4 + 2); sb.append("["); for (int i = off; i < off + len; i++) { if (i > 0) sb.append(","); // as an unsigned integer. // sb.append(Integer.toHexString(key[i] & 0xff)); sb.append(Integer.toString(key[i] & 0xff)); } sb.append("]"); return sb.toString(); } private static transient String NULL = "null"; /** * Formats the data into a {@link String}. * * @param data * An array of unsigned byte arrays. */ static public String toString(final byte[][] data) { final StringBuilder sb = new StringBuilder(); final int n = data.length; sb.append("data(n=" + n + ")={"); for (int i = 0; i < n; i++) { final byte[] a = data[i]; sb.append("\n"); sb.append("data[" + i + "]="); sb.append(BytesUtil.toString(a)); if (i + 1 < n) sb.append(","); } sb.append("}"); return sb.toString(); } /** * Binary search on an array whose members are variable length unsigned * byte[]s. * * @param keys * The buffer. * @param base * The offset of the base of the array within the buffer. * @param nmem * The #of members in the array. When [nmem == 0], the array is * empty. * @param key * The key for the search. * * @return index of the search key, if it is contained in <i>keys</i>; * otherwise, <code>(-(insertion point) - 1)</code>. The insertion * point is defined as the point at which the key would be inserted * into the array of keys. Note that this guarantees that the return * value will be >= 0 if and only if the key is found. */ static final public int binarySearch(final byte[][] keys, final int base, final int nmem, final byte[] key) { int low = 0; int high = nmem - 1; while (low <= high) { final int mid = (low + high) >> 1; final int offset = base + mid; final byte[] midVal = keys[offset]; // compare actual vs probe final int tmp = BytesUtil.compareBytes(midVal, key); if (tmp < 0) { // Actual LT probe, restrict lower bound and try again. low = mid + 1; } else if (tmp > 0) { // Actual GT probe, restrict upper bound and try again. high = mid - 1; } else { // Actual EQ probe. Found : return offset. return offset; } } // Not found: return insertion point. final int offset = (base + low); return -(offset + 1); } /** * Compares two unsigned byte[]s. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ public static class UnsignedByteArrayComparator implements Comparator<byte[]> { public static transient final Comparator<byte[]> INSTANCE = new UnsignedByteArrayComparator(); public int compare(final byte[] o1, final byte[] o2) { return BytesUtil.compareBytes(o1, o2); } } /** * Return <code>true</code> if the <i>key</i> lies inside of the optional * half-open range constraint. * * @param key * The key. * @param fromKey * The inclusive lower bound -or- <code>null</code> if there is * no lower bound. * @param toKey * The exclusive upper bound -or- <code>null</code> if there is * no upper bound. * * @return <code>true</code> unless the <i>key</i> is LT [fromKey] or GTE * [toKey]. */ final static public boolean rangeCheck(final byte[] key, final byte[] fromKey, final byte[] toKey) { if (fromKey == null && toKey == null) { // no range constraint. return true; } if (fromKey != null) { if (BytesUtil.compareBytes(key, fromKey) < 0) { if (log.isDebugEnabled()) { log.debug("key=" + BytesUtil.toString(key) + " LT fromKey" + BytesUtil.toString(fromKey)); } // key is LT then the optional inclusive lower bound. return false; } } if (toKey != null) { if (BytesUtil.compareBytes(key, toKey) >= 0) { if (log.isDebugEnabled()) { log.debug("key=" + BytesUtil.toString(key) + " GTE toKey" + BytesUtil.toString(toKey)); } // key is GTE the optional exclusive upper bound return false; } } return true; } /** * This method forces the load of the JNI library and tries to execute the * JNI methods. * <p> * In order to use the JNI library under Windows, you must specify the JNI * library location using the PATH environment variable, e.g., * * <pre> * cd bigdata * set PATH=%PATH%;lib * java -cp bin com.bigdata.btree.BytesUtil * </pre> * * <p> * In order to use the JNI library under un*x, you must specify the JNI * library location * * <pre> * java -Djava.library.path=lib com.bigdata.btree.BytesUtil * </pre> * * @param args * * @exception UnsatisfiedLinkError * if the JNI methods can not be resolved. * @exception AssertionError * if the JNI methods do not produce the expected answers. */ public static void main(final String[] args) { // Force load of the JNI library. loadJNILibrary(); if (0 != BytesUtil._compareBytes(3, new byte[] { 1, 2, 3 }, 3, new byte[] { 1, 2, 3 })) { throw new AssertionError(); } if (0 != BytesUtil._compareBytesWithOffsetAndLen(0, 3, new byte[] { 1, 2, 3 }, 0, 3, new byte[] { 1, 2, 3 })) { throw new AssertionError(); } System.out.println("JNI library routines Ok."); } /** * Return the #of bytes required to bit code the specified #of bits. * * @param nbits * The #of bit flags. * * @return The #of bytes required. This will be zero iff <i>nbits</i> is * zero. */ final public static int bitFlagByteLength(final int nbits) { return nbits / 8 + (nbits % 8 == 0 ? 0 : 1); // return nbits>>>3; // if (nbits == 0) // return 0; // // return ((int) ((nbits / 8) + 1)); } /** * Return the index of the byte in which the bit with the given index is * encoded. * * @param bitIndex * The bit index. * * @return The byte index. */ final public static int byteIndexForBit(final long bitIndex) { return ((int) (bitIndex / 8)); } /** * Return the offset within the byte in which the bit is coded of the bit * (this is just the remainder <code>bitIndex % 8</code>). * <p> * Note, the computation of the bit offset is intentionally aligned with * {@link OutputBitStream} and {@link InputBitStream}. * * @param bitIndex * The bit index into the byte[]. * * @return The offset of the bit in the appropriate byte. */ final public static int withinByteIndexForBit(final long bitIndex) { return 7 - ((int) bitIndex) % 8; } /** * Get the value of a bit. * <p> * Note, the computation of the bit offset is intentionally aligned with * {@link OutputBitStream} and {@link InputBitStream}. * * @param bitIndex * The index of the bit. * * @return The value of the bit. */ final public static boolean getBit(final byte[] buf, final long bitIndex) { final int mask = (1 << withinByteIndexForBit(bitIndex)); final int off = byteIndexForBit(bitIndex); final byte b = buf[off]; return (b & mask) != 0; } /** * Set the value of a bit - this is NOT thread-safe (contention for the byte * in the backing buffer can cause lost updates). * <p> * Note, the computation of the bit offset is intentionally aligned with * {@link OutputBitStream} and {@link InputBitStream}. * * @param bitIndex * The index of the bit. * * @return The old value of the bit. */ final public static boolean setBit(final byte[] buf, final long bitIndex, final boolean value) { final int mask = (1 << withinByteIndexForBit(bitIndex)); final int off = byteIndexForBit(bitIndex); // current byte at that index. byte b = buf[off]; final boolean oldValue = (b & mask) != 0; if (value) b |= mask; else b &= ~mask; buf[off] = b; return oldValue; } /** * An array of 32-bit mask values. The index in the array is the #of bits of * the hash code to be considered. The value at that index in the array is * the mask to be applied to mask off to zero the high bits of the hash code * which are to be ignored. */ static private final int[] masks32; static { // Populate the array of masking values. masks32 = new int[33]; for (int i = 0; i < 33; i++) masks32[i] = getMSBMask(i); } /** * Return a bit mask which reveals only the MSB (Most Significant Bits) N * bits of an int32 value. * * @param nbits * The #of bits to be revealed. * * @return The mask. * * @throws IllegalArgumentException * if <i>nbits</i> is LT ZERO (0). * @throws IllegalArgumentException * if <i>nbits</i> is GT 32. */ public static/* private */int getMSBMask(final int nbits) { if (nbits < 0 || nbits > 32) throw new IllegalArgumentException(); final int limit = (32 - nbits); int mask = 0; for (int i = 31; i >= limit; i--) { final int bit = (1 << i); mask |= bit; } return mask; } /** * Mask off all but the MSB <i>nbits</i> of the hash value and shift them * down such that the masked bits appear at bits (nbits:0] of the returned * value. This is used to index into a dictionary page based on the revealed * bits. * * @param h * The hash value. * @param nbits * The #of bits already accounted for by the path from the root. * * @return The hash value considering only the MSB <i>nbits</i> and shifted * down into an <i>nbits</i> integer. */ public static int maskOffMSB(final int h, final int nbits) { if (nbits < 0 || nbits > 32) throw new IllegalArgumentException(); final int v = h & masks32[nbits]; final int x = v >>> (32 - nbits); return x; } /** * Mask off all but the LSB <i>nbits</i> of the hash value. * * @param h * The hash value. * @param nbits * The #of LSB bits to be retained. * * @return The LSB <i>nbits</i>. * * TODO unit test. */ public static int maskOffLSB(final int h, final int nbits) { if (nbits < 0 || nbits > 32) throw new IllegalArgumentException(); final int v = h & ~masks32[32-nbits]; return v; } /** * Return the n-bit integer corresponding to the inclusive bit range of the * byte[]. Bit ZERO (0) is the Most Significant Bit (MSB). Bit positions * increase from zero up to <code>a.length * 8 - 1</code>. The return value * is an int32 and the bit range must not be greater than 32 bits. * <p> * For example, given the following data and the bit range (0,2) * * <pre> * bit index: 01234567 * ---------+---------- * bit value: 10110000 * </pre> * * TWO (2) bits starting at bit offset ZERO (0) would be extracted and * returned as a 2-bit integer. For those data, the return value would be an * int32 value whose binary representation was <code>10</code> (with leading * zeros suppressed). * <p> * Note: This method is design for use with the unsigned byte[] keys in a * bigdata hash index. All keys in bigdata are internally represented as * unsigned byte[]s, which is why this method accepts a byte[] rather than * an long[] for the bits. Also, while the length of an unsigned byte[] key * can vary, they are never huge and an int32 value is sufficient to index * into the bits in the byte[]. Finally, the return value is an int because * it will be used in hash table designs to index into a hash table based on * those bits in a hash code key which are masked as relevant to that hash * table. 32bits is far more than we will need to index into a hash table. * For an 8k page, we might expect a fan out of at most 1024 which is only * 10 bits. * * @param a * A byte[]. * @param off * The index of the first bit to be included. * @param len * The number of bits to be returned in [0:32]. However, a bit * length of zero will always return zero. * * @return The integer extracted from the specified bit range. */ public static int getBits(final byte[] a, final int off, final int len) { if (a == null || off < 0 || len < 0 || len > 32) throw new IllegalArgumentException(); final int maxbits = a.length * 8; if (off + len > maxbits) throw new IllegalArgumentException("off: " + off + ", len: " + len + ", a.length: " + a.length); if (len == 0) // zero length is always a zero. return 0; /* * Build int32 value having the desired bits. */ // byte in which the bit range begins. final int fromByteOffset = off / 8; // byteIndexForBit(off); final int lastByte = (off + len - 1); // byte in which the bit range ends (inclusive). final int toByteOffset = lastByte / 8; // byteIndexForBit(off + // len - 1); /* * The data are assembled into the int64 value by copying each byte in * turn having data for the slice. This will copy at most 5 bytes. For * example, when a 32-bit window starts in the middle of a byte. Once * the bytes are assembled into the int64 buffer, they are shifted down * to put the last bit extracted at bit index ZERO (0) of the int32 * word. Finally, the unused high bits are cleared to zero using a mask. */ final int nbytes = toByteOffset - fromByteOffset + 1; long v = 0L; // buffer for up to 5 source bytes. for (int i = fromByteOffset, j = 1; i <= toByteOffset; i++, j++) { final long x = 0xFF & a[i]; // next byte. final int shift = ((nbytes - j) << 3); // v |= (x << shift); // mask off high bits and shift into buf. // inlining the above expressions produces SLOWER code // v |= ((0xFF & a[i]) << ((nbytes-j)<<3)); } // next byte in the byte[]. // NOTE an explicit select on byte count to avoid the for loop // produces SLOWER code final int rshift = 7 - (lastByte % 8); // final right shift to word align. final int w = (int) (v >>> rshift); // int32 result. // int mask = masks32[32 - len]; // lookup mask with [len] LSB ZEROs. // mask = ~mask; // flip bits to get [len] LSB ONEs. // w &= mask; // mask off the lower [len] bits (handles sign extension // and // starting offset within byte). // return w; // inlining the above is a little FASTER return w & ~masks32[32 - len]; } static final int[] bitmasks = new int[] { 0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF }; /* * Given a byte and a start and length copy the bits to a long and mask off * those not required. The third argument defines the destination start bit * in the long. */ private static long getLongVal(final byte b, final int s, final int l, final int t) { final int lastBit = s + l; final int shift = 64 - t - l; final long ret; if (lastBit < 8) { ret = b >>> (8 - lastBit); } else { ret = b; } return (ret & bitmasks[l]) << shift; } private static int getIntVal(final byte b, final int s, final int l, final int t) { final int lastBit = s + l; final int shift = 64 - t - l; final int ret; if (lastBit < 8) { ret = b >>> (8 - lastBit); } else { ret = b; } return (ret & bitmasks[l]) << shift; } public static long altGetBits64(final byte[] a, final int off, final int len) { if (a == null || off < 0 || len < 0 || len > 64) throw new IllegalArgumentException(); if (len == 0) // zero length is always a zero. return 0; if (off + len > a.length * 8) throw new IllegalArgumentException(); /* * Build int64 value having the desired bits. */ // byte in which the bit range begins. int bi = off/8; // start bit offset int bo = off % 8; // bits remaining in current byte int br = 8 - bo; // long target start bit int lt = 64 - len; int lr = len; long ret = 0; for (int t = 0; t < len; t += br, bo = 0, br = 8, bi++) { if (br > lr) { br = lr; } ret |= getLongVal(a[bi], bo, br, lt); lr -= br; lt += br; } return ret; } /* * Is there any advantage in 32-bit math over 64-bit? */ public static int altGetBits32(final byte[] a, final int off, final int len) { if (a == null || off < 0 || len < 0 || len > 64) throw new IllegalArgumentException(); if (len == 0) // zero length is always a zero. return 0; if (off + len > a.length * 8) throw new IllegalArgumentException(); /* * Build int32 value having the desired bits. */ // byte in which the bit range begins. int bi = off/8; // start bit offset int bo = off % 8; // bits remaining in current byte int br = 8 - bo; // long target start bit int lt = 32 - len; int lr = len; int ret = 0; for (int t = 0; t < len; t += br, bo = 0, br = 8, bi++) { if (br > lr) { br = lr; } ret |= getIntVal(a[bi], bo, br, lt); lr -= br; lt += br; } return ret; } public static long getBits64(final byte[] a, final int off, final int len) { long ret = 0; if (len <= 32) { ret = 0xFFFFFFFFL & getBits(a, off, len); } else { final int hilen = len - 32; ret = getBits(a, off, hilen); ret <<= 32; ret |= (0xFFFFFFFFL & getBits(a, off + hilen, 32)); } return ret; } /** * Some benchmarks seem to indicate that altGetBits32 is faster than getBits * for smaller byte counts. OTOH the cost of the redirection may outweigh * any benefit. */ public static int optGetBits(final byte[] a, final int off, final int len) { if (len <= 16) { return altGetBits32(a, off, len); } else { return getBits(a, off, len); } } /** * Return the n-bit integer corresponding to the inclusive bit range of the * byte[]. Bit ZERO (0) is the Most Significant Bit (MSB). Bit positions * increase from zero up to <code>31</code>. The return value is an int32 * and the bit range must not be greater than 32 bits. * <p> * For example, given the following data and the bit range (0,2) * * <pre> * bit index: 01234567 * ---------+---------- * bit value: 10110000 * </pre> * * TWO (2) bits starting at bit offset ZERO (0) would be extracted and * returned as a 2-bit integer. For those data, the return value would be an * int32 value whose binary representation was <code>10</code> (with leading * zeros suppressed). * <p> * Note: This method is design for use in a bigdata hash index having native * int32 keys rather than unsigned byte[] keys. * * @param a * An integer. * @param off * The index of the first bit to be included. * @param len * The number of bits to be returned in [0:32]. However, a bit * length of zero will always return zero. * * @return The integer extracted from the specified bit range. */ public static int getBits(final int a, final int off, final int len) { if (off < 0) throw new IllegalArgumentException(); if (len < 0 || len > 32) throw new IllegalArgumentException(); if (len == 0) // zero length is always a zero. return 0; if (off + len > 32) throw new IllegalArgumentException(); final int last = off + len - 1; // index of the last bit (inclusive). final int rshift = 31 - last; // right shift to word align. int w = (int) (a >>> rshift); // int32 result. int mask = masks32[32 - len]; // lookup mask with [len] LSB ZEROs. mask = ~mask; // flip bits to get [len] LSB ONEs. w &= mask; // mask off the lower [len] bits (handles sign extension and // starting offset within byte). return w; } /** * Return the binary representation of the unsigned byte[]. * * @param b * The unsigned byte[]. * * @return The representation of the bits in that unsigned byte[]. * * @throws IllegalArgumentException * if the argument is <code>null</code>. */ public static String toBitString(final byte[] b) { if (b == null)// Note: fromKey/toKey may be null; caller must check 1st throw new IllegalArgumentException(); final char[] chars = new char[b.length << 3]; // one char per bit. int bitIndex = 0; // start at the msb. for (int i = 0; i < b.length; i++) { final byte x = b[i]; // next byte. for (int withinByteIndex = 7; withinByteIndex >= 0; withinByteIndex--) { final int mask = 1 << withinByteIndex; final boolean bit = (x & mask) != 0; chars[bitIndex++] = bits[bit ? 1 : 0]; } // next bit in the current byte. } // next byte in the byte[]. // System.err.println("b[]=" + BytesUtil.toString(b) + ", chars=" // + Arrays.toString(chars)); return new String(chars); } /** binary digits. */ private final static char[] bits = { '0', '1' }; /** * Decode a string of the form <code>[0-9]+(k|kb|m|mb|g|gb)?</code>, * returning the number of bytes. When a suffix indicates kilobytes, * megabytes, or gigabytes then the returned value is scaled accordingly. * The suffix is NOT case sensitive. * * @param s * The string value. * * @return The byte count. * * @throws IllegalArgumentException * if there is a problem with the argument (<code>null</code>, * ill-formed, etc). */ static public long getByteCount(final String s) { if (s == null) throw new IllegalArgumentException(); final Matcher m = PATTERN_BYTE_COUNT.matcher(s); if (!m.matches()) throw new IllegalArgumentException(s); // the numeric component. final String g1 = m.group(1); final long c = Long.valueOf(g1); // the units (null if not given). final String g2 = m.group(2); final long count; if (g2 == null) { count = c; } else if (g2.equalsIgnoreCase("k") || g2.equalsIgnoreCase("kb")) { count = c * Bytes.kilobyte; } else if (g2.equalsIgnoreCase("m") || g2.equalsIgnoreCase("mb")) { count = c * Bytes.megabyte; } else if (g2.equalsIgnoreCase("g") || g2.equalsIgnoreCase("gb")) { count = c * Bytes.gigabyte; } else { throw new AssertionError(); } return count; } static final private Pattern PATTERN_BYTE_COUNT = Pattern.compile("([0-9]+)(k|kb|m|mb|g|gb)?", Pattern.CASE_INSENSITIVE); /** * Return a byte[] having the data in the {@link ByteBuffer} from the * {@link ByteBuffer#position()} to the {@link ByteBuffer#limit()}. The * position, limit, and mark are not affected by this operation. When the * {@link ByteBuffer} has a backing array, the array offset is ZERO (0), and * the {@link ByteBuffer#limit()} is equal to the * {@link ByteBuffer#capacity()} then the backing array is returned. * Otherwise, a new byte[] is allocated and the data are copied into that * byte[], which is then returned. * * @param b * The {@link ByteBuffer}. * * @return The byte[]. */ static public byte[] toArray(final ByteBuffer b) { return toArray(b, false/* forceCopy */, null/* dst */); } /** * Return a byte[] having the data in the {@link ByteBuffer} from the * {@link ByteBuffer#position()} to the {@link ByteBuffer#limit()}. The * position, limit, and mark are not affected by this operation. * <p> * Under certain circumstances it is possible and may be desirable to return * the backing {@link ByteBuffer#array}. This behavior is enabled by * <code>forceCopy := false</code>. * <p> * It is possible to return the backing byte[] when the {@link ByteBuffer} * has a backing array, the array offset is ZERO (0), and the * {@link ByteBuffer#limit()} is equal to the {@link ByteBuffer#capacity()} * then the backing array is returned. Otherwise, a new byte[] must be * allocated, and the data are copied into that byte[], which may then be * returned. * * @param b * The {@link ByteBuffer}. * @param forceCopy * When <code>false</code>, the backing array will be returned if * possible. * @param dst * A byte[] provided by the caller (optional). When non- * <code>null</code> and having a length GTE * {@link ByteBuffer#remaining()}, this array will be preferred * to a newly allocated array. * * @return The byte[] having the data. When <i>dst</i> is non- * <code>null</code> this MAY be the caller's array. When it is the * caller's array, it MAY be larger than the #of bytes actually * read. */ static public byte[] toArray(final ByteBuffer b, final boolean forceCopy, final byte[] dst) { if (!forceCopy && b.hasArray() && b.arrayOffset() == 0 && b.position() == 0) { // && b.limit() == b.capacity() final byte[] a = b.array(); if (a.length == b.limit()) { return a; } } /* * Copy the data into a byte[] using a read-only view on the buffer so * that we do not mess with its position, mark, or limit. */ final ByteBuffer tmp = b.asReadOnlyBuffer(); final int len = tmp.remaining(); final byte[] a = dst != null && dst.length >= len ? dst : new byte[len]; // Transfer only the available bytes. tmp.get(a, 0, len); return a; } static private final char[] HEX_CHAR_TABLE = { '0', '1','2','3', '4','5','6','7', '8','9','a','b', 'c','d','e','f' }; /** * Utility to convert an int array to a hex string * * @param buf * The data. * * @return The hex string. */ static public String toHexString(final int[] ibuf) { final byte[] buf = new byte[ibuf.length*4]; for (int i = 0; i < ibuf.length; i++) { final int v = ibuf[i]; final int sb = i * 4; buf[sb] = (byte) (v >>> 24 & 0xFF); buf[sb+1] = (byte) (v >>> 16 & 0xFF); buf[sb+2] = (byte) (v >>> 8 & 0xFF); buf[sb+3] = (byte) (v & 0xFF); } return toHexString(buf, buf.length); } /** * Utility to convert a byte array to a hex string. * * @param buf * The data. * * @return The hex string. */ static public String toHexString(final byte[] buf) { if (buf == null) return "NULL"; return toHexString(buf, buf.length); } /** * Utility to display byte array of maximum i bytes as hexString. * * @param buf * The data. * @param n * The #of bytes to convert. * * @return The hex string. */ static public String toHexString(final byte[] buf, int n) { if (buf == null) return "NULL"; n = n < buf.length ? n : buf.length; final StringBuffer out = new StringBuffer(); for (int i = 0; i < n; i++) { final int v = buf[i] & 0xFF; out.append(HEX_CHAR_TABLE[v >>> 4]); out.append(HEX_CHAR_TABLE[v & 0xF]); } return out.toString(); } /** * Formats hex dta into 64 byte rows. * * @param sb * Where to format the data. * @param hexData * The data. */ static public void printHexString(final StringBuilder sb, final String hexData) { int rem = hexData.length(); int curs = 0; while (rem >= 64) { sb.append(String.format("%8d: ", curs)); sb.append(hexData.substring(curs, curs + 64) + "\n"); curs += 64; rem -= 64; } } /** * Return the data in the buffer. When possible, the backing array is * returned. Otherwise, a new byte[] is allocated, the data are copied into * the array, and the new array is returned. */ public static byte[] getBytes(ByteBuffer buf) { if (buf.hasArray() && buf.arrayOffset() == 0 && buf.position() == 0 && buf.limit() == buf.capacity()) { /* * Return the backing array. */ return buf.array(); } /* * Copy the expected data into a byte[] using a read-only view on the * buffer so that we do not mess with its position, mark, or limit. */ final byte[] a; { buf = buf.asReadOnlyBuffer(); final int len = buf.remaining(); a = new byte[len]; buf.get(a); } return a; } /** * Converts a byte array into a binary string. Useful for debugging. * * @param zOrderByteArray * @return */ public static String byteArrToBinaryStr(byte[] zOrderByteArray) { final StringBuffer buf = new StringBuffer(); for (int i=0; i<zOrderByteArray.length*8; i++) { buf.append(BytesUtil.getBit(zOrderByteArray, i) ? "1" : "0"); } return buf.toString(); } }