/* * fastutil: Fast & compact type-specific collections for Java * * Copyright (C) 2002, 2003, 2004, 2005, 2006 Sebastiano Vigna * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ package it.unimi.dsi.fastutil.bytes.custom; import it.unimi.dsi.fastutil.bytes.ByteArrays; import it.unimi.dsi.fastutil.ints.IntArrays; import it.unimi.dsi.fastutil.objects.AbstractObjectList; import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator; import it.unimi.dsi.fastutil.objects.ObjectListIterator; import java.io.DataOutput; import java.io.IOException; import java.io.OutputStream; import java.io.Serializable; import java.nio.ByteBuffer; import java.util.Collection; import java.util.Iterator; import java.util.NoSuchElementException; /** * Compact storage of lists of arrays using front coding. * * <P> * This class stores immutably a list of arrays in a single large array using * front coding (of course, the compression will be reasonable only if the list * is sorted lexicographically—see below). It implements an immutable * type-specific list that returns the <var>i</var>-th array when calling * {@link #get(int) get(<var>i</var>)}. The returned array may be freely * modified. * * <P> * Front coding is based on the idea that if the <var>i</var>-th and the * (<var>i</var>+1)-th array have a common prefix, we might store the length of * the common prefix, and then the rest of the second array. * * <P> * This approach, of course, requires that once in a while an array is stored * entirely. The <def>ratio</def> of a front-coded list defines how often this * happens (once every {@link #ratio()} arrays). A higher ratio means more * compression, but means also a longer access time, as more arrays have to be * probed to build the result. Note that we must build an array every time * {@link #get(int)} is called, but this class provides also methods that * extract one of the stored arrays in a given array, reducing garbage * collection. See the documentation of the family of <code>get()</code> * methods. * * <P> * By setting the ratio to 1 we actually disable front coding: however, we still * have a data structure storing large list of arrays with a reduced overhead * (just one integer per array, plus the space required for lengths). * * <P> * Note that the typical usage of front-coded lists is under the form of * serialized objects; usually, the data that has to be compacted is processed * offline, and the resulting structure is stored permanently. Since the pointer * array is not stored, the serialized format is very small. * * <H2>Implementation Details</H2> * * <P> * All arrays are stored in a large array. A separate array of pointers indexes * arrays whose position is a multiple of the ratio: thus, a higher ratio means * also less pointers. * * <P> * More in detail, an array whose position is a multiple of the ratio is stored * as the array length, followed by the elements of the array. The array length * is coded by a simple variable-length list of <var>k</var>-1 bit blocks, where * <var>k</var> is the number of bits of the underlying primitive type. All * other arrays are stored as follows: let <code>common</code> the length of the * maximum common prefix between the array and its predecessor. Then we store * the array length decremented by <code>common</code>, followed by * <code>common</code>, followed by the array elements whose index is greater * than or equal to <code>common</code>. For instance, if we store * <samp>foo</samp>, <samp>foobar</samp>, <samp>football</samp> and * <samp>fool</samp> in a front-coded character-array list with ratio 3, the * character array will contain * * <pre> * <b>3</b> f o o <b>3</b> <b>3</b> b a r <b>5</b> <b>3</b> t b a l l <b>4</b> f o o l * </pre> * * <H2>Limitations</H2> * * <P> * All arrays are stored in a large array: thus, the compressed list must not * exceed {@link java.lang.Integer#MAX_VALUE} elements. Moreover, iterators are * less efficient when they move back, as * {@link java.util.ListIterator#previous() previous()} cannot build * incrementally the previous array (whereas ( * {@link java.util.ListIterator#next() next()} can). * * <h3>Modifications</h3> * * This class was derived from * <code>it.unimi.dsi.fastutil.bytes.ArrayFrontCodedList</code>, which is part * of fastutils. The folowing changes were made: * <ul> * <li>The name of the class has been changed to prevent classpath problems.</li> * <li>The class has a new {@link #serialVersionUID} and the serialization logic * has been modified to allow serialization against {@link DataOutput} by * defining {@link #getBackingBuffer()} and a new constructors that operate on a * byte[] slice.</li> * <li>The test code from main() has been isolated in a junit test suite.</li> * <li>The backing <code>byte[] array</code> has been replaced by an interface * suitable for wrapping either a <code>byte[]</code> or a {@link ByteBuffer}. * This was done in order to permit access to the front-coded representation * without "de-serializing" the data and a suitable constructor was added for * the {@link ByteBuffer} case.</li> * <li>Make the {@link Collection} and {@link Iterator} ctors strongly typed.</li> * </ul> */ public class CustomByteArrayFrontCodedList extends AbstractObjectList<byte[]> implements Serializable, Cloneable { /* * New interfaces and their implementations. */ /** * Abstraction allowing different implementations of the backing buffer. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ public interface BackingBuffer extends Cloneable, Serializable { /** * Return the byte value at the specified index. * * @param i * The index. * @return The byte. */ public byte get(int i); /** * Reads a coded length. * * @param pos * The starting position. * * @return The length coded at <code>pos</code>. */ public int readInt(int pos); /** * Copy data from the backing buffer into the caller's array. * * @param pos * The starting position in the backing buffer. * @param dest * The caller's array. * @param destPos * The starting position in the caller's array. * @param len * The #of bytes to copy. */ public void arraycopy(int pos, byte[] dest, int destPos, int len); /** * The size of the backing buffer in bytes. */ public int size(); /** * Return a copy of the data in the backing buffer. */ public byte[] toArray(); /** * Write the data on the output stream. * * @param out * The output stream. * * @return The #of bytes written. */ public int writeOn(OutputStream out) throws IOException; /** * Write <i>len</i> bytes starting at <i>off</i> onto the caller's * stream. * * @param out * The output stream. * @param off * The index of the first byte to be written. * @param len * The #of bytes to be written. * * @return The #of bytes written. */ public int writeOn(OutputStream out, int off, int len) throws IOException; /** * Clone the backing buffer. */ public BackingBuffer clone(); } /** * Implementation for a <code>byte[]</code>. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ private class BackingByteArray implements BackingBuffer { private static final long serialVersionUID = 1L; private final byte[] a; private final int off; private final int len; public BackingByteArray(final byte[] a) { this(a,0,a.length); } public BackingByteArray(final byte[] a, final int off, final int len) { this.a = a; this.off = off; this.len = len; } public int size() { return len; } public byte get(final int i) { return a[off + i]; } public void arraycopy(final int pos, final byte[] dest, final int destPos, final int len) { if (pos < 0) // check starting pos. throw new IllegalArgumentException(); if (pos + len > this.len) // check run length. throw new IllegalArgumentException(); System.arraycopy(a/* src */, off + pos, dest, destPos, len); } public int writeOn(final OutputStream dos) throws IOException { dos.write(a, off, len); return len; } public int writeOn(final OutputStream dos, final int aoff, final int alen) throws IOException { if (aoff < 0) // check starting pos. throw new IllegalArgumentException(); if (aoff + alen > this.len) // check run length. throw new IllegalArgumentException(); dos.write(a, off + aoff, alen); return len; } public int readInt(int pos) { pos += off; if (a[pos] >= 0) return a[pos]; if (a[pos + 1] >= 0) return (-a[pos] - 1) << 7 | a[pos + 1]; if (a[pos + 2] >= 0) return (-a[pos] - 1) << 14 | (-a[pos + 1] - 1) << 7 | a[pos + 2]; if (a[pos + 3] >= 0) return (-a[pos] - 1) << 21 | (-a[pos + 1] - 1) << 14 | (-a[pos + 2] - 1) << 7 | a[pos + 3]; return (-a[pos] - 1) << 28 | (-a[pos + 1] - 1) << 21 | (-a[pos + 2] - 1) << 14 | (-a[pos + 3] - 1) << 7 | a[pos + 4]; } public byte[] toArray() { final byte[] b = new byte[len]; System.arraycopy(a, off, b, 0, len); return b; } public BackingByteArray clone() { return new BackingByteArray(toArray()); } } /** * Implementation with a backing {@link ByteBuffer}. * <p> * Note: Methods which interact with a ByteBuffer MUST NOT change its * position or limit. If they do then ALL methods which touch the buffer * need to be synchronized so NONE of them can have a concurrent read during * which the position/limit has been transiently modified. The culprits here * are the bulk byte transfer methods ByteBuffer#get() and ByteBuffer#put(). * This is really a huge limitation on the use of a ByteBuffer for * concurrent access to a read-only data structure. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ * * @deprecated The {@link ByteBuffer} is too slow. */ private static class BackingByteBuffer implements BackingBuffer { private static final long serialVersionUID = 1L; private final ByteBuffer b; /** * * @param b * The data. All bytes in view are used (from zero through * the capacity of the array). The limit and position of the * buffer are ignored. */ public BackingByteBuffer(final ByteBuffer b) { this.b = b; } public int size() { return b.capacity(); } public byte get(final int i) { synchronized(b) { return b.get(i); } } // @todo tweak by extracting values that are reused into tmp vars. public int readInt(final int pos) { synchronized(b) { if (get(pos) >= 0) return get(pos); if (get(pos + 1) >= 0) return (-get(pos) - 1) << 7 | get(pos + 1); if (get(pos + 2) >= 0) return (-get(pos) - 1) << 14 | (-get(pos + 1) - 1) << 7 | get(pos + 2); if (get(pos + 3) >= 0) return (-get(pos) - 1) << 21 | (-get(pos + 1) - 1) << 14 | (-get(pos + 2) - 1) << 7 | get(pos + 3); return (-get(pos) - 1) << 28 | (-get(pos + 1) - 1) << 21 | (-get(pos + 2) - 1) << 14 | (-get(pos + 3) - 1) << 7 | get(pos + 4); } } public byte[] toArray() { /* * Note: synchronized to prevent concurrent modification to the * pos/limit. The pos/limit are restored as a postcondition using * clear(). */ synchronized (b) { final byte[] a = new byte[b.capacity()]; b.clear(); b.get(a); b.clear(); return a; } } public void arraycopy(final int pos, final byte[] dest, final int destPos, final int len) { /* * Note: synchronized to prevent concurrent modification to the * pos/limit. The pos/limit are restored as a postcondition using * clear(). */ synchronized (b) { b.limit(pos + len); b.position(pos); b.get(dest, destPos, len); b.clear(); } } public int writeOn(final OutputStream dos) throws IOException { final byte[] a = toArray(); dos.write(a, 0/* off */, a.length/* len */); return a.length; } public int writeOn(final OutputStream dos, final int off, final int len) throws IOException { final byte[] a = new byte[len]; arraycopy(off, a, 0/*destPos*/, len); dos.write(a, 0/* off */, a.length/* len */); return a.length; } public BackingByteBuffer clone() { return new BackingByteBuffer(ByteBuffer.wrap(toArray())); } } /** * */ private static final long serialVersionUID = -2532468860579334765L; // The value for the original impl. // public static final long serialVersionUID = -7046029254386353130L; /** The number of arrays in the list. */ protected int n; /** The ratio of this front-coded list. */ protected int ratio; // /** The array containing the compressed arrays. */ // protected byte[] array; /** A view on the compressed arrays. */ private BackingBuffer bb; /** <code>true</code>iff duplicate keys are allowed. */ private boolean hasDups; /** The pointers to entire arrays in the list. */ transient protected int[] p; private void assertRatio(final int ratio) { if (ratio < 1) throw new IllegalArgumentException("Illegal ratio (" + ratio + ")"); } /** * Creates a new front-coded list containing the arrays returned by the * given iterator. * * @param arrays * an iterator returning arrays. * @param ratio * the desired ratio. */ public CustomByteArrayFrontCodedList(final Iterator<byte[]> arrays, final int ratio) { this(arrays, ratio, false/* hasDups */); } public CustomByteArrayFrontCodedList(final Iterator<byte[]> arrays, final int ratio, final boolean hasDups) { assertRatio(ratio); // if (ratio < 1) // throw new IllegalArgumentException("Illegal ratio (" + ratio + ")"); byte[] array = ByteArrays.EMPTY_ARRAY; int[] p = IntArrays.EMPTY_ARRAY; byte[][] a = new byte[2][]; int curSize = 0, b = 0, common, length, minLength; while (arrays.hasNext()) { a[b] = (byte[]) arrays.next(); length = a[b].length; if (n % ratio == 0) { p = IntArrays.grow(p, n / ratio + 1); p[n / ratio] = curSize; array = ByteArrays.grow(array, curSize + count(length) + length, curSize); curSize += writeInt(array, length, curSize); System.arraycopy(a[b], 0, array, curSize, length); curSize += length; } else { minLength = a[1 - b].length; if (length < minLength) minLength = length; for (common = 0; common < minLength; common++) if (a[0][common] != a[1][common]) break; length -= common; array = ByteArrays.grow(array, curSize + count(length) + count(common) + length, curSize); curSize += writeInt(array, length, curSize); curSize += writeInt(array, common, curSize); System.arraycopy(a[b], common, array, curSize, length); curSize += length; } b = 1 - b; n++; } this.ratio = ratio; // this.array = ByteArrays.trim(array, curSize); this.bb = new BackingByteArray( ByteArrays.trim(array, curSize) ); // this.bb = new BackingByteBuffer( ByteBuffer.wrap(ByteArrays.trim(array, curSize) )); this.p = IntArrays.trim(p, (n + ratio - 1) / ratio); this.hasDups = hasDups; } /** * Creates a new front-coded list containing the arrays in the given * collection. * * @param c * a collection containing arrays. * @param ratio * the desired ratio. */ public CustomByteArrayFrontCodedList(final Collection<byte[]> c, final int ratio) { this(c.iterator(), ratio); } /** * Creates a new front-coded list containing the arrays in the given * collection. * * @param c * a collection containing arrays. * @param ratio * the desired ratio. * @param hasDups * <code>true</code> iff the list allows duplicate keys. */ public CustomByteArrayFrontCodedList(final Collection<byte[]> c, final int ratio, final boolean hasDups) { this(c.iterator(), ratio, hasDups); } // /** // * Reads a coded length. // * // * @param a // * the data array. // * @param pos // * the starting position. // * @return the length coded at <code>pos</code>. // */ // private static int readInt(final byte a[], int pos) { // if (a[pos] >= 0) // return a[pos]; // if (a[pos + 1] >= 0) // return (-a[pos] - 1) << 7 | a[pos + 1]; // if (a[pos + 2] >= 0) // return (-a[pos] - 1) << 14 | (-a[pos + 1] - 1) << 7 | a[pos + 2]; // if (a[pos + 3] >= 0) // return (-a[pos] - 1) << 21 | (-a[pos + 1] - 1) << 14 // | (-a[pos + 2] - 1) << 7 | a[pos + 3]; // return (-a[pos] - 1) << 28 | (-a[pos + 1] - 1) << 21 // | (-a[pos + 2] - 1) << 14 | (-a[pos + 3] - 1) << 7 | a[pos + 4]; // } /** * Computes the number of elements coding a given length. * * @param length * the length to be coded. * @return the number of elements coding <code>length</code>. */ // @SuppressWarnings("unused") private static int count(final int length) { if (length < (1 << 7)) return 1; if (length < (1 << 14)) return 2; if (length < (1 << 21)) return 3; if (length < (1 << 28)) return 4; return 5; } /** * Writes a length. * * @param a * the data array. * @param length * the length to be written. * @param pos * the starting position. * @return the number of elements coding <code>length</code>. */ private static int writeInt(final byte a[], int length, int pos) { final int count = count(length); a[pos + count - 1] = (byte) (length & 0x7F); if (count != 1) { int i = count - 1; while (i-- != 0) { length >>>= 7; a[pos + i] = (byte) (-(length & 0x7F) - 1); } } return count; } /** * Returns the ratio of this list. * * @return the ratio of this list. */ public int ratio() { return ratio; } /** * Computes the length of the array at the given index. * * <P> * This private version of {@link #arrayLength(int)} does not check its * argument. * * @param index * an index. * @return the length of the <code>index</code>-th array. */ private int length(final int index) { // final byte[] array = this.array; final BackingBuffer bb = this.bb; final int delta = index % ratio; // The index into the p array, and the // delta inside the block. int pos = p[index / ratio]; // The position into the array of the first // entire word before the index-th. // int length = readInt(array, pos); int length = bb.readInt(pos); if (delta == 0) return length; // First of all, we recover the array length and the maximum amount of // copied elements. int common; pos += count(length) + length; // length = readInt(array, pos); // common = readInt(array, pos + count(length)); length = bb.readInt(pos); common = bb.readInt(pos + count(length)); for (int i = 0; i < delta - 1; i++) { pos += count(length) + count(common) + length; // length = readInt(array, pos); // common = readInt(array, pos + count(length)); length = bb.readInt(pos); common = bb.readInt(pos + count(length)); } return length + common; } /** * Computes the length of the array at the given index. * * @param index * an index. * @return the length of the <code>index</code>-th array. */ public int arrayLength(final int index) { ensureRestrictedIndex(index); return length(index); } /** * Extracts the array at the given index. * * @param index * an index. * @param a * the array that will store the result (we assume that it can * hold the result). * @param offset * an offset into <code>a</code> where elements will be store. * @param length * a maximum number of elements to store in <code>a</code>. * @return the length of the extracted array. */ private int extract(final int index, final byte a[], final int offset, final int length) { final BackingBuffer bb = this.bb; final int delta = index % ratio; // The delta inside the block. final int startPos = p[index / ratio]; // The position into the array of // the first entire word before // the index-th. // int pos, arrayLength = readInt(array, pos = startPos), prevArrayPos, currLen = 0, actualCommon; int pos, prevArrayPos, currLen = 0, actualCommon; int arrayLength = bb.readInt(pos = startPos); if (delta == 0) { pos = p[index / ratio] + count(arrayLength); // System.arraycopy(array, pos, a, offset, Math.min(length, // arrayLength)); bb.arraycopy(pos, a, offset, Math.min(length, arrayLength)); return arrayLength; } int common = 0; for (int i = 0; i < delta; i++) { prevArrayPos = pos + count(arrayLength) + (i != 0 ? count(common) : 0); pos = prevArrayPos + arrayLength; // arrayLength = readInt(array, pos); // common = readInt(array, pos + count(arrayLength)); arrayLength = bb.readInt(pos); common = bb.readInt(pos + count(arrayLength)); actualCommon = Math.min(common, length); if (actualCommon <= currLen) currLen = actualCommon; else { // System.arraycopy(array, prevArrayPos, a, currLen + offset, // actualCommon - currLen); bb.arraycopy(prevArrayPos, a, currLen + offset, actualCommon - currLen); currLen = actualCommon; } } if (currLen < length) // System.arraycopy(array, pos + count(arrayLength) + count(common), // a, currLen + offset, Math // .min(arrayLength, length - currLen)); bb.arraycopy(pos + count(arrayLength) + count(common), a, currLen + offset, Math .min(arrayLength, length - currLen)); return arrayLength + common; } public byte[] get(final int index) { return getArray(index); } /** * @see #get(int) */ public byte[] getArray(final int index) { ensureRestrictedIndex(index); final int length = length(index); final byte a[] = new byte[length]; extract(index, a, 0, length); return a; } /** * Write the specified byte[] onto a stream. * * @param os * The stream. * @param index * The index of the byte[]. * * @return The #of bytes written on the stream. * * @throws IOException * * @todo Optimize this to avoid the byte[] allocation. * * @todo An alternative optimization would be to specify a variant of * {@link #get(int)} which accepts a com.bigdata.io.ByteArrayBuffer * that is automatically extended to have sufficient capacity. */ public int writeOn(final OutputStream os, final int index) throws IOException { final byte[] a = get(index); os.write(a); return a.length; } /** * Stores in the given array elements from an array stored in this * front-coded list. * * @param index * an index. * @param a * the array that will store the result. * @param offset * an offset into <code>a</code> where elements will be store. * @param length * a maximum number of elements to store in <code>a</code>. * @return if <code>a</code> can hold the extracted elements, the number of * extracted elements; otherwise, the number of remaining elements * with the sign changed. */ public int get(final int index, final byte[] a, final int offset, final int length) { ensureRestrictedIndex(index); ByteArrays.ensureOffsetLength(a, offset, length); final int arrayLength = extract(index, a, offset, length); if (length >= arrayLength) return arrayLength; return length - arrayLength; } /** * Stores in the given array an array stored in this front-coded list. * * @param index * an index. * @param a * the array that will store the content of the result (we assume * that it can hold the result). * @return if <code>a</code> can hold the extracted elements, the number of * extracted elements; otherwise, the number of remaining elements * with the sign changed. */ public int get(final int index, final byte[] a) { return get(index, a, 0, a.length); } public int size() { return n; } public ObjectListIterator<byte[]> listIterator(final int start) { ensureIndex(start); return new AbstractObjectListIterator<byte[]>() { byte a[] = ByteArrays.EMPTY_ARRAY; int i = 0, pos = 0; boolean inSync; // Whether the current value in a is the string just // before the next to be produced. { if (start != 0) { if (start == n) i = start; // If we start at the end, we do nothing. else { pos = p[start / ratio]; int j = start % ratio; i = start - j; while (j-- != 0) next(); } } } public boolean hasNext() { return i < n; } public boolean hasPrevious() { return i > 0; } public int previousIndex() { return i - 1; } public int nextIndex() { return i; } public byte[] next() { int length, common; if (!hasNext()) throw new NoSuchElementException(); final BackingBuffer bb = CustomByteArrayFrontCodedList.this.bb; if (i % ratio == 0) { pos = p[i / ratio]; // length = readInt(array, pos); length = bb.readInt(pos); a = ByteArrays.ensureCapacity(a, length, 0); // System.arraycopy(array, pos + count(length), a, 0, length); bb.arraycopy(pos + count(length), a, 0, length); pos += length + count(length); inSync = true; } else { if (inSync) { // length = readInt(array, pos); // common = readInt(array, pos + count(length)); length = bb.readInt(pos); common = bb.readInt(pos + count(length)); a = ByteArrays.ensureCapacity(a, length + common, common); // System.arraycopy(array, pos + count(length) // + count(common), a, common, length); bb.arraycopy(pos + count(length) + count(common), a, common, length); pos += count(length) + count(common) + length; length += common; } else { a = ByteArrays.ensureCapacity(a, length = length(i), 0); extract(i, a, 0, length); } } i++; return ByteArrays.copy(a, 0, length); } public byte[] previous() { if (!hasPrevious()) throw new NoSuchElementException(); inSync = false; return getArray(--i); } }; } /** * Returns a copy of this list. * * @return a copy of this list. */ public Object clone() { CustomByteArrayFrontCodedList c; try { c = (CustomByteArrayFrontCodedList) super.clone(); } catch (CloneNotSupportedException cantHappen) { throw new InternalError(); } // c.array = array.clone(); c.bb = bb.clone(); c.p = p.clone(); return c; } /** * Modified to dump internal record metadata and to show the byte[]s as * unsigned values. */ public String toString() { final StringBuffer s = new StringBuffer(); s.append("{ratio=" + ratio + ", size=" + n + ", p[]=" + java.util.Arrays.toString(p)); s.append("[\n"); for (int i = 0; i < n;) { int pos = p[i/ratio]; for (int j = 0; j < ratio && i < n; j++, i++) { final int delta = i % ratio; final int pos0 = pos; // pos @ rlen. final int rlen = bb.readInt(pos); pos += count(rlen); final int clen; if (delta == 0) { clen = 0; } else { clen = bb.readInt(pos); pos += count(clen); } final byte[] a = get(i); s.append("index=" + i + ", delta=" + delta + ", p[" + (i / ratio) + "]=" + p[i / ratio] + ", pos@rlen=" + pos0 + ", rlen=" + rlen + ", clen=" + clen + ", pos@remainder=" + pos + " :: " + toString(a) + "\n"); pos += rlen; } // for(int i=0; i<n; i++) { // if (i != 0) // s.append(", "); // s.append(ByteArrayList.wrap(getArray(i)).toString()); } s.append("]}"); return s.toString(); } private void readObject(java.io.ObjectInputStream s) throws java.io.IOException, ClassNotFoundException { s.defaultReadObject(); rebuildPointerArray(); } /* * New ctors and new methods. */ /** * Reconsitute an instance from just the coded byte[], the #of elements in * the array, and the ratio. * * @param n * The #of elements in the array. * @param ratio * The ratio of this front-coded list. * @param array * The array containing the compressed arrays. */ public CustomByteArrayFrontCodedList(final int n, final int ratio, final byte[] array) { this(n, ratio, array, 0, array.length, false/* hasDups */); } /** * Reconsitute an instance from a slice byte[] containing the coded data, * the #of elements in the array, and the ratio. * * @param n * The #of elements in the array. * @param ratio * The ratio of this front-coded list. * @param array * The array containing the compressed arrays. * @param hasDups * <code>true</code> iff the list allows duplicate keys. */ public CustomByteArrayFrontCodedList(final int n, final int ratio, final byte[] array, final int off, final int len, final boolean hasDups) { assertRatio(ratio); this.n = n; this.ratio = ratio; // this.array = array; this.bb = new BackingByteArray(array, off, len); this.hasDups = hasDups; rebuildPointerArray(); } /** * Reconsitute an instance from just a {@link ByteBuffer} view onto the * coded byte[], the #of elements in the array, and the ratio. * * @param n * The #of elements in the array. * @param ratio * The ratio of this front-coded list. * @param b * The view onto the compressed arrays. */ public CustomByteArrayFrontCodedList(final int n, final int ratio, final ByteBuffer b) { assertRatio(ratio); this.n = n; this.ratio = ratio; this.bb = new BackingByteBuffer(b); rebuildPointerArray(); } /** * Return the backing buffer. */ public BackingBuffer getBackingBuffer() { // return array; return bb; } /** * Rebuild pointer array from the packed byte {@link #array}, the #of * elements in that array {@link #n}, and the {@link #ratio()}. */ private void rebuildPointerArray() { final int[] p = new int[(n + ratio - 1) / ratio]; // final byte a[] = array; final BackingBuffer bb = this.bb; int i = 0, pos = 0, length, common; for (i = 0; i < n; i++) { // length = readInt(a, pos); length = bb.readInt(pos); if (i % ratio == 0) { p[i / ratio] = pos; pos += count(length) + length; } else { // common = readInt(a, pos + count(length)); common = bb.readInt(pos + count(length)); pos += count(length) + count(common) + length; } } this.p = p; } /** * Search for the index of the value having the same data. The results are * meaningless if the list is not ordered. A binary search is performed * against each of the entries that is coded as a full length value. If * there is a match, the index of that entry is returned directly. Otherwise * a linear scan is performed starting with the insertion point as * identified by the binary search. The combination of a binary search * followed by a linear scan implies that search is fastest when ratio is * small. However, the compression is highest when the ratio is equal to the * #of entries in the list. Therefore, search performance is traded off * against compression. * <p> * Note: The full length entry is coded every [i/ratio] entries. However, * the subsequent entries code their common length with respect to the * previous front-coded entry NOT to the full length entry. This means that * the length of the common prefix can increase or decrease as we scan a * bucket and the #of already matched bytes can increase or decrease as * well. Consider the following example, when coded with a ratio of 8. * * <pre> * [121, 59, 18, 79, 99, 112, 24, 116], // #0 rlen=8, clen=0 (new bucket) * [121, 59, 18, 79, 99, 112, 43, 68], // #1 rlen=2, clen=6 * [121, 59, 18, 79, 99, 112, 46, 78], // #2 rlen=2, clen=6 * [121, 59, 18, 79, 99, 112, 54, 48], // #3 rlen=2, clen=6 * [121, 59, 18, 79, 99, 112, 54, 108], // #4 rlen=1, clen=7 (***) * [121, 59, 18, 79, 99, 112, 55, 81], // #5 rlen=2, clen=6 * [121, 59, 18, 79, 99, 112, 62, 85], // #6 rlen=2, clen=6 * [121, 59, 18, 79, 99, 112, 63, 110], // #7 rlen=8, clen=0 (new bucket) * [121, 59, 18, 79, 99, 112, 71, 124], // #8 ... * [121, 59, 18, 79, 99, 112, 73, 49] // #9 ... * </pre> * * The common length grows for entry #4 because the [54] in the next to last * byte in the array already appears in the same position in the previous * front-coded entry. However, the common length decreases again for the * next entry (#5). * <p> * The following rules guide the linear search of the bucket identified by * the binary search. * <ol> * <li>If <code>clen GT mlen</code>, then skip to the next entry in the * bucket as no match is possible (the common prefix was demonstrated to be * longer than the matched prefix on a previous entry).</li> * <li>Compare the remaining bytes in the search probe with the remainder * for the current entry.</li> * <li>If the search probe is EQ to the bucket entry, then halt. The probe * key was found.</li> * <li>If the search probe is LT the bucket entry, then halt. The probe key * was not found.</li> * <li>Otherwise, <code>mlen += prefixLength</code>, where prefixLength is * the length of the matched prefix from step 2. * </ol> * * @param a * The search probe, which is interpreted as an * <em>unsigned byte[]</code> * * @return index of the search key, if it is found; otherwise, * <code>(-(insertion point) - 1)</code>. The insertion point is * defined as the point at which the key would be inserted. Note * that this guarantees that the return value will be >= 0 if and * only if the key is found. */ public int search(final byte[] a) { /* * We can efficiently test each at each index which is an even multiple * of the ratio. For those indices, we do not have to copy the data out * of its compressed format. Therefore we first perform a binary search * and locate the greatest index that is a multiple of the ratio whose * value is LTE to the probe key. */ final int pret = binarySearch(a); if (pret == 0 || (pret > 0 && !hasDups)) { /* * An exact match on a full length entry in the backing buffer. * * Note: If duplicates are allowed, and the probe key is not an * exact match for a full length key (pred>=0), then we need to * search the previous encoding run for a match within that run. */ return pret * ratio; } if (pret == -1) { /* * The key would be inserted before the first entry in the * front-coded array. (This is a fast path for the case where the * probe key would be inserted at the head of the list and before * any full length key. Other insertion points are handled below.) */ return -1; } /* * Next we do a linear scan of up to [ratio-1] entries, returning the * first entry whose common bytes and remainder would reconstruct the * probe key. If we find an entry which would be ordered GT the probe * key, then we return the insertion point instead. */ /* * Convert the insertion point into the index of the set of up to ratio * front-coded byte[]s to be searched. Note that we always search the * bucket before the insertion point since we are looking for a key * which might exist in that bucket. */ final int poffset; if (pret < 0) { /* * We have an insertion point, which is the first encoding run LT * the probe key. We need to scan that encoding run and see if the * probe key exists in that run. This code path applies equally to * search whether or not duplicate keys are allowed. */ poffset = (-pret - 1) - 1; } else { /* * This case only arises if duplicates are allowed. When duplicate * keys are not allowed and pret>=0, we already returned at the head * of this method. For this case we have a hit on the first full * length key for an encoding run. Now we need to scan the previous * encoding run to see if the probe key exists in that encoding run. * The outcome of this dups code path in which we search the * previous run is handled by conditional logic at the bottom of * this method. */ assert hasDups; // code path iff dups allowed. assert pret > 0; // code path iff exact match on full length key. poffset = pret - 1; // pret is > 0. subtract 1 for the previous run. } // The corresponding index into the list. final int offset = poffset * ratio; /* * This is starting position in the backing buffer of the full length * entry corresponding to the insertion point. */ int pos = p[poffset]; /* * The #of bytes in the full length entry which match the probe key. * Note: This is NOT a fixed value. When we scan the front-coded entries * in the same bucket, the common length with respect to the previous * entry can actually increase -or- decrease, in which case the matched * length may change as well. */ int mlen; { // The #of bytes in the full length byte[] at the insertion point. final int blen = bb.readInt(pos); // Skip the #of bytes required to code that length. pos += count(blen); // Count matching bytes. int i; for (i = 0; i < a.length && i < blen; i++, pos++) { if (a[i] != bb.get(pos)) { break; } } // #of matching bytes. mlen = i; // skip over the remainder of the full length coded entry. pos += (blen - mlen); } /* * Scan up to ratio-1 entries or the last entry, whichever comes first. */ final int limit = Math.min(n - (offset + 1), ratio - 1); int delta; for (delta = 0; delta < limit; delta++) { // length of the remainder for this entry. final int rlen = bb.readInt(pos); // skip past rlen field. pos += count(rlen); // length of the common prefix (shared with the entry @ the ptr). final int clen = bb.readInt(pos); if (clen < mlen) { /* * No match is possible once the common length is LT the matched * length. */ break; } // skip past clen field. pos += count(clen); if (clen > mlen) { /* * No match is possible while the common prefix length with the * prior entry is GT the matched length with the probe key. */ pos += rlen; continue; } /* * Compare the remaining bytes in the search probe to the remainder * of the current entry. */ assert mlen == clen : "mlen=" + mlen + ", clen=" + clen + ", rlen=" + rlen + ", delta=" + delta + ", pret=" + pret + ", poffset=" + poffset + ", searchKey=" + toString(a) + ", this=" + this; final int ret = compareBytes(a, mlen, a.length - mlen, bb, pos, rlen); if (ret == 0) { // Found by linear scan of front-coded entries. return offset + delta + 1; } if (ret < 0) { // The current entry is GT the probe key. Halt (not found). break; } /* * Update the matched length by the length of the matched pefix from * the last comparison test. */ final int prefixLength = Math.abs(ret) - 1; mlen += prefixLength; // skip past the remainder and keep looking. pos += rlen; } /* * We did not find the probe key when scanning the encoding run. */ if (pret >= 0) { /* * We had an exact match on the full length key when we did the * binary search. This code path is only taken when duplicate keys * are allowed, since we otherwise return immediately at the head of * this method. If we had found an exact match for the probe key in * the previous run, then we would have returned the insertion point * during the scan of that encoding run above. Since we did not, we * want to return the insertion on the encoding run boundary. */ assert hasDups; // only when duplicate keys are allowed. assert delta == limit; // We scanned the entire previous run. /* * An exact match on a full length entry in the backing buffer. * Duplicate keys are allowed and we have proven that the probe does * not exist in the previous encoding run. */ return pret * ratio; } else { /* * We did not have an exact match on a full length key when we did * the binary search. We then scanned the encoding run for the * insertion point. We did not find an exact match within the * encoding run, but we worked out the correct insertion point into * that encoding run. * * Note: This can happen whether or not duplicate keys are allowed. * * Return the insert point into the list. */ return (-(offset + delta + 1) - 1); } } /** * Binary search against the entries in the backing buffer that are coded as * their full length values. * * @param key * The key for the search. * * @return index of the search key into the pointer array, if it is an exact * match with any of the full length values whose offsets are stored * in the pointer array; otherwise, * <code>(-(insertion point) - 1)</code>. The insertion point is * defined as the point at which the key would be inserted into the * list. Note that this guarantees that the return value will be >= * 0 if and only if the key is matched by any of the full length * coded entries. */ private int binarySearch(final byte[] key) { // final int base = 0; // final BackingBuffer bb = this.bb; /* * We will test each entry having an index that is an even multiple of * the ratio. The offset into the backing buffer of each such entry is * given by p[]. The data at p[i] is the length of the fully coded value * followed by the value itself. */ // final int nmem = p.length; int low = 0; int high = p.length/*nmem*/ - 1; while (low <= high) { final int mid = (low + high) >> 1; final int offset = /*base +*/ mid; /* * Compare the probe with the full length byte[] at index [mid]. */ final int tmp = comparePos(mid, key); if (tmp > 0) { // Actual GT probe, restrict lower bound and try again. low = mid + 1; } else if (tmp < 0) { // Actual LT probe, restrict upper bound and try again. high = mid - 1; } else { // duplicate check to see if previous is also a match if (hasDups && mid > 0 && comparePos(mid - 1, key) == 0) { // in which case set it as the highest high = mid - 1; } else { // Found: return offset. return offset; } } } // Not found: return insertion point. final int offset = (/*base +*/ low); return -(offset + 1); } /** * Compares the caller's key to a full length key at a specific offset * in the {@link BackingBuffer}. * * @param index * The index into the full length keys. * @param key * The probe key. * * @return A value which indicates whether the key at that offset into the * backing buffer is LT, GT, or EQ to the caller's key. */ private int comparePos(final int index, final byte[] key) { // The index into the backing buffer of index [index]. int pos = p[index]; // The #of bytes in the full length byte[] at index [index]. final int blen = bb.readInt(pos); // Skip the #of bytes required to code that length. pos += count(blen); // Compare key vs actual (in buffer). return compareBytes(key, 0, key.length, bb, pos, blen); } /** * Compare up to <i>len</i> bytes in <i>a</i> interpreted as unsigned bytes * against the bytes in the {@link BackingBuffer} starting at offset * <i>off</i> in the {@link BackingBuffer}. * * @param a * The caller's probe. * @param bb * The {@link BackingBuffer}. * @param boff * The offset of the first byte to be compared in the * {@link BackingBuffer}. * @param blen * The length of the byte[] at that offset in the * {@link BackingBuffer}. * * @return The return is a negative integer, zero, or a positive integer if * the first argument is less than, equal to, or greater than the * byte[] in the {@link BackingBuffer} at the specified offset. The * return value also codes the length of the shared prefix, which * may be computed as <code>Math.abs(ret)-1</code>. */ private int compareBytes(final byte[] a, final int aoff, final int alen, final BackingBuffer bb, final int boff, final int blen) { int mlen = 0; // Compare bytes(probe,entry) (negative iff probe < entry) for (int i = aoff, j = boff; i < aoff + alen && j < boff + blen; i++, j++, mlen++) { // promotes to signed integers in [0:255] for comparison. final int ret = (a[i] & 0xff) - (bb.get(j) & 0xff); if (ret != 0) { return ret < 0 ? -(mlen + 1) : (mlen + 1); } } return alen == blen ? 0 : (alen - blen) < 0 ? -(mlen + 1) : (mlen + 1); } /* * Note: These toString() methods were inlined to remove a dependency on * BytesUtil in the bigdata package (BBT, 9/24/2009). */ /** * Formats a key as a series of comma delimited unsigned bytes. * * @param key * The key. * * @return The string representation of the array as unsigned bytes. */ final public static String toString(final byte[] key) { if (key == null) return NULL; return toString(key, 0, key.length); } /** * Formats a key as a series of comma delimited unsigned bytes. * * @param key * The key. * @param off * The index of the first byte that will be visited. * @param len * The #of bytes to visit. * * @return The string representation of the array as unsigned bytes. */ final public static String toString(final byte[] key, final int off, final int len) { if (key == null) return NULL; final StringBuilder sb = new StringBuilder(len * 4 + 2); sb.append("["); for (int i = off; i < off + len; i++) { if (i > 0) sb.append(", "); // as an unsigned integer. // sb.append(Integer.toHexString(key[i] & 0xff)); sb.append(Integer.toString(key[i] & 0xff)); } sb.append("]"); return sb.toString(); } private static transient String NULL = "null"; }