/*
* fastutil: Fast & compact type-specific collections for Java
*
* Copyright (C) 2002, 2003, 2004, 2005, 2006 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
package it.unimi.dsi.fastutil.bytes.custom;
import it.unimi.dsi.fastutil.bytes.ByteArrays;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.objects.AbstractObjectList;
import it.unimi.dsi.fastutil.objects.AbstractObjectListIterator;
import it.unimi.dsi.fastutil.objects.ObjectListIterator;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Compact storage of lists of arrays using front coding.
*
* <P>
* This class stores immutably a list of arrays in a single large array using
* front coding (of course, the compression will be reasonable only if the list
* is sorted lexicographically—see below). It implements an immutable
* type-specific list that returns the <var>i</var>-th array when calling
* {@link #get(int) get(<var>i</var>)}. The returned array may be freely
* modified.
*
* <P>
* Front coding is based on the idea that if the <var>i</var>-th and the
* (<var>i</var>+1)-th array have a common prefix, we might store the length of
* the common prefix, and then the rest of the second array.
*
* <P>
* This approach, of course, requires that once in a while an array is stored
* entirely. The <def>ratio</def> of a front-coded list defines how often this
* happens (once every {@link #ratio()} arrays). A higher ratio means more
* compression, but means also a longer access time, as more arrays have to be
* probed to build the result. Note that we must build an array every time
* {@link #get(int)} is called, but this class provides also methods that
* extract one of the stored arrays in a given array, reducing garbage
* collection. See the documentation of the family of <code>get()</code>
* methods.
*
* <P>
* By setting the ratio to 1 we actually disable front coding: however, we still
* have a data structure storing large list of arrays with a reduced overhead
* (just one integer per array, plus the space required for lengths).
*
* <P>
* Note that the typical usage of front-coded lists is under the form of
* serialized objects; usually, the data that has to be compacted is processed
* offline, and the resulting structure is stored permanently. Since the pointer
* array is not stored, the serialized format is very small.
*
* <H2>Implementation Details</H2>
*
* <P>
* All arrays are stored in a large array. A separate array of pointers indexes
* arrays whose position is a multiple of the ratio: thus, a higher ratio means
* also less pointers.
*
* <P>
* More in detail, an array whose position is a multiple of the ratio is stored
* as the array length, followed by the elements of the array. The array length
* is coded by a simple variable-length list of <var>k</var>-1 bit blocks, where
* <var>k</var> is the number of bits of the underlying primitive type. All
* other arrays are stored as follows: let <code>common</code> the length of the
* maximum common prefix between the array and its predecessor. Then we store
* the array length decremented by <code>common</code>, followed by
* <code>common</code>, followed by the array elements whose index is greater
* than or equal to <code>common</code>. For instance, if we store
* <samp>foo</samp>, <samp>foobar</samp>, <samp>football</samp> and
* <samp>fool</samp> in a front-coded character-array list with ratio 3, the
* character array will contain
*
* <pre>
* <b>3</b> f o o <b>3</b> <b>3</b> b a r <b>5</b> <b>3</b> t b a l l <b>4</b> f o o l
* </pre>
*
* <H2>Limitations</H2>
*
* <P>
* All arrays are stored in a large array: thus, the compressed list must not
* exceed {@link java.lang.Integer#MAX_VALUE} elements. Moreover, iterators are
* less efficient when they move back, as
* {@link java.util.ListIterator#previous() previous()} cannot build
* incrementally the previous array (whereas (
* {@link java.util.ListIterator#next() next()} can).
*
* <h3>Modifications</h3>
*
* This class was derived from
* <code>it.unimi.dsi.fastutil.bytes.ArrayFrontCodedList</code>, which is part
* of fastutils. The folowing changes were made:
* <ul>
* <li>The name of the class has been changed to prevent classpath problems.</li>
* <li>The class has a new {@link #serialVersionUID} and the serialization logic
* has been modified to allow serialization against {@link DataOutput} by
* defining {@link #getBackingBuffer()} and a new constructors that operate on a
* byte[] slice.</li>
* <li>The test code from main() has been isolated in a junit test suite.</li>
* <li>The backing <code>byte[] array</code> has been replaced by an interface
* suitable for wrapping either a <code>byte[]</code> or a {@link ByteBuffer}.
* This was done in order to permit access to the front-coded representation
* without "de-serializing" the data and a suitable constructor was added for
* the {@link ByteBuffer} case.</li>
* <li>Make the {@link Collection} and {@link Iterator} ctors strongly typed.</li>
* </ul>
*/
public class CustomByteArrayFrontCodedList extends AbstractObjectList<byte[]>
implements Serializable, Cloneable {
/*
* New interfaces and their implementations.
*/
/**
* Abstraction allowing different implementations of the backing buffer.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
public interface BackingBuffer extends Cloneable, Serializable {
/**
* Return the byte value at the specified index.
*
* @param i
* The index.
* @return The byte.
*/
public byte get(int i);
/**
* Reads a coded length.
*
* @param pos
* The starting position.
*
* @return The length coded at <code>pos</code>.
*/
public int readInt(int pos);
/**
* Copy data from the backing buffer into the caller's array.
*
* @param pos
* The starting position in the backing buffer.
* @param dest
* The caller's array.
* @param destPos
* The starting position in the caller's array.
* @param len
* The #of bytes to copy.
*/
public void arraycopy(int pos, byte[] dest, int destPos, int len);
/**
* The size of the backing buffer in bytes.
*/
public int size();
/**
* Return a copy of the data in the backing buffer.
*/
public byte[] toArray();
/**
* Write the data on the output stream.
*
* @param out
* The output stream.
*
* @return The #of bytes written.
*/
public int writeOn(OutputStream out) throws IOException;
/**
* Write <i>len</i> bytes starting at <i>off</i> onto the caller's
* stream.
*
* @param out
* The output stream.
* @param off
* The index of the first byte to be written.
* @param len
* The #of bytes to be written.
*
* @return The #of bytes written.
*/
public int writeOn(OutputStream out, int off, int len) throws IOException;
/**
* Clone the backing buffer.
*/
public BackingBuffer clone();
}
/**
* Implementation for a <code>byte[]</code>.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
private class BackingByteArray implements BackingBuffer {
private static final long serialVersionUID = 1L;
private final byte[] a;
private final int off;
private final int len;
public BackingByteArray(final byte[] a) {
this(a,0,a.length);
}
public BackingByteArray(final byte[] a, final int off, final int len) {
this.a = a;
this.off = off;
this.len = len;
}
public int size() {
return len;
}
public byte get(final int i) {
return a[off + i];
}
public void arraycopy(final int pos, final byte[] dest,
final int destPos, final int len) {
if (pos < 0) // check starting pos.
throw new IllegalArgumentException();
if (pos + len > this.len) // check run length.
throw new IllegalArgumentException();
System.arraycopy(a/* src */, off + pos, dest, destPos, len);
}
public int writeOn(final OutputStream dos) throws IOException {
dos.write(a, off, len);
return len;
}
public int writeOn(final OutputStream dos, final int aoff,
final int alen) throws IOException {
if (aoff < 0) // check starting pos.
throw new IllegalArgumentException();
if (aoff + alen > this.len) // check run length.
throw new IllegalArgumentException();
dos.write(a, off + aoff, alen);
return len;
}
public int readInt(int pos) {
pos += off;
if (a[pos] >= 0)
return a[pos];
if (a[pos + 1] >= 0)
return (-a[pos] - 1) << 7 | a[pos + 1];
if (a[pos + 2] >= 0)
return (-a[pos] - 1) << 14 | (-a[pos + 1] - 1) << 7 | a[pos + 2];
if (a[pos + 3] >= 0)
return (-a[pos] - 1) << 21 | (-a[pos + 1] - 1) << 14
| (-a[pos + 2] - 1) << 7 | a[pos + 3];
return (-a[pos] - 1) << 28 | (-a[pos + 1] - 1) << 21
| (-a[pos + 2] - 1) << 14 | (-a[pos + 3] - 1) << 7 | a[pos + 4];
}
public byte[] toArray() {
final byte[] b = new byte[len];
System.arraycopy(a, off, b, 0, len);
return b;
}
public BackingByteArray clone() {
return new BackingByteArray(toArray());
}
}
/**
* Implementation with a backing {@link ByteBuffer}.
* <p>
* Note: Methods which interact with a ByteBuffer MUST NOT change its
* position or limit. If they do then ALL methods which touch the buffer
* need to be synchronized so NONE of them can have a concurrent read during
* which the position/limit has been transiently modified. The culprits here
* are the bulk byte transfer methods ByteBuffer#get() and ByteBuffer#put().
* This is really a huge limitation on the use of a ByteBuffer for
* concurrent access to a read-only data structure.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*
* @deprecated The {@link ByteBuffer} is too slow.
*/
private static class BackingByteBuffer implements BackingBuffer {
private static final long serialVersionUID = 1L;
private final ByteBuffer b;
/**
*
* @param b
* The data. All bytes in view are used (from zero through
* the capacity of the array). The limit and position of the
* buffer are ignored.
*/
public BackingByteBuffer(final ByteBuffer b) {
this.b = b;
}
public int size() {
return b.capacity();
}
public byte get(final int i) {
synchronized(b) {
return b.get(i);
}
}
// @todo tweak by extracting values that are reused into tmp vars.
public int readInt(final int pos) {
synchronized(b) {
if (get(pos) >= 0)
return get(pos);
if (get(pos + 1) >= 0)
return (-get(pos) - 1) << 7 | get(pos + 1);
if (get(pos + 2) >= 0)
return (-get(pos) - 1) << 14 | (-get(pos + 1) - 1) << 7 | get(pos + 2);
if (get(pos + 3) >= 0)
return (-get(pos) - 1) << 21 | (-get(pos + 1) - 1) << 14
| (-get(pos + 2) - 1) << 7 | get(pos + 3);
return (-get(pos) - 1) << 28 | (-get(pos + 1) - 1) << 21
| (-get(pos + 2) - 1) << 14 | (-get(pos + 3) - 1) << 7 | get(pos + 4);
}
}
public byte[] toArray() {
/*
* Note: synchronized to prevent concurrent modification to the
* pos/limit. The pos/limit are restored as a postcondition using
* clear().
*/
synchronized (b) {
final byte[] a = new byte[b.capacity()];
b.clear();
b.get(a);
b.clear();
return a;
}
}
public void arraycopy(final int pos, final byte[] dest,
final int destPos, final int len) {
/*
* Note: synchronized to prevent concurrent modification to the
* pos/limit. The pos/limit are restored as a postcondition using
* clear().
*/
synchronized (b) {
b.limit(pos + len);
b.position(pos);
b.get(dest, destPos, len);
b.clear();
}
}
public int writeOn(final OutputStream dos) throws IOException {
final byte[] a = toArray();
dos.write(a, 0/* off */, a.length/* len */);
return a.length;
}
public int writeOn(final OutputStream dos, final int off, final int len)
throws IOException {
final byte[] a = new byte[len];
arraycopy(off, a, 0/*destPos*/, len);
dos.write(a, 0/* off */, a.length/* len */);
return a.length;
}
public BackingByteBuffer clone() {
return new BackingByteBuffer(ByteBuffer.wrap(toArray()));
}
}
/**
*
*/
private static final long serialVersionUID = -2532468860579334765L;
// The value for the original impl.
// public static final long serialVersionUID = -7046029254386353130L;
/** The number of arrays in the list. */
protected int n;
/** The ratio of this front-coded list. */
protected int ratio;
// /** The array containing the compressed arrays. */
// protected byte[] array;
/** A view on the compressed arrays. */
private BackingBuffer bb;
/** <code>true</code>iff duplicate keys are allowed. */
private boolean hasDups;
/** The pointers to entire arrays in the list. */
transient protected int[] p;
private void assertRatio(final int ratio) {
if (ratio < 1)
throw new IllegalArgumentException("Illegal ratio (" + ratio + ")");
}
/**
* Creates a new front-coded list containing the arrays returned by the
* given iterator.
*
* @param arrays
* an iterator returning arrays.
* @param ratio
* the desired ratio.
*/
public CustomByteArrayFrontCodedList(final Iterator<byte[]> arrays,
final int ratio) {
this(arrays, ratio, false/* hasDups */);
}
public CustomByteArrayFrontCodedList(final Iterator<byte[]> arrays,
final int ratio, final boolean hasDups) {
assertRatio(ratio);
// if (ratio < 1)
// throw new IllegalArgumentException("Illegal ratio (" + ratio + ")");
byte[] array = ByteArrays.EMPTY_ARRAY;
int[] p = IntArrays.EMPTY_ARRAY;
byte[][] a = new byte[2][];
int curSize = 0, b = 0, common, length, minLength;
while (arrays.hasNext()) {
a[b] = (byte[]) arrays.next();
length = a[b].length;
if (n % ratio == 0) {
p = IntArrays.grow(p, n / ratio + 1);
p[n / ratio] = curSize;
array = ByteArrays.grow(array,
curSize + count(length) + length, curSize);
curSize += writeInt(array, length, curSize);
System.arraycopy(a[b], 0, array, curSize, length);
curSize += length;
} else {
minLength = a[1 - b].length;
if (length < minLength)
minLength = length;
for (common = 0; common < minLength; common++)
if (a[0][common] != a[1][common])
break;
length -= common;
array = ByteArrays.grow(array, curSize + count(length)
+ count(common) + length, curSize);
curSize += writeInt(array, length, curSize);
curSize += writeInt(array, common, curSize);
System.arraycopy(a[b], common, array, curSize, length);
curSize += length;
}
b = 1 - b;
n++;
}
this.ratio = ratio;
// this.array = ByteArrays.trim(array, curSize);
this.bb = new BackingByteArray( ByteArrays.trim(array, curSize) );
// this.bb = new BackingByteBuffer( ByteBuffer.wrap(ByteArrays.trim(array, curSize) ));
this.p = IntArrays.trim(p, (n + ratio - 1) / ratio);
this.hasDups = hasDups;
}
/**
* Creates a new front-coded list containing the arrays in the given
* collection.
*
* @param c
* a collection containing arrays.
* @param ratio
* the desired ratio.
*/
public CustomByteArrayFrontCodedList(final Collection<byte[]> c,
final int ratio) {
this(c.iterator(), ratio);
}
/**
* Creates a new front-coded list containing the arrays in the given
* collection.
*
* @param c
* a collection containing arrays.
* @param ratio
* the desired ratio.
* @param hasDups
* <code>true</code> iff the list allows duplicate keys.
*/
public CustomByteArrayFrontCodedList(final Collection<byte[]> c,
final int ratio, final boolean hasDups) {
this(c.iterator(), ratio, hasDups);
}
// /**
// * Reads a coded length.
// *
// * @param a
// * the data array.
// * @param pos
// * the starting position.
// * @return the length coded at <code>pos</code>.
// */
// private static int readInt(final byte a[], int pos) {
// if (a[pos] >= 0)
// return a[pos];
// if (a[pos + 1] >= 0)
// return (-a[pos] - 1) << 7 | a[pos + 1];
// if (a[pos + 2] >= 0)
// return (-a[pos] - 1) << 14 | (-a[pos + 1] - 1) << 7 | a[pos + 2];
// if (a[pos + 3] >= 0)
// return (-a[pos] - 1) << 21 | (-a[pos + 1] - 1) << 14
// | (-a[pos + 2] - 1) << 7 | a[pos + 3];
// return (-a[pos] - 1) << 28 | (-a[pos + 1] - 1) << 21
// | (-a[pos + 2] - 1) << 14 | (-a[pos + 3] - 1) << 7 | a[pos + 4];
// }
/**
* Computes the number of elements coding a given length.
*
* @param length
* the length to be coded.
* @return the number of elements coding <code>length</code>.
*/
// @SuppressWarnings("unused")
private static int count(final int length) {
if (length < (1 << 7))
return 1;
if (length < (1 << 14))
return 2;
if (length < (1 << 21))
return 3;
if (length < (1 << 28))
return 4;
return 5;
}
/**
* Writes a length.
*
* @param a
* the data array.
* @param length
* the length to be written.
* @param pos
* the starting position.
* @return the number of elements coding <code>length</code>.
*/
private static int writeInt(final byte a[], int length, int pos) {
final int count = count(length);
a[pos + count - 1] = (byte) (length & 0x7F);
if (count != 1) {
int i = count - 1;
while (i-- != 0) {
length >>>= 7;
a[pos + i] = (byte) (-(length & 0x7F) - 1);
}
}
return count;
}
/**
* Returns the ratio of this list.
*
* @return the ratio of this list.
*/
public int ratio() {
return ratio;
}
/**
* Computes the length of the array at the given index.
*
* <P>
* This private version of {@link #arrayLength(int)} does not check its
* argument.
*
* @param index
* an index.
* @return the length of the <code>index</code>-th array.
*/
private int length(final int index) {
// final byte[] array = this.array;
final BackingBuffer bb = this.bb;
final int delta = index % ratio; // The index into the p array, and the
// delta inside the block.
int pos = p[index / ratio]; // The position into the array of the first
// entire word before the index-th.
// int length = readInt(array, pos);
int length = bb.readInt(pos);
if (delta == 0)
return length;
// First of all, we recover the array length and the maximum amount of
// copied elements.
int common;
pos += count(length) + length;
// length = readInt(array, pos);
// common = readInt(array, pos + count(length));
length = bb.readInt(pos);
common = bb.readInt(pos + count(length));
for (int i = 0; i < delta - 1; i++) {
pos += count(length) + count(common) + length;
// length = readInt(array, pos);
// common = readInt(array, pos + count(length));
length = bb.readInt(pos);
common = bb.readInt(pos + count(length));
}
return length + common;
}
/**
* Computes the length of the array at the given index.
*
* @param index
* an index.
* @return the length of the <code>index</code>-th array.
*/
public int arrayLength(final int index) {
ensureRestrictedIndex(index);
return length(index);
}
/**
* Extracts the array at the given index.
*
* @param index
* an index.
* @param a
* the array that will store the result (we assume that it can
* hold the result).
* @param offset
* an offset into <code>a</code> where elements will be store.
* @param length
* a maximum number of elements to store in <code>a</code>.
* @return the length of the extracted array.
*/
private int extract(final int index, final byte a[], final int offset,
final int length) {
final BackingBuffer bb = this.bb;
final int delta = index % ratio; // The delta inside the block.
final int startPos = p[index / ratio]; // The position into the array of
// the first entire word before
// the index-th.
// int pos, arrayLength = readInt(array, pos = startPos), prevArrayPos, currLen = 0, actualCommon;
int pos, prevArrayPos, currLen = 0, actualCommon;
int arrayLength = bb.readInt(pos = startPos);
if (delta == 0) {
pos = p[index / ratio] + count(arrayLength);
// System.arraycopy(array, pos, a, offset, Math.min(length,
// arrayLength));
bb.arraycopy(pos, a, offset, Math.min(length,
arrayLength));
return arrayLength;
}
int common = 0;
for (int i = 0; i < delta; i++) {
prevArrayPos = pos + count(arrayLength)
+ (i != 0 ? count(common) : 0);
pos = prevArrayPos + arrayLength;
// arrayLength = readInt(array, pos);
// common = readInt(array, pos + count(arrayLength));
arrayLength = bb.readInt(pos);
common = bb.readInt(pos + count(arrayLength));
actualCommon = Math.min(common, length);
if (actualCommon <= currLen)
currLen = actualCommon;
else {
// System.arraycopy(array, prevArrayPos, a, currLen + offset,
// actualCommon - currLen);
bb.arraycopy(prevArrayPos, a, currLen + offset,
actualCommon - currLen);
currLen = actualCommon;
}
}
if (currLen < length)
// System.arraycopy(array, pos + count(arrayLength) + count(common),
// a, currLen + offset, Math
// .min(arrayLength, length - currLen));
bb.arraycopy(pos + count(arrayLength) + count(common),
a, currLen + offset, Math
.min(arrayLength, length - currLen));
return arrayLength + common;
}
public byte[] get(final int index) {
return getArray(index);
}
/**
* @see #get(int)
*/
public byte[] getArray(final int index) {
ensureRestrictedIndex(index);
final int length = length(index);
final byte a[] = new byte[length];
extract(index, a, 0, length);
return a;
}
/**
* Write the specified byte[] onto a stream.
*
* @param os
* The stream.
* @param index
* The index of the byte[].
*
* @return The #of bytes written on the stream.
*
* @throws IOException
*
* @todo Optimize this to avoid the byte[] allocation.
*
* @todo An alternative optimization would be to specify a variant of
* {@link #get(int)} which accepts a com.bigdata.io.ByteArrayBuffer
* that is automatically extended to have sufficient capacity.
*/
public int writeOn(final OutputStream os, final int index)
throws IOException {
final byte[] a = get(index);
os.write(a);
return a.length;
}
/**
* Stores in the given array elements from an array stored in this
* front-coded list.
*
* @param index
* an index.
* @param a
* the array that will store the result.
* @param offset
* an offset into <code>a</code> where elements will be store.
* @param length
* a maximum number of elements to store in <code>a</code>.
* @return if <code>a</code> can hold the extracted elements, the number of
* extracted elements; otherwise, the number of remaining elements
* with the sign changed.
*/
public int get(final int index, final byte[] a, final int offset,
final int length) {
ensureRestrictedIndex(index);
ByteArrays.ensureOffsetLength(a, offset, length);
final int arrayLength = extract(index, a, offset, length);
if (length >= arrayLength)
return arrayLength;
return length - arrayLength;
}
/**
* Stores in the given array an array stored in this front-coded list.
*
* @param index
* an index.
* @param a
* the array that will store the content of the result (we assume
* that it can hold the result).
* @return if <code>a</code> can hold the extracted elements, the number of
* extracted elements; otherwise, the number of remaining elements
* with the sign changed.
*/
public int get(final int index, final byte[] a) {
return get(index, a, 0, a.length);
}
public int size() {
return n;
}
public ObjectListIterator<byte[]> listIterator(final int start) {
ensureIndex(start);
return new AbstractObjectListIterator<byte[]>() {
byte a[] = ByteArrays.EMPTY_ARRAY;
int i = 0, pos = 0;
boolean inSync; // Whether the current value in a is the string just
// before the next to be produced.
{
if (start != 0) {
if (start == n)
i = start; // If we start at the end, we do nothing.
else {
pos = p[start / ratio];
int j = start % ratio;
i = start - j;
while (j-- != 0)
next();
}
}
}
public boolean hasNext() {
return i < n;
}
public boolean hasPrevious() {
return i > 0;
}
public int previousIndex() {
return i - 1;
}
public int nextIndex() {
return i;
}
public byte[] next() {
int length, common;
if (!hasNext())
throw new NoSuchElementException();
final BackingBuffer bb = CustomByteArrayFrontCodedList.this.bb;
if (i % ratio == 0) {
pos = p[i / ratio];
// length = readInt(array, pos);
length = bb.readInt(pos);
a = ByteArrays.ensureCapacity(a, length, 0);
// System.arraycopy(array, pos + count(length), a, 0, length);
bb.arraycopy(pos + count(length), a, 0, length);
pos += length + count(length);
inSync = true;
} else {
if (inSync) {
// length = readInt(array, pos);
// common = readInt(array, pos + count(length));
length = bb.readInt(pos);
common = bb.readInt(pos + count(length));
a = ByteArrays.ensureCapacity(a, length + common,
common);
// System.arraycopy(array, pos + count(length)
// + count(common), a, common, length);
bb.arraycopy(pos + count(length)
+ count(common), a, common, length);
pos += count(length) + count(common) + length;
length += common;
} else {
a = ByteArrays.ensureCapacity(a, length = length(i), 0);
extract(i, a, 0, length);
}
}
i++;
return ByteArrays.copy(a, 0, length);
}
public byte[] previous() {
if (!hasPrevious())
throw new NoSuchElementException();
inSync = false;
return getArray(--i);
}
};
}
/**
* Returns a copy of this list.
*
* @return a copy of this list.
*/
public Object clone() {
CustomByteArrayFrontCodedList c;
try {
c = (CustomByteArrayFrontCodedList) super.clone();
} catch (CloneNotSupportedException cantHappen) {
throw new InternalError();
}
// c.array = array.clone();
c.bb = bb.clone();
c.p = p.clone();
return c;
}
/**
* Modified to dump internal record metadata and to show the byte[]s as
* unsigned values.
*/
public String toString() {
final StringBuffer s = new StringBuffer();
s.append("{ratio=" + ratio + ", size=" + n + ", p[]="
+ java.util.Arrays.toString(p));
s.append("[\n");
for (int i = 0; i < n;) {
int pos = p[i/ratio];
for (int j = 0; j < ratio && i < n; j++, i++) {
final int delta = i % ratio;
final int pos0 = pos; // pos @ rlen.
final int rlen = bb.readInt(pos);
pos += count(rlen);
final int clen;
if (delta == 0) {
clen = 0;
} else {
clen = bb.readInt(pos);
pos += count(clen);
}
final byte[] a = get(i);
s.append("index=" + i + ", delta=" + delta + ", p["
+ (i / ratio) + "]=" + p[i / ratio] + ", pos@rlen="
+ pos0 + ", rlen=" + rlen + ", clen=" + clen
+ ", pos@remainder=" + pos + " :: "
+ toString(a) + "\n");
pos += rlen;
}
// for(int i=0; i<n; i++) {
// if (i != 0)
// s.append(", ");
// s.append(ByteArrayList.wrap(getArray(i)).toString());
}
s.append("]}");
return s.toString();
}
private void readObject(java.io.ObjectInputStream s)
throws java.io.IOException, ClassNotFoundException {
s.defaultReadObject();
rebuildPointerArray();
}
/*
* New ctors and new methods.
*/
/**
* Reconsitute an instance from just the coded byte[], the #of elements in
* the array, and the ratio.
*
* @param n
* The #of elements in the array.
* @param ratio
* The ratio of this front-coded list.
* @param array
* The array containing the compressed arrays.
*/
public CustomByteArrayFrontCodedList(final int n, final int ratio,
final byte[] array) {
this(n, ratio, array, 0, array.length, false/* hasDups */);
}
/**
* Reconsitute an instance from a slice byte[] containing the coded data,
* the #of elements in the array, and the ratio.
*
* @param n
* The #of elements in the array.
* @param ratio
* The ratio of this front-coded list.
* @param array
* The array containing the compressed arrays.
* @param hasDups
* <code>true</code> iff the list allows duplicate keys.
*/
public CustomByteArrayFrontCodedList(final int n, final int ratio,
final byte[] array, final int off, final int len,
final boolean hasDups) {
assertRatio(ratio);
this.n = n;
this.ratio = ratio;
// this.array = array;
this.bb = new BackingByteArray(array, off, len);
this.hasDups = hasDups;
rebuildPointerArray();
}
/**
* Reconsitute an instance from just a {@link ByteBuffer} view onto the
* coded byte[], the #of elements in the array, and the ratio.
*
* @param n
* The #of elements in the array.
* @param ratio
* The ratio of this front-coded list.
* @param b
* The view onto the compressed arrays.
*/
public CustomByteArrayFrontCodedList(final int n, final int ratio,
final ByteBuffer b) {
assertRatio(ratio);
this.n = n;
this.ratio = ratio;
this.bb = new BackingByteBuffer(b);
rebuildPointerArray();
}
/**
* Return the backing buffer.
*/
public BackingBuffer getBackingBuffer() {
// return array;
return bb;
}
/**
* Rebuild pointer array from the packed byte {@link #array}, the #of
* elements in that array {@link #n}, and the {@link #ratio()}.
*/
private void rebuildPointerArray() {
final int[] p = new int[(n + ratio - 1) / ratio];
// final byte a[] = array;
final BackingBuffer bb = this.bb;
int i = 0, pos = 0, length, common;
for (i = 0; i < n; i++) {
// length = readInt(a, pos);
length = bb.readInt(pos);
if (i % ratio == 0) {
p[i / ratio] = pos;
pos += count(length) + length;
} else {
// common = readInt(a, pos + count(length));
common = bb.readInt(pos + count(length));
pos += count(length) + count(common) + length;
}
}
this.p = p;
}
/**
* Search for the index of the value having the same data. The results are
* meaningless if the list is not ordered. A binary search is performed
* against each of the entries that is coded as a full length value. If
* there is a match, the index of that entry is returned directly. Otherwise
* a linear scan is performed starting with the insertion point as
* identified by the binary search. The combination of a binary search
* followed by a linear scan implies that search is fastest when ratio is
* small. However, the compression is highest when the ratio is equal to the
* #of entries in the list. Therefore, search performance is traded off
* against compression.
* <p>
* Note: The full length entry is coded every [i/ratio] entries. However,
* the subsequent entries code their common length with respect to the
* previous front-coded entry NOT to the full length entry. This means that
* the length of the common prefix can increase or decrease as we scan a
* bucket and the #of already matched bytes can increase or decrease as
* well. Consider the following example, when coded with a ratio of 8.
*
* <pre>
* [121, 59, 18, 79, 99, 112, 24, 116], // #0 rlen=8, clen=0 (new bucket)
* [121, 59, 18, 79, 99, 112, 43, 68], // #1 rlen=2, clen=6
* [121, 59, 18, 79, 99, 112, 46, 78], // #2 rlen=2, clen=6
* [121, 59, 18, 79, 99, 112, 54, 48], // #3 rlen=2, clen=6
* [121, 59, 18, 79, 99, 112, 54, 108], // #4 rlen=1, clen=7 (***)
* [121, 59, 18, 79, 99, 112, 55, 81], // #5 rlen=2, clen=6
* [121, 59, 18, 79, 99, 112, 62, 85], // #6 rlen=2, clen=6
* [121, 59, 18, 79, 99, 112, 63, 110], // #7 rlen=8, clen=0 (new bucket)
* [121, 59, 18, 79, 99, 112, 71, 124], // #8 ...
* [121, 59, 18, 79, 99, 112, 73, 49] // #9 ...
* </pre>
*
* The common length grows for entry #4 because the [54] in the next to last
* byte in the array already appears in the same position in the previous
* front-coded entry. However, the common length decreases again for the
* next entry (#5).
* <p>
* The following rules guide the linear search of the bucket identified by
* the binary search.
* <ol>
* <li>If <code>clen GT mlen</code>, then skip to the next entry in the
* bucket as no match is possible (the common prefix was demonstrated to be
* longer than the matched prefix on a previous entry).</li>
* <li>Compare the remaining bytes in the search probe with the remainder
* for the current entry.</li>
* <li>If the search probe is EQ to the bucket entry, then halt. The probe
* key was found.</li>
* <li>If the search probe is LT the bucket entry, then halt. The probe key
* was not found.</li>
* <li>Otherwise, <code>mlen += prefixLength</code>, where prefixLength is
* the length of the matched prefix from step 2.
* </ol>
*
* @param a
* The search probe, which is interpreted as an
* <em>unsigned byte[]</code>
*
* @return index of the search key, if it is found; otherwise,
* <code>(-(insertion point) - 1)</code>. The insertion point is
* defined as the point at which the key would be inserted. Note
* that this guarantees that the return value will be >= 0 if and
* only if the key is found.
*/
public int search(final byte[] a) {
/*
* We can efficiently test each at each index which is an even multiple
* of the ratio. For those indices, we do not have to copy the data out
* of its compressed format. Therefore we first perform a binary search
* and locate the greatest index that is a multiple of the ratio whose
* value is LTE to the probe key.
*/
final int pret = binarySearch(a);
if (pret == 0 || (pret > 0 && !hasDups)) {
/*
* An exact match on a full length entry in the backing buffer.
*
* Note: If duplicates are allowed, and the probe key is not an
* exact match for a full length key (pred>=0), then we need to
* search the previous encoding run for a match within that run.
*/
return pret * ratio;
}
if (pret == -1) {
/*
* The key would be inserted before the first entry in the
* front-coded array. (This is a fast path for the case where the
* probe key would be inserted at the head of the list and before
* any full length key. Other insertion points are handled below.)
*/
return -1;
}
/*
* Next we do a linear scan of up to [ratio-1] entries, returning the
* first entry whose common bytes and remainder would reconstruct the
* probe key. If we find an entry which would be ordered GT the probe
* key, then we return the insertion point instead.
*/
/*
* Convert the insertion point into the index of the set of up to ratio
* front-coded byte[]s to be searched. Note that we always search the
* bucket before the insertion point since we are looking for a key
* which might exist in that bucket.
*/
final int poffset;
if (pret < 0) {
/*
* We have an insertion point, which is the first encoding run LT
* the probe key. We need to scan that encoding run and see if the
* probe key exists in that run. This code path applies equally to
* search whether or not duplicate keys are allowed.
*/
poffset = (-pret - 1) - 1;
} else {
/*
* This case only arises if duplicates are allowed. When duplicate
* keys are not allowed and pret>=0, we already returned at the head
* of this method. For this case we have a hit on the first full
* length key for an encoding run. Now we need to scan the previous
* encoding run to see if the probe key exists in that encoding run.
* The outcome of this dups code path in which we search the
* previous run is handled by conditional logic at the bottom of
* this method.
*/
assert hasDups; // code path iff dups allowed.
assert pret > 0; // code path iff exact match on full length key.
poffset = pret - 1; // pret is > 0. subtract 1 for the previous run.
}
// The corresponding index into the list.
final int offset = poffset * ratio;
/*
* This is starting position in the backing buffer of the full length
* entry corresponding to the insertion point.
*/
int pos = p[poffset];
/*
* The #of bytes in the full length entry which match the probe key.
* Note: This is NOT a fixed value. When we scan the front-coded entries
* in the same bucket, the common length with respect to the previous
* entry can actually increase -or- decrease, in which case the matched
* length may change as well.
*/
int mlen;
{
// The #of bytes in the full length byte[] at the insertion point.
final int blen = bb.readInt(pos);
// Skip the #of bytes required to code that length.
pos += count(blen);
// Count matching bytes.
int i;
for (i = 0; i < a.length && i < blen; i++, pos++) {
if (a[i] != bb.get(pos)) {
break;
}
}
// #of matching bytes.
mlen = i;
// skip over the remainder of the full length coded entry.
pos += (blen - mlen);
}
/*
* Scan up to ratio-1 entries or the last entry, whichever comes first.
*/
final int limit = Math.min(n - (offset + 1), ratio - 1);
int delta;
for (delta = 0; delta < limit; delta++) {
// length of the remainder for this entry.
final int rlen = bb.readInt(pos);
// skip past rlen field.
pos += count(rlen);
// length of the common prefix (shared with the entry @ the ptr).
final int clen = bb.readInt(pos);
if (clen < mlen) {
/*
* No match is possible once the common length is LT the matched
* length.
*/
break;
}
// skip past clen field.
pos += count(clen);
if (clen > mlen) {
/*
* No match is possible while the common prefix length with the
* prior entry is GT the matched length with the probe key.
*/
pos += rlen;
continue;
}
/*
* Compare the remaining bytes in the search probe to the remainder
* of the current entry.
*/
assert mlen == clen : "mlen=" + mlen + ", clen=" + clen + ", rlen="
+ rlen + ", delta=" + delta + ", pret=" + pret
+ ", poffset=" + poffset + ", searchKey="
+ toString(a) + ", this=" + this;
final int ret = compareBytes(a, mlen, a.length - mlen, bb, pos,
rlen);
if (ret == 0) {
// Found by linear scan of front-coded entries.
return offset + delta + 1;
}
if (ret < 0) {
// The current entry is GT the probe key. Halt (not found).
break;
}
/*
* Update the matched length by the length of the matched pefix from
* the last comparison test.
*/
final int prefixLength = Math.abs(ret) - 1;
mlen += prefixLength;
// skip past the remainder and keep looking.
pos += rlen;
}
/*
* We did not find the probe key when scanning the encoding run.
*/
if (pret >= 0) {
/*
* We had an exact match on the full length key when we did the
* binary search. This code path is only taken when duplicate keys
* are allowed, since we otherwise return immediately at the head of
* this method. If we had found an exact match for the probe key in
* the previous run, then we would have returned the insertion point
* during the scan of that encoding run above. Since we did not, we
* want to return the insertion on the encoding run boundary.
*/
assert hasDups; // only when duplicate keys are allowed.
assert delta == limit; // We scanned the entire previous run.
/*
* An exact match on a full length entry in the backing buffer.
* Duplicate keys are allowed and we have proven that the probe does
* not exist in the previous encoding run.
*/
return pret * ratio;
} else {
/*
* We did not have an exact match on a full length key when we did
* the binary search. We then scanned the encoding run for the
* insertion point. We did not find an exact match within the
* encoding run, but we worked out the correct insertion point into
* that encoding run.
*
* Note: This can happen whether or not duplicate keys are allowed.
*
* Return the insert point into the list.
*/
return (-(offset + delta + 1) - 1);
}
}
/**
* Binary search against the entries in the backing buffer that are coded as
* their full length values.
*
* @param key
* The key for the search.
*
* @return index of the search key into the pointer array, if it is an exact
* match with any of the full length values whose offsets are stored
* in the pointer array; otherwise,
* <code>(-(insertion point) - 1)</code>. The insertion point is
* defined as the point at which the key would be inserted into the
* list. Note that this guarantees that the return value will be >=
* 0 if and only if the key is matched by any of the full length
* coded entries.
*/
private int binarySearch(final byte[] key) {
// final int base = 0;
// final BackingBuffer bb = this.bb;
/*
* We will test each entry having an index that is an even multiple of
* the ratio. The offset into the backing buffer of each such entry is
* given by p[]. The data at p[i] is the length of the fully coded value
* followed by the value itself.
*/
// final int nmem = p.length;
int low = 0;
int high = p.length/*nmem*/ - 1;
while (low <= high) {
final int mid = (low + high) >> 1;
final int offset = /*base +*/ mid;
/*
* Compare the probe with the full length byte[] at index [mid].
*/
final int tmp = comparePos(mid, key);
if (tmp > 0) {
// Actual GT probe, restrict lower bound and try again.
low = mid + 1;
} else if (tmp < 0) {
// Actual LT probe, restrict upper bound and try again.
high = mid - 1;
} else {
// duplicate check to see if previous is also a match
if (hasDups && mid > 0 && comparePos(mid - 1, key) == 0) {
// in which case set it as the highest
high = mid - 1;
} else {
// Found: return offset.
return offset;
}
}
}
// Not found: return insertion point.
final int offset = (/*base +*/ low);
return -(offset + 1);
}
/**
* Compares the caller's key to a full length key at a specific offset
* in the {@link BackingBuffer}.
*
* @param index
* The index into the full length keys.
* @param key
* The probe key.
*
* @return A value which indicates whether the key at that offset into the
* backing buffer is LT, GT, or EQ to the caller's key.
*/
private int comparePos(final int index, final byte[] key) {
// The index into the backing buffer of index [index].
int pos = p[index];
// The #of bytes in the full length byte[] at index [index].
final int blen = bb.readInt(pos);
// Skip the #of bytes required to code that length.
pos += count(blen);
// Compare key vs actual (in buffer).
return compareBytes(key, 0, key.length, bb, pos, blen);
}
/**
* Compare up to <i>len</i> bytes in <i>a</i> interpreted as unsigned bytes
* against the bytes in the {@link BackingBuffer} starting at offset
* <i>off</i> in the {@link BackingBuffer}.
*
* @param a
* The caller's probe.
* @param bb
* The {@link BackingBuffer}.
* @param boff
* The offset of the first byte to be compared in the
* {@link BackingBuffer}.
* @param blen
* The length of the byte[] at that offset in the
* {@link BackingBuffer}.
*
* @return The return is a negative integer, zero, or a positive integer if
* the first argument is less than, equal to, or greater than the
* byte[] in the {@link BackingBuffer} at the specified offset. The
* return value also codes the length of the shared prefix, which
* may be computed as <code>Math.abs(ret)-1</code>.
*/
private int compareBytes(final byte[] a, final int aoff, final int alen,
final BackingBuffer bb, final int boff, final int blen) {
int mlen = 0;
// Compare bytes(probe,entry) (negative iff probe < entry)
for (int i = aoff, j = boff; i < aoff + alen && j < boff + blen; i++, j++, mlen++) {
// promotes to signed integers in [0:255] for comparison.
final int ret = (a[i] & 0xff) - (bb.get(j) & 0xff);
if (ret != 0) {
return ret < 0 ? -(mlen + 1) : (mlen + 1);
}
}
return alen == blen ? 0 : (alen - blen) < 0 ? -(mlen + 1) : (mlen + 1);
}
/*
* Note: These toString() methods were inlined to remove a dependency on
* BytesUtil in the bigdata package (BBT, 9/24/2009).
*/
/**
* Formats a key as a series of comma delimited unsigned bytes.
*
* @param key
* The key.
*
* @return The string representation of the array as unsigned bytes.
*/
final public static String toString(final byte[] key) {
if (key == null)
return NULL;
return toString(key, 0, key.length);
}
/**
* Formats a key as a series of comma delimited unsigned bytes.
*
* @param key
* The key.
* @param off
* The index of the first byte that will be visited.
* @param len
* The #of bytes to visit.
*
* @return The string representation of the array as unsigned bytes.
*/
final public static String toString(final byte[] key, final int off,
final int len) {
if (key == null)
return NULL;
final StringBuilder sb = new StringBuilder(len * 4 + 2);
sb.append("[");
for (int i = off; i < off + len; i++) {
if (i > 0)
sb.append(", ");
// as an unsigned integer.
// sb.append(Integer.toHexString(key[i] & 0xff));
sb.append(Integer.toString(key[i] & 0xff));
}
sb.append("]");
return sb.toString();
}
private static transient String NULL = "null";
}