/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Apr 30, 2007
*/
package com.bigdata.btree.keys;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.text.Collator;
import java.util.Locale;
import java.util.Properties;
import java.util.UUID;
import org.apache.log4j.Logger;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleSerializer;
import com.bigdata.io.LongPacker;
import com.bigdata.util.BytesUtil;
/**
* A class that may be used to form multi-component keys but which does not
* support Unicode. An instance of this class is quite light-weight and SHOULD
* be used when Unicode support is not required.
* <p>
* Note: Avoid any dependencies within this class on the ICU libraries so that
* the code may run without those libraries when they are not required.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @see SuccessorUtil, Compute the successor of a value before encoding it as a
* component of a key.
*
* @see BytesUtil#successor(byte[]), Compute the successor of an encoded key.
*
* @todo introduce a mark and restore feature for generating multiple keys that
* share some leading prefix. in general, this is as easy as resetting the
* len field to the mark. keys with multiple components could benefit from
* allowing multiple marks (the sparse row store is the main use case).
*/
public class KeyBuilder implements IKeyBuilder, LongPacker.IByteBuffer {
private static final transient Logger log = Logger.getLogger(KeyBuilder.class);
/**
* The default capacity of the key buffer.
*/
final public static transient int DEFAULT_INITIAL_CAPACITY = 1024;
/**
* A non-negative integer specifying the #of bytes of data in the buffer
* that contain valid data starting from position zero(0).
*/
private int len;
/**
* The key buffer. This is re-allocated whenever the capacity of the buffer
* is too small and reused otherwise.
*/
private byte[] buf;
/**
* The object used to generate sort keys from Unicode strings (optional).
* <p>
* Note: When <code>null</code> the IKeyBuilder does NOT support Unicode
* and the optional Unicode methods will all throw an
* {@link UnsupportedOperationException}.
*/
private final UnicodeSortKeyGenerator sortKeyGenerator;
/**
* Creates a key builder with an initial buffer capacity of
* <code>1024</code> bytes.
*/
public KeyBuilder() {
this(DEFAULT_INITIAL_CAPACITY);
}
/**
* Creates a key builder with the specified initial buffer capacity.
*
* @param initialCapacity
* The initial capacity of the internal byte[] used to construct
* keys. When zero (0) the {@link #DEFAULT_INITIAL_CAPACITY} will
* be used.
*/
public KeyBuilder(final int initialCapacity) {
this(0, createBuffer(initialCapacity));
}
/**
* Create a buffer of the specified initial capacity.
*
* @param initialCapacity
* The initial size of the buffer.
*
* @return The byte[] buffer.
*
* @exception IllegalArgumentException
* if the initial capacity is negative.
*/
protected static byte[] createBuffer(final int initialCapacity) {
if (initialCapacity < 0) {
throw new IllegalArgumentException("initialCapacity must be non-negative");
}
final int capacity = initialCapacity == 0 ? DEFAULT_INITIAL_CAPACITY
: initialCapacity;
return new byte[capacity];
}
/**
* Creates a key builder using an existing buffer with some data.
*
* @param len
* The #of bytes of data in the provided buffer.
* @param buf
* The buffer, with <i>len</i> pre-existing bytes of valid data.
* The buffer reference is used directly rather than making a
* copy of the data.
*/
/*public*/ KeyBuilder(final int len, final byte[] buf) {
this( null /* no unicode support*/, len, buf );
}
/**
* Creates a key builder using an existing buffer with some data (designated
* constructor).
*
* @param sortKeyGenerator
* The object used to generate sort keys from Unicode strings
* (when <code>null</code> Unicode collation support is
* disabled).
* @param len
* The #of bytes of data in the provided buffer.
* @param buf
* The buffer, with <i>len</i> pre-existing bytes of valid data.
* The buffer reference is used directly rather than making a
* copy of the data.
*/
protected KeyBuilder(final UnicodeSortKeyGenerator sortKeyGenerator,
final int len, final byte[] buf) {
if (len < 0)
throw new IllegalArgumentException("len");
if (buf == null)
throw new IllegalArgumentException("buf");
if (len > buf.length)
throw new IllegalArgumentException("len>buf.length");
this.len = len;
this.buf = buf;
this.sortKeyGenerator = sortKeyGenerator; // MAY be null.
}
final public int off() {
return 0;
}
final public int len() {
return len;
}
final public byte[] array() {
return buf;
}
final public int capacity() {
return buf.length;
}
/**
* Sets the position to any non-negative length less than the current
* capacity of the buffer.
*/
final public void position(final int pos) {
if (len < 0 || len > buf.length) {
throw new IndexOutOfBoundsException("pos=" + pos
+ ", but capacity=" + buf.length);
}
len = pos;
}
final public KeyBuilder append(final byte b) {
return appendUnsigned(b);
}
final public KeyBuilder append(final byte[] a) {
return append(a, 0, a.length);
}
final public KeyBuilder append(final byte[] a, final int off, final int len) {
ensureFree(len);
System.arraycopy(a, off, buf, this.len, len);
this.len += len;
// assert this.len <= buf.length;
return this;
}
final public void ensureFree(int len) {
ensureCapacity(this.len + len );
}
final public void ensureCapacity(int capacity) {
if (capacity < 0)
throw new IllegalArgumentException();
final int overflow = capacity - buf.length;
if (overflow > 0) {
/*
* extend to at least the target capacity.
*/
final byte[] tmp = new byte[capacity];
// copy only the defined bytes.
System.arraycopy(buf, 0, tmp, 0, this.len);
buf = tmp;
}
}
final public byte[] toByteArray() {
return getKey();
}
final public byte[] getKey() {
final byte[] tmp = new byte[this.len];
System.arraycopy(buf, 0, tmp, 0, this.len);
return tmp;
}
/*
* The problem with this method is that it encourages us to reuse a key
* buffer but the btree (at least when used as part of a local api) requires
* that we donate the key buffer to the btree.
*/
// /**
// * Copy the key from the internal buffer into the supplied buffer.
// *
// * @param b
// * A byte[].
// *
// * @exception IndexOutOfBoundsException
// * if the supplied buffer is not large enough.
// */
// final public void copyKey(byte[] b) {
//
// System.arraycopy(this.buf, 0, b, 0, this.len);
//
// }
final public KeyBuilder reset() {
len = 0;
return this;
}
/*
* Unicode operations.
*/
final public boolean isUnicodeSupported() {
if (sortKeyGenerator == null) return false;
return true;
}
/**
* The object responsible for generating sort keys from Unicode strings.
*
* The {@link UnicodeSortKeyGenerator} -or- <code>null</code> if Unicode
* is not supported by this {@link IKeyBuilder} instance.
*/
final public UnicodeSortKeyGenerator getSortKeyGenerator() {
return sortKeyGenerator;
}
final public KeyBuilder append(final String s) {
if (sortKeyGenerator == null) {
// Force ASCII semantics on the Unicode text.
appendASCII( s );
} else {
sortKeyGenerator.appendSortKey( this, s );
}
return this;
}
/*
* Note: Dropped from the API to minimize confusion.
*/
// final public IKeyBuilder append(char[] v) {
//
// return append(new String(v));
//
// }
/*
* Non-optional operations.
*/
public KeyBuilder appendASCII(final String s) {
int tmpLen = s.length();
ensureFree(tmpLen);
for(int j=0; j<tmpLen; j++) {
char ch = s.charAt(j);
// append((byte)(ch & 0xff));
// lexiographic ordering as unsigned byte.
int v = (byte)ch;
if (v < 0) {
v = v - 0x80;
} else {
v = v + 0x80;
}
buf[this.len++] = (byte)(v & 0xff);
}
return this;
}
/**
* Decodes an ASCII string from a key.
*
* @param key
* The key.
* @param off
* The offset of the start of the string.
* @param len
* The #of bytes to decode (one byte per character).
*
* @return The ASCII characters decoded from the key.
*
* @see #appendASCII(String)
*/
public static String decodeASCII(final byte[] key, final int off,
final int len) {
final byte[] b = new byte[len];
System.arraycopy(key, off, b, 0, len);
for (int i = 0; i < len; i++) {
b[i] = decodeByte(b[i]);
}
try {
return new String(b, "US-ASCII");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
}
/**
* The default pad character (a space).
* <p>
* Note: Any character may be chosen as the pad character as long as it has
* a one byte representation. In practice this means you can choose 0x20 (a
* space) or 0x00 (a nul). This limit arises in
* {@link #appendText(String, boolean, boolean)} which assumes that it can
* write a pad character (or its successor) in one byte. 0xff will NOT work
* since its successor is not defined within an bit string of length 8.
*
* @todo make this a configuration option? if so then verify that the choice
* (and its successor) fit in 8 bits.
*/
final public byte pad = 0x20;
/**
* Normalize the text by truncating to no more than {@link #maxlen}
* characters and then stripping off trailing {@link #pad} characters.
*/
/*public*/ String normalizeText(String text) {
if (text.length() > maxlen) {
/*
* Truncate the encoded text field to maxlen characters to prevent
* overflow. While the number of bytes generated by the resulting
* encoding is variable, the order semantics will respect only the
* 1st maxlen characters _regardless_ of how many bytes are required
* to encode those characters.
*/
text = text.substring(0, maxlen);
}
/*
* Strip trailing pad "characters" from the text. This helps to ensure a
* canonical representation. If we did not do this then text with
* trailing pad characters would be encoded differently from text
* without trailing pad characters - even though they are supposed to be
* the "same".
*/
{
int npadded = 0;
for (int i = text.length() - 1; i >= 0; i--) {
if (text.charAt(i) == pad) {
npadded++;
continue;
}
break;
}
if (npadded > 0) {
int begin = 0;
int end = text.length() - npadded;
text = text.substring(begin,end);
}
}
return text;
}
public KeyBuilder appendText(String text, final boolean unicode,
final boolean successor) {
// current length of the encoded key.
final int pos = this.len;
/*
* Normalize the text by truncating to no more than [maxlen] characters
* and then stripping off trailing pad characters.
*/
text = normalizeText( text );
/*
* Encode the text as ASCII or Unicode as appropriate.
*/
if(unicode) {
append(text);
} else {
appendASCII(text);
}
// #of bytes in the encoded text field.
final int encoded_len = this.len - pos;
// #of characters (not bytes) in the text.
final int textlen;
if(successor) {
if (encoded_len == 0) {
/*
* Note: The successor of an empty string is not defined since
* it maps to an empty byte[] (an empty value space). However an
* empty string is semantically equivalent to all pad characters
* so we use the successor of a string containing a single pad
* character, which is equivalent to a string containing a
* single byte whose value is pad+1.
*/
appendSigned((byte)(pad+1));
textlen = 1;
} else {
/*
* Note: This generates the successor of the encoded text by
* treading the encoded byte[] as a fixed length bit string and
* finding the successor of that bit-string. The bytes in the
* buffer are modified as a side-effect. A runtime exception is
* thrown if there is no successor to the bit string (this is
* not a plausible scenario for either ASCII or Unicode text as
* the encoding would have to be all 0xff bytes for the
* successor to not be defined).
*/
SuccessorUtil.successor(buf, pos, encoded_len);
textlen = text.length();
}
} else {
textlen = text.length();
}
if (textlen < maxlen) {
/*
* append a single pad byte.
*
* Note: Changed this to append the pad character (a space) as if it
* was already an unsigned value (0x20) rather than its signed value
* (0x160). This causes "bro" to sort before "brown", which is
* desired. (bbt, 10/1/08)
*/
appendUnsigned(pad);
// append the run length for the trailing pad characters.
final int runLength = maxlen - textlen;
append((short) runLength);
}
return this;
}
final public KeyBuilder append(final double d) {
// performance tweak.
if (len + 8 > buf.length) ensureCapacity(len+8);
// ensureFree(8);
long v = Double.doubleToLongBits(d);
// convert to twos-complement long.
if (v < 0) {
v = 0x8000000000000000L - v;
}
// delegate to append(long)
return append( v );
}
static public double decodeDouble(final byte[] key,final int off) {
long v = decodeLong(key, off);
// convert to twos-complement long.
if (v < 0) {
v = 0x8000000000000000L - v;
}
return Double.longBitsToDouble(v);
}
final public KeyBuilder append(final float f) {
// performance tweak.
if (len + 4 > buf.length) ensureCapacity(len+4);
// ensureFree(4);
int v = Float.floatToIntBits(f);
// convert to twos complement int.
if (v < 0) {
v = 0x80000000 - v;
}
// delegate to append(int)
return append(v);
}
static public float decodeFloat(final byte[] key, final int off) {
int v = decodeInt(key, off);
// convert to twos complement int.
if (v < 0) {
v = 0x80000000 - v;
}
return Float.intBitsToFloat(v);
}
final public KeyBuilder append(final UUID uuid) {
if (len + 16 > buf.length) ensureCapacity(len+16);
append( uuid.getMostSignificantBits() );
append( uuid.getLeastSignificantBits() );
return this;
}
final public KeyBuilder append(long v) {
// performance tweak adds .3% on rdfs bulk load.
if (len + 8 > buf.length) ensureCapacity(len+8);
// ensureFree(8);
// ensureCapacity( len + 8 );
// lexiographic ordering as unsigned long integer.
if (v < 0) {
v = v - 0x8000000000000000L;
} else {
v = v + 0x8000000000000000L;
}
// big-endian.
buf[len++] = (byte)(v >>> 56);
buf[len++] = (byte)(v >>> 48);
buf[len++] = (byte)(v >>> 40);
buf[len++] = (byte)(v >>> 32);
buf[len++] = (byte)(v >>> 24);
buf[len++] = (byte)(v >>> 16);
buf[len++] = (byte)(v >>> 8);
buf[len++] = (byte)(v >>> 0);
return this;
}
/**
* Packs a non-negative long value into the minimum #of bytes in which the
* value can be represented and writes those bytes onto the buffer. The
* first byte determines whether or not the long value was packed and, if
* packed, how many bytes were required to represent the packed long value.
* When the high bit of the first byte is a one (1), then the long value
* could not be packed and the long value is found by clearing the high bit
* and interpreting the first byte plus the next seven (7) bytes as a long.
* Otherwise the next three (3) bits are interpreted as an unsigned integer
* giving the #of bytes (nbytes) required to represent the packed long
* value. To recover the long value the high nibble is cleared and the first
* byte together with the next nbytes are interpreted as an unsigned long
* value whose leading zero bytes were not written.
*
* <pre>
*
* [0|1|2|3|4|5|6|7]
* 1 - - - nbytes = 8, clear high bit and interpret this plus the next 7 bytes as a long.
* 0 1 1 1 nbytes = 7, clear high nibble and interpret this plus the next 6 bytes as a long.
* 0 1 1 0 nbytes = 6, clear high nibble and interpret this plus the next 5 bytes as a long.
* 0 1 0 1 nbytes = 5, clear high nibble and interpret this plus the next 4 bytes as a long.
* 0 1 0 0 nbytes = 4, clear high nibble and interpret this plus the next 3 bytes as a long.
* 0 0 1 1 nbytes = 3, clear high nibble and interpret this plus the next 3 bytes as a long.
* 0 0 1 0 nbytes = 2, clear high nibble and interpret this plus the next byte as a long.
* 0 0 0 1 nbytes = 1, clear high nibble. value is the low nibble.
*
* </pre>
*
* Note: These are decodable (no loss) but negative longs are not allowed.
* <p>
* Note: The order is NOT fully preserved. Any long which is encoded into
* less than 8 bytes has its order preserved. However, a long which is
* encoded into 8 bytes will wind up ordered before any longs which pack
* into fewer bytes.
*
* @param v
* The unsigned long value.
*
* @return The #of bytes onto which the unsigned long value was packed.
*/
final public KeyBuilder pack(final long v) {
LongPacker.packLong(v, pbuf, this);
return this;
}
/**
* Relative <i>put</i> method for writing a byte[] on the buffer.
*
* @param b
* The byte[].
* @param off
* The offset of the first byte in <i>b</i> to be written on
* the buffer.
* @param len
* The #of bytes in <i>b</i> to be written on the buffer.
*/
public void put(final byte[] b, final int off, final int len) {
ensureFree(len);
System.arraycopy(b/* src */, 0/* srcPos */, buf/* dest */,
this.len/* destPos */, len/* length */);
this.len += len;
}
/**
* Private buffer for packing long integers.
*/
final private byte[] pbuf = new byte[8];
// /**
// * Unpack a long value from the current buffer position.
// *
// * @param buf
// * The buffer containing the data to be decoded.
// * @param off
// * The offset of the first byte of the value to be decoded.
// * @param limit
// * The exclusive upper bound available for decoding
// *
// * @return The long value.
// */
// static final public long unpackLong(final byte[] buf, final int off,
// final int limit) {
//
// return LongPacker.unpackLong(buf, off, limit);
//
// }
/**
* Return the value that will impose the lexiographic ordering as an
* unsigned long integer.
*
* @param v
* The signed long integer.
*
* @return The value that will impose the lexiographic ordering as an
* unsigned long integer.
*
* @todo This is unused and untested.
*/
static final /*public*/ long encode(long v) {
if (v < 0) {
v = v - 0x8000000000000000L;
} else {
v = v + 0x8000000000000000L;
}
return v;
}
final public KeyBuilder append(int v) {
// performance tweak.
if (len + 4 > buf.length) ensureCapacity(len+4);
// ensureFree(4);
// lexiographic ordering as unsigned int.
if (v < 0) {
v = v - 0x80000000;
} else {
v = 0x80000000 + v;
}
// big-endian
buf[len++] = (byte)(v >>> 24);
buf[len++] = (byte)(v >>> 16);
buf[len++] = (byte)(v >>> 8);
buf[len++] = (byte)(v >>> 0);
return this;
}
final public KeyBuilder append(short v) {
// performance tweak.
if (len + 2 > buf.length) ensureCapacity(len+2);
// ensureFree(2);
// lexiographic ordering as unsigned short.
if (v < 0) {
v = (short)(v - (short)0x8000);
} else {
v = (short) ((short)0x8000 + v);
}
// big-endian
buf[len++] = (byte)(v >>> 8);
buf[len++] = (byte)(v >>> 0);
return this;
}
/*
* Note: this method has been dropped from the API to reduce the
* possibility of confusion. If you want Unicode semantics then use
* append(String). If you want ASCII semantics then use appendASCII().
* If you want signed integer semantics then use append(short).
*/
// final public IKeyBuilder append(char v) {
//
// /*
// * Note: converting to String first produces significantly larger keys
// * which, more important, violate the sort order expectations for
// * characters. For example, successor in the value space of 'z' is '{'.
// * However, the sort key generated from the String representation of the
// * character '{' is NOT ordered after the sort key generated from the
// * String representation of the character 'z'. Unicode wierdness.
// */
//
// return append((short) v);
//
// }
final public KeyBuilder appendUnsigned(final byte v) {
// performance tweak
if (len + 1 > buf.length) ensureCapacity(len+1);
// ensureFree(1);
buf[len++] = (byte)v;
return this;
}
final public KeyBuilder appendSigned(final byte v) {
// performance tweak
if (len + 1 > buf.length) ensureCapacity(len+1);
// ensureFree(1);
// lexiographic ordering as unsigned byte.
int i = v;
if (i < 0) {
i = i - 0x80;
} else {
i = i + 0x80;
}
buf[len++] = (byte)(i & 0xff);
return this;
}
// /**
// * Return the value that will impose the lexiographic ordering as an
// * unsigned byte.
// *
// * @param v
// * The signed byte.
// *
// * @return The value that will impose the lexiographic ordering as an
// * unsigned byte.
// */
// final static /*public*/ byte encode(byte v) {
//
// int i = v;
//
// if (i < 0) {
//
// i = i - 0x80;
//
// } else {
//
// i = i + 0x80;
//
// }
//
// byte tmp = (byte)(i & 0xff);
//
// return tmp;
//
// }
final public KeyBuilder appendNul() {
// return append(0);
// performance tweak.
if (len + 1 > buf.length) ensureCapacity(len+1);
// ensureFree(1);
buf[len++] = (byte) 0;
return this;
}
public KeyBuilder append(final BigInteger i) {
// Note: BigInteger.ZERO is represented as byte[]{0}.
final byte[] b = i.toByteArray();
final int runLength = i.signum() == -1 ? -b.length : b.length;
ensureFree(b.length + 2);
append((short) runLength);
append(b);
return this;
}
/**
* Return the #of bytes in the unsigned byte[] representation of the
* {@link BigInteger} value.
*
* @param value
* The {@link BigInteger} value.
*
* @return The byte length of its unsigned byte[] representation.
*/
static public int byteLength(final BigInteger value) {
return 2/* runLength */+ (value.bitLength() / 8 + 1)/* data */;
}
/**
* {@inheritDoc}
* <p>
* Note: Precision is NOT preserved by this encoding. Thus <code>0.0</code>
* and <code>0</code> are encoded by the same representation and both will
* decode to <code>0</code>.
* <h2>Implementation details</h2>
* The encoding to a BigDecimal requires the expression of scale and length
* {@link BigDecimal#scale()} indicates the precision of the number, where
* '3' is three decimal places and '-3' rounded to '000'
* {@link BigDecimal#precision()} is the number of unscaled digits therefore
* <code>precision - scale</code> is an expression of the exponent of the
* normalized number. This means that the exponent could be zero or negative
* so the sign of the number cannot be indicated by adding to the exponent.
* Instead an explicit sign byte,'0' or '1' is used. The actual
* {@link BigDecimal} serialization uses the {@link String} conversions
* supported by {@link BigDecimal}, less the '-' sign character if
* applicable. The length of this data is terminated by a trailing byte. The
* value of that byte depends on the sign of the original {@link BigDecimal}
* and is used to impose the correct sort order on negative
* {@link BigDecimal} values which differ only in the digits in the decimal
* portion.
*<p>
* The variable encoding of BigNumbers requires this String representation
* and negative representations are further encoded using
* {@link #flipDigits(String)} for the equivalent of 2s compliment negative
* representation.
* <p>
* There are two cases where scale and trailing zeros interact. The
* case of "0.000" is represented as precision of 1 and scale of 3,
* indicating the "0" is shifted down 3 decimal places. While "5.000"
* is represented as precision of 4 and scale of 3. The special case
* of zero is allowed because shifting zero to the right leaves a new
* zero on the left, so a zero value must be checked for explicitly, while
* if we want to compare "5", "5.00" and "5.0000" as equal we must
* remove and compensate for trailing zeros.
*
* @see #decodeBigDecimal(int, byte[])
*/
public KeyBuilder append(final BigDecimal d) {
final int sign = d.signum();
if (sign == 0) {
appendSigned((byte) 0);
return this;
}
BigDecimal nd = d.stripTrailingZeros();
String unscaledStr = nd.unscaledValue().toString();
final int precision = nd.precision();
final int scale = nd.scale();
int exponent = precision - scale;
if (sign == -1) {
exponent = -exponent;
}
appendSigned((byte) sign);
append(exponent);
// Note: coded as digits
if (sign == -1) {
unscaledStr = flipDigits(unscaledStr);
}
appendASCII(unscaledStr); // the unscaled BigInteger representation
// Note: uses unsigned 255 if negative and unsigned 0 if positive.
appendSigned(sign == -1 ? (byte) Byte.MAX_VALUE: (byte) 0);
return this;
}
/**
* Return the #of bytes in the unsigned byte[] representation of the
* {@link BigDecimal} value.
*
* @param value
* The {@link BigDecimal} value.
*
* @return The byte length of its unsigned byte[] representation.
*/
static public int byteLength(final BigDecimal value) {
final int byteLength;
if (value.signum() == 0) {
byteLength = 1;
} else {
final BigDecimal nbd = value.stripTrailingZeros();
final int dataLen = nbd.unscaledValue().toString().length();
byteLength =
+ 1 /* sign */
+ 4 /* exponent */
+ dataLen /* data */
+ 1 /* termination byte */
;
}
return byteLength;
}
public byte[] getSortKey(final Object val) {
reset();
append( val );
return getKey();
}
public KeyBuilder append(final Object val) {
if (val == null) {
throw new IllegalArgumentException();
}
if(val instanceof byte[]) {
append((byte[])val);
} else if (val instanceof Byte) {
appendSigned(((Byte) val).byteValue());
} else if (val instanceof Character) {
// append(((Character) val).charValue());
throw new UnsupportedOperationException(
"Character is not supported. Use Short or String depending on the semantics that you want.");
} else if (val instanceof Short) {
append(((Short) val).shortValue());
} else if (val instanceof Integer) {
append(((Integer) val).intValue());
} else if (val instanceof Long) {
append(((Long) val).longValue());
} else if (val instanceof BigInteger) {
append((BigInteger) val);
} else if (val instanceof BigDecimal) {
append((BigDecimal) val);
} else if (val instanceof Float) {
append(((Float) val).floatValue());
} else if (val instanceof Double) {
append(((Double) val).doubleValue());
} else if (val instanceof String) {
append((String) val);
} else if (val instanceof UUID) {
append(((UUID) val));
} else {
throw new UnsupportedOperationException("Can not encode key: "
+ val.getClass());
}
return this;
}
/**
* Converts a signed byte into an unsigned byte.
*
* @param v
* The signed byte.
*
* @return The corresponding unsigned value.
*/
static public byte encodeByte(final int v) {
int i = v;
if (i < 0) {
i = i - 0x80;
} else {
i = i + 0x80;
}
return (byte)(i & 0xff);
}
/**
* Converts an unsigned byte into a signed byte.
*
* @param v
* The unsigned byte.
*
* @return The corresponding signed value.
*/
static public byte decodeByte(final int v) {
int i = v;
if (i < 0) {
i = i + 0x80;
} else {
i = i - 0x80;
}
return (byte)(i & 0xff);
}
/**
* Encodes a double precision floating point value as an int64 value that
* has the same total ordering (you can compare two doubles encoded by this
* method and the long values will have the same ordering as the double
* values). The method works by converting the double to the IEEE 754
* floating-point "double format" bit layout using
* {@link Double#doubleToLongBits(double)} and then converting the resulting
* long into a two's complement number.
*
* See <a
* href="http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm">
* Comparing floating point numbers </a> by Bruce Dawson.
*
* @param d
* The double precision floating point value.
*
* @return The corresponding long integer value that maintains the same
* total ordering.
*/
public static long d2l(final double d) {
long aLong = Double.doubleToLongBits(d);
if (aLong < 0) {
aLong = 0x8000000000000000L - aLong;
}
return aLong;
}
/**
* Encodes a floating point value as an int32 value that has the same total
* ordering (you can compare two floats encoded by this method and the int
* values will have the same ordering as the float values). The method works
* by converting the float to the IEEE 754 floating-point "single format"
* bit layout using {@link Float#floatToIntBits(float)} and then converting
* the resulting int into a two's complement number.
*
* See <a
* href="http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm">
* Comparing floating point numbers </a> by Bruce Dawson.
*
* @param f
* The floating point value.
*
* @return The corresponding integer value that maintains the same total
* ordering.
*/
public static int f2i(final float f) {
int aInt = Float.floatToIntBits(f);
if (aInt < 0) {
aInt = 0x80000000 - aInt;
}
return aInt;
}
/**
* Decodes a signed long value as encoded by {@link #append(long)}.
*
* @param buf
* The buffer containing the encoded key.
* @param off
* The offset at which to decode the key.
*
* @return The signed long value.
*/
static public long decodeLong(final byte[] buf, int off) {
long v = 0L;
// big-endian.
v += (0xffL & buf[off++]) << 56;
v += (0xffL & buf[off++]) << 48;
v += (0xffL & buf[off++]) << 40;
v += (0xffL & buf[off++]) << 32;
v += (0xffL & buf[off++]) << 24;
v += (0xffL & buf[off++]) << 16;
v += (0xffL & buf[off++]) << 8;
v += (0xffL & buf[off++]) << 0;
if (v < 0) {
v = v + 0x8000000000000000L;
} else {
v = v - 0x8000000000000000L;
}
return v;
}
/**
* Decode a {@link UUID} as encoded by {@link #append(UUID)}.
*
* @param buf
* The buffer containing the encoded key.
* @param off
* The offset at which to decode the key.
*
* @return The decoded {@link UUID}.
*/
static public UUID decodeUUID(final byte[] buf, int off) {
final long msb = decodeLong(buf, off);
off += 8;
final long lsb = decodeLong(buf, off);
return new UUID(msb, lsb);
}
/**
* Decodes a signed int value as encoded by {@link #append(int)}.
*
* @param buf
* The buffer containing the encoded key.
* @param off
* The offset at which to decode the key.
*
* @return The signed int value.
*/
static public int decodeInt(final byte[] buf, int off) {
int v = 0;
// big-endian.
v += (0xffL & buf[off++]) << 24;
v += (0xffL & buf[off++]) << 16;
v += (0xffL & buf[off++]) << 8;
v += (0xffL & buf[off++]) << 0;
if (v < 0) {
v = v + 0x80000000;
} else {
v = v - 0x80000000;
}
return v;
}
/**
* Decodes a signed short value as encoded by {@link #append(short)}.
*
* @param buf
* The buffer containing the encoded key.
* @param off
* The offset at which to decode the key.
*
* @return The signed short value.
*/
static public short decodeShort(final byte[] buf, int off) {
int v = 0;
// big-endian.
v += (0xffL & buf[off++]) << 8;
v += (0xffL & buf[off++]) << 0;
if (v < 0) {
v = v + 0x8000;
} else {
v = v - 0x8000;
}
return (short) v;
}
/**
* Convert an unsigned byte[] into a {@link BigInteger}.
*
* @param key
* The bytes.
*
* @return The big integer value.
*/
static public BigInteger decodeBigInteger(final int offset, final byte[] key) {
return new BigInteger(decodeBigInteger2(offset, key));
}
/**
* Decodes a {@link BigInteger} key, returning a byte[] which may be used to
* construct a {@link BigInteger} having the decoded value. The number of
* bytes consumed by the key component is <code>2 + runLength</2>. The
* <code>2</code> is a fixed length field coding the signum of the value and
* its runLength. The length of the returned array is the runLength of the
* variable length portion of the value. This method may be used to scan
* through a key containing {@link BigInteger} components.
*
* @param offset
* The offset of the start of the {@link BigInteger} in the key.
* @param key
* The key.
* @return The byte[] to be passed to {@link BigInteger#BigInteger(byte[])}.
*/
static public byte[] decodeBigInteger2(final int offset, final byte[] key) {
final int tmp = KeyBuilder.decodeShort(key, offset);
/*
* Note: The signum is thrown away when we decode the runLength field.
* Signum is actually in the key twice: once in the runLength to put the
* BigInteger values into total order and once in the representation of
* the BigInteger as a byte[].
*/
final int runLength = tmp < 0 ? -tmp : tmp;
final byte[] b = new byte[runLength];
System.arraycopy(key/* src */, offset + 2/* srcpos */, b/* dst */,
0/* destPos */, runLength);
return b;
}
/**
* Decodes a {@link BigDecimal} key, returning a byte[] which may be used to
* construct a {@link BigDecimal} having the decoded value.
*
* The number of bytes consumed by the key component is
* <code>2 + runLength</2>. The
* <code>2</code> is a fixed length field coding the signum of the value and
* its runLength. The length of the returned array is the runLength of the
* variable length portion of the value.
*
* This method may be used to scan through a key containing
* {@link BigDecimal} components.
*
* @param offset
* The offset of the start of the {@link BigDecimal} in the key.
* @param key
* The key.
* @return The byte[] to be passed to {@link BigDecimal#BigInteger(byte[])}.
*
* @todo update javadoc
*
* FIXME We need a version which has all the metadata to support scanning
* through a key as well as one that does a simple decode.
*/
static public BigDecimal decodeBigDecimal(final int offset, final byte[] key) {
int curs = offset;
final byte sign = key[curs++];
if (sign == decodeZero) {
return new BigDecimal(0);
}
int exponent = decodeInt(key, curs);
final boolean neg = sign == negSign;
if (neg) {
exponent = -exponent;
}
curs += 4;
int len = 0;
for (int i = curs; key[i] != (neg ? eos2 : eos); i++) len++;
String unscaledStr = decodeASCII(key, curs, len);
if (neg) {
unscaledStr = flipDigits(unscaledStr);
}
final BigInteger unscaled = new BigInteger(unscaledStr);
final int precision = len;
final int scale = precision - exponent - (neg ? 1 : 0);
final BigDecimal ret = new BigDecimal(unscaled, scale);
return ret; // relative scale adjustment
}
private static final byte decodeZero = decodeByte(0);
private static final byte eos = decodeZero;
private static final byte eos2 = decodeByte(Byte.MAX_VALUE);
private static final byte negSign = decodeByte(-1);
private static final char[] flipMap = {'0', '1', '2', '3', '4',
'5', '6', '7', '8', '9'
};
/**
* Flip numbers such that <code>0/9,1/8,2/7,3/6,4/5</code> - this is the
* equivalent of a two-complement representation for the base 10 character
* digits.
*/
static private String flipDigits(final String str) {
final char[] chrs = str.toCharArray();
for (int i = 0; i < chrs.length; i++) {
final int flip = '9' - chrs[i];
if (flip >= 0 && flip < 10) {
chrs[i] = flipMap[flip];
}
}
return new String(chrs);
}
/**
* Create an instance for ASCII keys.
*
* @return The new instance.
*/
public static IKeyBuilder newInstance() {
return newInstance(DEFAULT_INITIAL_CAPACITY);
}
/**
* Create an instance for ASCII keys with the specified initial capacity.
*
* @param initialCapacity
* The initial capacity.
*
* @return The new instance.
*/
public static IKeyBuilder newInstance(final int initialCapacity) {
return newInstance(initialCapacity, CollatorEnum.ASCII, null/* locale */,
null/* strength */, null/* decomposition mode */);
}
/**
* Configuration options for {@link DefaultKeyBuilderFactory} and the
* {@link KeyBuilder} factory methods. <strong>The use of
* {@link DefaultKeyBuilderFactory} is highly recommended as it will cause
* the configuration to be serialized. In combination with the use of an
* {@link ITupleSerializer}, this means that Unicode keys for an index will
* be interpreted in the same manner on any machine where {@link ITuple}s
* for that index are (de-)materialized. </strong>
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public static interface Options {
/**
* Optional property specifies the library that will be used to generate
* sort keys from Unicode data. The ICU library is the default.
* You may explicitly specify
* the library choice using one of the {@link CollatorEnum} values. The
* {@link CollatorEnum#ASCII} value may be used to disable Unicode
* support entirely, treating the characters as if they were ASCII. If
* your data is not actually Unicode then this offers a substantial
* performance benefit.
*
* @see CollatorEnum
*/
public String COLLATOR = KeyBuilder.class.getName()+".collator";
/**
* Optional string -or- integer property whose value is the strength to
* be set on the collator. When specified, the value must be either one
* of the type-safe {@link StrengthEnum}s -or- one of those supported
* by the ICU or JDK library, as appropriate. The following values are
* shared by both libraries:
* <dl>
* <dt>0</dt>
* <dd>{@link Collator#PRIMARY}</dd>
* <dt>1</dt>
* <dd>{@link Collator#SECONDARY}</dd>
* <dt>2</dt>
* <dd>{@link Collator#TERTIARY}</dd>
* </dl>
* The ICU library also supports
* <dl>
* <dt>3</dt>
* <dd>Quaternary</dd>
* </dl>
* While both libraries define <strong>IDENTICAL</strong> they use
* different values for this strength, hence the use of the type safe
* enums is recommended.
*/
public String STRENGTH = KeyBuilder.class.getName()+".collator.strength";
/**
* Optional string property whose value is one of the type safe
* {@link DecompositionEnum}s. The default decomposition mode will be
* overridden on the collator one is explicitly specified using this
* property.
*
* @see DecompositionEnum
*/
public String DECOMPOSITION = KeyBuilder.class.getName()+".collator.decomposition";
/**
* The pre-defined System property {@value #USER_LANGUAGE} determines
* the <em>language</em> for the default {@link Locale}.
*
* @see Locale#setDefault(Locale)
*
* @see <a
* href="http://java.sun.com/developer/technicalArticles/J2SE/locale/">http://java.sun.com/developer/technicalArticles/J2SE/locale/</a>
*/
public String USER_LANGUAGE = "user.language";
/**
* The pre-defined System property {@value #USER_COUNTRY} determines the
* <em>country</em> for the default {@link Locale}.
*
* @see <a
* href="http://java.sun.com/developer/technicalArticles/J2SE/locale/">http://java.sun.com/developer/technicalArticles/J2SE/locale/</a>
*/
public String USER_COUNTRY = "user.country";
/**
* The pre-defined System property {@value #USER_VARIANT} determines the
* <em>variant</em> for the default {@link Locale}.
*
* @see <a
* href="http://java.sun.com/developer/technicalArticles/J2SE/locale/">http://java.sun.com/developer/technicalArticles/J2SE/locale/</a>
*/
public String USER_VARIANT = "user.variant";
}
/**
* Create a factory for {@link IKeyBuilder} instances configured using the
* system properties. The factory will support Unicode unless
* {@link CollatorEnum#ASCII} is explicitly specified for the
* {@link Options#COLLATOR} property.
*
* @param properties
* The properties to be used (optional). When <code>null</code>
* the {@link System#getProperties() System properties} are used.
*
* @see Options
*
* @throws UnsupportedOperationException
* <p>
* The ICU library was required but was not located. Make sure
* that the ICU JAR is on the classpath. See
* {@link Options#COLLATOR}.
* </p>
* <p>
* Note: If you are trying to use ICU4JNI then that has to be
* locatable as a native library. How you do this is different
* for Windows and Un*x.
* </p>
*/
public static IKeyBuilder newUnicodeInstance() {
return new DefaultKeyBuilderFactory(null/* properties */)
.getKeyBuilder();
}
/**
* Create a factory for {@link IKeyBuilder} instances configured according
* to the specified <i>properties</i>. Any properties NOT explicitly given
* will be defaulted from {@link System#getProperties()}. The pre-defined
* properties {@link Options#USER_LANGUAGE}, {@link Options#USER_COUNTRY},
* and {@link Options#USER_VARIANT} MAY be overriden. The factory will
* support Unicode unless {@link CollatorEnum#ASCII} is explicitly specified
* for the {@link Options#COLLATOR} property.
*
* @param properties
* The properties to be used (optional). When <code>null</code>
* the {@link System#getProperties() System properties} are used.
*
* @see Options
*
* @throws UnsupportedOperationException
* <p>
* The ICU library was required but was not located. Make sure
* that the ICU JAR is on the classpath. See
* {@link Options#COLLATOR}.
* </p>
* <p>
* Note: If you are trying to use ICU4JNI then that has to be
* locatable as a native library. How you do this is different
* for Windows and Un*x.
* </p>
*/
public static IKeyBuilder newUnicodeInstance(Properties properties) {
return new DefaultKeyBuilderFactory(properties).getKeyBuilder();
}
/**
* Create a new instance that optionally supports Unicode sort keys.
*
* @param capacity
* The initial capacity of the buffer. When zero (0) the
* {@link #DEFAULT_INITIAL_CAPACITY} will be used.
* @param collatorChoice
* Identifies the collator that will be used to generate sort
* keys from Unicode values.
* @param locale
* When <code>null</code> the
* {@link Locale#getDefault() default locale} will be used.
* @param strength
* Either an {@link Integer} or a {@link StrengthEnum} specifying
* the strength to be set on the collator object (optional). When
* <code>null</code> the default strength of the collator will
* not be overridden.
* @param mode
* The decomposition mode to be set on the collator object
* (optional). When <code>null</code> the default decomposition
* mode of the collator will not be overridden.
*
* @return The new instance.
*
* @throws UnsupportedOperationException
* <p>
* The ICU library was required but was not located. Make sure
* that the ICU JAR is on the classpath.
* </p>
* <p>
* Note: If you are trying to use ICUJNI then that has to be
* locatable as a native library. How you do this is different
* for Windows and Un*x.
* </p>
*/
public static IKeyBuilder newInstance(int capacity,
CollatorEnum collatorChoice, Locale locale, Object strength,
DecompositionEnum mode) {
if (collatorChoice == CollatorEnum.ASCII) {
/*
* No Unicode support.
*/
return new KeyBuilder(capacity);
}
/*
* Unicode support.
*/
if (locale == null) {
locale = Locale.getDefault();
if(log.isInfoEnabled())
log.info("Using default locale: " + locale.getDisplayName());
}
// true iff ICU
final boolean icu = collatorChoice == CollatorEnum.ICU;
if (icu && !DefaultKeyBuilderFactory.isICUAvailable()) {
/*
* The ICU library was required but was not located. Make sure that
* the ICU JAR is on the classpath.
*
* Note: If you are trying to use ICU4JNI then that has to be
* locatable as a native library. How you do this is different for
* Windows and Un*x.
*/
throw new UnsupportedOperationException(DefaultKeyBuilderFactory.ICU_NOT_AVAILABLE);
}
// create the initial buffer.
final byte[] buf = createBuffer(capacity);
// the buffer is initially empty.
final int len = 0;
switch (collatorChoice) {
case ICU:
return new KeyBuilder(new ICUSortKeyGenerator(locale, strength,
mode), len, buf);
case JDK:
return new KeyBuilder(new JDKSortKeyGenerator(locale, strength,
mode), len, buf);
default:
throw new UnsupportedOperationException("Collator not supported: "
+ collatorChoice);
}
}
@Override
public byte[] toZOrder(int numDimensions) {
// we're operating over Long
final int bytesTotal = Long.SIZE / Byte.SIZE * numDimensions;
final byte[] zOrderArr = new byte[bytesTotal]; // target buffer
// we compose the original components into the the z-order bit array
for (int dimIt = 0; dimIt < numDimensions; dimIt++) { // iterate dimensions
final int offset = dimIt * Long.SIZE;
for (int bufIt = 0; bufIt < Long.SIZE;) { // iterate over bits
// skip byte if no bit is set here (for performance only,
// this check is not required for correctness)
if (buf[(bufIt + offset) / Byte.SIZE] != 0) {
BytesUtil.setBit(
zOrderArr, // target array
bufIt * numDimensions + ((numDimensions - 1) - dimIt), // position
BytesUtil.getBit(buf, bufIt + offset) // value
);
bufIt++;
} else {
bufIt += Byte.SIZE; // skip full byte
}
}
}
return zOrderArr;
}
@Override
public long[] fromZOrder(int numDimensions) {
// we're operating over Long
final int bytesTotal = Long.SIZE / Byte.SIZE * numDimensions;
final byte[] componentArr = new byte[bytesTotal]; // target buffer
// we compose the original components into the the z-order bit array
for (int bufIt=0; bufIt< Long.SIZE * numDimensions; ) {
// skip byte if no bit is set here (for performance only,
// this check is not required for correctness)
if (buf[bufIt / Byte.SIZE] != 0) {
// first Long.SIZE bits belong to dim=0, next Long.SIZE to dim=1, etc.
final int dimension = bufIt % numDimensions;
final int bitPos = bufIt/numDimensions + dimension*Long.SIZE;
BytesUtil.setBit(
componentArr, // target array
bitPos, // position
BytesUtil.getBit(buf, bufIt) // value
);
bufIt++;
} else {
bufIt += Byte.SIZE; // skip full byte
}
}
// having restored the components, decode them as long[]
final long[] ret = new long[numDimensions];
for (int i=0; i<numDimensions; i++) {
ret[(numDimensions-1)-i] = decodeLong(componentArr, i*(Long.SIZE/8));
}
return ret;
}
}