IKeyBuilder.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Apr 30, 2007
 */

package com.bigdata.btree.keys;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.Locale;
import java.util.Properties;
import java.util.UUID;

import com.bigdata.btree.keys.KeyBuilder.Options;
import com.bigdata.io.IManagedByteArray;
import com.bigdata.util.BytesUtil;

/**
 * <p>
 * Interface for building up variable <code>unsigned byte[]</code> keys from
 * one or more primitive data types values and/or Unicode strings. An instance
 * of this interface may be {@link #reset()} and reused to encode a series of
 * keys.
 * </p>
 * <p>
 * A sort key is an unsigned byte[] that preserves the total order of the
 * original data. Sort keys may potentially be formed from multiple fields but
 * field markers do not appear within the resulting sort key. While the original
 * values can be extracted from sort keys (this is true of all the fixed length
 * fields, such as int, long, float, or double) they can not be extracted from
 * Unicode variable length fields (the collation ordering for a Unicode string
 * depends on the {@link Locale}, the collation strength, and the decomposition
 * mode and is a non-reversable operation).
 * </p>
 * <h2>Unicode</h2>
 * <p>
 * Factory methods are defined by {@link KeyBuilder} for obtaining instances of
 * this interface that optionally support Unicode. Instances may be created for
 * a given {@link Locale}, collation strength, decomposition mode, etc.
 * </p>
 * <p>
 * The ICU library supports generation of compressed Unicode sort keys and is
 * used by default when available. The JDK {@link java.text} package also
 * supports the generation of Unicode sort keys, but it does NOT produce
 * compressed sort keys. The resulting sort keys are therefore (a) incompatible
 * with those produced by the ICU library and (b) much larger than those
 * produced by the ICU library.
 * </p>
 * <p>
 * Support for Unicode MAY be disabled using {@link Options#COLLATOR}, by using
 * {@link KeyBuilder#newInstance()} or another factory method that does not
 * enable Unicode support, or by using one of the {@link KeyBuilder}
 * constructors that does not support Unicode.
 * </p>
 * <h2>Multi-field keys with variable length fields</h2>
 * <p>
 * Multi-field keys in which variable length fields are embedded within the key
 * present a special problem. Any run of fixed length fields can be compared as
 * unsigned byte[]s. Likewise, any any key with a fixed length prefix (including
 * zero) but a variable length field in its tail can also be compared directly
 * as unsigned byte[]s. However, the introduction of a variable length field
 * into any non-terminal position in a multi-field key must be handled specially
 * since simple concatenation of the field keys will NOT produce the correct
 * total ordering. (This is why SQL requires that text fields compare as if they
 * were padded out with ASCII blanks (0x20) to some maximum length for the
 * field.) A utility method exists specifically for this purpose - see
 * {@link #appendText(String, boolean, boolean)}.
 * </p>
 * 
 * @see KeyBuilder#newInstance()
 * @see KeyBuilder#newUnicodeInstance()
 * @see KeyBuilder#newUnicodeInstance(Properties)
 * @see SuccessorUtil
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public interface IKeyBuilder extends ISortKeyBuilder<Object>, IManagedByteArray {
    
    /**
     * The backing byte[] WILL be transparently replaced if the buffer capacity
     * is extended. {@inheritDoc}
     */
    byte[] array();

    /**
     * The offset of the slice into the backing byte[] is always zero.
     * {@inheritDoc}
     */
    int off();

    /**
     * The length of the slice is number of bytes written onto the backing
     * byte[]. This is set to ZERO (0) by {@link #reset()}. {@inheritDoc}
     */
    int len();

    /**
     * Return the encoded key. Comparison of keys returned by this method MUST
     * treat the array as an array of <em>unsigned bytes</em>.
     * <p>
     * Note that keys are <em>donated</em> to the btree so it is important to
     * allocate new keys when running in the same process space.  When using a
     * network api, the api provides the necessary decoupling. 
     * 
     * @return A new array containing the key.
     * 
     * @see BytesUtil#compareBytes(byte[], byte[])
     */
    public byte[] getKey();

    /**
     * An alias for {@link #getKey()}.
     * 
     * {@inheritDoc}
     */
    public byte[] toByteArray();
    
    /**
     * Reset the key length to zero before building another key.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder reset();

    /*
     * Optional operations.
     */
    
    /**
     * Encodes a Unicode string using the configured {@link Options#COLLATOR}
     * and appends the resulting sort key to the buffer (without a trailing nul
     * byte).
     * <p>
     * Note: The {@link SuccessorUtil#successor(String)} of a string is formed
     * by appending a trailing <code>nul</code> character. However, since
     * <code>IDENTICAL</code> appears to be required to differentiate between
     * a string and its successor (with the trailing <code>nul</code>
     * character), you MUST form the sort key first and then its successor (by
     * appending a trailing <code>nul</code>). Failure to follow this pattern
     * will lead to the successor of the key comparing as EQUAL to the key. For
     * example,
     * 
     * <pre>
     *            
     *            IKeyBuilder keyBuilder = ...;
     *            
     *            String s = "foo";
     *            
     *            byte[] fromKey = keyBuilder.reset().append( s );
     *            
     *            // right.
     *            byte[] toKey = keyBuilder.reset().append( s ).appendNul();
     *            
     *            // wrong!
     *            byte[] toKey = keyBuilder.reset().append( s+"\0" );
     *            
     * </pre>
     * 
     * @param s
     *            A string.
     * 
     * @throws UnsupportedOperationException
     *             if Unicode is not supported.
     * 
     * @return <i>this</i>
     * 
     * @see SuccessorUtil#successor(String)
     * @see SuccessorUtil#successor(byte[])
     * @see TestICUUnicodeKeyBuilder#test_keyBuilder_unicode_trailingNuls()
     * 
     * FIXME update the javadoc further to speak to handling of multi-field
     * keys.
     * 
     * @todo provide a more flexible interface for handling Unicode, including
     *       the means to encode using a specified language family (such as
     *       could be identified with an <code>xml:lang</code> attribute).
     */
    public IKeyBuilder append(String s);

    /**
     * Encodes a variable length text field into the buffer. The text is
     * truncated to {@link IKeyBuilder#maxlen} characters. The sort keys for
     * strings that differ after truncation solely in the #of trailing
     * {@link #pad} characters will be identical (trailing pad characters are
     * implicit out to {@link #maxlen} characters).
     * <p>
     * Note: Trailing pad characters are normalized to a representation as a
     * single pad character (1 byte) followed by the #of actual or implied
     * trailing pad characters represented as an unsigned short integer (2
     * bytes). This technique serves to keep multi-field keys with embedded
     * variable length text fields aligned such that the field following a
     * variable length text field does not bleed into the lexiographic ordering
     * of the variable length text field.
     * <p>
     * Note: While the ASCII encoding happens to use one byte for each character
     * that is NOT true of the Unicode encoding. The space requirements for the
     * Unicode encoding depend on the text, the Locale, the collator strength,
     * and the collator decomposition mode.
     * <p>
     * Note: The <i>successor</i> option is designed to encapsulate some
     * trickiness around forming the successor of a variable length text field
     * embedded in a multi-field key. In particular, simply appending a
     * <code>nul</code> byte will NOT work (it works fine when the text field
     * is the last field in the key or when it is the only component in the
     * key). This approach breaks encapsulation of the field boundaries such
     * that the resulting "successor" is actually ordered before the original
     * key. This happens because you introduce a 0x0 byte right on the boundary
     * of the next field, effectively causing the next field to have a smaller
     * value. Consider the following example (in hex) where "|" represents the
     * end of the "text" field:
     * 
     * <pre>
     *     ab cd | 12
     * </pre>
     * 
     * if you compute the successor by appending a nul byte to the text field
     * you get
     * 
     * <pre>
     *     ab cd | 00 12
     * </pre>
     * 
     * which is ordered before the original key!
     * 
     * @param text
     *            The text.
     * @param unicode
     *            When true the text is interpreted as Unicode according to the
     *            {@link Options#COLLATOR} option. Otherwise it is interpreted
     *            as ASCII.
     * @param successor
     *            When true, the successor of the text will be encoded.
     *            Otherwise the text will be encoded.
     * 
     * @return The {@link IKeyBuilder}.
     * 
     * @see http://www.unicode.org/reports/tr10/tr10-10.html#Interleaved_Levels
     */
    public IKeyBuilder appendText(String text, boolean unicode,
            boolean successor);
    
    /*
     * Note: This operation is not implemented since it can cause confusion so
     * easily.  If you want Unicode encoding use append(String).  If you want
     * ASCII encoding, use appendASCII(String).
     */
//    /**
//     * Encodes a character as a Unicode sort key by first converting it to a
//     * unicode string of length N and then encoding it using
//     * {@link #append(String)} (optional operation).
//     * 
//     * @throws UnsupportedOperationException
//     *                if Unicode is not supported.
//     *                
//     * @return <i>this</i>
//     */
//    public IKeyBuilder append(char[] v);

    /*
     * Required operations.
     */
    
    /**
     * Return <code>true</code> iff Unicode is supported by this object
     * (returns <code>false</code> if only ASCII support is configured).
     */
    public boolean isUnicodeSupported();
    
    /**
     * The maximum length of a variable length text field is <code>65535</code> (<code>pow(2,16)-1</code>).
     * <p>
     * Note: This restriction only applies to multi-field keys where the text
     * field appears in a non-terminal position within the key - that is as encoded by . When a text
     * field appears in such a non-terminal position trailing pad characters are
     * used to maintain lexiographic ordering over the multi-field key.
     */
    final public int maxlen = 65535;

    /**
     * Encodes a unicode string by assuming that its contents are ASCII
     * characters. For each character, this method simply chops of the high byte
     * and converts the low byte to an unsigned byte.
     * <p>
     * Note: This method is potentially much faster than the Unicode aware
     * {@link #append(String)}. However, this method is NOT unicode aware and
     * non-ASCII characters will not be encoded correctly. This method MUST NOT
     * be mixed with keys whose corresponding component is encoded by the
     * unicode aware methods, e.g., {@link #append(String)}.
     * 
     * @param s
     *            A String containing US-ASCII characters.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder appendASCII(String s);

    /**
     * Appends a byte - the byte is treated as an <code>unsigned</code> value.
     * 
     * @param b
     *            The byte.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(byte b);

    /**
     * Appends an array of bytes - the bytes are treated as
     * <code>unsigned</code> values.
     * 
     * @param a
     *            The array of bytes.
     *            
     * @return <i>this</i>
     */
    public IKeyBuilder append(byte[] a);

    /**
     * Append <i>len</i> bytes starting at <i>off</i> in <i>a</i> to the key
     * buffer - the bytes are treated as <code>unsigned</code> values.
     * 
     * @param off
     *            The offset.
     * @param len
     *            The #of bytes to append.
     * @param a
     *            The array containing the bytes to append.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(byte[] a, int off, int len);

    /**
     * Appends a double precision floating point value by first converting it
     * into a signed long integer using {@link Double#doubleToLongBits(double)},
     * converting that values into a twos-complement number and then appending
     * the bytes in big-endian order into the key buffer.
     * <p>
     * Note: this converts -0d and +0d to the same key.
     * 
     * @param d
     *            The double-precision floating point value.
     *            
     * @return <i>this</i>
     */
    public IKeyBuilder append(double d);

    /**
     * Appends a single precision floating point value by first converting it
     * into a signed integer using {@link Float#floatToIntBits(float)}
     * converting that values into a twos-complement number and then appending
     * the bytes in big-endian order into the key buffer.
     * <p>
     * Note: this converts -0f and +0f to the same key.
     * 
     * @param f
     *            The single-precision floating point value.
     *            
     * @return <i>this</i>
     */
    public IKeyBuilder append(float f);

    /**
     * Appends the UUID to the key using the MSB and then the LSB (this
     * preserves the natural order imposed by {@link UUID#compareTo(UUID)}).
     * 
     * @param uuid
     *            The UUID.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(UUID uuid);

    /**
     * Appends a signed long integer to the key by first converting it to a
     * lexiographic ordering as an unsigned long integer and then appending it
     * into the buffer as 8 bytes using a big-endian order.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(long v);

    /**
     * Appends a signed integer to the key by first converting it to a
     * lexiographic ordering as an unsigned integer and then appending it into
     * the buffer as 4 bytes using a big-endian order.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(int v);

    /**
     * Appends a signed short integer to the key by first converting it to a
     * two-complete representation supporting unsigned byte[] comparison and
     * then appending it into the buffer as 2 bytes using a big-endian order.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder append(short v);

    /*
     * Note: this method has been dropped from the API to reduce the
     * possibility of confusion.  If you want Unicode semantics then use
     * append(String).  If you want ASCII semantics then use appendASCII().
     * If you want signed integer semantics then use append(short).
     */
//    /**
//     * Encodes a character as a 16-bit unsigned integer.
//     * <p>
//     * Note: Characters are encoded as unsigned integers rather than as Unicode
//     * values since the semantics of Unicode collation sequences often violate
//     * the semantics of the character code points, even for ASCII. For example,
//     * the character 'z' has the successor '{', but Unicode collation would
//     * place order the string "{" BEFORE the string "z".
//     * 
//     * @param v
//     *            The character.
//     *            
//     * @return <i>this</i>
//     */
//    public IKeyBuilder append(char v);

    /**
     * Converts the signed byte to an unsigned byte and appends it to the key.
     * 
     * @param v
     *            The signed byte.
     *            
     * @return <i>this</i>
     */
    public IKeyBuilder appendSigned(final byte v);

    /**
     * Append an unsigned zero byte to the key.
     * 
     * @return <i>this</i>
     */
    public IKeyBuilder appendNul();

    /**
     * Encode a {@link BigInteger} into an unsigned byte[] and append it into
     * the key buffer.
     * <P>
     * The encoding is a 2 byte run length whose leading bit is set iff the
     * {@link BigInteger} is negative followed by the <code>byte[]</code> as
     * returned by {@link BigInteger#toByteArray()}.
     * 
     * @param The
     *            {@link BigInteger} value.
     * 
     * @return The unsigned byte[].
     */
    public IKeyBuilder append(final BigInteger i);

    /**
     * Encode a {@link BigDecimal} into an unsigned byte[] and append it into
     * the key buffer.
     * 
     * @param The
     *            {@link BigDecimal} value.
     * 
     * @return The unsigned byte[].
     */
    public IKeyBuilder append(final BigDecimal d);

    /**
     * Append the value to the buffer, encoding it as appropriate based on the
     * class of the object.  This method handles all of the primitive data types
     * plus {@link UUID} and Unicode {@link String}s.
     * 
     * @param val
     *            The value.
     * 
     * @return <i>this</i>
     * 
     * @throws IllegalArgumentException
     *             if <i>val</i> is <code>null</code>.
     * @throws UnsupportedOperationException
     *             if <i>val</i> is an instance of an unsupported class.
     */
    public IKeyBuilder append(Object val);

    /**
     * Converts the key into a z-order byte array, assuming numDimensions components
     * of type Long (i.e., 64bit each). For instance, assume the current key's 
     * buffer is 001001011010010001010100 and we call the method with
     * numDimensions=3. The method logically proceeds as follows:
     * 
     * 1. Split the key into n components, namely: 00100101 10100100 01010100
     * 2. Merge the component bit by bit: 010 001 110 001 000 111 000 100
     * 3. The result is this merged array
     * 
     * @param baseSize
     * @param numDimensions
     */
    byte[] toZOrder(int numDimensions);
    
    /**
     * Inverts method above in the sense that it interprets the buffer as
     * a zOrderString and returns an array of long values of size numDimensions,
     * reflecting the individual components of the z-order string.
     * 
     * @param size
     * @param numDimensions
     * @return
     */
    long[] fromZOrder(int numDimensions);
    
}