/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on May 3, 2010 */ package com.bigdata.rdf.internal; import java.math.BigDecimal; import java.math.BigInteger; import java.util.UUID; import org.openrdf.model.URI; import org.openrdf.model.Value; import com.bigdata.rdf.internal.impl.BlobIV; import com.bigdata.rdf.lexicon.LexiconRelation; import com.bigdata.rdf.model.BigdataValue; import com.bigdata.util.Bytes; /** * Data Type Enumeration (DTE) is a class which declares the known intrinsic * data types, provides for extensibility to new data types, and provides for * data types which either can not be inlined or are not being inlined. The * intrinsic data types are those having natural orders which can be encoded * into an unsigned byte[] key and decoded without loss. Whether or not a given * data type is actually inlined is a configuration option for the lexicon. * <p> * If a data type is not inlined, then the representation of the value must be * materialized. Non-inline values are {@link BlobIV}s and are materialized by * looking {@link IV} in the TERMS index. More recently, we are also permitting * indirection from the TERMS index. For example, very large objects may be * stored in the file system, in S3, etc. In those cases, you must still resolve * the {@link IV} against the TERMS index. Now the resulting object may be a * {@link BigdataValue}. However, the object materialized from the TERMS index * may also provide indirection into the file system, S3, etc. * <p> * The {@link VTE} has 4 distinctions (URI, Literal, BlankNode, and SID) and is * coded in the high 2 bits of a byte while the {@link DTE} has 16 possible * distinctions, one of which is reserved against future use and one of which is * reserved against extensibility in the set of intrinsic types. * <p> * Note: Unicode values CAN NOT be inlined because (a) Unicode sort keys are not * decodable; and (b) the collation rules for Unicode depend on the lexicon * configuration, which specifies parameters such as Locale, Strength, etc. * <p> * Blanks nodes (their IDs are UUIDs) and data types with larger values (UUIDs) * or varying length values (xsd:integer, xsd:decimanl) can be inlined. Whether * it makes sense to do so is a question which trades off redundancy in the * statement indices for faster materialization of the data type values and a * smaller lexicon. UUIDs for purposes other than blank nodes can also be * inlined, however they will have a different prefix to indicate that they are * Literals rather than blank nodes. * <p> * Note: While multidimensional data types (such as points or rectangles) could * be inlined (in the sense that their values could be converted to unsigned * byte[] keys and the keys could be decoded), they can not be placed into a * total order which has meaningful semantics by a single index. For example, * geo:point describes a 2-dimensional location and lacks any meaningful * locality when inlined into the index. The only reason to inline such data * types is to avoid indirection through the lexicon to materialize their * values. Efficiently resolving points in a region requires the use of a * spatial index. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * * @see http://www.w3.org/TR/xmlschema-2/ */ public enum DTE { /** * The "inline" value is a boolean (xsd:boolean). Only the distinct points * in the xsd:boolean value space are represented. xsd:boolean has multiple * lexical forms which map onto "true" and "false". Those distinctions are * not preserved. */ XSDBoolean((byte) 0, Bytes.SIZEOF_BYTE, Boolean.class, XSD.BOOLEAN, DTEFlags.NOFLAGS), // /** The "inline" value is a signed byte (xsd:byte). */ XSDByte((byte) 1, Bytes.SIZEOF_BYTE, Byte.class, XSD.BYTE, DTEFlags.NUMERIC), // /** The "inline" value is a signed short (xsd:short). */ XSDShort((byte) 2, Bytes.SIZEOF_SHORT, Short.class, XSD.SHORT, DTEFlags.NUMERIC), // /** The "inline" value is a signed 4 byte integer (xsd:int). */ XSDInt((byte) 3, Bytes.SIZEOF_INT, Integer.class, XSD.INT, DTEFlags.NUMERIC), // /** The "inline" value is a signed 8 byte integer (xsd:long). */ XSDLong((byte) 4, Bytes.SIZEOF_LONG, Long.class, XSD.LONG, DTEFlags.NUMERIC), // /* * unsigned byte, short, int, long. */ /** The "inline" value is an unsigned byte (xsd:unsignedByte). */ XSDUnsignedByte((byte) 5, Bytes.SIZEOF_BYTE, Byte.class, XSD.UNSIGNED_BYTE, DTEFlags.UNSIGNED_NUMERIC), // /** The "inline" value is a unsigned short (xsd:unsignedShort). */ XSDUnsignedShort((byte) 6, Bytes.SIZEOF_SHORT, Short.class, XSD.UNSIGNED_SHORT, DTEFlags.UNSIGNED_NUMERIC), // /** The "inline" value is an unsigned 4 byte integer (xsd:unsignedInt). */ XSDUnsignedInt((byte) 7, Bytes.SIZEOF_INT, Integer.class, XSD.UNSIGNED_INT, DTEFlags.UNSIGNED_NUMERIC), // /** The "inline" value is an unsigned 8 byte integer (xsd:unsignedLong). */ XSDUnsignedLong((byte) 8, Bytes.SIZEOF_LONG, Long.class, XSD.UNSIGNED_LONG, DTEFlags.UNSIGNED_NUMERIC), // /* * float, double. */ /** * The "inline" value is a single precision floating point number * (xsd:float). */ XSDFloat((byte) 9, Bytes.SIZEOF_FLOAT, Float.class, XSD.FLOAT, DTEFlags.NUMERIC), // /** * The "inline" value is a double precision floating point number * (xsd:double). */ XSDDouble((byte) 10, Bytes.SIZEOF_DOUBLE, Double.class, XSD.DOUBLE, DTEFlags.NUMERIC), // /* * xsd:integer, xsd:decimal. */ /** * The "inline" value is an xsd:integer, which is equivalent to * {@link BigInteger}. */ XSDInteger((byte) 11, 0/* variable length */, BigInteger.class, XSD.INTEGER, DTEFlags.NUMERIC), // /** * The "inline" value is an xsd:decimal. This is mostly equivalent to * {@link BigDecimal}, but unlike that Java class, xsd:decimal DOES NOT * preserve the precision of the value. (This fact is convenient for indices * since {@link BigDecimal} has, among other things, many distinct * representations of ZERO with different precision, etc. If we had to * represent the precision, we could not use xsd:decimal in an index!) */ XSDDecimal((byte) 12, 0/* variable length */, BigDecimal.class, XSD.DECIMAL, DTEFlags.NUMERIC), // /* * custom intrinsic data types. */ /** * The "inline" value is a {@link UUID}. * * @see http://lists.xml.org/archives/xml-dev/201003/msg00027.html */ UUID((byte) 13, Bytes.SIZEOF_UUID, UUID.class, XSD.UUID, DTEFlags.NOFLAGS), // /** * The "inline" value is a compressed Unicode string. This is decodable * compressed encoding rather than a Unicode sort key. It is suitable for * representing "small" Unicode values directly within the statement * indices. "Small" is configurable, but should not be overly large. The * tradeoff is the growth in the B+Tree leaf size for a statement index * versus the overhead required when indirecting through the * {@link LexiconRelation} to materialize the RDF {@link Value}. Further, * there is a practical upper bound on the size of a key in the B+Tree. * Therefore, inlining of Unicode values having between 32 and 64 characters * is suggested as a recommended practice. Beyond that, inlining can * contribute significantly to the growth in the B+Tree leaf size and have a * negative impact on join performance for the statement indices. * <p> * This {@link DTE} may be used in combination with {@link VTE} as follows: * <dl> * <dt>{@link VTE#BNODE}</dt> * <dd>Represent an inline blank node identifier.</dd> * <dt>{@link VTE#LITERAL}</dt> * <dd>Represent a plain literal.</dd> * <dt>{@link VTE#LITERAL} plus the extension bit</dt> * <dd>Represent a data type literal where the extension IV is the data type * of the literal.</dd> * <dt>{@link VTE#URI} plus the extension bit</dt> * <dd>Represent a URI where the extension IV is the namespace of the URI * and the inline Unicode component is the local name of the URI. This * depends on the openrdf definition of a {@link URI}'s namespace and local * name (basically, everything after the last '/' in the URI path or after * the '#' if there is a URI anchor).</dd> * </dl> */ XSDString((byte) 14, 0/* len */, String.class, XSD.STRING, DTEFlags.NOFLAGS), // /** * This provides an extension of the intrinsic data types. Its code * corresponds to 0xf, which is to say all four bits are on. When this code * is used, you must interpret the next byte using {@link DTEExtension}. * <p> * Note: This is NOT the same as the {@link AbstractIV#isExtension()} bit. * The latter <strong>always</strong> indicates that an {@link IV} follows * the <code>flags</code> byte and indicates the actual datatype URI. In * contrast, {@link DTE#Extension} gives you another byte which you can use * to handle additional "intrinsic" types. * * @see DTEExtension * @see BLZG-1507 (Implement support for DTE extension types for URIs) */ Extension((byte) 15, 0/* len */, Void.class, null/* datatype */, DTEFlags.NOFLAGS); /** * @param v * The code for the data type. * @param len * The length of the inline value -or- ZERO (0) if the value has * a variable length (xsd:integer, xsd:decimal). * @param cls * The class of the Java object used to represent instances of * the coded data type. * @param datatype * The well-known URI for the data type. * @param flags * Some bit flags. See {@link #NUMERIC}, * {@link #UNSIGNED_NUMERIC}, etc. */ private DTE(final byte v, final int len, final Class<?> cls, final URI datatypeURI, final int flags) { this.v = v; this.len = len; this.cls = cls; this.datatypeURI = datatypeURI; this.flags = flags; } static final public DTE valueOf(final byte b) { /* * Note: This switch MUST correspond to the declarations above (you can * not made the cases of the switch from [v] since it is not considered * a to be constant by the compiler). * * Note: This masks off everything but the lower 4 bits. */ switch (b & 0x0f) { case 0: return XSDBoolean; case 1: return XSDByte; case 2: return XSDShort; case 3: return XSDInt; case 4: return XSDLong; case 5: return XSDUnsignedByte; case 6: return XSDUnsignedShort; case 7: return XSDUnsignedInt; case 8: return XSDUnsignedLong; case 9: return XSDFloat; case 10: return XSDDouble; case 11: return XSDInteger; case 12: return XSDDecimal; case 13: return UUID; case 14: return XSDString; case 15: return Extension; default: throw new IllegalArgumentException(Byte.toString(b)); } } /** * Return the {@link DTE} for the datatype {@link URI}. * * @param datatype * The datatype {@link URI}. * * @return The {@link DTE} for that datatype -or- {@link #Extension} if the * datatype is <code>null</code> (there is no specific datatype for * an extension since extensions by their nature can handle any * datatype) -or- <code>null</code> if the datatype {@link URI} is * none of the datatypes for which native support is provided. */ static final public DTE valueOf(final URI datatype) { /* * Note: This switch MUST correspond to the declarations above (you can * not make the cases of the switch from [v] since it is not considered * a to be constant by the compiler). * * TODO Optimize using trie, weighted frequency lookup tree, hash map, * etc. Also, the match will always be on the local name once we proof * the namespace. */ if (datatype == null) { return null; } // Removed for BLZG-1507 (Implement support for DTE extension types for URIs) // if (datatype.equals(XSD.IPV4)) { // /* // * Note: This is a bit of a rough spot in the API. There is no // * datatype associated with [Extension] since it is a place holder // * for any an extension for any datatype. // * // * Right now I am hijacking Extension for IPv4. // */ // return Extension; // } if (datatype.equals(XSD.BOOLEAN)) return XSDBoolean; if (datatype.equals(XSD.BYTE)) return XSDByte; if (datatype.equals(XSD.SHORT)) return XSDShort; if (datatype.equals(XSD.INT)) return XSDInt; if (datatype.equals(XSD.LONG)) return XSDLong; if (datatype.equals(XSD.UNSIGNED_BYTE)) return XSDUnsignedByte; if (datatype.equals(XSD.UNSIGNED_SHORT)) return XSDUnsignedShort; if (datatype.equals(XSD.UNSIGNED_INT)) return XSDUnsignedInt; if (datatype.equals(XSD.UNSIGNED_LONG)) return XSDUnsignedLong; if (datatype.equals(XSD.FLOAT)) return XSDFloat; if (datatype.equals(XSD.DOUBLE)) return XSDDouble; if (datatype.equals(XSD.INTEGER)) return XSDInteger; if (datatype.equals(XSD.DECIMAL)) return XSDDecimal; if (datatype.equals(XSD.UUID)) return UUID; if (datatype.equals(XSD.STRING)) return XSDString; /* * Not a known DTE datatype. */ return null; } /** * The code for the data type. */ final byte v; /** * The length of the inline value -or- ZERO (0) if the value has a variable * length (xsd:integer, xsd:decimal). */ private final int len; /** * The class of the Java object used to represent instances of the coded * data type. */ private final Class<?> cls; /** * The well-known URI for the data type. */ private final URI datatypeURI; /** * Some bit flags. * * @see #NUMERIC * @see #UNSIGNED_NUMERIC */ private final int flags; /** * An <code>byte</code> value whose whose lower 6 bits code the * {@link DTE}. */ final public byte v() { return v; } /** * The length of the data type value when represented as a component in an * unsigned byte[] key -or- ZERO iff the key component has a variable length * for that data type. */ final public int len() { return len; } /** * The class of the Java object used to represent instances of the coded * data type. */ final public Class<?> getCls() { return cls; } /** * The corresponding datatype {@link URI}. */ final public URI getDatatypeURI() { return datatypeURI; } /** * <code>true</code> for any of the numeric data types (xsd:byte, * xsd:unsignedByte, xsd:short, xsd:unsignedShort, xsd:int, xsd:unsignedInt, * xsd:long, xsd:unsignedLong, xsd:float, xsd:double, xsd:integer, and * xsd:decimal). */ public boolean isNumeric() { return (flags & DTEFlags.NUMERIC) != 0; } /** * <code>true</code> for an signed numeric datatype ( xsd:byte, * xsd:short, xsd:int, xsd:long, xsd:float, xsd:double, xsd:integer, and * xsd:decimal). */ public boolean isSignedNumeric() { return isNumeric() && !isUnsignedNumeric(); } /** * <code>true</code> for an unsigned numeric datatype ( xsd:unsignedByte, * xsd:unsignedShort, xsd:unsignedInt, xsd:unsignedLong). */ public boolean isUnsignedNumeric() { /* * Note: The DTEFlags are not independent bits so when we test them * we have to test for equality rather than masking off the various * bits. */ return flags == DTEFlags.UNSIGNED_NUMERIC; } /** * This is <code>!isBigNumeric()</code> and is <code>true</code> for any of * the fixed length numeric data types (<code>xsd:byte, xsd:unsignedByte, * xsd:short, xsd:unsignedShort, xsd:int, xsd:unsignedInt, xsd:long, * xsd:unsignedLong, xsd:float, xsd:double</code>). */ public boolean isFixedNumeric() { return (flags & DTEFlags.NUMERIC) != 0 && len != 0; } /** * <code>true</code> for xsd:integer and xsd:decimal. */ public boolean isBigNumeric() { return (flags & DTEFlags.NUMERIC) != 0 && len == 0; } /** * <code>true</code> for xsd:float, xsd:double, and xsd:decimal. */ public boolean isFloatingPointNumeric() { return this == XSDFloat || this == XSDDouble || this == XSDDecimal; } }