/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.serde2.binarysortable; import java.io.IOException; import java.math.BigInteger; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.common.type.HiveDecimalV1; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.common.type.HiveIntervalDayTime; import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.AbstractSerDe; import org.apache.hadoop.hive.serde2.ByteStream; import org.apache.hadoop.hive.serde2.ByteStream.Output; import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput; import org.apache.hadoop.hive.serde2.SerDeException; import org.apache.hadoop.hive.serde2.SerDeSpec; import org.apache.hadoop.hive.serde2.SerDeStats; import org.apache.hadoop.hive.serde2.SerDeUtils; import org.apache.hadoop.hive.serde2.io.ByteWritable; import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.io.HiveCharWritable; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalDayTimeWritable; import org.apache.hadoop.hive.serde2.io.HiveIntervalYearMonthWritable; import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable; import org.apache.hadoop.hive.serde2.io.ShortWritable; import org.apache.hadoop.hive.serde2.io.TimestampTZWritable; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector.StandardUnion; import org.apache.hadoop.hive.serde2.objectinspector.StructField; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveIntervalDayTimeObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveIntervalYearMonthObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampTZObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.FloatWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * BinarySortableSerDe can be used to write data in a way that the data can be * compared byte-by-byte with the same order. * * The data format: NULL: a single byte (\0 or \1, check below) NON-NULL Primitives: * ALWAYS prepend a single byte (\0 or \1), and then: Boolean: FALSE = \1, TRUE = \2 * Byte: flip the sign-bit to make sure negative comes before positive Short: flip the * sign-bit to make sure negative comes before positive Int: flip the sign-bit to * make sure negative comes before positive Long: flip the sign-bit to make sure * negative comes before positive Double: flip the sign-bit for positive double, * and all bits for negative double values String: NULL-terminated UTF-8 string, * with NULL escaped to \1 \1, and \1 escaped to \1 \2 NON-NULL Complex Types: * ALWAYS prepend a single byte \1, and then: Struct: one field by one field. * List: \1 followed by each element, and \0 to terminate Map: \1 followed by * each key and then each value, and \0 to terminate * * This SerDe takes an additional parameter SERIALIZATION_SORT_ORDER which is a * string containing only "+" and "-". The length of the string should equal to * the number of fields in the top-level struct for serialization. "+" means the * field should be sorted ascendingly, and "-" means descendingly. The sub * fields in the same top-level field will have the same sort order. * * This SerDe takes an additional parameter SERIALIZATION_NULL_SORT_ORDER which is a * string containing only "a" and "z". The length of the string should equal to * the number of fields in the top-level struct for serialization. "a" means that * NULL should come first (thus, single byte is \0 for ascending order, \1 * for descending order), while "z" means that NULL should come last (thus, single * byte is \1 for ascending order, \0 for descending order). */ @SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES, serdeConstants.SERIALIZATION_SORT_ORDER, serdeConstants.SERIALIZATION_NULL_SORT_ORDER}) public class BinarySortableSerDe extends AbstractSerDe { public static final Logger LOG = LoggerFactory.getLogger(BinarySortableSerDe.class.getName()); public static final byte ZERO = (byte) 0; public static final byte ONE = (byte) 1; List<String> columnNames; List<TypeInfo> columnTypes; TypeInfo rowTypeInfo; StructObjectInspector rowObjectInspector; boolean[] columnSortOrderIsDesc; byte[] columnNullMarker; byte[] columnNotNullMarker; public static Charset decimalCharSet = Charset.forName("US-ASCII"); @Override public void initialize(Configuration conf, Properties tbl) throws SerDeException { // Get column names and sort order String columnNameProperty = tbl.getProperty(serdeConstants.LIST_COLUMNS); String columnTypeProperty = tbl.getProperty(serdeConstants.LIST_COLUMN_TYPES); final String columnNameDelimiter = tbl.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? tbl .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA); if (columnNameProperty.length() == 0) { columnNames = new ArrayList<String>(); } else { columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter)); } if (columnTypeProperty.length() == 0) { columnTypes = new ArrayList<TypeInfo>(); } else { columnTypes = TypeInfoUtils .getTypeInfosFromTypeString(columnTypeProperty); } assert (columnNames.size() == columnTypes.size()); // Create row related objects rowTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNames, columnTypes); rowObjectInspector = (StructObjectInspector) TypeInfoUtils .getStandardWritableObjectInspectorFromTypeInfo(rowTypeInfo); row = new ArrayList<Object>(columnNames.size()); for (int i = 0; i < columnNames.size(); i++) { row.add(null); } // Get the sort order String columnSortOrder = tbl .getProperty(serdeConstants.SERIALIZATION_SORT_ORDER); columnSortOrderIsDesc = new boolean[columnNames.size()]; for (int i = 0; i < columnSortOrderIsDesc.length; i++) { columnSortOrderIsDesc[i] = (columnSortOrder != null && columnSortOrder .charAt(i) == '-'); } // Null first/last String columnNullOrder = tbl .getProperty(serdeConstants.SERIALIZATION_NULL_SORT_ORDER); columnNullMarker = new byte[columnNames.size()]; columnNotNullMarker = new byte[columnNames.size()]; for (int i = 0; i < columnSortOrderIsDesc.length; i++) { if (columnSortOrderIsDesc[i]) { // Descending if (columnNullOrder != null && columnNullOrder.charAt(i) == 'a') { // Null first columnNullMarker[i] = ONE; columnNotNullMarker[i] = ZERO; } else { // Null last (default for descending order) columnNullMarker[i] = ZERO; columnNotNullMarker[i] = ONE; } } else { // Ascending if (columnNullOrder != null && columnNullOrder.charAt(i) == 'z') { // Null last columnNullMarker[i] = ONE; columnNotNullMarker[i] = ZERO; } else { // Null first (default for ascending order) columnNullMarker[i] = ZERO; columnNotNullMarker[i] = ONE; } } } } @Override public Class<? extends Writable> getSerializedClass() { return BytesWritable.class; } @Override public ObjectInspector getObjectInspector() throws SerDeException { return rowObjectInspector; } ArrayList<Object> row; InputByteBuffer inputByteBuffer = new InputByteBuffer(); @Override public Object deserialize(Writable blob) throws SerDeException { BytesWritable data = (BytesWritable) blob; inputByteBuffer.reset(data.getBytes(), 0, data.getLength()); try { for (int i = 0; i < columnNames.size(); i++) { row.set(i, deserialize(inputByteBuffer, columnTypes.get(i), columnSortOrderIsDesc[i], columnNullMarker[i], columnNotNullMarker[i], row.get(i))); } } catch (IOException e) { throw new SerDeException(e); } return row; } static Object deserialize(InputByteBuffer buffer, TypeInfo type, boolean invert, byte nullMarker, byte notNullMarker, Object reuse) throws IOException { // Is this field a null? byte isNull = buffer.read(invert); if (isNull == nullMarker) { return null; } assert (isNull == notNullMarker); switch (type.getCategory()) { case PRIMITIVE: { PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type; switch (ptype.getPrimitiveCategory()) { case VOID: { return null; } case BOOLEAN: { BooleanWritable r = reuse == null ? new BooleanWritable() : (BooleanWritable) reuse; byte b = buffer.read(invert); assert (b == 1 || b == 2); r.set(b == 2); return r; } case BYTE: { ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse; r.set((byte) (buffer.read(invert) ^ 0x80)); return r; } case SHORT: { ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse; int v = buffer.read(invert) ^ 0x80; v = (v << 8) + (buffer.read(invert) & 0xff); r.set((short) v); return r; } case INT: { IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse; r.set(deserializeInt(buffer, invert)); return r; } case LONG: { LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse; r.set(deserializeLong(buffer, invert)); return r; } case FLOAT: { FloatWritable r = reuse == null ? new FloatWritable() : (FloatWritable) reuse; int v = 0; for (int i = 0; i < 4; i++) { v = (v << 8) + (buffer.read(invert) & 0xff); } if ((v & (1 << 31)) == 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1 << 31); } r.set(Float.intBitsToFloat(v)); return r; } case DOUBLE: { DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse; long v = 0; for (int i = 0; i < 8; i++) { v = (v << 8) + (buffer.read(invert) & 0xff); } if ((v & (1L << 63)) == 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1L << 63); } r.set(Double.longBitsToDouble(v)); return r; } case STRING: { Text r = reuse == null ? new Text() : (Text) reuse; return deserializeText(buffer, invert, r); } case CHAR: { HiveCharWritable r = reuse == null ? new HiveCharWritable() : (HiveCharWritable) reuse; // Use internal text member to read value deserializeText(buffer, invert, r.getTextValue()); r.enforceMaxLength(getCharacterMaxLength(type)); return r; } case VARCHAR: { HiveVarcharWritable r = reuse == null ? new HiveVarcharWritable() : (HiveVarcharWritable) reuse; // Use HiveVarchar's internal Text member to read the value. deserializeText(buffer, invert, r.getTextValue()); // If we cache helper data for deserialization we could avoid having // to call getVarcharMaxLength() on every deserialize call. r.enforceMaxLength(getCharacterMaxLength(type)); return r; } case BINARY: { BytesWritable bw = new BytesWritable() ; // Get the actual length first int start = buffer.tell(); int length = 0; do { byte b = buffer.read(invert); if (b == 0) { // end of string break; } if (b == 1) { // the last char is an escape char. read the actual char buffer.read(invert); } length++; } while (true); if (length == buffer.tell() - start) { // No escaping happened, so we are already done. bw.set(buffer.getData(), start, length); } else { // Escaping happened, we need to copy byte-by-byte. // 1. Set the length first. bw.set(buffer.getData(), start, length); // 2. Reset the pointer. buffer.seek(start); // 3. Copy the data. byte[] rdata = bw.getBytes(); for (int i = 0; i < length; i++) { byte b = buffer.read(invert); if (b == 1) { // The last char is an escape char, read the actual char. // The serialization format escape \0 to \1, and \1 to \2, // to make sure the string is null-terminated. b = (byte) (buffer.read(invert) - 1); } rdata[i] = b; } // 4. Read the null terminator. byte b = buffer.read(invert); assert (b == 0); } return bw; } case DATE: { DateWritable d = reuse == null ? new DateWritable() : (DateWritable) reuse; d.set(deserializeInt(buffer, invert)); return d; } case TIMESTAMP: TimestampWritable t = (reuse == null ? new TimestampWritable() : (TimestampWritable) reuse); byte[] bytes = new byte[TimestampWritable.BINARY_SORTABLE_LENGTH]; for (int i = 0; i < bytes.length; i++) { bytes[i] = buffer.read(invert); } t.setBinarySortable(bytes, 0); return t; case TIMESTAMPTZ: TimestampTZWritable tstz = (reuse == null ? new TimestampTZWritable() : (TimestampTZWritable) reuse); byte[] data = new byte[TimestampTZWritable.BINARY_SORTABLE_LENGTH]; for (int i = 0; i < data.length; i++) { data[i] = buffer.read(invert); } tstz.fromBinarySortable(data, 0); return tstz; case INTERVAL_YEAR_MONTH: { HiveIntervalYearMonthWritable i = reuse == null ? new HiveIntervalYearMonthWritable() : (HiveIntervalYearMonthWritable) reuse; i.set(deserializeInt(buffer, invert)); return i; } case INTERVAL_DAY_TIME: { HiveIntervalDayTimeWritable i = reuse == null ? new HiveIntervalDayTimeWritable() : (HiveIntervalDayTimeWritable) reuse; long totalSecs = deserializeLong(buffer, invert); int nanos = deserializeInt(buffer, invert); i.set(totalSecs, nanos); return i; } case DECIMAL: { // See serialization of decimal for explanation (below) HiveDecimalWritable bdw = (reuse == null ? new HiveDecimalWritable() : (HiveDecimalWritable) reuse); int b = buffer.read(invert) - 1; assert (b == 1 || b == -1 || b == 0); boolean positive = b != -1; int factor = buffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { factor = (factor << 8) + (buffer.read(invert) & 0xff); } if (!positive) { factor = -factor; } int start = buffer.tell(); int length = 0; do { b = buffer.read(positive ? invert : !invert); assert(b != 1); if (b == 0) { // end of digits break; } length++; } while (true); final byte[] decimalBuffer = new byte[length]; buffer.seek(start); for (int i = 0; i < length; ++i) { decimalBuffer[i] = buffer.read(positive ? invert : !invert); } // read the null byte again buffer.read(positive ? invert : !invert); String digits = new String(decimalBuffer, 0, length, decimalCharSet); BigInteger bi = new BigInteger(digits); HiveDecimal bd = HiveDecimal.create(bi).scaleByPowerOfTen(factor-length); if (!positive) { bd = bd.negate(); } bdw.set(bd); return bdw; } default: { throw new RuntimeException("Unrecognized type: " + ptype.getPrimitiveCategory()); } } } case LIST: { ListTypeInfo ltype = (ListTypeInfo) type; TypeInfo etype = ltype.getListElementTypeInfo(); // Create the list if needed ArrayList<Object> r = reuse == null ? new ArrayList<Object>() : (ArrayList<Object>) reuse; // Read the list int size = 0; while (true) { int more = buffer.read(invert); if (more == 0) { // \0 to terminate break; } // \1 followed by each element assert (more == 1); if (size == r.size()) { r.add(null); } r.set(size, deserialize(buffer, etype, invert, nullMarker, notNullMarker, r.get(size))); size++; } // Remove additional elements if the list is reused while (r.size() > size) { r.remove(r.size() - 1); } return r; } case MAP: { MapTypeInfo mtype = (MapTypeInfo) type; TypeInfo ktype = mtype.getMapKeyTypeInfo(); TypeInfo vtype = mtype.getMapValueTypeInfo(); // Create the map if needed Map<Object, Object> r; if (reuse == null) { r = new HashMap<Object, Object>(); } else { r = (HashMap<Object, Object>) reuse; r.clear(); } while (true) { int more = buffer.read(invert); if (more == 0) { // \0 to terminate break; } // \1 followed by each key and then each value assert (more == 1); Object k = deserialize(buffer, ktype, invert, nullMarker, notNullMarker, null); Object v = deserialize(buffer, vtype, invert, nullMarker, notNullMarker, null); r.put(k, v); } return r; } case STRUCT: { StructTypeInfo stype = (StructTypeInfo) type; List<TypeInfo> fieldTypes = stype.getAllStructFieldTypeInfos(); int size = fieldTypes.size(); // Create the struct if needed ArrayList<Object> r = reuse == null ? new ArrayList<Object>(size) : (ArrayList<Object>) reuse; assert (r.size() <= size); // Set the size of the struct while (r.size() < size) { r.add(null); } // Read one field by one field for (int eid = 0; eid < size; eid++) { r .set(eid, deserialize(buffer, fieldTypes.get(eid), invert, nullMarker, notNullMarker, r .get(eid))); } return r; } case UNION: { UnionTypeInfo utype = (UnionTypeInfo) type; StandardUnion r = reuse == null ? new StandardUnion() : (StandardUnion) reuse; // Read the tag byte tag = buffer.read(invert); r.setTag(tag); r.setObject(deserialize(buffer, utype.getAllUnionObjectTypeInfos().get(tag), invert, nullMarker, notNullMarker, null)); return r; } default: { throw new RuntimeException("Unrecognized type: " + type.getCategory()); } } } private static int deserializeInt(InputByteBuffer buffer, boolean invert) throws IOException { int v = buffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { v = (v << 8) + (buffer.read(invert) & 0xff); } return v; } private static long deserializeLong(InputByteBuffer buffer, boolean invert) throws IOException { long v = buffer.read(invert) ^ 0x80; for (int i = 0; i < 7; i++) { v = (v << 8) + (buffer.read(invert) & 0xff); } return v; } static int getCharacterMaxLength(TypeInfo type) { return ((BaseCharTypeInfo)type).getLength(); } public static Text deserializeText(InputByteBuffer buffer, boolean invert, Text r) throws IOException { // Get the actual length first int start = buffer.tell(); int length = 0; do { byte b = buffer.read(invert); if (b == 0) { // end of string break; } if (b == 1) { // the last char is an escape char. read the actual char buffer.read(invert); } length++; } while (true); if (length == buffer.tell() - start) { // No escaping happened, so we are already done. r.set(buffer.getData(), start, length); } else { // Escaping happened, we need to copy byte-by-byte. // 1. Set the length first. r.set(buffer.getData(), start, length); // 2. Reset the pointer. buffer.seek(start); // 3. Copy the data. byte[] rdata = r.getBytes(); for (int i = 0; i < length; i++) { byte b = buffer.read(invert); if (b == 1) { // The last char is an escape char, read the actual char. // The serialization format escape \0 to \1, and \1 to \2, // to make sure the string is null-terminated. b = (byte) (buffer.read(invert) - 1); } rdata[i] = b; } // 4. Read the null terminator. byte b = buffer.read(invert); assert (b == 0); } return r; } BytesWritable serializeBytesWritable = new BytesWritable(); ByteStream.Output output = new ByteStream.Output(); @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { output.reset(); StructObjectInspector soi = (StructObjectInspector) objInspector; List<? extends StructField> fields = soi.getAllStructFieldRefs(); for (int i = 0; i < columnNames.size(); i++) { serialize(output, soi.getStructFieldData(obj, fields.get(i)), fields.get(i).getFieldObjectInspector(), columnSortOrderIsDesc[i], columnNullMarker[i], columnNotNullMarker[i]); } serializeBytesWritable.set(output.getData(), 0, output.getLength()); return serializeBytesWritable; } public static void writeByte(RandomAccessOutput buffer, byte b, boolean invert) { if (invert) { b = (byte) (0xff ^ b); } buffer.write(b); } static void serialize(ByteStream.Output buffer, Object o, ObjectInspector oi, boolean invert, byte nullMarker, byte notNullMarker) throws SerDeException { // Is this field a null? if (o == null) { writeByte(buffer, nullMarker, invert); return; } // This field is not a null. writeByte(buffer, notNullMarker, invert); switch (oi.getCategory()) { case PRIMITIVE: { PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi; switch (poi.getPrimitiveCategory()) { case VOID: { return; } case BOOLEAN: { boolean v = ((BooleanObjectInspector) poi).get(o); writeByte(buffer, (byte) (v ? 2 : 1), invert); return; } case BYTE: { ByteObjectInspector boi = (ByteObjectInspector) poi; byte v = boi.get(o); writeByte(buffer, (byte) (v ^ 0x80), invert); return; } case SHORT: { ShortObjectInspector spoi = (ShortObjectInspector) poi; short v = spoi.get(o); serializeShort(buffer, v, invert); return; } case INT: { IntObjectInspector ioi = (IntObjectInspector) poi; int v = ioi.get(o); serializeInt(buffer, v, invert); return; } case LONG: { LongObjectInspector loi = (LongObjectInspector) poi; long v = loi.get(o); serializeLong(buffer, v, invert); return; } case FLOAT: { FloatObjectInspector foi = (FloatObjectInspector) poi; serializeFloat(buffer, foi.get(o), invert); return; } case DOUBLE: { DoubleObjectInspector doi = (DoubleObjectInspector) poi; serializeDouble(buffer, doi.get(o), invert); return; } case STRING: { StringObjectInspector soi = (StringObjectInspector) poi; Text t = soi.getPrimitiveWritableObject(o); serializeBytes(buffer, t.getBytes(), t.getLength(), invert); return; } case CHAR: { HiveCharObjectInspector hcoi = (HiveCharObjectInspector) poi; HiveCharWritable hc = hcoi.getPrimitiveWritableObject(o); // Trailing space should ignored for char comparisons. // So write stripped values for this SerDe. Text t = hc.getStrippedValue(); serializeBytes(buffer, t.getBytes(), t.getLength(), invert); return; } case VARCHAR: { HiveVarcharObjectInspector hcoi = (HiveVarcharObjectInspector)poi; HiveVarcharWritable hc = hcoi.getPrimitiveWritableObject(o); // use varchar's text field directly Text t = hc.getTextValue(); serializeBytes(buffer, t.getBytes(), t.getLength(), invert); return; } case BINARY: { BinaryObjectInspector baoi = (BinaryObjectInspector) poi; BytesWritable ba = baoi.getPrimitiveWritableObject(o); byte[] toSer = new byte[ba.getLength()]; System.arraycopy(ba.getBytes(), 0, toSer, 0, ba.getLength()); serializeBytes(buffer, toSer, ba.getLength(), invert); return; } case DATE: { DateObjectInspector doi = (DateObjectInspector) poi; int v = doi.getPrimitiveWritableObject(o).getDays(); serializeInt(buffer, v, invert); return; } case TIMESTAMP: { TimestampObjectInspector toi = (TimestampObjectInspector) poi; TimestampWritable t = toi.getPrimitiveWritableObject(o); serializeTimestampWritable(buffer, t, invert); return; } case TIMESTAMPTZ: { TimestampTZObjectInspector toi = (TimestampTZObjectInspector) poi; TimestampTZWritable t = toi.getPrimitiveWritableObject(o); serializeTimestampTZWritable(buffer, t, invert); return; } case INTERVAL_YEAR_MONTH: { HiveIntervalYearMonthObjectInspector ioi = (HiveIntervalYearMonthObjectInspector) poi; HiveIntervalYearMonth intervalYearMonth = ioi.getPrimitiveJavaObject(o); serializeHiveIntervalYearMonth(buffer, intervalYearMonth, invert); return; } case INTERVAL_DAY_TIME: { HiveIntervalDayTimeObjectInspector ioi = (HiveIntervalDayTimeObjectInspector) poi; HiveIntervalDayTime intervalDayTime = ioi.getPrimitiveJavaObject(o); serializeHiveIntervalDayTime(buffer, intervalDayTime, invert); return; } case DECIMAL: { HiveDecimalObjectInspector boi = (HiveDecimalObjectInspector) poi; HiveDecimal dec = boi.getPrimitiveJavaObject(o); serializeHiveDecimal(buffer, dec, invert); return; } default: { throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory()); } } } case LIST: { ListObjectInspector loi = (ListObjectInspector) oi; ObjectInspector eoi = loi.getListElementObjectInspector(); // \1 followed by each element int size = loi.getListLength(o); for (int eid = 0; eid < size; eid++) { writeByte(buffer, (byte) 1, invert); serialize(buffer, loi.getListElement(o, eid), eoi, invert, nullMarker, notNullMarker); } // and \0 to terminate writeByte(buffer, (byte) 0, invert); return; } case MAP: { MapObjectInspector moi = (MapObjectInspector) oi; ObjectInspector koi = moi.getMapKeyObjectInspector(); ObjectInspector voi = moi.getMapValueObjectInspector(); // \1 followed by each key and then each value Map<?, ?> map = moi.getMap(o); for (Map.Entry<?, ?> entry : map.entrySet()) { writeByte(buffer, (byte) 1, invert); serialize(buffer, entry.getKey(), koi, invert, nullMarker, notNullMarker); serialize(buffer, entry.getValue(), voi, invert, nullMarker, notNullMarker); } // and \0 to terminate writeByte(buffer, (byte) 0, invert); return; } case STRUCT: { StructObjectInspector soi = (StructObjectInspector) oi; List<? extends StructField> fields = soi.getAllStructFieldRefs(); for (int i = 0; i < fields.size(); i++) { serialize(buffer, soi.getStructFieldData(o, fields.get(i)), fields.get( i).getFieldObjectInspector(), invert, nullMarker, notNullMarker); } return; } case UNION: { UnionObjectInspector uoi = (UnionObjectInspector) oi; byte tag = uoi.getTag(o); writeByte(buffer, tag, invert); serialize(buffer, uoi.getField(o), uoi.getObjectInspectors().get(tag), invert, nullMarker, notNullMarker); return; } default: { throw new RuntimeException("Unrecognized type: " + oi.getCategory()); } } } public static void serializeBytes( ByteStream.Output buffer, byte[] data, int length, boolean invert) { for (int i = 0; i < length; i++) { if (data[i] == 0 || data[i] == 1) { writeByte(buffer, (byte) 1, invert); writeByte(buffer, (byte) (data[i] + 1), invert); } else { writeByte(buffer, data[i], invert); } } writeByte(buffer, (byte) 0, invert); } public static void serializeBytes( ByteStream.Output buffer, byte[] data, int offset, int length, boolean invert) { for (int i = offset; i < offset + length; i++) { if (data[i] == 0 || data[i] == 1) { writeByte(buffer, (byte) 1, invert); writeByte(buffer, (byte) (data[i] + 1), invert); } else { writeByte(buffer, data[i], invert); } } writeByte(buffer, (byte) 0, invert); } public static void serializeShort(ByteStream.Output buffer, short v, boolean invert) { writeByte(buffer, (byte) ((v >> 8) ^ 0x80), invert); writeByte(buffer, (byte) v, invert); } public static void serializeInt(ByteStream.Output buffer, int v, boolean invert) { writeByte(buffer, (byte) ((v >> 24) ^ 0x80), invert); writeByte(buffer, (byte) (v >> 16), invert); writeByte(buffer, (byte) (v >> 8), invert); writeByte(buffer, (byte) v, invert); } public static void serializeLong(ByteStream.Output buffer, long v, boolean invert) { writeByte(buffer, (byte) ((v >> 56) ^ 0x80), invert); writeByte(buffer, (byte) (v >> 48), invert); writeByte(buffer, (byte) (v >> 40), invert); writeByte(buffer, (byte) (v >> 32), invert); writeByte(buffer, (byte) (v >> 24), invert); writeByte(buffer, (byte) (v >> 16), invert); writeByte(buffer, (byte) (v >> 8), invert); writeByte(buffer, (byte) v, invert); } public static void serializeFloat(ByteStream.Output buffer, float vf, boolean invert) { int v = Float.floatToIntBits(vf); if ((v & (1 << 31)) != 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1 << 31); } writeByte(buffer, (byte) (v >> 24), invert); writeByte(buffer, (byte) (v >> 16), invert); writeByte(buffer, (byte) (v >> 8), invert); writeByte(buffer, (byte) v, invert); } public static void serializeDouble(ByteStream.Output buffer, double vd, boolean invert) { long v = Double.doubleToLongBits(vd); if ((v & (1L << 63)) != 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1L << 63); } writeByte(buffer, (byte) (v >> 56), invert); writeByte(buffer, (byte) (v >> 48), invert); writeByte(buffer, (byte) (v >> 40), invert); writeByte(buffer, (byte) (v >> 32), invert); writeByte(buffer, (byte) (v >> 24), invert); writeByte(buffer, (byte) (v >> 16), invert); writeByte(buffer, (byte) (v >> 8), invert); writeByte(buffer, (byte) v, invert); } public static void serializeTimestampWritable(ByteStream.Output buffer, TimestampWritable t, boolean invert) { byte[] data = t.getBinarySortable(); for (int i = 0; i < data.length; i++) { writeByte(buffer, data[i], invert); } } public static void serializeTimestampTZWritable( ByteStream.Output buffer, TimestampTZWritable t, boolean invert) { byte[] data = t.toBinarySortable(); for (byte b : data) { writeByte(buffer, b, invert); } } public static void serializeHiveIntervalYearMonth(ByteStream.Output buffer, HiveIntervalYearMonth intervalYearMonth, boolean invert) { int totalMonths = intervalYearMonth.getTotalMonths(); serializeInt(buffer, totalMonths, invert); } public static void serializeHiveIntervalDayTime(ByteStream.Output buffer, HiveIntervalDayTime intervalDayTime, boolean invert) { long totalSecs = intervalDayTime.getTotalSeconds(); int nanos = intervalDayTime.getNanos(); serializeLong(buffer, totalSecs, invert); serializeInt(buffer, nanos, invert); } public static void serializeOldHiveDecimal(ByteStream.Output buffer, HiveDecimalV1 oldDec, boolean invert) { // get the sign of the big decimal int sign = oldDec.compareTo(HiveDecimalV1.ZERO); // we'll encode the absolute value (sign is separate) oldDec = oldDec.abs(); // get the scale factor to turn big decimal into a decimal < 1 // This relies on the BigDecimal precision value, which as of HIVE-10270 // is now different from HiveDecimal.precision() int factor = oldDec.bigDecimalValue().precision() - oldDec.bigDecimalValue().scale(); factor = sign == 1 ? factor : -factor; // convert the absolute big decimal to string oldDec.scaleByPowerOfTen(Math.abs(oldDec.scale())); String digits = oldDec.unscaledValue().toString(); // finally write out the pieces (sign, scale, digits) writeByte(buffer, (byte) ( sign + 1), invert); writeByte(buffer, (byte) ((factor >> 24) ^ 0x80), invert); writeByte(buffer, (byte) ( factor >> 16), invert); writeByte(buffer, (byte) ( factor >> 8), invert); writeByte(buffer, (byte) factor, invert); serializeBytes(buffer, digits.getBytes(decimalCharSet), digits.length(), sign == -1 ? !invert : invert); } // See comments for next method. public static void serializeHiveDecimal(ByteStream.Output buffer, HiveDecimal dec, boolean invert) { byte[] scratchBuffer = new byte[HiveDecimal.SCRATCH_BUFFER_LEN_TO_BYTES]; serializeHiveDecimal(buffer, dec, invert, scratchBuffer); } /** * Decimals are encoded in three pieces:Decimals are encoded in three pieces: * * Sign: 1, 2 or 3 for smaller, equal or larger than 0 respectively * Factor: Number that indicates the amount of digits you have to move * the decimal point left or right until the resulting number is smaller * than zero but has something other than 0 as the first digit. * Digits: which is a string of all the digits in the decimal. If the number * is negative the binary string will be inverted to get the correct ordering. * * UNDONE: Is this example correct? * Example: 0.00123 * Sign is 3 (bigger than 0) * Factor is -2 (move decimal point 2 positions right) * Digits are: 123 * * @param buffer * @param dec * @param invert * @param scratchBuffer */ public static void serializeHiveDecimal( ByteStream.Output buffer, HiveDecimal dec, boolean invert, byte[] scratchBuffer) { // Get the sign of the decimal. int signum = dec.signum(); // Get the 10^N power to turn digits into the desired decimal with a possible // fractional part. // To be compatible with the OldHiveDecimal version, zero has factor 1. int factor; if (signum == 0) { factor = 1; } else { factor = dec.rawPrecision() - dec.scale(); } // To make comparisons work properly, the "factor" gets the decimal's sign, too. factor = signum == 1 ? factor : -factor; // Convert just the decimal digits (no dot, sign, etc) into bytes. // // This is much faster than converting the BigInteger value from unscaledValue() which is no // longer part of the HiveDecimal representation anymore to string, then bytes. int index = dec.toDigitsOnlyBytes(scratchBuffer); /* * Finally write out the pieces (sign, power, digits) */ writeByte(buffer, (byte) ( signum + 1), invert); writeByte(buffer, (byte) ((factor >> 24) ^ 0x80), invert); writeByte(buffer, (byte) ( factor >> 16), invert); writeByte(buffer, (byte) ( factor >> 8), invert); writeByte(buffer, (byte) factor, invert); // The toDigitsOnlyBytes stores digits at the end of the scratch buffer. serializeBytes( buffer, scratchBuffer, index, scratchBuffer.length - index, signum == -1 ? !invert : invert); } // A HiveDecimalWritable version. public static void serializeHiveDecimal( ByteStream.Output buffer, HiveDecimalWritable decWritable, boolean invert, byte[] scratchBuffer) { // Get the sign of the decimal. int signum = decWritable.signum(); // Get the 10^N power to turn digits into the desired decimal with a possible // fractional part. // To be compatible with the OldHiveDecimal version, zero has factor 1. int factor; if (signum == 0) { factor = 1; } else { factor = decWritable.rawPrecision() - decWritable.scale(); } // To make comparisons work properly, the "factor" gets the decimal's sign, too. factor = signum == 1 ? factor : -factor; // Convert just the decimal digits (no dot, sign, etc) into bytes. // // This is much faster than converting the BigInteger value from unscaledValue() which is no // longer part of the HiveDecimal representation anymore to string, then bytes. int index = decWritable.toDigitsOnlyBytes(scratchBuffer); /* * Finally write out the pieces (sign, power, digits) */ writeByte(buffer, (byte) ( signum + 1), invert); writeByte(buffer, (byte) ((factor >> 24) ^ 0x80), invert); writeByte(buffer, (byte) ( factor >> 16), invert); writeByte(buffer, (byte) ( factor >> 8), invert); writeByte(buffer, (byte) factor, invert); // The toDigitsOnlyBytes stores digits at the end of the scratch buffer. serializeBytes( buffer, scratchBuffer, index, scratchBuffer.length - index, signum == -1 ? !invert : invert); } @Override public SerDeStats getSerDeStats() { // no support for statistics return null; } public static void serializeStruct(Output byteStream, Object[] fieldData, List<ObjectInspector> fieldOis, boolean[] sortableSortOrders, byte[] nullMarkers, byte[] notNullMarkers) throws SerDeException { for (int i = 0; i < fieldData.length; i++) { serialize(byteStream, fieldData[i], fieldOis.get(i), sortableSortOrders[i], nullMarkers[i], notNullMarkers[i]); } } public boolean[] getSortOrders() { return columnSortOrderIsDesc; } public byte[] getNullMarkers() { return columnNullMarker; } public byte[] getNotNullMarkers() { return columnNotNullMarker; } }