/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.serde2.binarysortable.fast; import java.io.IOException; import java.util.ArrayDeque; import java.util.Arrays; import java.util.Deque; import java.util.List; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe; import org.apache.hadoop.hive.serde2.binarysortable.InputByteBuffer; import org.apache.hadoop.hive.serde2.fast.DeserializeRead; import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /* * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format. * * The caller is responsible for calling the read method for the right type of each field * (after calling readNextField). * * Reading some fields require a results object to receive value information. A separate * results object is created by the caller at initialization per different field even for the same * type. * * Some type values are by reference to either bytes in the deserialization buffer or to * other type specific buffers. So, those references are only valid until the next time set is * called. */ public final class BinarySortableDeserializeRead extends DeserializeRead { public static final Logger LOG = LoggerFactory.getLogger(BinarySortableDeserializeRead.class.getName()); // The sort order (ascending/descending) for each field. Set to true when descending (invert). private boolean[] columnSortOrderIsDesc; byte[] columnNullMarker; byte[] columnNotNullMarker; private int start; private int end; private int fieldStart; private int bytesStart; private int internalBufferLen; private byte[] internalBuffer; private byte[] tempTimestampBytes; private byte[] tempDecimalBuffer; private InputByteBuffer inputByteBuffer = new InputByteBuffer(); private Field root; private Deque<Field> stack; private class Field { Field[] children; Category category; PrimitiveObjectInspector.PrimitiveCategory primitiveCategory; TypeInfo typeInfo; int index; int count; int start; int tag; } /* * Use this constructor when only ascending sort order is used. */ public BinarySortableDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer) { this(typeInfos, useExternalBuffer, null, null, null); } public BinarySortableDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer, boolean[] columnSortOrderIsDesc, byte[] columnNullMarker, byte[] columnNotNullMarker) { super(typeInfos, useExternalBuffer); final int count = typeInfos.length; root = new Field(); root.category = Category.STRUCT; root.children = createFields(typeInfos); root.count = count; stack = new ArrayDeque<>(); if (columnSortOrderIsDesc != null) { this.columnSortOrderIsDesc = columnSortOrderIsDesc; } else { this.columnSortOrderIsDesc = new boolean[count]; Arrays.fill(this.columnSortOrderIsDesc, false); } if (columnNullMarker != null) { this.columnNullMarker = columnNullMarker; this.columnNotNullMarker = columnNotNullMarker; } else { this.columnNullMarker = new byte[count]; this.columnNotNullMarker = new byte[count]; for (int i = 0; i < count; i++) { if (this.columnSortOrderIsDesc[i]) { // Descending // Null last (default for descending order) this.columnNullMarker[i] = BinarySortableSerDe.ZERO; this.columnNotNullMarker[i] = BinarySortableSerDe.ONE; } else { // Ascending // Null first (default for ascending order) this.columnNullMarker[i] = BinarySortableSerDe.ZERO; this.columnNotNullMarker[i] = BinarySortableSerDe.ONE; } } } inputByteBuffer = new InputByteBuffer(); internalBufferLen = -1; } // Not public since we must have column information. private BinarySortableDeserializeRead() { super(); } /* * Set the range of bytes to be deserialized. */ @Override public void set(byte[] bytes, int offset, int length) { start = offset; end = offset + length; inputByteBuffer.reset(bytes, start, end); root.index = -1; stack.clear(); stack.push(root); clearIndex(root); } private void clearIndex(Field field) { field.index = -1; if (field.children == null) { return; } for (Field child : field.children) { clearIndex(child); } } /* * Get detailed read position information to help diagnose exceptions. */ public String getDetailedReadPositionString() { StringBuffer sb = new StringBuffer(); sb.append("Reading inputByteBuffer of length "); sb.append(inputByteBuffer.getEnd()); sb.append(" at start offset "); sb.append(start); sb.append(" for length "); sb.append(end - start); sb.append(" to read "); sb.append(root.count); sb.append(" fields with types "); sb.append(Arrays.toString(typeInfos)); sb.append(". "); if (root.index == -1) { sb.append("Before first field?"); } else { sb.append("Read field #"); sb.append(root.index); sb.append(" at field start position "); sb.append(fieldStart); sb.append(" current read offset "); sb.append(inputByteBuffer.tell()); } sb.append(" column sort order "); sb.append(Arrays.toString(columnSortOrderIsDesc)); // UNDONE: Convert byte 0 or 1 to character. sb.append(" column null marker "); sb.append(Arrays.toString(columnNullMarker)); sb.append(" column non null marker "); sb.append(Arrays.toString(columnNotNullMarker)); return sb.toString(); } /* * Reads the the next field. * * Afterwards, reading is positioned to the next field. * * @return Return true when the field was not null and data is put in the appropriate * current* member. * Otherwise, false when the field is null. * */ @Override public boolean readNextField() throws IOException { return readComplexField(); } private boolean readPrimitive(Field field) throws IOException { final int fieldIndex = root.index; field.start = inputByteBuffer.tell(); /* * We have a field and are positioned to it. Read it. */ switch (field.primitiveCategory) { case BOOLEAN: currentBoolean = (inputByteBuffer.read(columnSortOrderIsDesc[fieldIndex]) == 2); return true; case BYTE: currentByte = (byte) (inputByteBuffer.read(columnSortOrderIsDesc[fieldIndex]) ^ 0x80); return true; case SHORT: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; int v = inputByteBuffer.read(invert) ^ 0x80; v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); currentShort = (short) v; } return true; case INT: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; int v = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } currentInt = v; } return true; case LONG: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; long v = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 7; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } currentLong = v; } return true; case DATE: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; int v = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } currentDateWritable.set(v); } return true; case TIMESTAMP: { if (tempTimestampBytes == null) { tempTimestampBytes = new byte[TimestampWritable.BINARY_SORTABLE_LENGTH]; } final boolean invert = columnSortOrderIsDesc[fieldIndex]; for (int i = 0; i < tempTimestampBytes.length; i++) { tempTimestampBytes[i] = inputByteBuffer.read(invert); } currentTimestampWritable.setBinarySortable(tempTimestampBytes, 0); } return true; case FLOAT: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; int v = 0; for (int i = 0; i < 4; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } if ((v & (1 << 31)) == 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1 << 31); } currentFloat = Float.intBitsToFloat(v); } return true; case DOUBLE: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; long v = 0; for (int i = 0; i < 8; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } if ((v & (1L << 63)) == 0) { // negative number, flip all bits v = ~v; } else { // positive number, flip the first bit v = v ^ (1L << 63); } currentDouble = Double.longBitsToDouble(v); } return true; case BINARY: case STRING: case CHAR: case VARCHAR: { /* * This code is a modified version of BinarySortableSerDe.deserializeText that lets us * detect if we can return a reference to the bytes directly. */ // Get the actual length first bytesStart = inputByteBuffer.tell(); final boolean invert = columnSortOrderIsDesc[fieldIndex]; int length = 0; do { byte b = inputByteBuffer.read(invert); if (b == 0) { // end of string break; } if (b == 1) { // the last char is an escape char. read the actual char inputByteBuffer.read(invert); } length++; } while (true); if (length == 0 || (!invert && length == inputByteBuffer.tell() - bytesStart - 1)) { // No inversion or escaping happened, so we are can reference directly. currentExternalBufferNeeded = false; currentBytes = inputByteBuffer.getData(); currentBytesStart = bytesStart; currentBytesLength = length; } else { // We are now positioned at the end of this field's bytes. if (useExternalBuffer) { // If we decided not to reposition and re-read the buffer to copy it with // copyToExternalBuffer, we we will still be correctly positioned for the next field. currentExternalBufferNeeded = true; currentExternalBufferNeededLen = length; } else { // The copyToBuffer will reposition and re-read the input buffer. currentExternalBufferNeeded = false; if (internalBufferLen < length) { internalBufferLen = length; internalBuffer = new byte[internalBufferLen]; } copyToBuffer(internalBuffer, 0, length); currentBytes = internalBuffer; currentBytesStart = 0; currentBytesLength = length; } } } return true; case INTERVAL_YEAR_MONTH: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; int v = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { v = (v << 8) + (inputByteBuffer.read(invert) & 0xff); } currentHiveIntervalYearMonthWritable.set(v); } return true; case INTERVAL_DAY_TIME: { final boolean invert = columnSortOrderIsDesc[fieldIndex]; long totalSecs = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 7; i++) { totalSecs = (totalSecs << 8) + (inputByteBuffer.read(invert) & 0xff); } int nanos = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { nanos = (nanos << 8) + (inputByteBuffer.read(invert) & 0xff); } currentHiveIntervalDayTimeWritable.set(totalSecs, nanos); } return true; case DECIMAL: { // Since enforcing precision and scale can cause a HiveDecimal to become NULL, // we must read it, enforce it here, and either return NULL or buffer the result. final boolean invert = columnSortOrderIsDesc[fieldIndex]; int b = inputByteBuffer.read(invert) - 1; if (!(b == 1 || b == -1 || b == 0)) { throw new IOException("Unexpected byte value " + (int)b + " in binary sortable format data (invert " + invert + ")"); } final boolean positive = b != -1; int factor = inputByteBuffer.read(invert) ^ 0x80; for (int i = 0; i < 3; i++) { factor = (factor << 8) + (inputByteBuffer.read(invert) & 0xff); } if (!positive) { factor = -factor; } final int decimalStart = inputByteBuffer.tell(); int length = 0; do { b = inputByteBuffer.read(positive ? invert : !invert); if (b == 1) { throw new IOException("Expected -1 and found byte value " + (int)b + " in binary sortable format data (invert " + invert + ")"); } if (b == 0) { // end of digits break; } length++; } while (true); // CONSIDER: Allocate a larger initial size. if(tempDecimalBuffer == null || tempDecimalBuffer.length < length) { tempDecimalBuffer = new byte[length]; } inputByteBuffer.seek(decimalStart); for (int i = 0; i < length; ++i) { tempDecimalBuffer[i] = inputByteBuffer.read(positive ? invert : !invert); } // read the null byte again inputByteBuffer.read(positive ? invert : !invert); // Set the value of the writable from the decimal digits that were written with no dot. final int scale = length - factor; currentHiveDecimalWritable.setFromDigitsOnlyBytesWithScale( !positive, tempDecimalBuffer, 0, length, scale); boolean decimalIsNull = !currentHiveDecimalWritable.isSet(); if (!decimalIsNull) { // We have a decimal. After we enforce precision and scale, will it become a NULL? final DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) field.typeInfo; final int enforcePrecision = decimalTypeInfo.getPrecision(); final int enforceScale = decimalTypeInfo.getScale(); decimalIsNull = !currentHiveDecimalWritable.mutateEnforcePrecisionScale( enforcePrecision, enforceScale); } if (decimalIsNull) { return false; } } return true; default: throw new RuntimeException("Unexpected primitive type category " + field.primitiveCategory); } } /* * Reads through an undesired field. * * No data values are valid after this call. * Designed for skipping columns that are not included. */ public void skipNextField() throws IOException { final Field current = stack.peek(); current.index++; if (root.index >= root.count) { return; } if (inputByteBuffer.isEof()) { // Also, reading beyond our byte range produces NULL. return; } if (current.category == Category.UNION && current.index == 0) { current.tag = inputByteBuffer.read(); currentInt = current.tag; return; } final Field child = getChild(current); if (isNull()) { return; } if (child.category == Category.PRIMITIVE) { readPrimitive(child); } else { stack.push(child); switch (child.category) { case LIST: case MAP: while (isNextComplexMultiValue()) { skipNextField(); } break; case STRUCT: for (int i = 0; i < child.count; i++) { skipNextField(); } finishComplexVariableFieldsType(); break; case UNION: readComplexField(); skipNextField(); finishComplexVariableFieldsType(); break; } } } @Override public void copyToExternalBuffer(byte[] externalBuffer, int externalBufferStart) throws IOException { copyToBuffer(externalBuffer, externalBufferStart, currentExternalBufferNeededLen); } private void copyToBuffer(byte[] buffer, int bufferStart, int bufferLength) throws IOException { final boolean invert = columnSortOrderIsDesc[root.index]; inputByteBuffer.seek(bytesStart); // 3. Copy the data. for (int i = 0; i < bufferLength; i++) { byte b = inputByteBuffer.read(invert); if (b == 1) { // The last char is an escape char, read the actual char. // The serialization format escape \0 to \1, and \1 to \2, // to make sure the string is null-terminated. b = (byte) (inputByteBuffer.read(invert) - 1); } buffer[bufferStart + i] = b; } // 4. Read the null terminator. byte b = inputByteBuffer.read(invert); if (b != 0) { throw new RuntimeException("Expected 0 terminating byte"); } } /* * Call this method may be called after all the all fields have been read to check * for unread fields. * * Note that when optimizing reading to stop reading unneeded include columns, worrying * about whether all data is consumed is not appropriate (often we aren't reading it all by * design). * * Since LazySimpleDeserializeRead parses the line through the last desired column it does * support this function. */ public boolean isEndOfInputReached() { return inputByteBuffer.isEof(); } private Field[] createFields(TypeInfo[] typeInfos) { final Field[] children = new Field[typeInfos.length]; for (int i = 0; i < typeInfos.length; i++) { children[i] = createField(typeInfos[i]); } return children; } private Field createField(TypeInfo typeInfo) { final Field field = new Field(); final Category category = typeInfo.getCategory(); field.category = category; field.typeInfo = typeInfo; switch (category) { case PRIMITIVE: field.primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory(); break; case LIST: field.children = new Field[1]; field.children[0] = createField(((ListTypeInfo) typeInfo).getListElementTypeInfo()); break; case MAP: field.children = new Field[2]; field.children[0] = createField(((MapTypeInfo) typeInfo).getMapKeyTypeInfo()); field.children[1] = createField(((MapTypeInfo) typeInfo).getMapValueTypeInfo()); break; case STRUCT: StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo; List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos(); field.count = fieldTypeInfos.size(); field.children = createFields(fieldTypeInfos.toArray(new TypeInfo[fieldTypeInfos.size()])); break; case UNION: UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo; List<TypeInfo> objectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos(); field.count = 2; field.children = createFields(objectTypeInfos.toArray(new TypeInfo[objectTypeInfos.size()])); break; default: throw new RuntimeException(); } return field; } private Field getChild(Field field) { switch (field.category) { case LIST: return field.children[0]; case MAP: return field.children[field.index % 2]; case STRUCT: return field.children[field.index]; case UNION: return field.children[field.tag]; default: throw new RuntimeException(); } } private boolean isNull() throws IOException { return inputByteBuffer.read(columnSortOrderIsDesc[root.index]) == columnNullMarker[root.index]; } @Override public boolean readComplexField() throws IOException { final Field current = stack.peek(); current.index++; if (root.index >= root.count) { return false; } if (inputByteBuffer.isEof()) { // Also, reading beyond our byte range produces NULL. return false; } if (current.category == Category.UNION) { if (current.index == 0) { current.tag = inputByteBuffer.read(columnSortOrderIsDesc[root.index]); currentInt = current.tag; return true; } } final Field child = getChild(current); boolean isNull = isNull(); if (isNull) { return false; } if (child.category == Category.PRIMITIVE) { isNull = !readPrimitive(child); } else { stack.push(child); } return !isNull; } @Override public boolean isNextComplexMultiValue() throws IOException { final byte isNullByte = inputByteBuffer.read(columnSortOrderIsDesc[root.index]); final boolean isEnded; switch (isNullByte) { case 0: isEnded = true; break; case 1: isEnded = false; break; default: throw new RuntimeException(); } if (isEnded) { stack.pop(); stack.peek(); } return !isEnded; } @Override public void finishComplexVariableFieldsType() { stack.pop(); if (stack.peek() == null) { throw new RuntimeException(); } stack.peek(); } }