BinarySortableDeserializeRead.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.binarysortable.fast;

import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;

import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.serde2.binarysortable.BinarySortableSerDe;
import org.apache.hadoop.hive.serde2.binarysortable.InputByteBuffer;
import org.apache.hadoop.hive.serde2.fast.DeserializeRead;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;

/*
 * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format.
 *
 * The caller is responsible for calling the read method for the right type of each field
 * (after calling readNextField).
 *
 * Reading some fields require a results object to receive value information.  A separate
 * results object is created by the caller at initialization per different field even for the same
 * type.
 *
 * Some type values are by reference to either bytes in the deserialization buffer or to
 * other type specific buffers.  So, those references are only valid until the next time set is
 * called.
 */
public final class BinarySortableDeserializeRead extends DeserializeRead {
  public static final Logger LOG = LoggerFactory.getLogger(BinarySortableDeserializeRead.class.getName());

  // The sort order (ascending/descending) for each field. Set to true when descending (invert).
  private boolean[] columnSortOrderIsDesc;

  byte[] columnNullMarker;
  byte[] columnNotNullMarker;

  private int start;
  private int end;
  private int fieldStart;

  private int bytesStart;

  private int internalBufferLen;
  private byte[] internalBuffer;

  private byte[] tempTimestampBytes;

  private byte[] tempDecimalBuffer;

  private InputByteBuffer inputByteBuffer = new InputByteBuffer();

  private Field root;
  private Deque<Field> stack;

  private class Field {
    Field[] children;

    Category category;
    PrimitiveObjectInspector.PrimitiveCategory primitiveCategory;
    TypeInfo typeInfo;

    int index;
    int count;
    int start;
    int tag;
  }

  /*
   * Use this constructor when only ascending sort order is used.
   */
  public BinarySortableDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer) {
    this(typeInfos, useExternalBuffer, null, null, null);
  }

  public BinarySortableDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer,
          boolean[] columnSortOrderIsDesc, byte[] columnNullMarker, byte[] columnNotNullMarker) {
    super(typeInfos, useExternalBuffer);
    final int count = typeInfos.length;

    root = new Field();
    root.category = Category.STRUCT;
    root.children = createFields(typeInfos);
    root.count = count;
    stack = new ArrayDeque<>();

    if (columnSortOrderIsDesc != null) {
      this.columnSortOrderIsDesc = columnSortOrderIsDesc;
    } else {
      this.columnSortOrderIsDesc = new boolean[count];
      Arrays.fill(this.columnSortOrderIsDesc, false);
    }
    if (columnNullMarker != null) {
      this.columnNullMarker = columnNullMarker;
      this.columnNotNullMarker = columnNotNullMarker;
    } else {
      this.columnNullMarker = new byte[count];
      this.columnNotNullMarker = new byte[count];
      for (int i = 0; i < count; i++) {
        if (this.columnSortOrderIsDesc[i]) {
          // Descending
          // Null last (default for descending order)
          this.columnNullMarker[i] = BinarySortableSerDe.ZERO;
          this.columnNotNullMarker[i] = BinarySortableSerDe.ONE;
        } else {
          // Ascending
          // Null first (default for ascending order)
          this.columnNullMarker[i] = BinarySortableSerDe.ZERO;
          this.columnNotNullMarker[i] = BinarySortableSerDe.ONE;
        }
      }
    }
    inputByteBuffer = new InputByteBuffer();
    internalBufferLen = -1;
  }

  // Not public since we must have column information.
  private BinarySortableDeserializeRead() {
    super();
  }

  /*
   * Set the range of bytes to be deserialized.
   */
  @Override
  public void set(byte[] bytes, int offset, int length) {
    start = offset;
    end = offset + length;
    inputByteBuffer.reset(bytes, start, end);
    root.index = -1;
    stack.clear();
    stack.push(root);
    clearIndex(root);
  }

  private void clearIndex(Field field) {
    field.index = -1;
    if (field.children == null) {
      return;
    }
    for (Field child : field.children) {
      clearIndex(child);
    }
  }

  /*
   * Get detailed read position information to help diagnose exceptions.
   */
  public String getDetailedReadPositionString() {
    StringBuffer sb = new StringBuffer();

    sb.append("Reading inputByteBuffer of length ");
    sb.append(inputByteBuffer.getEnd());
    sb.append(" at start offset ");
    sb.append(start);
    sb.append(" for length ");
    sb.append(end - start);
    sb.append(" to read ");
    sb.append(root.count);
    sb.append(" fields with types ");
    sb.append(Arrays.toString(typeInfos));
    sb.append(".  ");
    if (root.index == -1) {
      sb.append("Before first field?");
    } else {
      sb.append("Read field #");
      sb.append(root.index);
      sb.append(" at field start position ");
      sb.append(fieldStart);
      sb.append(" current read offset ");
      sb.append(inputByteBuffer.tell());
    }
    sb.append(" column sort order ");
    sb.append(Arrays.toString(columnSortOrderIsDesc));
    // UNDONE: Convert byte 0 or 1 to character.
    sb.append(" column null marker ");
    sb.append(Arrays.toString(columnNullMarker));
    sb.append(" column non null marker ");
    sb.append(Arrays.toString(columnNotNullMarker));

    return sb.toString();
  }

  /*
   * Reads the the next field.
   *
   * Afterwards, reading is positioned to the next field.
   *
   * @return  Return true when the field was not null and data is put in the appropriate
   *          current* member.
   *          Otherwise, false when the field is null.
   *
   */
  @Override
  public boolean readNextField() throws IOException {
    return readComplexField();
  }

  private boolean readPrimitive(Field field) throws IOException {
    final int fieldIndex = root.index;
    field.start = inputByteBuffer.tell();

    /*
     * We have a field and are positioned to it.  Read it.
     */
    switch (field.primitiveCategory) {
    case BOOLEAN:
      currentBoolean = (inputByteBuffer.read(columnSortOrderIsDesc[fieldIndex]) == 2);
      return true;
    case BYTE:
      currentByte = (byte) (inputByteBuffer.read(columnSortOrderIsDesc[fieldIndex]) ^ 0x80);
      return true;
    case SHORT:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int v = inputByteBuffer.read(invert) ^ 0x80;
        v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        currentShort = (short) v;
      }
      return true;
    case INT:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int v = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 3; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        currentInt = v;
      }
      return true;
    case LONG:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        long v = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 7; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        currentLong = v;
      }
      return true;
    case DATE:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int v = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 3; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        currentDateWritable.set(v);
      }
      return true;
    case TIMESTAMP:
      {
        if (tempTimestampBytes == null) {
          tempTimestampBytes = new byte[TimestampWritable.BINARY_SORTABLE_LENGTH];
        }
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        for (int i = 0; i < tempTimestampBytes.length; i++) {
          tempTimestampBytes[i] = inputByteBuffer.read(invert);
        }
        currentTimestampWritable.setBinarySortable(tempTimestampBytes, 0);
      }
      return true;
    case FLOAT:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int v = 0;
        for (int i = 0; i < 4; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        if ((v & (1 << 31)) == 0) {
          // negative number, flip all bits
          v = ~v;
        } else {
          // positive number, flip the first bit
          v = v ^ (1 << 31);
        }
        currentFloat = Float.intBitsToFloat(v);
      }
      return true;
    case DOUBLE:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        long v = 0;
        for (int i = 0; i < 8; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        if ((v & (1L << 63)) == 0) {
          // negative number, flip all bits
          v = ~v;
        } else {
          // positive number, flip the first bit
          v = v ^ (1L << 63);
        }
        currentDouble = Double.longBitsToDouble(v);
      }
      return true;
    case BINARY:
    case STRING:
    case CHAR:
    case VARCHAR:
      {
        /*
         * This code is a modified version of BinarySortableSerDe.deserializeText that lets us
         * detect if we can return a reference to the bytes directly.
         */

        // Get the actual length first
        bytesStart = inputByteBuffer.tell();
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int length = 0;
        do {
          byte b = inputByteBuffer.read(invert);
          if (b == 0) {
            // end of string
            break;
          }
          if (b == 1) {
            // the last char is an escape char. read the actual char
            inputByteBuffer.read(invert);
          }
          length++;
        } while (true);

        if (length == 0 ||
            (!invert && length == inputByteBuffer.tell() - bytesStart - 1)) {
          // No inversion or escaping happened, so we are can reference directly.
          currentExternalBufferNeeded = false;
          currentBytes = inputByteBuffer.getData();
          currentBytesStart = bytesStart;
          currentBytesLength = length;
        } else {
          // We are now positioned at the end of this field's bytes.
          if (useExternalBuffer) {
            // If we decided not to reposition and re-read the buffer to copy it with
            // copyToExternalBuffer, we we will still be correctly positioned for the next field.
            currentExternalBufferNeeded = true;
            currentExternalBufferNeededLen = length;
          } else {
            // The copyToBuffer will reposition and re-read the input buffer.
            currentExternalBufferNeeded = false;
            if (internalBufferLen < length) {
              internalBufferLen = length;
              internalBuffer = new byte[internalBufferLen];
            }
            copyToBuffer(internalBuffer, 0, length);
            currentBytes = internalBuffer;
            currentBytesStart = 0;
            currentBytesLength = length;
          }
        }
      }
      return true;
    case INTERVAL_YEAR_MONTH:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int v = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 3; i++) {
          v = (v << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        currentHiveIntervalYearMonthWritable.set(v);
      }
      return true;
    case INTERVAL_DAY_TIME:
      {
        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        long totalSecs = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 7; i++) {
          totalSecs = (totalSecs << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        int nanos = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 3; i++) {
          nanos = (nanos << 8) + (inputByteBuffer.read(invert) & 0xff);
        }
        currentHiveIntervalDayTimeWritable.set(totalSecs, nanos);
      }
      return true;
    case DECIMAL:
      {
        // Since enforcing precision and scale can cause a HiveDecimal to become NULL,
        // we must read it, enforce it here, and either return NULL or buffer the result.

        final boolean invert = columnSortOrderIsDesc[fieldIndex];
        int b = inputByteBuffer.read(invert) - 1;
        if (!(b == 1 || b == -1 || b == 0)) {
          throw new IOException("Unexpected byte value " + (int)b + " in binary sortable format data (invert " + invert + ")");
        }
        final boolean positive = b != -1;

        int factor = inputByteBuffer.read(invert) ^ 0x80;
        for (int i = 0; i < 3; i++) {
          factor = (factor << 8) + (inputByteBuffer.read(invert) & 0xff);
        }

        if (!positive) {
          factor = -factor;
        }

        final int decimalStart = inputByteBuffer.tell();
        int length = 0;

        do {
          b = inputByteBuffer.read(positive ? invert : !invert);
          if (b == 1) {
            throw new IOException("Expected -1 and found byte value " + (int)b + " in binary sortable format data (invert " + invert + ")");
          }


          if (b == 0) {
            // end of digits
            break;
          }

          length++;
        } while (true);

        // CONSIDER: Allocate a larger initial size.
        if(tempDecimalBuffer == null || tempDecimalBuffer.length < length) {
          tempDecimalBuffer = new byte[length];
        }

        inputByteBuffer.seek(decimalStart);
        for (int i = 0; i < length; ++i) {
          tempDecimalBuffer[i] = inputByteBuffer.read(positive ? invert : !invert);
        }

        // read the null byte again
        inputByteBuffer.read(positive ? invert : !invert);

        // Set the value of the writable from the decimal digits that were written with no dot.
        final int scale = length - factor;
        currentHiveDecimalWritable.setFromDigitsOnlyBytesWithScale(
            !positive, tempDecimalBuffer, 0, length, scale);
        boolean decimalIsNull = !currentHiveDecimalWritable.isSet();
        if (!decimalIsNull) {

          // We have a decimal.  After we enforce precision and scale, will it become a NULL?

          final DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) field.typeInfo;

          final int enforcePrecision = decimalTypeInfo.getPrecision();
          final int enforceScale = decimalTypeInfo.getScale();

          decimalIsNull =
              !currentHiveDecimalWritable.mutateEnforcePrecisionScale(
                  enforcePrecision, enforceScale);

        }
        if (decimalIsNull) {
          return false;
        }
      }
      return true;
    default:
      throw new RuntimeException("Unexpected primitive type category " + field.primitiveCategory);
    }
  }

  /*
   * Reads through an undesired field.
   *
   * No data values are valid after this call.
   * Designed for skipping columns that are not included.
   */
  public void skipNextField() throws IOException {
    final Field current = stack.peek();
    current.index++;

    if (root.index >= root.count) {
      return;
    }

    if (inputByteBuffer.isEof()) {
      // Also, reading beyond our byte range produces NULL.
      return;
    }

    if (current.category == Category.UNION && current.index == 0) {
      current.tag = inputByteBuffer.read();
      currentInt = current.tag;
      return;
    }

    final Field child = getChild(current);

    if (isNull()) {
      return;
    }
    if (child.category == Category.PRIMITIVE) {
      readPrimitive(child);
    } else {
      stack.push(child);
      switch (child.category) {
      case LIST:
      case MAP:
        while (isNextComplexMultiValue()) {
          skipNextField();
        }
        break;
      case STRUCT:
        for (int i = 0; i < child.count; i++) {
          skipNextField();
        }
        finishComplexVariableFieldsType();
        break;
      case UNION:
        readComplexField();
        skipNextField();
        finishComplexVariableFieldsType();
        break;
      }
    }
  }

  @Override
  public void copyToExternalBuffer(byte[] externalBuffer, int externalBufferStart) throws IOException {
    copyToBuffer(externalBuffer, externalBufferStart, currentExternalBufferNeededLen);
  }

  private void copyToBuffer(byte[] buffer, int bufferStart, int bufferLength) throws IOException {
    final boolean invert = columnSortOrderIsDesc[root.index];
    inputByteBuffer.seek(bytesStart);
    // 3. Copy the data.
    for (int i = 0; i < bufferLength; i++) {
      byte b = inputByteBuffer.read(invert);
      if (b == 1) {
        // The last char is an escape char, read the actual char.
        // The serialization format escape \0 to \1, and \1 to \2,
        // to make sure the string is null-terminated.
        b = (byte) (inputByteBuffer.read(invert) - 1);
      }
      buffer[bufferStart + i] = b;
    }
    // 4. Read the null terminator.
    byte b = inputByteBuffer.read(invert);
    if (b != 0) {
      throw new RuntimeException("Expected 0 terminating byte");
    }
  }

  /*
   * Call this method may be called after all the all fields have been read to check
   * for unread fields.
   *
   * Note that when optimizing reading to stop reading unneeded include columns, worrying
   * about whether all data is consumed is not appropriate (often we aren't reading it all by
   * design).
   *
   * Since LazySimpleDeserializeRead parses the line through the last desired column it does
   * support this function.
   */
  public boolean isEndOfInputReached() {
    return inputByteBuffer.isEof();
  }

  private Field[] createFields(TypeInfo[] typeInfos) {
    final Field[] children = new Field[typeInfos.length];
    for (int i = 0; i < typeInfos.length; i++) {
      children[i] = createField(typeInfos[i]);
    }
    return children;
  }

  private Field createField(TypeInfo typeInfo) {
    final Field field = new Field();
    final Category category = typeInfo.getCategory();
    field.category = category;
    field.typeInfo = typeInfo;

    switch (category) {
    case PRIMITIVE:
      field.primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory();
      break;
    case LIST:
      field.children = new Field[1];
      field.children[0] = createField(((ListTypeInfo) typeInfo).getListElementTypeInfo());
      break;
    case MAP:
      field.children = new Field[2];
      field.children[0] = createField(((MapTypeInfo) typeInfo).getMapKeyTypeInfo());
      field.children[1] = createField(((MapTypeInfo) typeInfo).getMapValueTypeInfo());
      break;
    case STRUCT:
      StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
      List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
      field.count = fieldTypeInfos.size();
      field.children = createFields(fieldTypeInfos.toArray(new TypeInfo[fieldTypeInfos.size()]));
      break;
    case UNION:
      UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
      List<TypeInfo> objectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();
      field.count = 2;
      field.children = createFields(objectTypeInfos.toArray(new TypeInfo[objectTypeInfos.size()]));
      break;
    default:
      throw new RuntimeException();
    }
    return field;
  }

  private Field getChild(Field field) {
    switch (field.category) {
    case LIST:
      return field.children[0];
    case MAP:
      return field.children[field.index % 2];
    case STRUCT:
      return field.children[field.index];
    case UNION:
      return field.children[field.tag];
    default:
      throw new RuntimeException();
    }
  }

  private boolean isNull() throws IOException {
    return inputByteBuffer.read(columnSortOrderIsDesc[root.index]) ==
        columnNullMarker[root.index];
  }

  @Override
  public boolean readComplexField() throws IOException {
    final Field current = stack.peek();
    current.index++;

    if (root.index >= root.count) {
      return false;
    }

    if (inputByteBuffer.isEof()) {
      // Also, reading beyond our byte range produces NULL.
      return false;
    }

    if (current.category == Category.UNION) {
      if (current.index == 0) {
        current.tag = inputByteBuffer.read(columnSortOrderIsDesc[root.index]);
        currentInt = current.tag;
        return true;
      }
    }

    final Field child = getChild(current);

    boolean isNull = isNull();

    if (isNull) {
      return false;
    }
    if (child.category == Category.PRIMITIVE) {
      isNull = !readPrimitive(child);
    } else {
      stack.push(child);
    }
    return !isNull;
  }

  @Override
  public boolean isNextComplexMultiValue() throws IOException {
    final byte isNullByte = inputByteBuffer.read(columnSortOrderIsDesc[root.index]);
    final boolean isEnded;

    switch (isNullByte) {
    case 0:
      isEnded = true;
      break;

    case 1:
      isEnded = false;
      break;

    default:
      throw new RuntimeException();
    }

    if (isEnded) {
      stack.pop();
      stack.peek();
    }
    return !isEnded;
  }

  @Override
  public void finishComplexVariableFieldsType() {
    stack.pop();
    if (stack.peek() == null) {
      throw new RuntimeException();
    }
    stack.peek();
  }
}