LazyBinaryDeserializeRead.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.lazybinary.fast;

import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.List;

import org.apache.hadoop.hive.serde2.fast.DeserializeRead;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VInt;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils.VLong;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.WritableUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/*
 * Directly deserialize with the caller reading field-by-field the LazyBinary serialization format.
 *
 * The caller is responsible for calling the read method for the right type of each field
 * (after calling readNextField).
 *
 * Reading some fields require a results object to receive value information.  A separate
 * results object is created by the caller at initialization per different field even for the same
 * type.
 *
 * Some type values are by reference to either bytes in the deserialization buffer or to
 * other type specific buffers.  So, those references are only valid until the next time set is
 * called.
 */
public final class LazyBinaryDeserializeRead extends DeserializeRead {
  public static final Logger LOG = LoggerFactory.getLogger(LazyBinaryDeserializeRead.class.getName());

  private byte[] bytes;
  private int start;
  private int offset;
  private int end;

  private boolean skipLengthPrefix = false;

  // Object to receive results of reading a decoded variable length int or long.
  private VInt tempVInt;
  private VLong tempVLong;

  private Deque<Field> stack = new ArrayDeque<>();
  private Field root;

  private class Field {
    Field[] children;

    Category category;
    PrimitiveCategory primitiveCategory;
    TypeInfo typeInfo;

    int index;
    int count;
    int start;
    int end;
    int nullByteStart;
    byte nullByte;
    byte tag;
  }

  public LazyBinaryDeserializeRead(TypeInfo[] typeInfos, boolean useExternalBuffer) {
    super(typeInfos, useExternalBuffer);
    tempVInt = new VInt();
    tempVLong = new VLong();
    currentExternalBufferNeeded = false;

    root = new Field();
    root.category = Category.STRUCT;
    root.children = createFields(typeInfos);
    root.count = typeInfos.length;
  }

  private Field[] createFields(TypeInfo[] typeInfos) {
    final Field[] children = new Field[typeInfos.length];
    for (int i = 0; i < typeInfos.length; i++) {
      children[i] = createField(typeInfos[i]);
    }
    return children;
  }

  private Field createField(TypeInfo typeInfo) {
    final Field field = new Field();
    final Category category = typeInfo.getCategory();
    field.category = category;
    field.typeInfo = typeInfo;
    switch (category) {
    case PRIMITIVE:
      field.primitiveCategory = ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory();
      break;
    case LIST:
      field.children = new Field[1];
      field.children[0] = createField(((ListTypeInfo) typeInfo).getListElementTypeInfo());
      break;
    case MAP:
      field.children = new Field[2];
      field.children[0] = createField(((MapTypeInfo) typeInfo).getMapKeyTypeInfo());
      field.children[1] = createField(((MapTypeInfo) typeInfo).getMapValueTypeInfo());
      break;
    case STRUCT:
      final StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
      final List<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
      field.children = createFields(fieldTypeInfos.toArray(new TypeInfo[fieldTypeInfos.size()]));
      break;
    case UNION:
      final UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
      final List<TypeInfo> objectTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();
      field.children = createFields(objectTypeInfos.toArray(new TypeInfo[objectTypeInfos.size()]));
      break;
    default:
      throw new RuntimeException();
    }
    return field;
  }

  /*
   * Set the range of bytes to be deserialized.
   */
  @Override
  public void set(byte[] bytes, int offset, int length) {
    this.bytes = bytes;
    this.offset = offset;
    start = offset;
    end = offset + length;

    stack.clear();
    stack.push(root);
    clearIndex(root);
  }

  private void clearIndex(Field field) {
    field.index = 0;
    if (field.children == null) {
      return;
    }
    for (Field child : field.children) {
      clearIndex(child);
    }
  }

  /*
   * Get detailed read position information to help diagnose exceptions.
   */
  public String getDetailedReadPositionString() {
    StringBuffer sb = new StringBuffer();

    sb.append("Reading byte[] of length ");
    sb.append(bytes.length);
    sb.append(" at start offset ");
    sb.append(start);
    sb.append(" for length ");
    sb.append(end - start);
    sb.append(" to read ");
    sb.append(root.children.length);
    sb.append(" fields with types ");
    sb.append(Arrays.toString(typeInfos));
    sb.append(".  Read field #");
    sb.append(root.index);
    sb.append(" at field start position ");
    sb.append(root.start);
    sb.append(" current read offset ");
    sb.append(offset);

    return sb.toString();
  }

  /*
   * Reads the the next field.
   *
   * Afterwards, reading is positioned to the next field.
   *
   * @return  Return true when the field was not null and data is put in the appropriate
   *          current* member.
   *          Otherwise, false when the field is null.
   *
   */
  @Override
  public boolean readNextField() throws IOException {
    return readComplexField();
  }

  private boolean readPrimitive(Field field) throws IOException {
    final PrimitiveCategory primitiveCategory = field.primitiveCategory;
    final TypeInfo typeInfo = field.typeInfo;
    switch (primitiveCategory) {
    case BOOLEAN:
      // No check needed for single byte read.
      currentBoolean = (bytes[offset++] != 0);
      break;
    case BYTE:
      // No check needed for single byte read.
      currentByte = bytes[offset++];
      break;
    case SHORT:
      // Last item -- ok to be at end.
      if (offset + 2 > end) {
        throw new EOFException();
      }
      currentShort = LazyBinaryUtils.byteArrayToShort(bytes, offset);
      offset += 2;
      break;
    case INT:
      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
      offset += tempVInt.length;
      currentInt = tempVInt.value;
      break;
    case LONG:
      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVLong(bytes, offset, tempVLong);
      offset += tempVLong.length;
      currentLong = tempVLong.value;
      break;
    case FLOAT:
      // Last item -- ok to be at end.
      if (offset + 4 > end) {
        throw new EOFException();
      }
      currentFloat = Float.intBitsToFloat(LazyBinaryUtils.byteArrayToInt(bytes, offset));
      offset += 4;
      break;
    case DOUBLE:
      // Last item -- ok to be at end.
      if (offset + 8 > end) {
        throw new EOFException();
      }
      currentDouble = Double.longBitsToDouble(LazyBinaryUtils.byteArrayToLong(bytes, offset));
      offset += 8;
      break;

    case BINARY:
    case STRING:
    case CHAR:
    case VARCHAR:
      {
        // using vint instead of 4 bytes
        // Parse the first byte of a vint/vlong to determine the number of bytes.
        if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
          throw new EOFException();
        }
        LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
        offset += tempVInt.length;

        int saveStart = offset;
        int length = tempVInt.value;
        offset += length;
        // Last item -- ok to be at end.
        if (offset > end) {
          throw new EOFException();
        }

        currentBytes = bytes;
        currentBytesStart = saveStart;
        currentBytesLength = length;
      }
      break;
    case DATE:
      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
      offset += tempVInt.length;

      currentDateWritable.set(tempVInt.value);
      break;
    case TIMESTAMP:
      {
        int length = TimestampWritable.getTotalLength(bytes, offset);
        int saveStart = offset;
        offset += length;
        // Last item -- ok to be at end.
        if (offset > end) {
          throw new EOFException();
        }

        currentTimestampWritable.set(bytes, saveStart);
      }
      break;
    case INTERVAL_YEAR_MONTH:
      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
      offset += tempVInt.length;

      currentHiveIntervalYearMonthWritable.set(tempVInt.value);
      break;
    case INTERVAL_DAY_TIME:
      // The first bounds check requires at least one more byte beyond for 2nd int (hence >=).
      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) >= end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVLong(bytes, offset, tempVLong);
      offset += tempVLong.length;

      // Parse the first byte of a vint/vlong to determine the number of bytes.
      if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
        throw new EOFException();
      }
      LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
      offset += tempVInt.length;

      currentHiveIntervalDayTimeWritable.set(tempVLong.value, tempVInt.value);
      break;
    case DECIMAL:
      {
        // Since enforcing precision and scale can cause a HiveDecimal to become NULL,
        // we must read it, enforce it here, and either return NULL or buffer the result.

        // These calls are to see how much data there is. The setFromBytes call below will do the same
        // readVInt reads but actually unpack the decimal.

        // The first bounds check requires at least one more byte beyond for 2nd int (hence >=).
        // Parse the first byte of a vint/vlong to determine the number of bytes.
        if (offset + WritableUtils.decodeVIntSize(bytes[offset]) >= end) {
          throw new EOFException();
        }
        LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
        offset += tempVInt.length;
        int readScale = tempVInt.value;

        // Parse the first byte of a vint/vlong to determine the number of bytes.
        if (offset + WritableUtils.decodeVIntSize(bytes[offset]) > end) {
          throw new EOFException();
        }
        LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
        offset += tempVInt.length;
        int saveStart = offset;
        offset += tempVInt.value;
        // Last item -- ok to be at end.
        if (offset > end) {
          throw new EOFException();
        }
        int length = offset - saveStart;

        //   scale = 2, length = 6, value = -6065716379.11
        //   \002\006\255\114\197\131\083\105
        //           \255\114\197\131\083\105

        currentHiveDecimalWritable.setFromBigIntegerBytesAndScale(
            bytes, saveStart, length, readScale);
        boolean decimalIsNull = !currentHiveDecimalWritable.isSet();
        if (!decimalIsNull) {

          final DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;

          final int precision = decimalTypeInfo.getPrecision();
          final int scale = decimalTypeInfo.getScale();

          decimalIsNull = !currentHiveDecimalWritable.mutateEnforcePrecisionScale(precision, scale);
        }
        if (decimalIsNull) {
          return false;
        }
      }
      break;
    default:
      throw new Error("Unexpected primitive category " + primitiveCategory.name());
    }
    return true;
  }

  /*
   * Reads through an undesired field.
   *
   * No data values are valid after this call.
   * Designed for skipping columns that are not included.
   */
  public void skipNextField() throws IOException {
    final Field current = stack.peek();
    final boolean isNull = isNull(current);

    if (isNull) {
      current.index++;
      return;
    }

    if (readUnionTag(current)) {
      current.index++;
      return;
    }

    final Field child = getChild(current);

    if (child.category == Category.PRIMITIVE) {
      readPrimitive(child);
      current.index++;
    } else {
      parseHeader(child);
      stack.push(child);

      for (int i = 0; i < child.count; i++) {
        skipNextField();
      }
      finishComplexVariableFieldsType();
    }

    if (offset > end) {
      throw new EOFException();
    }
  }

  /*
   * Call this method may be called after all the all fields have been read to check
   * for unread fields.
   *
   * Note that when optimizing reading to stop reading unneeded include columns, worrying
   * about whether all data is consumed is not appropriate (often we aren't reading it all by
   * design).
   *
   * Since LazySimpleDeserializeRead parses the line through the last desired column it does
   * support this function.
   */
  public boolean isEndOfInputReached() {
    return (offset == end);
  }

  private boolean isNull(Field field) {
    final byte b = (byte) (1 << (field.index % 8));
    switch (field.category) {
    case PRIMITIVE:
      return false;
    case LIST:
    case MAP:
      final byte nullByte = bytes[field.nullByteStart + (field.index / 8)];
      return (nullByte & b) == 0;
    case STRUCT:
      if (field.index % 8 == 0) {
        field.nullByte = bytes[offset++];
      }
      return (field.nullByte & b) == 0;
    case UNION:
      return false;
    default:
      throw new RuntimeException();
    }
  }

  private void parseHeader(Field field) {
    // Init
    field.index = 0;
    field.start = offset;

    // Read length
    if (!skipLengthPrefix) {
      final int length = LazyBinaryUtils.byteArrayToInt(bytes, offset);
      offset += 4;
      field.end = offset + length;
    }

    switch (field.category) {
    case LIST:
    case MAP:
      // Read count
      LazyBinaryUtils.readVInt(bytes, offset, tempVInt);
      if (field.category == Category.LIST) {
        field.count = tempVInt.value;
      } else {
        field.count = tempVInt.value * 2;
      }
      offset += tempVInt.length;

      // Null byte start
      field.nullByteStart = offset;
      offset += ((field.count) + 7) / 8;
      break;
    case STRUCT:
      field.count = ((StructTypeInfo) field.typeInfo).getAllStructFieldTypeInfos().size();
      break;
    case UNION:
      field.count = 2;
      break;
    }
  }

  private Field getChild(Field field) {
    switch (field.category) {
    case LIST:
      return field.children[0];
    case MAP:
      return field.children[field.index % 2];
    case STRUCT:
      return field.children[field.index];
    case UNION:
      return field.children[field.tag];
    default:
      throw new RuntimeException();
    }
  }

  private boolean readUnionTag(Field field) {
    if (field.category == Category.UNION && field.index == 0) {
      field.tag = bytes[offset++];
      currentInt = field.tag;
      return true;
    } else {
      return false;
    }
  }

  // Push or next
  @Override
  public boolean readComplexField() throws IOException {
    final Field current = stack.peek();
    boolean isNull = isNull(current);

    if (isNull) {
      current.index++;
      return false;
    }

    if (readUnionTag(current)) {
      current.index++;
      return true;
    }

    final Field child = getChild(current);

    if (child.category == Category.PRIMITIVE) {
      isNull = !readPrimitive(child);
      current.index++;
    } else {
      parseHeader(child);
      stack.push(child);
    }

    if (offset > end) {
      throw new EOFException();
    }
    return !isNull;
  }

  // Pop (list, map)
  @Override
  public boolean isNextComplexMultiValue() {
    Field current = stack.peek();
    final boolean isNext = current.index < current.count;
    if (!isNext) {
      stack.pop();
      stack.peek().index++;
    }
    return isNext;
  }

  // Pop (struct, union)
  @Override
  public void finishComplexVariableFieldsType() {
    stack.pop();
    if (stack.peek() == null) {
      throw new RuntimeException();
    }
    stack.peek().index++;
  }
}