VectorizedPrimitiveColumnReader.java example

Explorer
hive-master
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.parquet.vector;

import com.google.common.base.Strings;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.bytes.BytesUtils;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Dictionary;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV1;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.PageReader;
import org.apache.parquet.column.values.ValuesReader;
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.schema.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.sql.Timestamp;
import java.util.Calendar;
import java.util.TimeZone;

import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL;
import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL;
import static org.apache.parquet.column.ValuesType.VALUES;

/**
 * It's column level Parquet reader which is used to read a batch of records for a column,
 * part of the code is referred from Apache Spark and Apache Parquet.
 */
public class VectorizedPrimitiveColumnReader implements VectorizedColumnReader {

  private static final Logger LOG = LoggerFactory.getLogger(VectorizedPrimitiveColumnReader.class);

  private String conversionTimeZone;

  /**
   * Total number of values read.
   */
  private long valuesRead;

  /**
   * value that indicates the end of the current page. That is,
   * if valuesRead == endOfPageValueCount, we are at the end of the page.
   */
  private long endOfPageValueCount;

  /**
   * The dictionary, if this column has dictionary encoding.
   */
  private final Dictionary dictionary;

  /**
   * If true, the current page is dictionary encoded.
   */
  private boolean isCurrentPageDictionaryEncoded;

  /**
   * Maximum definition level for this column.
   */
  private final int maxDefLevel;

  private int definitionLevel;
  private int repetitionLevel;

  /**
   * Repetition/Definition/Value readers.
   */
  private IntIterator repetitionLevelColumn;
  private IntIterator definitionLevelColumn;
  private ValuesReader dataColumn;

  /**
   * Total values in the current page.
   */
  private int pageValueCount;

  private final PageReader pageReader;
  private final ColumnDescriptor descriptor;
  private final Type type;

  public VectorizedPrimitiveColumnReader(
    ColumnDescriptor descriptor,
    PageReader pageReader,
    String conversionTimeZone,
    Type type) throws IOException {
    this.descriptor = descriptor;
    this.type = type;
    this.pageReader = pageReader;
    this.maxDefLevel = descriptor.getMaxDefinitionLevel();
    this.conversionTimeZone = conversionTimeZone;

    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
        this.isCurrentPageDictionaryEncoded = true;
      } catch (IOException e) {
        throw new IOException("could not decode the dictionary for " + descriptor, e);
      }
    } else {
      this.dictionary = null;
      this.isCurrentPageDictionaryEncoded = false;
    }
  }

  public void readBatch(
    int total,
    ColumnVector column,
    TypeInfo columnType) throws IOException {
    int rowId = 0;
    while (total > 0) {
      // Compute the number of values we want to read in this page.
      int leftInPage = (int) (endOfPageValueCount - valuesRead);
      if (leftInPage == 0) {
        readPage();
        leftInPage = (int) (endOfPageValueCount - valuesRead);
      }

      int num = Math.min(total, leftInPage);
      if (isCurrentPageDictionaryEncoded) {
        LongColumnVector dictionaryIds = new LongColumnVector();
        // Read and decode dictionary ids.
        readDictionaryIDs(num, dictionaryIds, rowId);
        decodeDictionaryIds(rowId, num, column, dictionaryIds);
      } else {
        // assign values in vector
        readBatchHelper(num, column, columnType, rowId);
      }
      rowId += num;
      total -= num;
    }
  }

  private void readBatchHelper(
    int num,
    ColumnVector column,
    TypeInfo columnType,
    int rowId) throws IOException {
    PrimitiveTypeInfo primitiveColumnType = (PrimitiveTypeInfo) columnType;
    switch (primitiveColumnType.getPrimitiveCategory()) {
    case INT:
    case BYTE:
    case SHORT:
      readIntegers(num, (LongColumnVector) column, rowId);
      break;
    case DATE:
    case INTERVAL_YEAR_MONTH:
    case LONG:
      readLongs(num, (LongColumnVector) column, rowId);
      break;
    case BOOLEAN:
      readBooleans(num, (LongColumnVector) column, rowId);
      break;
    case DOUBLE:
      readDoubles(num, (DoubleColumnVector) column, rowId);
      break;
    case BINARY:
    case STRING:
    case CHAR:
    case VARCHAR:
      readBinaries(num, (BytesColumnVector) column, rowId);
      break;
    case FLOAT:
      readFloats(num, (DoubleColumnVector) column, rowId);
      break;
    case DECIMAL:
      readDecimal(num, (DecimalColumnVector) column, rowId);
      break;
    case INTERVAL_DAY_TIME:
    case TIMESTAMP:
    default:
      throw new IOException("Unsupported type: " + type);
    }
  }

  private void readDictionaryIDs(
    int total,
    LongColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readValueDictionaryId();
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readIntegers(
    int total,
    LongColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readInteger();
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readDoubles(
    int total,
    DoubleColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readDouble();
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readBooleans(
    int total,
    LongColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readBoolean() ? 1 : 0;
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readLongs(
    int total,
    LongColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readLong();
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readFloats(
    int total,
    DoubleColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId] = dataColumn.readFloat();
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readDecimal(
    int total,
    DecimalColumnVector c,
    int rowId) throws IOException {
    int left = total;
    c.precision = (short) type.asPrimitiveType().getDecimalMetadata().getPrecision();
    c.scale = (short) type.asPrimitiveType().getDecimalMetadata().getScale();
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.vector[rowId].set(dataColumn.readBytes().getBytesUnsafe(), c.scale);
        c.isNull[rowId] = false;
        c.isRepeating = c.isRepeating && (c.vector[0] == c.vector[rowId]);
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  private void readBinaries(
    int total,
    BytesColumnVector c,
    int rowId) throws IOException {
    int left = total;
    while (left > 0) {
      readRepetitionAndDefinitionLevels();
      if (definitionLevel >= maxDefLevel) {
        c.setVal(rowId, dataColumn.readBytes().getBytesUnsafe());
        c.isNull[rowId] = false;
        // TODO figure out a better way to set repeat for Binary type
        c.isRepeating = false;
      } else {
        c.isNull[rowId] = true;
        c.isRepeating = false;
        c.noNulls = false;
      }
      rowId++;
      left--;
    }
  }

  /**
   * Reads `num` values into column, decoding the values from `dictionaryIds` and `dictionary`.
   */
  private void decodeDictionaryIds(
    int rowId,
    int num,
    ColumnVector column,
    LongColumnVector dictionaryIds) {
    System.arraycopy(dictionaryIds.isNull, rowId, column.isNull, rowId, num);
    if (column.noNulls) {
      column.noNulls = dictionaryIds.noNulls;
    }
    column.isRepeating = column.isRepeating && dictionaryIds.isRepeating;

    switch (descriptor.getType()) {
    case INT32:
      for (int i = rowId; i < rowId + num; ++i) {
        ((LongColumnVector) column).vector[i] =
          dictionary.decodeToInt((int) dictionaryIds.vector[i]);
      }
      break;
    case INT64:
      for (int i = rowId; i < rowId + num; ++i) {
        ((LongColumnVector) column).vector[i] =
          dictionary.decodeToLong((int) dictionaryIds.vector[i]);
      }
      break;
    case FLOAT:
      for (int i = rowId; i < rowId + num; ++i) {
        ((DoubleColumnVector) column).vector[i] =
          dictionary.decodeToFloat((int) dictionaryIds.vector[i]);
      }
      break;
    case DOUBLE:
      for (int i = rowId; i < rowId + num; ++i) {
        ((DoubleColumnVector) column).vector[i] =
          dictionary.decodeToDouble((int) dictionaryIds.vector[i]);
      }
      break;
    case INT96:
      final Calendar calendar;
      if (Strings.isNullOrEmpty(this.conversionTimeZone)) {
        // Local time should be used if no timezone is specified
        calendar = Calendar.getInstance();
      } else {
        calendar = Calendar.getInstance(TimeZone.getTimeZone(this.conversionTimeZone));
      }
      for (int i = rowId; i < rowId + num; ++i) {
        ByteBuffer buf = dictionary.decodeToBinary((int) dictionaryIds.vector[i]).toByteBuffer();
        buf.order(ByteOrder.LITTLE_ENDIAN);
        long timeOfDayNanos = buf.getLong();
        int julianDay = buf.getInt();
        NanoTime nt = new NanoTime(julianDay, timeOfDayNanos);
        Timestamp ts = NanoTimeUtils.getTimestamp(nt, calendar);
        ((TimestampColumnVector) column).set(i, ts);
      }
      break;
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
      if (column instanceof BytesColumnVector) {
        for (int i = rowId; i < rowId + num; ++i) {
          ((BytesColumnVector) column)
            .setVal(i, dictionary.decodeToBinary((int) dictionaryIds.vector[i]).getBytesUnsafe());
        }
      } else {
        DecimalColumnVector decimalColumnVector = ((DecimalColumnVector) column);
        decimalColumnVector.precision =
          (short) type.asPrimitiveType().getDecimalMetadata().getPrecision();
        decimalColumnVector.scale = (short) type.asPrimitiveType().getDecimalMetadata().getScale();
        for (int i = rowId; i < rowId + num; ++i) {
          decimalColumnVector.vector[i]
            .set(dictionary.decodeToBinary((int) dictionaryIds.vector[i]).getBytesUnsafe(),
              decimalColumnVector.scale);
        }
      }
      break;
    default:
      throw new UnsupportedOperationException("Unsupported type: " + descriptor.getType());
    }
  }

  private void readRepetitionAndDefinitionLevels() {
    repetitionLevel = repetitionLevelColumn.nextInt();
    definitionLevel = definitionLevelColumn.nextInt();
    valuesRead++;
  }

  private void readPage() throws IOException {
    DataPage page = pageReader.readPage();
    // TODO: Why is this a visitor?
    page.accept(new DataPage.Visitor<Void>() {
      @Override
      public Void visit(DataPageV1 dataPageV1) {
        readPageV1(dataPageV1);
        return null;
      }

      @Override
      public Void visit(DataPageV2 dataPageV2) {
        readPageV2(dataPageV2);
        return null;
      }
    });
  }

  private void initDataReader(Encoding dataEncoding, byte[] bytes, int offset, int valueCount) throws IOException {
    this.pageValueCount = valueCount;
    this.endOfPageValueCount = valuesRead + pageValueCount;
    if (dataEncoding.usesDictionary()) {
      this.dataColumn = null;
      if (dictionary == null) {
        throw new IOException(
          "could not read page in col " + descriptor +
            " as the dictionary was missing for encoding " + dataEncoding);
      }
      dataColumn = dataEncoding.getDictionaryBasedValuesReader(descriptor, VALUES, dictionary);
      this.isCurrentPageDictionaryEncoded = true;
    } else {
      if (dataEncoding != Encoding.PLAIN) {
        throw new UnsupportedOperationException("Unsupported encoding: " + dataEncoding);
      }
      dataColumn = dataEncoding.getValuesReader(descriptor, VALUES);
      this.isCurrentPageDictionaryEncoded = false;
    }

    try {
      dataColumn.initFromPage(pageValueCount, bytes, offset);
    } catch (IOException e) {
      throw new IOException("could not read page in col " + descriptor, e);
    }
  }

  private void readPageV1(DataPageV1 page) {
    ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL);
    this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
    this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
    try {
      byte[] bytes = page.getBytes().toByteArray();
      LOG.debug("page size " + bytes.length + " bytes and " + pageValueCount + " records");
      LOG.debug("reading repetition levels at 0");
      rlReader.initFromPage(pageValueCount, bytes, 0);
      int next = rlReader.getNextOffset();
      LOG.debug("reading definition levels at " + next);
      dlReader.initFromPage(pageValueCount, bytes, next);
      next = dlReader.getNextOffset();
      LOG.debug("reading data at " + next);
      initDataReader(page.getValueEncoding(), bytes, next, page.getValueCount());
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
    }
  }

  private void readPageV2(DataPageV2 page) {
    this.pageValueCount = page.getValueCount();
    this.repetitionLevelColumn = newRLEIterator(descriptor.getMaxRepetitionLevel(),
      page.getRepetitionLevels());
    this.definitionLevelColumn = newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels());
    try {
      LOG.debug("page data size " + page.getData().size() + " bytes and " + pageValueCount + " records");
      initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0, page.getValueCount());
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page " + page + " in col " + descriptor, e);
    }
  }

  private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) {
    try {
      if (maxLevel == 0) {
        return new NullIntIterator();
      }
      return new RLEIntIterator(
        new RunLengthBitPackingHybridDecoder(
          BytesUtils.getWidthFromMaxInt(maxLevel),
          new ByteArrayInputStream(bytes.toByteArray())));
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read levels in page for col " + descriptor, e);
    }
  }

  /**
   * Utility classes to abstract over different way to read ints with different encodings.
   * TODO: remove this layer of abstraction?
   */
  abstract static class IntIterator {
    abstract int nextInt();
  }

  protected static final class ValuesReaderIntIterator extends IntIterator {
    ValuesReader delegate;

    public ValuesReaderIntIterator(ValuesReader delegate) {
      this.delegate = delegate;
    }

    @Override
    int nextInt() {
      return delegate.readInteger();
    }
  }

  protected static final class RLEIntIterator extends IntIterator {
    RunLengthBitPackingHybridDecoder delegate;

    public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) {
      this.delegate = delegate;
    }

    @Override
    int nextInt() {
      try {
        return delegate.readInt();
      } catch (IOException e) {
        throw new ParquetDecodingException(e);
      }
    }
  }

  protected static final class NullIntIterator extends IntIterator {
    @Override
    int nextInt() { return 0; }
  }
}