ParquetRecordReaderBase.java example

Explorer
hive-master
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.parquet;

import com.google.common.base.Strings;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.filter2.compat.RowGroupFilter;
import org.apache.parquet.filter2.predicate.FilterPredicate;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.ParquetInputSplit;
import org.apache.parquet.hadoop.api.InitContext;
import org.apache.parquet.hadoop.api.ReadSupport;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.TimeZone;

public class ParquetRecordReaderBase {
  public static final Logger LOG = LoggerFactory.getLogger(ParquetRecordReaderBase.class);

  protected Path file;
  protected ProjectionPusher projectionPusher;
  protected SerDeStats serDeStats;
  protected JobConf jobConf;

  protected int schemaSize;
  protected List<BlockMetaData> filtedBlocks;
  protected ParquetFileReader reader;

  /**
   * gets a ParquetInputSplit corresponding to a split given by Hive
   *
   * @param oldSplit The split given by Hive
   * @param conf The JobConf of the Hive job
   * @return a ParquetInputSplit corresponding to the oldSplit
   * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
   */
  @SuppressWarnings("deprecation")
  protected ParquetInputSplit getSplit(
    final org.apache.hadoop.mapred.InputSplit oldSplit,
    final JobConf conf
  ) throws IOException {
    ParquetInputSplit split;

    if (oldSplit == null) {
      return null;
    }

    if (oldSplit instanceof FileSplit) {
      final Path finalPath = ((FileSplit) oldSplit).getPath();
      jobConf = projectionPusher.pushProjectionsAndFilters(conf, finalPath.getParent());

      // TODO enable MetadataFilter by using readFooter(Configuration configuration, Path file,
      // MetadataFilter filter) API
      final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(jobConf, finalPath);
      final List<BlockMetaData> blocks = parquetMetadata.getBlocks();
      final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();

      final ReadSupport.ReadContext
        readContext = new DataWritableReadSupport().init(new InitContext(jobConf,
        null, fileMetaData.getSchema()));

      // Compute stats
      for (BlockMetaData bmd : blocks) {
        serDeStats.setRowCount(serDeStats.getRowCount() + bmd.getRowCount());
        serDeStats.setRawDataSize(serDeStats.getRawDataSize() + bmd.getTotalByteSize());
      }

      schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata()
        .get(DataWritableReadSupport.HIVE_TABLE_AS_PARQUET_SCHEMA)).getFieldCount();
      final List<BlockMetaData> splitGroup = new ArrayList<BlockMetaData>();
      final long splitStart = ((FileSplit) oldSplit).getStart();
      final long splitLength = ((FileSplit) oldSplit).getLength();
      for (final BlockMetaData block : blocks) {
        final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
        if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
          splitGroup.add(block);
        }
      }
      if (splitGroup.isEmpty()) {
        LOG.warn("Skipping split, could not find row group in: " + oldSplit);
        return null;
      }

      FilterCompat.Filter filter = setFilter(jobConf, fileMetaData.getSchema());
      if (filter != null) {
        filtedBlocks = RowGroupFilter.filterRowGroups(filter, splitGroup, fileMetaData.getSchema());
        if (filtedBlocks.isEmpty()) {
          LOG.debug("All row groups are dropped due to filter predicates");
          return null;
        }

        long droppedBlocks = splitGroup.size() - filtedBlocks.size();
        if (droppedBlocks > 0) {
          LOG.debug("Dropping " + droppedBlocks + " row groups that do not pass filter predicate");
        }
      } else {
        filtedBlocks = splitGroup;
      }

      split = new ParquetInputSplit(finalPath,
        splitStart,
        splitLength,
        oldSplit.getLocations(),
        filtedBlocks,
        readContext.getRequestedSchema().toString(),
        fileMetaData.getSchema().toString(),
        fileMetaData.getKeyValueMetaData(),
        readContext.getReadSupportMetadata());
      return split;
    } else {
      throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
  }

  /**
   * Sets the TimeZone conversion for Parquet timestamp columns.
   *
   * @param configuration Configuration object where to get and set the TimeZone conversion
   * @param finalPath     path to the parquet file
   */
  protected void setTimeZoneConversion(Configuration configuration, Path finalPath) {
    ParquetMetadata parquetMetadata;
    String timeZoneID;

    try {
      parquetMetadata = ParquetFileReader.readFooter(configuration, finalPath,
          ParquetMetadataConverter.NO_FILTER);
    } catch (IOException e) {
      // If an error occurred while reading the file, then we just skip the TimeZone setting.
      // This error will probably occur on any other part of the code.
      LOG.debug("Could not read parquet file footer at " + finalPath + ". Cannot determine " +
          "parquet file timezone", e);
      return;
    }

    boolean skipConversion = HiveConf.getBoolVar(configuration,
        HiveConf.ConfVars.HIVE_PARQUET_TIMESTAMP_SKIP_CONVERSION);
    FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    if (!Strings.nullToEmpty(fileMetaData.getCreatedBy()).startsWith("parquet-mr") &&
        skipConversion) {
      // Impala writes timestamp values using GMT only. We should not try to convert Impala
      // files to other type of timezones.
      timeZoneID = ParquetTableUtils.PARQUET_INT96_NO_ADJUSTMENT_ZONE;
    } else {
      // TABLE_PARQUET_INT96_TIMEZONE is a table property used to detect what timezone conversion
      // to use when reading Parquet timestamps.
      timeZoneID = configuration.get(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY);
      NanoTimeUtils.validateTimeZone(timeZoneID);
    }

    // 'timeZoneID' should be valid, since we did not throw exception above
    configuration.set(ParquetTableUtils.PARQUET_INT96_WRITE_ZONE_PROPERTY,timeZoneID);
  }

  public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) {
    SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
    if (sarg == null) {
      return null;
    }

    // Create the Parquet FilterPredicate without including columns that do not exist
    // on the schema (such as partition columns).
    FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
    if (p != null) {
      // Filter may have sensitive information. Do not send to debug.
      LOG.debug("PARQUET predicate push down generated.");
      ParquetInputFormat.setFilterPredicate(conf, p);
      return FilterCompat.get(p);
    } else {
      // Filter may have sensitive information. Do not send to debug.
      LOG.debug("No PARQUET predicate push down is generated.");
      return null;
    }
  }

  public List<BlockMetaData> getFiltedBlocks() {
    return filtedBlocks;
  }

  public SerDeStats getStats() {
    return serDeStats;
  }
}