MapredParquetInputFormat.java example

Explorer
hive-master
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.parquet;

import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.JobConf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.RecordReader;

import org.apache.parquet.hadoop.ParquetInputFormat;


/**
 *
 * A Parquet InputFormat for Hive (with the deprecated package mapred)
 *
 * NOTE: With HIVE-9235 we removed "implements VectorizedParquetInputFormat" since all data types
 *       are not currently supported.  Removing the interface turns off vectorization.
 */
public class MapredParquetInputFormat extends FileInputFormat<NullWritable, ArrayWritable>
  implements VectorizedInputFormatInterface {

  private static final Logger LOG = LoggerFactory.getLogger(MapredParquetInputFormat.class);

  private final ParquetInputFormat<ArrayWritable> realInput;

  private final transient VectorizedParquetInputFormat vectorizedSelf;

  public MapredParquetInputFormat() {
    this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class));
  }

  protected MapredParquetInputFormat(final ParquetInputFormat<ArrayWritable> inputFormat) {
    this.realInput = inputFormat;
    vectorizedSelf = new VectorizedParquetInputFormat();
  }

  @SuppressWarnings({ "unchecked", "rawtypes" })
  @Override
  public org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> getRecordReader(
      final org.apache.hadoop.mapred.InputSplit split,
      final org.apache.hadoop.mapred.JobConf job,
      final org.apache.hadoop.mapred.Reporter reporter
      ) throws IOException {

    propagateParquetTimeZoneTablePorperty((FileSplit) split, job);

    try {
      if (Utilities.getUseVectorizedInputFileFormat(job)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Using vectorized record reader");
        }
        return (RecordReader) vectorizedSelf.getRecordReader(split, job, reporter);
      }
      else {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Using row-mode record reader");
        }
        return new ParquetRecordReaderWrapper(realInput, split, job, reporter);
      }
    } catch (final InterruptedException e) {
      throw new RuntimeException("Cannot create a RecordReaderWrapper", e);
    }
  }

  /**
   * Tries to find the table belonging to the file path of the split.
   * If the table can be determined, the parquet timezone property will be propagated
   * to the job configuration to be used during reading.
   * If the table cannot be determined, then do nothing.
   * @param split file split being read
   * @param job configuration to set the timezone property on
   */
  private void propagateParquetTimeZoneTablePorperty(FileSplit split, JobConf job) {
    PartitionDesc part = null;
    Path filePath = split.getPath();
    try {
      MapWork mapWork = Utilities.getMapWork(job);
      if(mapWork != null) {
        LOG.debug("Trying to find partition in MapWork for path " + filePath);
        Map<Path, PartitionDesc> pathToPartitionInfo = mapWork.getPathToPartitionInfo();

        part = HiveFileFormatUtils
            .getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, null);
        LOG.debug("Partition found " + part);
      }
    } catch (AssertionError ae) {
      LOG.warn("Cannot get partition description from " + filePath
          + " because " + ae.getMessage());
      part = null;
    } catch (Exception e) {
      LOG.warn("Cannot get partition description from " + filePath
          + " because " + e.getMessage());
      part = null;
    }

    if (part != null && part.getTableDesc() != null) {
      ParquetTableUtils.setParquetTimeZoneIfAbsent(job, part.getTableDesc().getProperties());
    }
  }
}