/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io.parquet; import java.io.IOException; import java.util.Map; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface; import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetTableUtils; import org.apache.hadoop.hive.ql.plan.MapWork; import org.apache.hadoop.hive.ql.plan.PartitionDesc; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.RecordReader; import org.apache.parquet.hadoop.ParquetInputFormat; /** * * A Parquet InputFormat for Hive (with the deprecated package mapred) * * NOTE: With HIVE-9235 we removed "implements VectorizedParquetInputFormat" since all data types * are not currently supported. Removing the interface turns off vectorization. */ public class MapredParquetInputFormat extends FileInputFormat<NullWritable, ArrayWritable> implements VectorizedInputFormatInterface { private static final Logger LOG = LoggerFactory.getLogger(MapredParquetInputFormat.class); private final ParquetInputFormat<ArrayWritable> realInput; private final transient VectorizedParquetInputFormat vectorizedSelf; public MapredParquetInputFormat() { this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class)); } protected MapredParquetInputFormat(final ParquetInputFormat<ArrayWritable> inputFormat) { this.realInput = inputFormat; vectorizedSelf = new VectorizedParquetInputFormat(); } @SuppressWarnings({ "unchecked", "rawtypes" }) @Override public org.apache.hadoop.mapred.RecordReader<NullWritable, ArrayWritable> getRecordReader( final org.apache.hadoop.mapred.InputSplit split, final org.apache.hadoop.mapred.JobConf job, final org.apache.hadoop.mapred.Reporter reporter ) throws IOException { propagateParquetTimeZoneTablePorperty((FileSplit) split, job); try { if (Utilities.getUseVectorizedInputFileFormat(job)) { if (LOG.isDebugEnabled()) { LOG.debug("Using vectorized record reader"); } return (RecordReader) vectorizedSelf.getRecordReader(split, job, reporter); } else { if (LOG.isDebugEnabled()) { LOG.debug("Using row-mode record reader"); } return new ParquetRecordReaderWrapper(realInput, split, job, reporter); } } catch (final InterruptedException e) { throw new RuntimeException("Cannot create a RecordReaderWrapper", e); } } /** * Tries to find the table belonging to the file path of the split. * If the table can be determined, the parquet timezone property will be propagated * to the job configuration to be used during reading. * If the table cannot be determined, then do nothing. * @param split file split being read * @param job configuration to set the timezone property on */ private void propagateParquetTimeZoneTablePorperty(FileSplit split, JobConf job) { PartitionDesc part = null; Path filePath = split.getPath(); try { MapWork mapWork = Utilities.getMapWork(job); if(mapWork != null) { LOG.debug("Trying to find partition in MapWork for path " + filePath); Map<Path, PartitionDesc> pathToPartitionInfo = mapWork.getPathToPartitionInfo(); part = HiveFileFormatUtils .getPartitionDescFromPathRecursively(pathToPartitionInfo, filePath, null); LOG.debug("Partition found " + part); } } catch (AssertionError ae) { LOG.warn("Cannot get partition description from " + filePath + " because " + ae.getMessage()); part = null; } catch (Exception e) { LOG.warn("Cannot get partition description from " + filePath + " because " + e.getMessage()); part = null; } if (part != null && part.getTableDesc() != null) { ParquetTableUtils.setParquetTimeZoneIfAbsent(job, part.getTableDesc().getProperties()); } } }