HiveRecordReaders.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * This template is used to generate different Hive record reader classes for different data formats
 * to avoid JIT profile pullusion. These readers are derived from HiveAbstractReader which implements
 * codes for init and setup stage, but the repeated - and performance critical part - next() method is
 * separately implemented in the classes generated from this template. The internal SkipRecordReeader
 * class is also separated as well due to the same reason.
 *
 * As to the performance gain with this change, please refer to:
 * https://issues.apache.org/jira/browse/DRILL-4982
 *
 */
<@pp.dropOutputFile />
<#list hiveFormat.map as entry>
<@pp.changeOutputFile name="/org/apache/drill/exec/store/hive/Hive${entry.hiveReader}Reader.java" />
<#include "/@includes/license.ftl" />

package org.apache.drill.exec.store.hive;

import java.io.IOException;
import java.util.List;
import java.util.Properties;
import org.apache.drill.common.exceptions.DrillRuntimeException;
import org.apache.drill.common.exceptions.ExecutionSetupException;
import org.apache.drill.common.expression.SchemaPath;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.vector.AllocationHelper;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.hive.conf.HiveConf;

import org.apache.hadoop.hive.serde2.SerDeException;

import org.apache.hadoop.mapred.RecordReader;
<#if entry.hasHeaderFooter == true>
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Queue;
import java.util.Set;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.serde.serdeConstants;
</#if>

public class Hive${entry.hiveReader}Reader extends HiveAbstractReader {

  Object key;
<#if entry.hasHeaderFooter == true>
  SkipRecordsInspector skipRecordsInspector;
<#else>
  Object value;
</#if>

  public Hive${entry.hiveReader}Reader(HiveTableWithColumnCache table, HivePartition partition, InputSplit inputSplit, List<SchemaPath> projectedColumns,
                       FragmentContext context, final HiveConf hiveConf,
                       UserGroupInformation proxyUgi) throws ExecutionSetupException {
    super(table, partition, inputSplit, projectedColumns, context, hiveConf, proxyUgi);
  }

  public  void internalInit(Properties tableProperties, RecordReader<Object, Object> reader) {

    key = reader.createKey();
<#if entry.hasHeaderFooter == true>
    skipRecordsInspector = new SkipRecordsInspector(tableProperties, reader);
<#else>
    value = reader.createValue();
</#if>

  }
  private void readHiveRecordAndInsertIntoRecordBatch(Object deSerializedValue, int outputRecordIndex) {
    for (int i = 0; i < selectedStructFieldRefs.size(); i++) {
      Object hiveValue = finalOI.getStructFieldData(deSerializedValue, selectedStructFieldRefs.get(i));
      if (hiveValue != null) {
        selectedColumnFieldConverters.get(i).setSafeValue(selectedColumnObjInspectors.get(i), hiveValue,
          vectors.get(i), outputRecordIndex);
      }
    }
  }

<#if entry.hasHeaderFooter == true>
  @Override
  public int next() {
    for (ValueVector vv : vectors) {
      AllocationHelper.allocateNew(vv, TARGET_RECORD_COUNT);
    }
    if (empty) {
      setValueCountAndPopulatePartitionVectors(0);
      return 0;
    }

    try {
      skipRecordsInspector.reset();
      Object value;

      int recordCount = 0;

      while (recordCount < TARGET_RECORD_COUNT && reader.next(key, value = skipRecordsInspector.getNextValue())) {
        if (skipRecordsInspector.doSkipHeader(recordCount++)) {
          continue;
        }
        Object bufferedValue = skipRecordsInspector.bufferAdd(value);
        if (bufferedValue != null) {
          Object deSerializedValue = partitionSerDe.deserialize((Writable) bufferedValue);
          if (partTblObjectInspectorConverter != null) {
            deSerializedValue = partTblObjectInspectorConverter.convert(deSerializedValue);
          }
          readHiveRecordAndInsertIntoRecordBatch(deSerializedValue, skipRecordsInspector.getActualCount());
          skipRecordsInspector.incrementActualCount();
        }
        skipRecordsInspector.incrementTempCount();
      }

      setValueCountAndPopulatePartitionVectors(skipRecordsInspector.getActualCount());
      skipRecordsInspector.updateContinuance();
      return skipRecordsInspector.getActualCount();
    } catch (IOException | SerDeException e) {
      throw new DrillRuntimeException(e);
    }
  }

/**
 * SkipRecordsInspector encapsulates logic to skip header and footer from file.
 * Logic is applicable only for predefined in constructor file formats.
 */
protected class SkipRecordsInspector {

  private final Set<Object> fileFormats;
  private int headerCount;
  private int footerCount;
  private Queue<Object> footerBuffer;
  // indicates if we continue reading the same file
  private boolean continuance;
  private int holderIndex;
  private List<Object> valueHolder;
  private int actualCount;
  // actualCount without headerCount, used to determine holderIndex
  private int tempCount;

  protected SkipRecordsInspector(Properties tableProperties, RecordReader reader) {
    this.fileFormats = new HashSet<Object>(Arrays.asList(org.apache.hadoop.mapred.TextInputFormat.class.getName()));
    this.headerCount = retrievePositiveIntProperty(tableProperties, serdeConstants.HEADER_COUNT, 0);
    this.footerCount = retrievePositiveIntProperty(tableProperties, serdeConstants.FOOTER_COUNT, 0);
    logger.debug("skipRecordInspector: fileFormat {}, headerCount {}, footerCount {}",
        this.fileFormats, this.headerCount, this.footerCount);
    this.footerBuffer = Lists.newLinkedList();
    this.continuance = false;
    this.holderIndex = -1;
    this.valueHolder = initializeValueHolder(reader, footerCount);
    this.actualCount = 0;
    this.tempCount = 0;
  }

  protected boolean doSkipHeader(int recordCount) {
    return !continuance && recordCount < headerCount;
  }

  protected void reset() {
    tempCount = holderIndex + 1;
    actualCount = 0;
    if (!continuance) {
      footerBuffer.clear();
    }
  }

  protected Object bufferAdd(Object value) throws SerDeException {
    footerBuffer.add(value);
    if (footerBuffer.size() <= footerCount) {
      return null;
    }
    return footerBuffer.poll();
  }

  protected Object getNextValue() {
    holderIndex = tempCount % getHolderSize();
    return valueHolder.get(holderIndex);
  }

  private int getHolderSize() {
    return valueHolder.size();
  }

  protected void updateContinuance() {
    this.continuance = actualCount != 0;
  }

  protected int incrementTempCount() {
    return ++tempCount;
  }

  protected int getActualCount() {
    return actualCount;
  }

  protected int incrementActualCount() {
    return ++actualCount;
  }

  /**
   * Retrieves positive numeric property from Properties object by name.
   * Return default value if
   * 1. file format is absent in predefined file formats list
   * 2. property doesn't exist in table properties
   * 3. property value is negative
   * otherwise casts value to int.
   *
   * @param tableProperties property holder
   * @param propertyName    name of the property
   * @param defaultValue    default value
   * @return property numeric value
   * @throws NumberFormatException if property value is non-numeric
   */
  protected int retrievePositiveIntProperty(Properties tableProperties, String propertyName, int defaultValue) {
    int propertyIntValue = defaultValue;
    if (!fileFormats.contains(tableProperties.get(hive_metastoreConstants.FILE_INPUT_FORMAT))) {
      return propertyIntValue;
    }
    Object propertyObject = tableProperties.get(propertyName);
    if (propertyObject != null) {
      try {
        propertyIntValue = Integer.valueOf((String) propertyObject);
      } catch (NumberFormatException e) {
        throw new NumberFormatException(String.format("Hive table property %s value '%s' is non-numeric", propertyName, propertyObject.toString()));
      }
    }
    return propertyIntValue < 0 ? defaultValue : propertyIntValue;
  }

  /**
   * Creates buffer of objects to be used as values, so these values can be re-used.
   * Objects number depends on number of lines to skip in the end of the file plus one object.
   *
   * @param reader          RecordReader to return value object
   * @param skipFooterLines number of lines to skip at the end of the file
   * @return list of objects to be used as values
   */
  private List<Object> initializeValueHolder(RecordReader reader, int skipFooterLines) {
    List<Object> valueHolder = new ArrayList<>(skipFooterLines + 1);
    for (int i = 0; i <= skipFooterLines; i++) {
      valueHolder.add(reader.createValue());
    }
    return valueHolder;
  }
 }

<#else>
  @Override
  public int next() {
    for (ValueVector vv : vectors) {
      AllocationHelper.allocateNew(vv, TARGET_RECORD_COUNT);
    }
    if (empty) {
      setValueCountAndPopulatePartitionVectors(0);
      return 0;
    }

    try {
      int recordCount = 0;
      while (recordCount < TARGET_RECORD_COUNT && reader.next(key, value)) {
        Object deSerializedValue = partitionSerDe.deserialize((Writable) value);
        if (partTblObjectInspectorConverter != null) {
          deSerializedValue = partTblObjectInspectorConverter.convert(deSerializedValue);
        }
        readHiveRecordAndInsertIntoRecordBatch(deSerializedValue, recordCount);
        recordCount++;
      }

      setValueCountAndPopulatePartitionVectors(recordCount);
      return recordCount;
    } catch (IOException | SerDeException e) {
      throw new DrillRuntimeException(e);
    }
  }
</#if>

}
</#list>