LlapInputFormat.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.hadoop.hive.llap.io.api.impl;

import org.apache.hadoop.hive.ql.io.BatchToRowInputFormat;

import org.apache.hadoop.hive.conf.HiveConf.ConfVars;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.concurrent.ExecutorService;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.llap.counters.QueryFragmentCounters;
import org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer;
import org.apache.hadoop.hive.llap.io.decode.ReadPipeline;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedInputFormatInterface;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.AvoidSplitCombination;
import org.apache.hadoop.hive.ql.io.LlapAwareSplit;
import org.apache.hadoop.hive.ql.io.SelfDescribingInputFormatInterface;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hive.common.util.HiveStringUtils;
import org.apache.orc.OrcUtils;
import org.apache.orc.TypeDescription;
import org.apache.orc.impl.SchemaEvolution;
import org.apache.tez.common.counters.TezCounters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;

public class LlapInputFormat implements InputFormat<NullWritable, VectorizedRowBatch>,
    VectorizedInputFormatInterface, SelfDescribingInputFormatInterface,
    AvoidSplitCombination {
  private static final String NONVECTOR_SETTING_MESSAGE = "disable "
      + ConfVars.LLAP_IO_NONVECTOR_WRAPPER_ENABLED.varname + " to work around this error";

  @SuppressWarnings("rawtypes")
  private final InputFormat sourceInputFormat;
  private final AvoidSplitCombination sourceASC;
  @SuppressWarnings("deprecation")
  private final Deserializer sourceSerDe;
  final ColumnVectorProducer cvp;
  final ExecutorService executor;
  private final String hostName;

  @SuppressWarnings("rawtypes")
  LlapInputFormat(InputFormat sourceInputFormat, Deserializer sourceSerDe,
      ColumnVectorProducer cvp, ExecutorService executor) {
    this.executor = executor;
    this.cvp = cvp;
    this.sourceInputFormat = sourceInputFormat;
    this.sourceASC = (sourceInputFormat instanceof AvoidSplitCombination)
        ? (AvoidSplitCombination)sourceInputFormat : null;
    this.sourceSerDe = sourceSerDe;
    this.hostName = HiveStringUtils.getHostname();
  }

  @Override
  public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(
      InputSplit split, JobConf job, Reporter reporter) throws IOException {
    RecordReader<NullWritable, VectorizedRowBatch> noLlap = checkLlapSplit(split, job, reporter);
    if (noLlap != null) return noLlap;

    boolean isVectorized = Utilities.getUseVectorizedInputFileFormat(job);

    FileSplit fileSplit = (FileSplit) split;
    reporter.setStatus(fileSplit.toString());
    try {
      List<Integer> includedCols = ColumnProjectionUtils.isReadAllColumns(job)
          ? null : ColumnProjectionUtils.getReadColumnIDs(job);
      LlapRecordReader rr = new LlapRecordReader(job, fileSplit, includedCols, hostName, cvp,
          executor, sourceInputFormat, sourceSerDe, reporter);
      if (!rr.init()) {
        return sourceInputFormat.getRecordReader(split, job, reporter);
      }

      return wrapLlapReader(isVectorized, includedCols, rr, split, job, reporter);
    } catch (Exception ex) {
      throw new IOException(ex);
    }
  }

  public RecordReader<NullWritable, VectorizedRowBatch> wrapLlapReader(
      boolean isVectorized, List<Integer> includedCols, LlapRecordReader rr,
      InputSplit split, JobConf job, Reporter reporter) throws IOException {
    // vectorized row batch reader
    if (isVectorized) {
      return rr;
    } else if (sourceInputFormat instanceof BatchToRowInputFormat) {
      LlapIoImpl.LOG.info("Using batch-to-row converter for split: " + split);
      return bogusCast(((BatchToRowInputFormat) sourceInputFormat).getWrapper(
          rr, rr.getVectorizedRowBatchCtx(), includedCols));
    } else {
      LlapIoImpl.LOG.warn("Not using LLAP IO for an unsupported split: " + split);
      return sourceInputFormat.getRecordReader(split, job, reporter);
    }
  }

  public RecordReader<NullWritable, VectorizedRowBatch> checkLlapSplit(
      InputSplit split, JobConf job, Reporter reporter) throws IOException {
    boolean useLlapIo = true;
    if (split instanceof LlapAwareSplit) {
      useLlapIo = ((LlapAwareSplit) split).canUseLlapIo();
    }
    if (useLlapIo) return null;

    LlapIoImpl.LOG.warn("Not using LLAP IO for an unsupported split: " + split);
    return sourceInputFormat.getRecordReader(split, job, reporter);
  }

  // Returning either a vectorized or non-vectorized reader from the same call requires breaking
  // generics... this is how vectorization currently works.
  @SuppressWarnings("unchecked")
  private static <A, B, C, D> RecordReader<A, B> bogusCast(RecordReader<C, D> rr) {
    return (RecordReader<A, B>)rr;
  }

  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    return sourceInputFormat.getSplits(job, numSplits);
  }

  @Override
  public boolean shouldSkipCombine(Path path, Configuration conf) throws IOException {
    return sourceASC == null ? false : sourceASC.shouldSkipCombine(path, conf);
  }

  static VectorizedRowBatchCtx createFakeVrbCtx(MapWork mapWork) throws HiveException {
    // This is based on Vectorizer code, minus the validation.

    // Add all non-virtual columns from the TableScan operator.
    RowSchema rowSchema = findTsOp(mapWork).getSchema();
    final List<String> colNames = new ArrayList<String>(rowSchema.getSignature().size());
    final List<TypeInfo> colTypes = new ArrayList<TypeInfo>(rowSchema.getSignature().size());
    for (ColumnInfo c : rowSchema.getSignature()) {
      String columnName = c.getInternalName();
      if (VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(columnName)) continue;
      colNames.add(columnName);
      colTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(c.getTypeName()));
    }

    // Determine the partition columns using the first partition descriptor.
    // Note - like vectorizer, this assumes partition columns go after data columns.
    int partitionColumnCount = 0;
    Iterator<Path> paths = mapWork.getPathToAliases().keySet().iterator();
    if (paths.hasNext()) {
      PartitionDesc partDesc = mapWork.getPathToPartitionInfo().get(paths.next());
      if (partDesc != null) {
        LinkedHashMap<String, String> partSpec = partDesc.getPartSpec();
        if (partSpec != null && partSpec.isEmpty()) {
          partitionColumnCount = partSpec.size();
        }
      }
    }
    return new VectorizedRowBatchCtx(colNames.toArray(new String[colNames.size()]),
        colTypes.toArray(new TypeInfo[colTypes.size()]), null, partitionColumnCount, new String[0]);
  }

  static TableScanOperator findTsOp(MapWork mapWork) throws HiveException {
    if (mapWork.getAliasToWork() == null) {
      throw new HiveException("Unexpected - aliasToWork is missing; " + NONVECTOR_SETTING_MESSAGE);
    }
    Iterator<Operator<?>> ops = mapWork.getAliasToWork().values().iterator();
    TableScanOperator tableScanOperator = null;
    while (ops.hasNext()) {
      Operator<?> op = ops.next();
      if (op instanceof TableScanOperator) {
        if (tableScanOperator != null) {
          throw new HiveException("Unexpected - more than one TSOP; " + NONVECTOR_SETTING_MESSAGE);
        }
        tableScanOperator = (TableScanOperator)op;
      }
    }
    return tableScanOperator;
  }
}