VectorSMBMapJoinOperator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.SMBJoinDesc;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;

/**
 * VectorSMBJoinOperator.
 * Implements the vectorized SMB join operator. The implementation relies on the row-mode SMB join operator.
 * It accepts a vectorized batch input from the big table and iterates over the batch, calling the parent row-mode
 * implementation for each row in the batch.
 */
public class VectorSMBMapJoinOperator extends SMBMapJoinOperator implements VectorizationContextRegion {

  private static final Logger LOG = LoggerFactory.getLogger(
      VectorSMBMapJoinOperator.class.getName());

  private static final long serialVersionUID = 1L;

  private VectorExpression[] bigTableValueExpressions;

  private VectorExpression[] bigTableFilterExpressions;

  private VectorExpression[] keyExpressions;

  private VectorExpressionWriter[] keyOutputWriters;

  private VectorizationContext vOutContext;

  // The above members are initialized by the constructor and must not be
  // transient.
  //---------------------------------------------------------------------------

  private transient VectorizedRowBatch outputBatch;

  private transient VectorizedRowBatchCtx vrbCtx = null;

  private transient VectorHashKeyWrapperBatch keyWrapperBatch;

  private transient Map<ObjectInspector, VectorAssignRow> outputVectorAssignRowMap;

  private transient int batchIndex = -1;

  private transient VectorHashKeyWrapper[] keyValues;

  private transient SMBJoinKeyEvaluator keyEvaluator;

  private transient VectorExpressionWriter[] valueWriters;

  private interface SMBJoinKeyEvaluator {
    List<Object> evaluate(VectorHashKeyWrapper kw) throws HiveException;
}

  /** Kryo ctor. */
  @VisibleForTesting
  public VectorSMBMapJoinOperator() {
    super();
  }

  public VectorSMBMapJoinOperator(CompilationOpContext ctx) {
    super(ctx);
  }

  public VectorSMBMapJoinOperator(CompilationOpContext ctx,
      VectorizationContext vContext, OperatorDesc conf) throws HiveException {
    this(ctx);
    SMBJoinDesc desc = (SMBJoinDesc) conf;
    this.conf = desc;

    order = desc.getTagOrder();
    numAliases = desc.getExprs().size();
    posBigTable = (byte) desc.getPosBigTable();
    filterMaps = desc.getFilterMap();
    noOuterJoin = desc.isNoOuterJoin();

    // Must obtain vectorized equivalents for filter and value expressions

    Map<Byte, List<ExprNodeDesc>> filterExpressions = desc.getFilters();
    bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable),
        VectorExpressionDescriptor.Mode.FILTER);

    List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable);
    keyExpressions = vContext.getVectorExpressions(keyDesc);
    keyOutputWriters = VectorExpressionWriterFactory.getExpressionWriters(keyDesc);

    Map<Byte, List<ExprNodeDesc>> exprs = desc.getExprs();
    bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable));

    // We are making a new output vectorized row batch.
    vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames(),
        /* vContextEnvironment */ vContext);
  }

  @Override
  protected List<Object> smbJoinComputeKeys(Object row, byte alias) throws HiveException {
    if (alias == this.posBigTable) {

      // The keyEvaluate reuses storage.  That doesn't work with SMB MapJoin because it
      // holds references to keys as it is merging.
      List<Object> singletonListAndObjects = keyEvaluator.evaluate(keyValues[batchIndex]);
      ArrayList<Object> result = new ArrayList<Object>(singletonListAndObjects.size());
      for (int i = 0; i < singletonListAndObjects.size(); i++) {
        result.add(ObjectInspectorUtils.copyToStandardObject(singletonListAndObjects.get(i),
            joinKeysObjectInspectors[alias].get(i),
            ObjectInspectorCopyOption.WRITABLE));
      }
      return result;
    } else {
      return super.smbJoinComputeKeys(row, alias);
    }
  }

  @Override
  protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);

    vrbCtx = new VectorizedRowBatchCtx();
    vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames());

    outputBatch = vrbCtx.createVectorizedRowBatch();

    keyWrapperBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions);

    outputVectorAssignRowMap = new HashMap<ObjectInspector, VectorAssignRow>();

    // This key evaluator translates from the vectorized VectorHashKeyWrapper format
    // into the row-mode MapJoinKey
    keyEvaluator = new SMBJoinKeyEvaluator() {
      private List<Object> key;

      public SMBJoinKeyEvaluator init() {
        key = new ArrayList<Object>();
        for(int i = 0; i < keyExpressions.length; ++i) {
          key.add(null);
        }
        return this;
      }

      @Override
      public List<Object> evaluate(VectorHashKeyWrapper kw) throws HiveException {
        for(int i = 0; i < keyExpressions.length; ++i) {
          key.set(i, keyWrapperBatch.getWritableKeyValue(kw, i, keyOutputWriters[i]));
        }
        return key;
      };
    }.init();

    Map<Byte, List<ExprNodeDesc>> valueExpressions = conf.getExprs();
    List<ExprNodeDesc> bigTableExpressions = valueExpressions.get(posBigTable);

    // We're hijacking the big table evaluators and replacing them with our own custom ones
    // which are going to return values from the input batch vector expressions
    List<ExprNodeEvaluator> vectorNodeEvaluators = new ArrayList<ExprNodeEvaluator>(bigTableExpressions.size());

    VectorExpressionWriterFactory.processVectorExpressions(
        bigTableExpressions,
        new VectorExpressionWriterFactory.ListOIDClosure() {

          @Override
          public void assign(VectorExpressionWriter[] writers, List<ObjectInspector> oids) {
            valueWriters = writers;
            joinValuesObjectInspectors[posBigTable] = oids;
          }
        });

    for(int i=0; i<bigTableExpressions.size(); ++i) {
      ExprNodeDesc desc = bigTableExpressions.get(i);
      VectorExpression vectorExpr = bigTableValueExpressions[i];

      // This is a vectorized aware evaluator
      ExprNodeEvaluator eval = new ExprNodeEvaluator<ExprNodeDesc>(desc, hconf) {
        int columnIndex;;
        int writerIndex;

        public ExprNodeEvaluator initVectorExpr(int columnIndex, int writerIndex) {
          this.columnIndex = columnIndex;
          this.writerIndex = writerIndex;
          return this;
        }

        @Override
        public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException {
          throw new HiveException("should never reach here");
        }

        @Override
        protected Object _evaluate(Object row, int version) throws HiveException {
          VectorizedRowBatch inBatch = (VectorizedRowBatch) row;
          int rowIndex = inBatch.selectedInUse ? inBatch.selected[batchIndex] : batchIndex;
          return valueWriters[writerIndex].writeValue(inBatch.cols[columnIndex], rowIndex);
        }
      }.initVectorExpr(vectorExpr.getOutputColumn(), i);
      vectorNodeEvaluators.add(eval);
    }
    // Now replace the old evaluators with our own
    joinValues[posBigTable] = vectorNodeEvaluators;
  }

  @Override
  public void process(Object row, int tag) throws HiveException {
    byte alias = (byte) tag;

    if (alias != this.posBigTable) {
      super.process(row, tag);
    } else {

      VectorizedRowBatch inBatch = (VectorizedRowBatch) row;

      if (null != bigTableFilterExpressions) {
        for(VectorExpression ve : bigTableFilterExpressions) {
          ve.evaluate(inBatch);
        }
      }

      if (null != bigTableValueExpressions) {
        for(VectorExpression ve : bigTableValueExpressions) {
          ve.evaluate(inBatch);
        }
      }

      for (VectorExpression ve : keyExpressions) {
        ve.evaluate(inBatch);
      }
      keyWrapperBatch.evaluateBatch(inBatch);
      keyValues = keyWrapperBatch.getVectorHashKeyWrappers();

      // This implementation of vectorized JOIN is delegating all the work
      // to the row-mode implementation by hijacking the big table node evaluators
      // and calling the row-mode join processOp for each row in the input batch.
      // Since the JOIN operator is not fully vectorized anyway at the moment
      // (due to the use of row-mode small-tables) this is a reasonable trade-off.
      //
      for(batchIndex=0; batchIndex < inBatch.size; ++batchIndex ) {
        super.process(row, tag);
      }

      // Set these two to invalid values so any attempt to use them
      // outside the inner loop results in NPE/OutOfBounds errors
      batchIndex = -1;
      keyValues = null;
    }
  }

  @Override
  public void closeOp(boolean aborted) throws HiveException {
    super.closeOp(aborted);
    if (!aborted && 0 < outputBatch.size) {
      flushOutput();
    }
  }

  @Override
  protected void internalForward(Object row, ObjectInspector outputOI) throws HiveException {
    Object[] values = (Object[]) row;
    VectorAssignRow va = outputVectorAssignRowMap.get(outputOI);
    if (va == null) {
      va = new VectorAssignRow();
      va.init((StructObjectInspector) outputOI, vOutContext.getProjectedColumns());
      outputVectorAssignRowMap.put(outputOI, va);
    }

    va.assignRow(outputBatch, outputBatch.size, values);

    ++outputBatch.size;
    if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) {
      flushOutput();
    }
  }

  private void flushOutput() throws HiveException {
    forward(outputBatch, null);
    outputBatch.reset();
  }

  @Override
  public VectorizationContext getOuputVectorizationContext() {
    return vOutContext;
  }
}