/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator; import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.SMBJoinDesc; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption; import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; /** * VectorSMBJoinOperator. * Implements the vectorized SMB join operator. The implementation relies on the row-mode SMB join operator. * It accepts a vectorized batch input from the big table and iterates over the batch, calling the parent row-mode * implementation for each row in the batch. */ public class VectorSMBMapJoinOperator extends SMBMapJoinOperator implements VectorizationContextRegion { private static final Logger LOG = LoggerFactory.getLogger( VectorSMBMapJoinOperator.class.getName()); private static final long serialVersionUID = 1L; private VectorExpression[] bigTableValueExpressions; private VectorExpression[] bigTableFilterExpressions; private VectorExpression[] keyExpressions; private VectorExpressionWriter[] keyOutputWriters; private VectorizationContext vOutContext; // The above members are initialized by the constructor and must not be // transient. //--------------------------------------------------------------------------- private transient VectorizedRowBatch outputBatch; private transient VectorizedRowBatchCtx vrbCtx = null; private transient VectorHashKeyWrapperBatch keyWrapperBatch; private transient Map<ObjectInspector, VectorAssignRow> outputVectorAssignRowMap; private transient int batchIndex = -1; private transient VectorHashKeyWrapper[] keyValues; private transient SMBJoinKeyEvaluator keyEvaluator; private transient VectorExpressionWriter[] valueWriters; private interface SMBJoinKeyEvaluator { List<Object> evaluate(VectorHashKeyWrapper kw) throws HiveException; } /** Kryo ctor. */ @VisibleForTesting public VectorSMBMapJoinOperator() { super(); } public VectorSMBMapJoinOperator(CompilationOpContext ctx) { super(ctx); } public VectorSMBMapJoinOperator(CompilationOpContext ctx, VectorizationContext vContext, OperatorDesc conf) throws HiveException { this(ctx); SMBJoinDesc desc = (SMBJoinDesc) conf; this.conf = desc; order = desc.getTagOrder(); numAliases = desc.getExprs().size(); posBigTable = (byte) desc.getPosBigTable(); filterMaps = desc.getFilterMap(); noOuterJoin = desc.isNoOuterJoin(); // Must obtain vectorized equivalents for filter and value expressions Map<Byte, List<ExprNodeDesc>> filterExpressions = desc.getFilters(); bigTableFilterExpressions = vContext.getVectorExpressions(filterExpressions.get(posBigTable), VectorExpressionDescriptor.Mode.FILTER); List<ExprNodeDesc> keyDesc = desc.getKeys().get(posBigTable); keyExpressions = vContext.getVectorExpressions(keyDesc); keyOutputWriters = VectorExpressionWriterFactory.getExpressionWriters(keyDesc); Map<Byte, List<ExprNodeDesc>> exprs = desc.getExprs(); bigTableValueExpressions = vContext.getVectorExpressions(exprs.get(posBigTable)); // We are making a new output vectorized row batch. vOutContext = new VectorizationContext(getName(), desc.getOutputColumnNames(), /* vContextEnvironment */ vContext); } @Override protected List<Object> smbJoinComputeKeys(Object row, byte alias) throws HiveException { if (alias == this.posBigTable) { // The keyEvaluate reuses storage. That doesn't work with SMB MapJoin because it // holds references to keys as it is merging. List<Object> singletonListAndObjects = keyEvaluator.evaluate(keyValues[batchIndex]); ArrayList<Object> result = new ArrayList<Object>(singletonListAndObjects.size()); for (int i = 0; i < singletonListAndObjects.size(); i++) { result.add(ObjectInspectorUtils.copyToStandardObject(singletonListAndObjects.get(i), joinKeysObjectInspectors[alias].get(i), ObjectInspectorCopyOption.WRITABLE)); } return result; } else { return super.smbJoinComputeKeys(row, alias); } } @Override protected void initializeOp(Configuration hconf) throws HiveException { super.initializeOp(hconf); vrbCtx = new VectorizedRowBatchCtx(); vrbCtx.init((StructObjectInspector) this.outputObjInspector, vOutContext.getScratchColumnTypeNames()); outputBatch = vrbCtx.createVectorizedRowBatch(); keyWrapperBatch = VectorHashKeyWrapperBatch.compileKeyWrapperBatch(keyExpressions); outputVectorAssignRowMap = new HashMap<ObjectInspector, VectorAssignRow>(); // This key evaluator translates from the vectorized VectorHashKeyWrapper format // into the row-mode MapJoinKey keyEvaluator = new SMBJoinKeyEvaluator() { private List<Object> key; public SMBJoinKeyEvaluator init() { key = new ArrayList<Object>(); for(int i = 0; i < keyExpressions.length; ++i) { key.add(null); } return this; } @Override public List<Object> evaluate(VectorHashKeyWrapper kw) throws HiveException { for(int i = 0; i < keyExpressions.length; ++i) { key.set(i, keyWrapperBatch.getWritableKeyValue(kw, i, keyOutputWriters[i])); } return key; }; }.init(); Map<Byte, List<ExprNodeDesc>> valueExpressions = conf.getExprs(); List<ExprNodeDesc> bigTableExpressions = valueExpressions.get(posBigTable); // We're hijacking the big table evaluators and replacing them with our own custom ones // which are going to return values from the input batch vector expressions List<ExprNodeEvaluator> vectorNodeEvaluators = new ArrayList<ExprNodeEvaluator>(bigTableExpressions.size()); VectorExpressionWriterFactory.processVectorExpressions( bigTableExpressions, new VectorExpressionWriterFactory.ListOIDClosure() { @Override public void assign(VectorExpressionWriter[] writers, List<ObjectInspector> oids) { valueWriters = writers; joinValuesObjectInspectors[posBigTable] = oids; } }); for(int i=0; i<bigTableExpressions.size(); ++i) { ExprNodeDesc desc = bigTableExpressions.get(i); VectorExpression vectorExpr = bigTableValueExpressions[i]; // This is a vectorized aware evaluator ExprNodeEvaluator eval = new ExprNodeEvaluator<ExprNodeDesc>(desc, hconf) { int columnIndex;; int writerIndex; public ExprNodeEvaluator initVectorExpr(int columnIndex, int writerIndex) { this.columnIndex = columnIndex; this.writerIndex = writerIndex; return this; } @Override public ObjectInspector initialize(ObjectInspector rowInspector) throws HiveException { throw new HiveException("should never reach here"); } @Override protected Object _evaluate(Object row, int version) throws HiveException { VectorizedRowBatch inBatch = (VectorizedRowBatch) row; int rowIndex = inBatch.selectedInUse ? inBatch.selected[batchIndex] : batchIndex; return valueWriters[writerIndex].writeValue(inBatch.cols[columnIndex], rowIndex); } }.initVectorExpr(vectorExpr.getOutputColumn(), i); vectorNodeEvaluators.add(eval); } // Now replace the old evaluators with our own joinValues[posBigTable] = vectorNodeEvaluators; } @Override public void process(Object row, int tag) throws HiveException { byte alias = (byte) tag; if (alias != this.posBigTable) { super.process(row, tag); } else { VectorizedRowBatch inBatch = (VectorizedRowBatch) row; if (null != bigTableFilterExpressions) { for(VectorExpression ve : bigTableFilterExpressions) { ve.evaluate(inBatch); } } if (null != bigTableValueExpressions) { for(VectorExpression ve : bigTableValueExpressions) { ve.evaluate(inBatch); } } for (VectorExpression ve : keyExpressions) { ve.evaluate(inBatch); } keyWrapperBatch.evaluateBatch(inBatch); keyValues = keyWrapperBatch.getVectorHashKeyWrappers(); // This implementation of vectorized JOIN is delegating all the work // to the row-mode implementation by hijacking the big table node evaluators // and calling the row-mode join processOp for each row in the input batch. // Since the JOIN operator is not fully vectorized anyway at the moment // (due to the use of row-mode small-tables) this is a reasonable trade-off. // for(batchIndex=0; batchIndex < inBatch.size; ++batchIndex ) { super.process(row, tag); } // Set these two to invalid values so any attempt to use them // outside the inner loop results in NPE/OutOfBounds errors batchIndex = -1; keyValues = null; } } @Override public void closeOp(boolean aborted) throws HiveException { super.closeOp(aborted); if (!aborted && 0 < outputBatch.size) { flushOutput(); } } @Override protected void internalForward(Object row, ObjectInspector outputOI) throws HiveException { Object[] values = (Object[]) row; VectorAssignRow va = outputVectorAssignRowMap.get(outputOI); if (va == null) { va = new VectorAssignRow(); va.init((StructObjectInspector) outputOI, vOutContext.getProjectedColumns()); outputVectorAssignRowMap.put(outputOI, va); } va.assignRow(outputBatch, outputBatch.size, values); ++outputBatch.size; if (outputBatch.size == VectorizedRowBatch.DEFAULT_SIZE) { flushOutput(); } } private void flushOutput() throws HiveException { forward(outputBatch, null); outputBatch.reset(); } @Override public VectorizationContext getOuputVectorizationContext() { return vOutContext; } }