/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.vector.mapjoin; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.CompilationOpContext; import org.apache.hadoop.hive.ql.exec.JoinUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.plan.OperatorDesc; /** * This class has methods for generating vectorized join results for left semi joins. * * The big difference between inner joins and left semi joins is existence testing. * * Inner joins use a hash map to lookup the 1 or more small table values. * * Left semi joins are a specialized join for outputting big table rows whose key exists * in the small table. * * No small table values are needed for left semi join since they would be empty. So, * we use a hash set as the hash table. Hash sets just report whether a key exists. This * is a big performance optimization. */ public abstract class VectorMapJoinLeftSemiGenerateResultOperator extends VectorMapJoinGenerateResultOperator { private static final long serialVersionUID = 1L; private static final Logger LOG = LoggerFactory.getLogger(VectorMapJoinLeftSemiGenerateResultOperator.class.getName()); //--------------------------------------------------------------------------- // Semi join specific members. // // An array of hash set results so we can do lookups on the whole batch before output result // generation. protected transient VectorMapJoinHashSetResult hashSetResults[]; // Pre-allocated member for storing the (physical) batch index of matching row (single- or // multi-small-table-valued) indexes during a process call. protected transient int[] allMatchs; // Pre-allocated member for storing the (physical) batch index of rows that need to be spilled. protected transient int[] spills; // Pre-allocated member for storing index into the hashSetResults for each spilled row. protected transient int[] spillHashMapResultIndices; /** Kryo ctor. */ protected VectorMapJoinLeftSemiGenerateResultOperator() { super(); } public VectorMapJoinLeftSemiGenerateResultOperator(CompilationOpContext ctx) { super(ctx); } public VectorMapJoinLeftSemiGenerateResultOperator(CompilationOpContext ctx, VectorizationContext vContext, OperatorDesc conf) throws HiveException { super(ctx, vContext, conf); } /* * Setup our left semi join specific members. */ protected void commonSetup(VectorizedRowBatch batch) throws HiveException { super.commonSetup(batch); // Semi join specific. VectorMapJoinHashSet baseHashSet = (VectorMapJoinHashSet) vectorMapJoinHashTable; hashSetResults = new VectorMapJoinHashSetResult[batch.DEFAULT_SIZE]; for (int i = 0; i < hashSetResults.length; i++) { hashSetResults[i] = baseHashSet.createHashSetResult(); } allMatchs = new int[batch.DEFAULT_SIZE]; spills = new int[batch.DEFAULT_SIZE]; spillHashMapResultIndices = new int[batch.DEFAULT_SIZE]; } //----------------------------------------------------------------------------------------------- /* * Left semi join (hash set). */ /** * Generate the left semi join output results for one vectorized row batch. * * @param batch * The big table batch with any matching and any non matching rows both as * selected in use. * @param allMatchCount * Number of matches in allMatchs. * @param spillCount * Number of spills in spills. * @param hashTableResults * The array of all hash table results for the batch. We need the * VectorMapJoinHashTableResult for the spill information. */ protected void finishLeftSemi(VectorizedRowBatch batch, int allMatchCount, int spillCount, VectorMapJoinHashTableResult[] hashTableResults) throws HiveException, IOException { // Get rid of spills before we start modifying the batch. if (spillCount > 0) { spillHashMapBatch(batch, hashTableResults, spills, spillHashMapResultIndices, spillCount); } /* * Optimize by running value expressions only over the matched rows. */ if (allMatchCount > 0 && bigTableValueExpressions != null) { performValueExpressions(batch, allMatchs, allMatchCount); } int numSel = generateHashSetResults(batch, allMatchs, allMatchCount); batch.size = numSel; batch.selectedInUse = true; } /** * Generate the matching left semi join output results of a vectorized row batch. * * @param batch * The big table batch. * @param allMatchs * A subset of the rows of the batch that are matches. * @param allMatchCount * Number of matches in allMatchs. */ private int generateHashSetResults(VectorizedRowBatch batch, int[] allMatchs, int allMatchCount) throws HiveException, IOException { int numSel = 0; // Generate result within big table batch itself. for (int i = 0; i < allMatchCount; i++) { int batchIndex = allMatchs[i]; // Use the big table row as output. batch.selected[numSel++] = batchIndex; } return numSel; } /** * Generate the left semi join output results for one vectorized row batch with a repeated key. * * @param batch * The big table batch whose repeated key matches. */ protected int generateHashSetResultRepeatedAll(VectorizedRowBatch batch) throws HiveException { if (batch.selectedInUse) { // The selected array is already filled in as we want it. } else { int[] selected = batch.selected; for (int i = 0; i < batch.size; i++) { selected[i] = i; } batch.selectedInUse = true; } return batch.size; } protected void finishLeftSemiRepeated(VectorizedRowBatch batch, JoinUtil.JoinResult joinResult, VectorMapJoinHashTableResult hashSetResult) throws HiveException, IOException { switch (joinResult) { case MATCH: if (bigTableValueExpressions != null) { // Run our value expressions over whole batch. for(VectorExpression ve: bigTableValueExpressions) { ve.evaluate(batch); } } // Generate special repeated case. int numSel = generateHashSetResultRepeatedAll(batch); batch.size = numSel; batch.selectedInUse = true; break; case SPILL: // Whole batch is spilled. spillBatchRepeated(batch, (VectorMapJoinHashTableResult) hashSetResult); batch.size = 0; break; case NOMATCH: // No match for entire batch. batch.size = 0; break; } } }