/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.impl.join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.drill.common.expression.FieldReference;
import org.apache.drill.common.logical.data.JoinCondition;
import org.apache.drill.common.logical.data.NamedExpression;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.DataMode;
import org.apache.drill.common.types.TypeProtos.MajorType;
import org.apache.drill.common.types.Types;
import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.compile.sig.GeneratorMapping;
import org.apache.drill.exec.compile.sig.MappingSet;
import org.apache.drill.exec.exception.ClassTransformationException;
import org.apache.drill.exec.exception.OutOfMemoryException;
import org.apache.drill.exec.exception.SchemaChangeException;
import org.apache.drill.exec.expr.ClassGenerator;
import org.apache.drill.exec.expr.CodeGenerator;
import org.apache.drill.exec.ops.FragmentContext;
import org.apache.drill.exec.ops.MetricDef;
import org.apache.drill.exec.physical.config.HashJoinPOP;
import org.apache.drill.exec.physical.impl.common.ChainedHashTable;
import org.apache.drill.exec.physical.impl.common.HashTable;
import org.apache.drill.exec.physical.impl.common.HashTableConfig;
import org.apache.drill.exec.physical.impl.common.HashTableStats;
import org.apache.drill.exec.physical.impl.common.IndexPointer;
import org.apache.drill.exec.physical.impl.common.Comparator;
import org.apache.drill.exec.physical.impl.sort.RecordBatchData;
import org.apache.drill.exec.record.AbstractRecordBatch;
import org.apache.drill.exec.record.BatchSchema;
import org.apache.drill.exec.record.BatchSchema.SelectionVectorMode;
import org.apache.drill.exec.record.ExpandableHyperContainer;
import org.apache.drill.exec.record.MaterializedField;
import org.apache.drill.exec.record.RecordBatch;
import org.apache.drill.exec.record.TypedFieldId;
import org.apache.drill.exec.record.VectorContainer;
import org.apache.drill.exec.record.VectorWrapper;
import org.apache.drill.exec.vector.ValueVector;
import org.apache.drill.exec.vector.complex.AbstractContainerVector;
import org.apache.calcite.rel.core.JoinRelType;
import com.sun.codemodel.JExpr;
import com.sun.codemodel.JExpression;
import com.sun.codemodel.JVar;
public class HashJoinBatch extends AbstractRecordBatch<HashJoinPOP> {
public static final long ALLOCATOR_INITIAL_RESERVATION = 1 * 1024 * 1024;
public static final long ALLOCATOR_MAX_RESERVATION = 20L * 1000 * 1000 * 1000;
// Probe side record batch
private final RecordBatch left;
// Build side record batch
private final RecordBatch right;
// Join type, INNER, LEFT, RIGHT or OUTER
private final JoinRelType joinType;
// Join conditions
private final List<JoinCondition> conditions;
private final List<Comparator> comparators;
// Runtime generated class implementing HashJoinProbe interface
private HashJoinProbe hashJoinProbe = null;
/* Helper class
* Maintains linked list of build side records with the same key
* Keeps information about which build records have a corresponding
* matching key in the probe side (for outer, right joins)
*/
private HashJoinHelper hjHelper = null;
// Underlying hashtable used by the hash join
private HashTable hashTable = null;
/* Hyper container to store all build side record batches.
* Records are retrieved from this container when there is a matching record
* on the probe side
*/
private ExpandableHyperContainer hyperContainer;
// Number of records in the output container
private int outputRecords;
// Current batch index on the build side
private int buildBatchIndex = 0;
// Schema of the build side
private BatchSchema rightSchema = null;
// Generator mapping for the build side
// Generator mapping for the build side : scalar
private static final GeneratorMapping PROJECT_BUILD =
GeneratorMapping.create("doSetup"/* setup method */, "projectBuildRecord" /* eval method */, null /* reset */,
null /* cleanup */);
// Generator mapping for the build side : constant
private static final GeneratorMapping PROJECT_BUILD_CONSTANT = GeneratorMapping.create("doSetup"/* setup method */,
"doSetup" /* eval method */,
null /* reset */, null /* cleanup */);
// Generator mapping for the probe side : scalar
private static final GeneratorMapping PROJECT_PROBE =
GeneratorMapping.create("doSetup" /* setup method */, "projectProbeRecord" /* eval method */, null /* reset */,
null /* cleanup */);
// Generator mapping for the probe side : constant
private static final GeneratorMapping PROJECT_PROBE_CONSTANT = GeneratorMapping.create("doSetup" /* setup method */,
"doSetup" /* eval method */,
null /* reset */, null /* cleanup */);
// Mapping set for the build side
private final MappingSet projectBuildMapping =
new MappingSet("buildIndex" /* read index */, "outIndex" /* write index */, "buildBatch" /* read container */,
"outgoing" /* write container */, PROJECT_BUILD_CONSTANT, PROJECT_BUILD);
// Mapping set for the probe side
private final MappingSet projectProbeMapping = new MappingSet("probeIndex" /* read index */, "outIndex" /* write index */,
"probeBatch" /* read container */,
"outgoing" /* write container */,
PROJECT_PROBE_CONSTANT, PROJECT_PROBE);
// indicates if we have previously returned an output batch
boolean firstOutputBatch = true;
IterOutcome leftUpstream = IterOutcome.NONE;
IterOutcome rightUpstream = IterOutcome.NONE;
private final HashTableStats htStats = new HashTableStats();
public enum Metric implements MetricDef {
NUM_BUCKETS,
NUM_ENTRIES,
NUM_RESIZING,
RESIZING_TIME;
// duplicate for hash ag
@Override
public int metricId() {
return ordinal();
}
}
@Override
public int getRecordCount() {
return outputRecords;
}
@Override
protected void buildSchema() throws SchemaChangeException {
leftUpstream = next(left);
rightUpstream = next(right);
if (leftUpstream == IterOutcome.STOP || rightUpstream == IterOutcome.STOP) {
state = BatchState.STOP;
return;
}
if (leftUpstream == IterOutcome.OUT_OF_MEMORY || rightUpstream == IterOutcome.OUT_OF_MEMORY) {
state = BatchState.OUT_OF_MEMORY;
return;
}
// Initialize the hash join helper context
hjHelper = new HashJoinHelper(context, oContext.getAllocator());
try {
rightSchema = right.getSchema();
final VectorContainer vectors = new VectorContainer(oContext);
for (final VectorWrapper<?> w : right) {
vectors.addOrGet(w.getField());
}
vectors.buildSchema(SelectionVectorMode.NONE);
vectors.setRecordCount(0);
hyperContainer = new ExpandableHyperContainer(vectors);
hjHelper.addNewBatch(0);
buildBatchIndex++;
setupHashTable();
hashJoinProbe = setupHashJoinProbe();
// Build the container schema and set the counts
for (final VectorWrapper<?> w : container) {
w.getValueVector().allocateNew();
}
container.buildSchema(BatchSchema.SelectionVectorMode.NONE);
container.setRecordCount(outputRecords);
} catch (IOException | ClassTransformationException e) {
throw new SchemaChangeException(e);
}
}
@Override
public IterOutcome innerNext() {
try {
/* If we are here for the first time, execute the build phase of the
* hash join and setup the run time generated class for the probe side
*/
if (state == BatchState.FIRST) {
// Build the hash table, using the build side record batches.
executeBuildPhase();
// IterOutcome next = next(HashJoinHelper.LEFT_INPUT, left);
hashJoinProbe.setupHashJoinProbe(context, hyperContainer, left, left.getRecordCount(), this, hashTable,
hjHelper, joinType);
// Update the hash table related stats for the operator
updateStats(this.hashTable);
}
// Store the number of records projected
if (!hashTable.isEmpty() || joinType != JoinRelType.INNER) {
// Allocate the memory for the vectors in the output container
allocateVectors();
outputRecords = hashJoinProbe.probeAndProject();
/* We are here because of one the following
* 1. Completed processing of all the records and we are done
* 2. We've filled up the outgoing batch to the maximum and we need to return upstream
* Either case build the output container's schema and return
*/
if (outputRecords > 0 || state == BatchState.FIRST) {
if (state == BatchState.FIRST) {
state = BatchState.NOT_FIRST;
}
for (final VectorWrapper<?> v : container) {
v.getValueVector().getMutator().setValueCount(outputRecords);
}
return IterOutcome.OK;
}
} else {
// Our build side is empty, we won't have any matches, clear the probe side
if (leftUpstream == IterOutcome.OK_NEW_SCHEMA || leftUpstream == IterOutcome.OK) {
for (final VectorWrapper<?> wrapper : left) {
wrapper.getValueVector().clear();
}
left.kill(true);
leftUpstream = next(HashJoinHelper.LEFT_INPUT, left);
while (leftUpstream == IterOutcome.OK_NEW_SCHEMA || leftUpstream == IterOutcome.OK) {
for (final VectorWrapper<?> wrapper : left) {
wrapper.getValueVector().clear();
}
leftUpstream = next(HashJoinHelper.LEFT_INPUT, left);
}
}
}
// No more output records, clean up and return
state = BatchState.DONE;
// if (first) {
// return IterOutcome.OK_NEW_SCHEMA;
// }
return IterOutcome.NONE;
} catch (ClassTransformationException | SchemaChangeException | IOException e) {
context.fail(e);
killIncoming(false);
return IterOutcome.STOP;
}
}
public void setupHashTable() throws IOException, SchemaChangeException, ClassTransformationException {
// Setup the hash table configuration object
int conditionsSize = conditions.size();
final List<NamedExpression> rightExpr = new ArrayList<>(conditionsSize);
List<NamedExpression> leftExpr = new ArrayList<>(conditionsSize);
// Create named expressions from the conditions
for (int i = 0; i < conditionsSize; i++) {
rightExpr.add(new NamedExpression(conditions.get(i).getRight(), new FieldReference("build_side_" + i)));
leftExpr.add(new NamedExpression(conditions.get(i).getLeft(), new FieldReference("probe_side_" + i)));
}
// Set the left named expression to be null if the probe batch is empty.
if (leftUpstream != IterOutcome.OK_NEW_SCHEMA && leftUpstream != IterOutcome.OK) {
leftExpr = null;
} else {
if (left.getSchema().getSelectionVectorMode() != BatchSchema.SelectionVectorMode.NONE) {
final String errorMsg = new StringBuilder()
.append("Hash join does not support probe batch with selection vectors. ")
.append("Probe batch has selection mode = ")
.append(left.getSchema().getSelectionVectorMode())
.toString();
throw new SchemaChangeException(errorMsg);
}
}
final HashTableConfig htConfig =
new HashTableConfig((int) context.getOptions().getOption(ExecConstants.MIN_HASH_TABLE_SIZE),
HashTable.DEFAULT_LOAD_FACTOR, rightExpr, leftExpr, comparators);
// Create the chained hash table
final ChainedHashTable ht =
new ChainedHashTable(htConfig, context, oContext.getAllocator(), this.right, this.left, null);
hashTable = ht.createAndSetupHashTable(null);
}
public void executeBuildPhase() throws SchemaChangeException, ClassTransformationException, IOException {
//Setup the underlying hash table
// skip first batch if count is zero, as it may be an empty schema batch
if (right.getRecordCount() == 0) {
for (final VectorWrapper<?> w : right) {
w.clear();
}
rightUpstream = next(right);
}
boolean moreData = true;
while (moreData) {
switch (rightUpstream) {
case OUT_OF_MEMORY:
case NONE:
case NOT_YET:
case STOP:
moreData = false;
continue;
case OK_NEW_SCHEMA:
if (rightSchema == null) {
rightSchema = right.getSchema();
if (rightSchema.getSelectionVectorMode() != BatchSchema.SelectionVectorMode.NONE) {
final String errorMsg = new StringBuilder()
.append("Hash join does not support build batch with selection vectors. ")
.append("Build batch has selection mode = ")
.append(left.getSchema().getSelectionVectorMode())
.toString();
throw new SchemaChangeException(errorMsg);
}
setupHashTable();
} else {
if (!rightSchema.equals(right.getSchema())) {
throw SchemaChangeException.schemaChanged("Hash join does not support schema changes in build side.", rightSchema, right.getSchema());
}
hashTable.updateBatches();
}
// Fall through
case OK:
final int currentRecordCount = right.getRecordCount();
/* For every new build batch, we store some state in the helper context
* Add new state to the helper context
*/
hjHelper.addNewBatch(currentRecordCount);
// Holder contains the global index where the key is hashed into using the hash table
final IndexPointer htIndex = new IndexPointer();
// For every record in the build batch , hash the key columns
for (int i = 0; i < currentRecordCount; i++) {
hashTable.put(i, htIndex, 1 /* retry count */);
/* Use the global index returned by the hash table, to store
* the current record index and batch index. This will be used
* later when we probe and find a match.
*/
hjHelper.setCurrentIndex(htIndex.value, buildBatchIndex, i);
}
/* Completed hashing all records in this batch. Transfer the batch
* to the hyper vector container. Will be used when we want to retrieve
* records that have matching keys on the probe side.
*/
final RecordBatchData nextBatch = new RecordBatchData(right, oContext.getAllocator());
boolean success = false;
try {
if (hyperContainer == null) {
hyperContainer = new ExpandableHyperContainer(nextBatch.getContainer());
} else {
hyperContainer.addBatch(nextBatch.getContainer());
}
// completed processing a batch, increment batch index
buildBatchIndex++;
success = true;
} finally {
if (!success) {
nextBatch.clear();
}
}
break;
}
// Get the next record batch
rightUpstream = next(HashJoinHelper.RIGHT_INPUT, right);
}
}
public HashJoinProbe setupHashJoinProbe() throws ClassTransformationException, IOException {
final CodeGenerator<HashJoinProbe> cg = CodeGenerator.get(HashJoinProbe.TEMPLATE_DEFINITION, context.getFunctionRegistry(), context.getOptions());
cg.plainJavaCapable(true);
// Uncomment out this line to debug the generated code.
// cg.saveCodeForDebugging(true);
final ClassGenerator<HashJoinProbe> g = cg.getRoot();
// Generate the code to project build side records
g.setMappingSet(projectBuildMapping);
int fieldId = 0;
final JExpression buildIndex = JExpr.direct("buildIndex");
final JExpression outIndex = JExpr.direct("outIndex");
g.rotateBlock();
if (rightSchema != null) {
for (final MaterializedField field : rightSchema) {
final MajorType inputType = field.getType();
final MajorType outputType;
// If left or full outer join, then the output type must be nullable. However, map types are
// not nullable so we must exclude them from the check below (see DRILL-2197).
if ((joinType == JoinRelType.LEFT || joinType == JoinRelType.FULL) && inputType.getMode() == DataMode.REQUIRED
&& inputType.getMinorType() != TypeProtos.MinorType.MAP) {
outputType = Types.overrideMode(inputType, DataMode.OPTIONAL);
} else {
outputType = inputType;
}
// make sure to project field with children for children to show up in the schema
final MaterializedField projected = field.withType(outputType);
// Add the vector to our output container
container.addOrGet(projected);
final JVar inVV = g.declareVectorValueSetupAndMember("buildBatch", new TypedFieldId(field.getType(), true, fieldId));
final JVar outVV = g.declareVectorValueSetupAndMember("outgoing", new TypedFieldId(outputType, false, fieldId));
g.getEvalBlock().add(outVV.invoke("copyFromSafe")
.arg(buildIndex.band(JExpr.lit((int) Character.MAX_VALUE)))
.arg(outIndex)
.arg(inVV.component(buildIndex.shrz(JExpr.lit(16)))));
g.rotateBlock();
fieldId++;
}
}
// Generate the code to project probe side records
g.setMappingSet(projectProbeMapping);
int outputFieldId = fieldId;
fieldId = 0;
final JExpression probeIndex = JExpr.direct("probeIndex");
if (leftUpstream == IterOutcome.OK || leftUpstream == IterOutcome.OK_NEW_SCHEMA) {
for (final VectorWrapper<?> vv : left) {
final MajorType inputType = vv.getField().getType();
final MajorType outputType;
// If right or full outer join then the output type should be optional. However, map types are
// not nullable so we must exclude them from the check below (see DRILL-2771, DRILL-2197).
if ((joinType == JoinRelType.RIGHT || joinType == JoinRelType.FULL) && inputType.getMode() == DataMode.REQUIRED
&& inputType.getMinorType() != TypeProtos.MinorType.MAP) {
outputType = Types.overrideMode(inputType, DataMode.OPTIONAL);
} else {
outputType = inputType;
}
final ValueVector v = container.addOrGet(MaterializedField.create(vv.getField().getPath(), outputType));
if (v instanceof AbstractContainerVector) {
vv.getValueVector().makeTransferPair(v);
v.clear();
}
final JVar inVV = g.declareVectorValueSetupAndMember("probeBatch", new TypedFieldId(inputType, false, fieldId));
final JVar outVV = g.declareVectorValueSetupAndMember("outgoing", new TypedFieldId(outputType, false, outputFieldId));
g.getEvalBlock().add(outVV.invoke("copyFromSafe").arg(probeIndex).arg(outIndex).arg(inVV));
g.rotateBlock();
fieldId++;
outputFieldId++;
}
}
final HashJoinProbe hj = context.getImplementationClass(cg);
return hj;
}
private void allocateVectors() {
for (final VectorWrapper<?> v : container) {
v.getValueVector().allocateNew();
}
}
public HashJoinBatch(HashJoinPOP popConfig, FragmentContext context, RecordBatch left,
RecordBatch right) throws OutOfMemoryException {
super(popConfig, context, true);
this.left = left;
this.right = right;
joinType = popConfig.getJoinType();
conditions = popConfig.getConditions();
comparators = Lists.newArrayListWithExpectedSize(conditions.size());
for (int i=0; i<conditions.size(); i++) {
JoinCondition cond = conditions.get(i);
comparators.add(JoinUtils.checkAndReturnSupportedJoinComparator(cond));
}
}
private void updateStats(HashTable htable) {
if (htable == null) {
return;
}
htable.getStats(htStats);
stats.setLongStat(Metric.NUM_BUCKETS, htStats.numBuckets);
stats.setLongStat(Metric.NUM_ENTRIES, htStats.numEntries);
stats.setLongStat(Metric.NUM_RESIZING, htStats.numResizing);
stats.setLongStat(Metric.RESIZING_TIME, htStats.resizingTime);
}
@Override
public void killIncoming(boolean sendUpstream) {
left.kill(sendUpstream);
right.kill(sendUpstream);
}
@Override
public void close() {
if (hjHelper != null) {
hjHelper.clear();
}
// If we didn't receive any data, hyperContainer may be null, check before clearing
if (hyperContainer != null) {
hyperContainer.clear();
}
if (hashTable != null) {
hashTable.clear();
}
super.close();
}
}