/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.ObjectPair;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.HashTableLoaderFactory;
import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
import org.apache.hadoop.hive.ql.exec.persistence.BytesBytesMultiHashMap;
import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.HybridHashTableContainer.HashPartition;
import org.apache.hadoop.hive.ql.exec.persistence.KeyValueContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinBytesTableContainer.KeyValueHelper;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinKey;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectSerDeContext;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer.ReusableGetAdaptor;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainerSerDe;
import org.apache.hadoop.hive.ql.exec.persistence.ObjectContainer;
import org.apache.hadoop.hive.ql.exec.persistence.UnwrapRowContainer;
import org.apache.hadoop.hive.ql.exec.spark.SparkUtilities;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.JoinCondDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hive.common.util.ReflectionUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.esotericsoftware.kryo.KryoException;
/**
* Map side Join operator implementation.
*/
public class MapJoinOperator extends AbstractMapJoinOperator<MapJoinDesc> implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger LOG = LoggerFactory.getLogger(MapJoinOperator.class.getName());
private static final String CLASS_NAME = MapJoinOperator.class.getName();
private transient final PerfLogger perfLogger = SessionState.getPerfLogger();
private transient String cacheKey;
private transient ObjectCache cache;
protected HashTableLoader loader;
protected transient MapJoinTableContainer[] mapJoinTables;
private transient MapJoinTableContainerSerDe[] mapJoinTableSerdes;
private transient boolean hashTblInitedOnce;
protected transient ReusableGetAdaptor[] hashMapRowGetters;
private UnwrapRowContainer[] unwrapContainer;
private transient Configuration hconf;
private transient boolean hybridMapJoinLeftover; // whether there's spilled data to be processed
protected transient MapJoinBytesTableContainer[] spilledMapJoinTables; // used to hold restored
// spilled small tables
protected HybridHashTableContainer firstSmallTable; // The first small table;
// Only this table has spilled big table rows
/** Kryo ctor. */
protected MapJoinOperator() {
super();
}
public MapJoinOperator(CompilationOpContext ctx) {
super(ctx);
}
public MapJoinOperator(AbstractMapJoinOperator<? extends MapJoinDesc> mjop) {
super(mjop);
}
/*
* We need the base (operator.java) implementation of start/endGroup.
* The parent class has functionality in those that map join can't use.
* Note: The mapjoin can be run in the reducer only on Tez.
*/
@Override
public void endGroup() throws HiveException {
defaultEndGroup();
}
@Override
public void startGroup() throws HiveException {
defaultStartGroup();
}
protected HashTableLoader getHashTableLoader(Configuration hconf) {
return HashTableLoaderFactory.getLoader(hconf);
}
public String getCacheKey() {
return cacheKey;
}
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
this.hconf = hconf;
unwrapContainer = new UnwrapRowContainer[conf.getTagLength()];
super.initializeOp(hconf);
int tagLen = conf.getTagLength();
// On Tez only: The hash map might already be cached in the container we run
// the task in. On MR: The cache is a no-op.
String queryId = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVEQUERYID);
cacheKey = "HASH_MAP_" + this.getOperatorId() + "_container";
cache = ObjectCacheFactory.getCache(hconf, queryId, false);
loader = getHashTableLoader(hconf);
hashMapRowGetters = null;
mapJoinTables = new MapJoinTableContainer[tagLen];
mapJoinTableSerdes = new MapJoinTableContainerSerDe[tagLen];
hashTblInitedOnce = false;
// Reset grace hashjoin context so that there is no state maintained when operator/work is
// retrieved from object cache
hybridMapJoinLeftover = false;
firstSmallTable = null;
generateMapMetaData();
final ExecMapperContext mapContext = getExecContext();
final MapredContext mrContext = MapredContext.get();
if (!conf.isBucketMapJoin() && !conf.isDynamicPartitionHashJoin()) {
/*
* The issue with caching in case of bucket map join is that different tasks
* process different buckets and if the container is reused to join a different bucket,
* join results can be incorrect. The cache is keyed on operator id and for bucket map join
* the operator does not change but data needed is different. For a proper fix, this
* requires changes in the Tez API with regard to finding bucket id and
* also ability to schedule tasks to re-use containers that have cached the specific bucket.
*/
if (isLogDebugEnabled) {
LOG.debug("This is not bucket map join, so cache");
}
Future<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>> future =
cache.retrieveAsync(
cacheKey,
new Callable<Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>>() {
@Override
public Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> call()
throws HiveException {
return loadHashTable(mapContext, mrContext);
}
});
asyncInitOperations.add(future);
} else if (!isInputFileChangeSensitive(mapContext)) {
loadHashTable(mapContext, mrContext);
hashTblInitedOnce = true;
}
}
@SuppressWarnings("unchecked")
@Override
protected void completeInitializationOp(Object[] os) throws HiveException {
if (os.length != 0) {
Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair =
(Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>) os[0];
boolean spilled = false;
for (MapJoinTableContainer container : pair.getLeft()) {
if (container != null) {
spilled = spilled || container.hasSpill();
}
}
if (spilled) {
// we can't use the cached table because it has spilled.
loadHashTable(getExecContext(), MapredContext.get());
} else {
if (LOG.isDebugEnabled()) {
String s = "Using tables from cache: [";
for (MapJoinTableContainer c : pair.getLeft()) {
s += ((c == null) ? "null" : c.getClass().getSimpleName()) + ", ";
}
LOG.debug(s + "]");
}
// let's use the table from the cache.
mapJoinTables = pair.getLeft();
mapJoinTableSerdes = pair.getRight();
}
hashTblInitedOnce = true;
}
if (this.getExecContext() != null) {
// reset exec context so that initialization of the map operator happens
// properly
this.getExecContext().setLastInputPath(null);
this.getExecContext().setCurrentInputPath(null);
}
}
@Override
protected List<ObjectInspector> getValueObjectInspectors(
byte alias, List<ObjectInspector>[] aliasToObjectInspectors) {
int[] valueIndex = conf.getValueIndex(alias);
if (valueIndex == null) {
return super.getValueObjectInspectors(alias, aliasToObjectInspectors);
}
List<ObjectInspector> inspectors = aliasToObjectInspectors[alias];
int bigPos = conf.getPosBigTable();
Converter[] converters = new Converter[valueIndex.length];
List<ObjectInspector> valueOI = new ArrayList<ObjectInspector>();
for (int i = 0; i < valueIndex.length; i++) {
if (valueIndex[i] >= 0 && !joinKeysObjectInspectors[bigPos].isEmpty()) {
if (conf.getNoOuterJoin()) {
valueOI.add(joinKeysObjectInspectors[bigPos].get(valueIndex[i]));
} else {
// It is an outer join. We are going to add the inspector from the
// inner side, but the key value will come from the outer side, so
// we need to create a converter from inputOI to outputOI.
valueOI.add(inspectors.get(i));
converters[i] = ObjectInspectorConverters.getConverter(
joinKeysObjectInspectors[bigPos].get(valueIndex[i]), inspectors.get(i));
}
} else {
valueOI.add(inspectors.get(i));
}
}
unwrapContainer[alias] = new UnwrapRowContainer(alias, valueIndex, converters, hasFilter(alias));
return valueOI;
}
public void generateMapMetaData() throws HiveException {
// generate the meta data for key
// index for key is -1
try {
TableDesc keyTableDesc = conf.getKeyTblDesc();
AbstractSerDe keySerializer = (AbstractSerDe) ReflectionUtil.newInstance(
keyTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(keySerializer, null, keyTableDesc.getProperties(), null);
MapJoinObjectSerDeContext keyContext = new MapJoinObjectSerDeContext(keySerializer, false);
for (int pos = 0; pos < order.length; pos++) {
if (pos == posBigTable) {
continue;
}
TableDesc valueTableDesc;
if (conf.getNoOuterJoin()) {
valueTableDesc = conf.getValueTblDescs().get(pos);
} else {
valueTableDesc = conf.getValueFilteredTblDescs().get(pos);
}
AbstractSerDe valueSerDe = (AbstractSerDe) ReflectionUtil.newInstance(
valueTableDesc.getDeserializerClass(), null);
SerDeUtils.initializeSerDe(valueSerDe, null, valueTableDesc.getProperties(), null);
MapJoinObjectSerDeContext valueContext =
new MapJoinObjectSerDeContext(valueSerDe, hasFilter(pos));
mapJoinTableSerdes[pos] = new MapJoinTableContainerSerDe(keyContext, valueContext);
}
} catch (SerDeException e) {
throw new HiveException(e);
}
}
protected Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> loadHashTable(
ExecMapperContext mapContext, MapredContext mrContext) throws HiveException {
if (canSkipReload(mapContext)) {
// no need to reload
return new ImmutablePair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]>(
mapJoinTables, mapJoinTableSerdes);
}
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.LOAD_HASHTABLE);
loader.init(mapContext, mrContext, hconf, this);
try {
loader.load(mapJoinTables, mapJoinTableSerdes);
} catch (HiveException e) {
if (isLogInfoEnabled) {
LOG.info("Exception loading hash tables. Clearing partially loaded hash table containers.");
}
// there could be some spilled partitions which needs to be cleaned up
clearAllTableContainers();
throw e;
}
hashTblInitedOnce = true;
Pair<MapJoinTableContainer[], MapJoinTableContainerSerDe[]> pair
= new ImmutablePair<MapJoinTableContainer[],
MapJoinTableContainerSerDe[]> (mapJoinTables, mapJoinTableSerdes);
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.LOAD_HASHTABLE);
if (canSkipJoinProcessing(mapContext)) {
LOG.info("Skipping big table join processing for " + this.toString());
this.setDone(true);
}
return pair;
}
// Load the hash table
@Override
public void cleanUpInputFileChangedOp() throws HiveException {
loadHashTable(getExecContext(), MapredContext.get());
}
protected JoinUtil.JoinResult setMapJoinKey(
ReusableGetAdaptor dest, Object row, byte alias) throws HiveException {
return dest.setFromRow(row, joinKeys[alias], joinKeysObjectInspectors[alias]);
}
protected MapJoinKey getRefKey(byte alias) {
// We assume that since we are joining on the same key, all tables would have either
// optimized or non-optimized key; hence, we can pass any key in any table as reference.
// We do it so that MJKB could determine whether it can use optimized keys.
for (byte pos = 0; pos < order.length; pos++) {
if (pos == alias) continue;
MapJoinKey refKey = mapJoinTables[pos].getAnyKey();
if (refKey != null) return refKey;
}
return null; // All join tables have 0 keys, doesn't matter what we generate.
}
@Override
public void process(Object row, int tag) throws HiveException {
try {
alias = (byte) tag;
if (hashMapRowGetters == null) {
hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
MapJoinKey refKey = getRefKey(alias);
for (byte pos = 0; pos < order.length; pos++) {
if (pos != alias) {
hashMapRowGetters[pos] = mapJoinTables[pos].createGetter(refKey);
}
}
}
// As we're calling processOp again to process the leftover "tuples", we know the "row" is
// coming from the spilled matchfile. We need to recreate hashMapRowGetter against new hashtables
if (hybridMapJoinLeftover) {
MapJoinKey refKey = getRefKey(alias);
for (byte pos = 0; pos < order.length; pos++) {
if (pos != alias && spilledMapJoinTables[pos] != null) {
hashMapRowGetters[pos] = spilledMapJoinTables[pos].createGetter(refKey);
}
}
}
// compute keys and values as StandardObjects
ReusableGetAdaptor firstSetKey = null;
int fieldCount = joinKeys[alias].size();
boolean joinNeeded = false;
boolean bigTableRowSpilled = false;
for (byte pos = 0; pos < order.length; pos++) {
if (pos != alias) {
JoinUtil.JoinResult joinResult;
ReusableGetAdaptor adaptor;
if (firstSetKey == null) {
adaptor = firstSetKey = hashMapRowGetters[pos];
joinResult = setMapJoinKey(firstSetKey, row, alias);
} else {
// Keys for all tables are the same, so only the first has to deserialize them.
adaptor = hashMapRowGetters[pos];
joinResult = adaptor.setFromOther(firstSetKey);
}
MapJoinRowContainer rowContainer = adaptor.getCurrentRows();
if (joinResult != JoinUtil.JoinResult.MATCH) {
assert (rowContainer == null || !rowContainer.hasRows()) :
"Expecting an empty result set for no match";
}
if (rowContainer != null && unwrapContainer[pos] != null) {
Object[] currentKey = firstSetKey.getCurrentKey();
rowContainer = unwrapContainer[pos].setInternal(rowContainer, currentKey);
}
// there is no join-value or join-key has all null elements
if (rowContainer == null || firstSetKey.hasAnyNulls(fieldCount, nullsafes)) {
if (!noOuterJoin) {
// For Hybrid Grace Hash Join, during the 1st round processing,
// we only keep the LEFT side if the row is not spilled
if (!conf.isHybridHashJoin() || hybridMapJoinLeftover ||
(joinResult != JoinUtil.JoinResult.SPILL && !bigTableRowSpilled)) {
joinNeeded = true;
storage[pos] = dummyObjVectors[pos];
} else {
joinNeeded = false;
}
} else {
storage[pos] = emptyList;
}
} else {
joinNeeded = true;
storage[pos] = rowContainer.copy();
aliasFilterTags[pos] = rowContainer.getAliasFilter();
}
// Spill the big table rows into appropriate partition:
// When the JoinResult is SPILL, it means the corresponding small table row may have been
// spilled to disk (at least the partition that holds this row is on disk). So we need to
// postpone the join processing for this pair by also spilling this big table row.
if (joinResult == JoinUtil.JoinResult.SPILL &&
!bigTableRowSpilled) { // For n-way join, only spill big table rows once
spillBigTableRow(mapJoinTables[pos], row);
bigTableRowSpilled = true;
}
}
}
if (joinNeeded) {
List<Object> value = getFilteredValue(alias, row);
// Add the value to the ArrayList
storage[alias].addRow(value);
// generate the output records
checkAndGenObject();
}
// done with the row
storage[tag].clearRows();
for (byte pos = 0; pos < order.length; pos++) {
if (pos != tag) {
storage[pos] = null;
}
}
} catch (Exception e) {
String msg = "Unexpected exception from "
+ this.getClass().getSimpleName() + " : " + e.getMessage();
LOG.error(msg, e);
throw new HiveException(msg, e);
}
}
/**
* Postpone processing the big table row temporarily by spilling it to a row container
* @param hybridHtContainer Hybrid hashtable container
* @param row big table row
*/
protected void spillBigTableRow(MapJoinTableContainer hybridHtContainer, Object row) throws HiveException {
HybridHashTableContainer ht = (HybridHashTableContainer) hybridHtContainer;
int partitionId = ht.getToSpillPartitionId();
HashPartition hp = ht.getHashPartitions()[partitionId];
ObjectContainer bigTable = hp.getMatchfileObjContainer();
bigTable.add(row);
}
@Override
public void closeOp(boolean abort) throws HiveException {
boolean spilled = false;
for (MapJoinTableContainer container : mapJoinTables) {
if (container != null) {
spilled = spilled || container.hasSpill();
container.dumpMetrics();
}
}
// For Hybrid Grace Hash Join, we need to see if there is any spilled data to be processed next
if (spilled) {
if (!abort) {
if (hashMapRowGetters == null) {
hashMapRowGetters = new ReusableGetAdaptor[mapJoinTables.length];
}
int numPartitions = 0;
// Find out number of partitions for each small table (should be same across tables)
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
if (pos != conf.getPosBigTable()) {
firstSmallTable = (HybridHashTableContainer) mapJoinTables[pos];
numPartitions = firstSmallTable.getHashPartitions().length;
break;
}
}
assert numPartitions != 0 : "Number of partitions must be greater than 0!";
if (firstSmallTable.hasSpill()) {
spilledMapJoinTables = new MapJoinBytesTableContainer[mapJoinTables.length];
hybridMapJoinLeftover = true;
// Clear all in-memory partitions first
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
MapJoinTableContainer tableContainer = mapJoinTables[pos];
if (tableContainer != null && tableContainer instanceof HybridHashTableContainer) {
HybridHashTableContainer hybridHtContainer = (HybridHashTableContainer) tableContainer;
hybridHtContainer.dumpStats();
HashPartition[] hashPartitions = hybridHtContainer.getHashPartitions();
// Clear all in memory partitions first
for (int i = 0; i < hashPartitions.length; i++) {
if (!hashPartitions[i].isHashMapOnDisk()) {
hybridHtContainer.setTotalInMemRowCount(
hybridHtContainer.getTotalInMemRowCount() -
hashPartitions[i].getHashMapFromMemory().getNumValues());
hashPartitions[i].getHashMapFromMemory().clear();
}
}
assert hybridHtContainer.getTotalInMemRowCount() == 0;
}
}
// Reprocess the spilled data
for (int i = 0; i < numPartitions; i++) {
HashPartition[] hashPartitions = firstSmallTable.getHashPartitions();
if (hashPartitions[i].isHashMapOnDisk()) {
try {
continueProcess(i); // Re-process spilled data
} catch (KryoException ke) {
LOG.error("Processing the spilled data failed due to Kryo error!");
LOG.error("Cleaning up all spilled data!");
cleanupGraceHashJoin();
throw new HiveException(ke);
} catch (Exception e) {
throw new HiveException(e);
}
for (byte pos = 0; pos < order.length; pos++) {
if (pos != conf.getPosBigTable())
spilledMapJoinTables[pos] = null;
}
}
}
}
}
if (isLogInfoEnabled) {
LOG.info("spilled: " + spilled + " abort: " + abort + ". Clearing spilled partitions.");
}
// spilled tables are loaded always (no sharing), so clear it
clearAllTableContainers();
cache.remove(cacheKey);
}
// in mapreduce case, we need to always clear up as mapreduce doesn't have object registry.
if ((this.getExecContext() != null) && (this.getExecContext().getLocalWork() != null)
&& (this.getExecContext().getLocalWork().getInputFileChangeSensitive())
&& !(HiveConf.getVar(hconf, ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")
&& SparkUtilities.isDedicatedCluster(hconf))) {
if (isLogInfoEnabled) {
LOG.info("MR: Clearing all map join table containers.");
}
clearAllTableContainers();
}
this.loader = null;
super.closeOp(abort);
}
private void clearAllTableContainers() {
if (mapJoinTables != null) {
for (MapJoinTableContainer tableContainer : mapJoinTables) {
if (tableContainer != null) {
tableContainer.clear();
}
}
}
}
/**
* Continue processing join between spilled hashtable(s) and spilled big table
* @param partitionId the partition number across all small tables to process
* @throws HiveException
* @throws IOException
* @throws SerDeException
*/
private void continueProcess(int partitionId)
throws HiveException, IOException, SerDeException, ClassNotFoundException {
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
if (pos != conf.getPosBigTable()) {
LOG.info("Going to reload hash partition " + partitionId);
reloadHashTable(pos, partitionId);
}
}
reProcessBigTable(partitionId);
}
/**
* Reload hashtable from the hash partition.
* It can have two steps:
* 1) Deserialize a serialized hash table, and
* 2) Merge every key/value pair from small table container into the hash table
* @param pos position of small table
* @param partitionId the partition of the small table to be reloaded from
* @throws IOException
* @throws HiveException
* @throws SerDeException
*/
protected void reloadHashTable(byte pos, int partitionId)
throws IOException, HiveException, SerDeException, ClassNotFoundException {
HybridHashTableContainer container = (HybridHashTableContainer)mapJoinTables[pos];
HashPartition partition = container.getHashPartitions()[partitionId];
// Merge the sidefile into the newly created hash table
// This is where the spilling may happen again
LOG.info("Going to restore sidefile...");
KeyValueContainer kvContainer = partition.getSidefileKVContainer();
int rowCount = kvContainer.size();
LOG.info("Hybrid Grace Hash Join: Number of rows restored from KeyValueContainer: " +
kvContainer.size());
// Deserialize the on-disk hash table
// We're sure this part is smaller than memory limit
if (rowCount <= 0) {
rowCount = 1024 * 1024; // Since rowCount is used later to instantiate a BytesBytesMultiHashMap
// as the initialCapacity which cannot be 0, we provide a reasonable
// positive number here
}
LOG.info("Going to restore hashmap...");
BytesBytesMultiHashMap restoredHashMap = partition.getHashMapFromDisk(rowCount);
rowCount += restoredHashMap.getNumValues();
LOG.info("Hybrid Grace Hash Join: Deserializing spilled hash partition...");
LOG.info("Hybrid Grace Hash Join: Number of rows in hashmap: " + rowCount);
// If based on the new key count, keyCount is smaller than a threshold,
// then just load the entire restored hashmap into memory.
// The size of deserialized partition shouldn't exceed half of memory limit
if (rowCount * container.getTableRowSize() >= container.getMemoryThreshold() / 2) {
LOG.warn("Hybrid Grace Hash Join: Hash table cannot be reloaded since it" +
" will be greater than memory limit. Recursive spilling is currently not supported");
}
KeyValueHelper writeHelper = container.getWriteHelper();
while (kvContainer.hasNext()) {
ObjectPair<HiveKey, BytesWritable> pair = kvContainer.next();
Writable key = pair.getFirst();
Writable val = pair.getSecond();
writeHelper.setKeyValue(key, val);
restoredHashMap.put(writeHelper, -1);
}
container.setTotalInMemRowCount(container.getTotalInMemRowCount()
+ restoredHashMap.getNumValues());
kvContainer.clear();
spilledMapJoinTables[pos] = new MapJoinBytesTableContainer(restoredHashMap);
spilledMapJoinTables[pos].setInternalValueOi(container.getInternalValueOi());
spilledMapJoinTables[pos].setSortableSortOrders(container.getSortableSortOrders());
spilledMapJoinTables[pos].setNullMarkers(container.getNullMarkers());
spilledMapJoinTables[pos].setNotNullMarkers(container.getNotNullMarkers());
}
/**
* Iterate over the big table row container and feed process() with leftover rows
* @param partitionId the partition from which to take out spilled big table rows
* @throws HiveException
*/
protected void reProcessBigTable(int partitionId) throws HiveException {
// For binary join, firstSmallTable is the only small table; it has reference to spilled big
// table rows;
// For n-way join, since we only spill once, when processing the first small table, so only the
// firstSmallTable has reference to the spilled big table rows.
HashPartition partition = firstSmallTable.getHashPartitions()[partitionId];
ObjectContainer bigTable = partition.getMatchfileObjContainer();
LOG.info("Hybrid Grace Hash Join: Going to process spilled big table rows in partition " +
partitionId + ". Number of rows: " + bigTable.size());
while (bigTable.hasNext()) {
Object row = bigTable.next();
process(row, conf.getPosBigTable());
}
bigTable.clear();
}
/**
* Clean up data participating the join, i.e. in-mem and on-disk files for small table(s) and big table
*/
private void cleanupGraceHashJoin() {
for (byte pos = 0; pos < mapJoinTables.length; pos++) {
if (pos != conf.getPosBigTable()) {
LOG.info("Cleaning up small table data at pos: " + pos);
HybridHashTableContainer container = (HybridHashTableContainer) mapJoinTables[pos];
container.clear();
}
}
}
/**
* Implements the getName function for the Node Interface.
*
* @return the name of the operator
*/
@Override
public String getName() {
return getOperatorName();
}
static public String getOperatorName() {
return "MAPJOIN";
}
@Override
public OperatorType getType() {
return OperatorType.MAPJOIN;
}
protected boolean isInputFileChangeSensitive(ExecMapperContext mapContext) {
return !(mapContext == null
|| mapContext.getLocalWork() == null
|| mapContext.getLocalWork().getInputFileChangeSensitive() == false);
}
protected boolean canSkipReload(ExecMapperContext mapContext) {
return (this.hashTblInitedOnce && !isInputFileChangeSensitive(mapContext));
}
// If the loaded hash table is empty, for some conditions we can skip processing the big table rows.
protected boolean canSkipJoinProcessing(ExecMapperContext mapContext) {
if (!canSkipReload(mapContext)) {
return false;
}
JoinCondDesc[] joinConds = getConf().getConds();
if (joinConds.length > 0) {
for (JoinCondDesc joinCond : joinConds) {
if (joinCond.getType() != JoinDesc.INNER_JOIN) {
return false;
}
}
} else {
return false;
}
boolean skipJoinProcessing = false;
for (int idx = 0; idx < mapJoinTables.length; ++idx) {
if (idx == getConf().getPosBigTable()) {
continue;
}
MapJoinTableContainer mapJoinTable = mapJoinTables[idx];
if (mapJoinTable.size() == 0) {
// If any table is empty, an inner join involving the tables should yield 0 rows.
LOG.info("Hash table number " + idx + " is empty");
skipJoinProcessing = true;
break;
}
}
return skipJoinProcessing;
}
}