/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.File;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.persistence.AbstractMapJoinKey;
import org.apache.hadoop.hive.ql.exec.persistence.HashMapWrapper;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinObjectValue;
import org.apache.hadoop.hive.ql.exec.persistence.MapJoinRowContainer;
import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.HashTableSinkDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.serde2.SerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.util.ReflectionUtils;
public class HashTableSinkOperator extends TerminalOperator<HashTableSinkDesc> implements
Serializable {
private static final long serialVersionUID = 1L;
private static final Log LOG = LogFactory.getLog(HashTableSinkOperator.class.getName());
// from abstract map join operator
/**
* The expressions for join inputs's join keys.
*/
protected transient Map<Byte, List<ExprNodeEvaluator>> joinKeys;
/**
* The ObjectInspectors for the join inputs's join keys.
*/
protected transient Map<Byte, List<ObjectInspector>> joinKeysObjectInspectors;
/**
* The standard ObjectInspectors for the join inputs's join keys.
*/
protected transient Map<Byte, List<ObjectInspector>> joinKeysStandardObjectInspectors;
protected transient int posBigTableTag = -1; // one of the tables that is not in memory
protected transient int posBigTableAlias = -1; // one of the tables that is not in memory
transient int mapJoinRowsKey; // rows for a given key
protected transient RowContainer<ArrayList<Object>> emptyList = null;
transient int numMapRowsRead;
protected transient int totalSz; // total size of the composite object
transient boolean firstRow;
/**
* The filters for join
*/
protected transient Map<Byte, List<ExprNodeEvaluator>> joinFilters;
protected transient int numAliases; // number of aliases
/**
* The expressions for join outputs.
*/
protected transient Map<Byte, List<ExprNodeEvaluator>> joinValues;
/**
* The ObjectInspectors for the join inputs.
*/
protected transient Map<Byte, List<ObjectInspector>> joinValuesObjectInspectors;
/**
* The ObjectInspectors for join filters.
*/
protected transient Map<Byte, List<ObjectInspector>> joinFilterObjectInspectors;
/**
* The standard ObjectInspectors for the join inputs.
*/
protected transient Map<Byte, List<ObjectInspector>> joinValuesStandardObjectInspectors;
protected transient Map<Byte, List<ObjectInspector>> rowContainerStandardObjectInspectors;
protected transient Byte[] order; // order in which the results should
Configuration hconf;
protected transient Byte alias;
protected transient Map<Byte, TableDesc> spillTableDesc; // spill tables are
protected transient Map<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>> mapJoinTables;
protected transient boolean noOuterJoin;
private long rowNumber = 0;
protected transient LogHelper console;
private long hashTableScale;
private boolean isAbort = false;
public static class HashTableSinkObjectCtx {
ObjectInspector standardOI;
SerDe serde;
TableDesc tblDesc;
Configuration conf;
/**
* @param standardOI
* @param serde
*/
public HashTableSinkObjectCtx(ObjectInspector standardOI, SerDe serde, TableDesc tblDesc,
Configuration conf) {
this.standardOI = standardOI;
this.serde = serde;
this.tblDesc = tblDesc;
this.conf = conf;
}
/**
* @return the standardOI
*/
public ObjectInspector getStandardOI() {
return standardOI;
}
/**
* @return the serde
*/
public SerDe getSerDe() {
return serde;
}
public TableDesc getTblDesc() {
return tblDesc;
}
public Configuration getConf() {
return conf;
}
}
private static final transient String[] FATAL_ERR_MSG = {
null, // counter value 0 means no error
"Mapside join exceeds available memory. "
+ "Please try removing the mapjoin hint."};
private final int metadataKeyTag = -1;
transient int[] metadataValueTag;
public HashTableSinkOperator() {
}
public HashTableSinkOperator(MapJoinOperator mjop) {
this.conf = new HashTableSinkDesc(mjop.getConf());
}
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
boolean isSilent = HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVESESSIONSILENT);
console = new LogHelper(LOG, isSilent);
numMapRowsRead = 0;
firstRow = true;
// for small tables only; so get the big table position first
posBigTableTag = conf.getPosBigTable();
order = conf.getTagOrder();
posBigTableAlias = order[posBigTableTag];
// initialize some variables, which used to be initialized in CommonJoinOperator
numAliases = conf.getExprs().size();
this.hconf = hconf;
totalSz = 0;
noOuterJoin = conf.isNoOuterJoin();
// process join keys
joinKeys = new HashMap<Byte, List<ExprNodeEvaluator>>();
JoinUtil.populateJoinKeyValue(joinKeys, conf.getKeys(), order, posBigTableAlias);
joinKeysObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinKeys,
inputObjInspectors, posBigTableAlias);
joinKeysStandardObjectInspectors = JoinUtil.getStandardObjectInspectors(
joinKeysObjectInspectors, posBigTableAlias);
// process join values
joinValues = new HashMap<Byte, List<ExprNodeEvaluator>>();
JoinUtil.populateJoinKeyValue(joinValues, conf.getExprs(), order, posBigTableAlias);
joinValuesObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinValues,
inputObjInspectors, posBigTableAlias);
joinValuesStandardObjectInspectors = JoinUtil.getStandardObjectInspectors(
joinValuesObjectInspectors, posBigTableAlias);
// process join filters
joinFilters = new HashMap<Byte, List<ExprNodeEvaluator>>();
JoinUtil.populateJoinKeyValue(joinFilters, conf.getFilters(), order, posBigTableAlias);
joinFilterObjectInspectors = JoinUtil.getObjectInspectorsFromEvaluators(joinFilters,
inputObjInspectors, posBigTableAlias);
if (noOuterJoin) {
rowContainerStandardObjectInspectors = joinValuesStandardObjectInspectors;
} else {
Map<Byte, List<ObjectInspector>> rowContainerObjectInspectors = new HashMap<Byte, List<ObjectInspector>>();
for (Byte alias : order) {
if (alias == posBigTableAlias) {
continue;
}
ArrayList<ObjectInspector> rcOIs = new ArrayList<ObjectInspector>();
rcOIs.addAll(joinValuesObjectInspectors.get(alias));
// for each alias, add object inspector for boolean as the last element
rcOIs.add(PrimitiveObjectInspectorFactory.writableBooleanObjectInspector);
rowContainerObjectInspectors.put(alias, rcOIs);
}
rowContainerStandardObjectInspectors = getStandardObjectInspectors(rowContainerObjectInspectors);
}
metadataValueTag = new int[numAliases];
for (int pos = 0; pos < numAliases; pos++) {
metadataValueTag[pos] = -1;
}
mapJoinTables = new HashMap<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>>();
int hashTableThreshold = HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVEHASHTABLETHRESHOLD);
float hashTableLoadFactor = HiveConf.getFloatVar(hconf,
HiveConf.ConfVars.HIVEHASHTABLELOADFACTOR);
float hashTableMaxMemoryUsage = this.getConf().getHashtableMemoryUsage();
hashTableScale = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVEHASHTABLESCALE);
if (hashTableScale <= 0) {
hashTableScale = 1;
}
// initialize the hash tables for other tables
for (Byte pos : order) {
if (pos == posBigTableTag) {
continue;
}
HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashTable = new HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>(
hashTableThreshold, hashTableLoadFactor, hashTableMaxMemoryUsage);
mapJoinTables.put(pos, hashTable);
}
}
protected static HashMap<Byte, List<ObjectInspector>> getStandardObjectInspectors(
Map<Byte, List<ObjectInspector>> aliasToObjectInspectors) {
HashMap<Byte, List<ObjectInspector>> result = new HashMap<Byte, List<ObjectInspector>>();
for (Entry<Byte, List<ObjectInspector>> oiEntry : aliasToObjectInspectors.entrySet()) {
Byte alias = oiEntry.getKey();
List<ObjectInspector> oiList = oiEntry.getValue();
ArrayList<ObjectInspector> fieldOIList = new ArrayList<ObjectInspector>(oiList.size());
for (int i = 0; i < oiList.size(); i++) {
fieldOIList.add(ObjectInspectorUtils.getStandardObjectInspector(oiList.get(i),
ObjectInspectorCopyOption.WRITABLE));
}
result.put(alias, fieldOIList);
}
return result;
}
private void setKeyMetaData() throws SerDeException {
TableDesc keyTableDesc = conf.getKeyTblDesc();
SerDe keySerializer = (SerDe) ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(),
null);
keySerializer.initialize(null, keyTableDesc.getProperties());
MapJoinMetaData.clear();
MapJoinMetaData.put(Integer.valueOf(metadataKeyTag), new HashTableSinkObjectCtx(
ObjectInspectorUtils.getStandardObjectInspector(keySerializer.getObjectInspector(),
ObjectInspectorCopyOption.WRITABLE), keySerializer, keyTableDesc, hconf));
}
/*
* This operator only process small tables Read the key/value pairs Load them into hashtable
*/
@Override
public void processOp(Object row, int tag) throws HiveException {
// let the mapJoinOp process these small tables
try {
if (firstRow) {
// generate the map metadata
setKeyMetaData();
firstRow = false;
}
alias = order[tag];
// alias = (byte)tag;
// compute keys and values as StandardObjects
AbstractMapJoinKey keyMap = JoinUtil.computeMapJoinKeys(row, joinKeys.get(alias),
joinKeysObjectInspectors.get(alias));
Object[] value = JoinUtil.computeMapJoinValues(row, joinValues.get(alias),
joinValuesObjectInspectors.get(alias), joinFilters.get(alias), joinFilterObjectInspectors
.get(alias), noOuterJoin);
HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashTable = mapJoinTables
.get((byte) tag);
MapJoinObjectValue o = hashTable.get(keyMap);
MapJoinRowContainer<Object[]> res = null;
boolean needNewKey = true;
if (o == null) {
res = new MapJoinRowContainer<Object[]>();
res.add(value);
if (metadataValueTag[tag] == -1) {
metadataValueTag[tag] = order[tag];
setValueMetaData(tag);
}
// Construct externalizable objects for key and value
if (needNewKey) {
MapJoinObjectValue valueObj = new MapJoinObjectValue(metadataValueTag[tag], res);
rowNumber++;
if (rowNumber > hashTableScale && rowNumber % hashTableScale == 0) {
isAbort = hashTable.isAbort(rowNumber, console);
if (isAbort) {
throw new HiveException("RunOutOfMeomoryUsage");
}
}
hashTable.put(keyMap, valueObj);
}
} else {
res = o.getObj();
res.add(value);
}
} catch (SerDeException e) {
throw new HiveException(e);
}
}
private void setValueMetaData(int tag) throws SerDeException {
TableDesc valueTableDesc = conf.getValueTblFilteredDescs().get(tag);
SerDe valueSerDe = (SerDe) ReflectionUtils.newInstance(valueTableDesc.getDeserializerClass(),
null);
valueSerDe.initialize(null, valueTableDesc.getProperties());
List<ObjectInspector> newFields = rowContainerStandardObjectInspectors.get((Byte) alias);
int length = newFields.size();
List<String> newNames = new ArrayList<String>(length);
for (int i = 0; i < length; i++) {
String tmp = new String("tmp_" + i);
newNames.add(tmp);
}
StandardStructObjectInspector standardOI = ObjectInspectorFactory
.getStandardStructObjectInspector(newNames, newFields);
MapJoinMetaData.put(Integer.valueOf(metadataValueTag[tag]), new HashTableSinkObjectCtx(
standardOI, valueSerDe, valueTableDesc, hconf));
}
@Override
public void closeOp(boolean abort) throws HiveException {
try {
if (mapJoinTables != null) {
// get tmp file URI
String tmpURI = this.getExecContext().getLocalWork().getTmpFileURI();
LOG.info("Get TMP URI: " + tmpURI);
long fileLength;
for (Map.Entry<Byte, HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue>> hashTables : mapJoinTables
.entrySet()) {
// get the key and value
Byte tag = hashTables.getKey();
HashMapWrapper<AbstractMapJoinKey, MapJoinObjectValue> hashTable = hashTables.getValue();
// get current input file name
String bigBucketFileName = this.getExecContext().getCurrentBigBucketFile();
if (bigBucketFileName == null || bigBucketFileName.length() == 0) {
bigBucketFileName = "-";
}
// get the tmp URI path; it will be a hdfs path if not local mode
String tmpURIPath = Utilities.generatePath(tmpURI, conf.getDumpFilePrefix(), tag, bigBucketFileName);
hashTable.isAbort(rowNumber, console);
console.printInfo(Utilities.now() + "\tDump the hashtable into file: " + tmpURIPath);
// get the hashtable file and path
Path path = new Path(tmpURIPath);
FileSystem fs = path.getFileSystem(hconf);
File file = new File(path.toUri().getPath());
fs.create(path);
fileLength = hashTable.flushMemoryCacheToPersistent(file);
console.printInfo(Utilities.now() + "\tUpload 1 File to: " + tmpURIPath + " File size: "
+ fileLength);
hashTable.close();
}
}
super.closeOp(abort);
} catch (Exception e) {
LOG.error("Generate Hashtable error");
e.printStackTrace();
}
}
/**
* Implements the getName function for the Node Interface.
*
* @return the name of the operator
*/
@Override
public String getName() {
return "HASHTABLESINK";
}
@Override
public OperatorType getType() {
return OperatorType.HASHTABLESINK;
}
}