/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hive.common.util.ReflectionUtil;
public class JoinUtil {
/**
* Represents the join result between two tables
*/
public static enum JoinResult {
MATCH, // A match is found
NOMATCH, // No match is found, and the current row will be dropped
SPILL // The current row has been spilled to disk, as the join is postponed
}
public static List<ObjectInspector>[] getObjectInspectorsFromEvaluators(
List<ExprNodeEvaluator>[] exprEntries,
ObjectInspector[] inputObjInspector,
int posBigTableAlias, int tagLen) throws HiveException {
List<ObjectInspector>[] result = new List[tagLen];
int iterate = Math.min(exprEntries.length, inputObjInspector.length);
for (byte alias = 0; alias < iterate; alias++) {
ObjectInspector inputOI = inputObjInspector[alias];
// For vectorized reduce-side operators getting inputs from a reduce sink,
// the row object inspector will get a flattened version of the object inspector
// where the nested key/value structs are replaced with a single struct:
// Example: { key: { reducesinkkey0:int }, value: { _col0:int, _col1:int, .. } }
// Would get converted to the following for a vectorized input:
// { 'key.reducesinkkey0':int, 'value._col0':int, 'value._col1':int, .. }
// The ExprNodeEvaluator initialzation below gets broken with the flattened
// object inpsectors, so convert it back to the a form that contains the
// nested key/value structs.
inputOI = unflattenObjInspector(inputOI);
if (alias == (byte) posBigTableAlias ||
exprEntries[alias] == null || inputOI == null) {
// skip the driver and directly loadable tables
continue;
}
List<ExprNodeEvaluator> exprList = exprEntries[alias];
List<ObjectInspector> fieldOIList = new ArrayList<ObjectInspector>();
for (int i = 0; i < exprList.size(); i++) {
fieldOIList.add(exprList.get(i).initialize(inputOI));
}
result[alias] = fieldOIList;
}
return result;
}
public static List<ObjectInspector>[] getStandardObjectInspectors(
List<ObjectInspector>[] aliasToObjectInspectors,
int posBigTableAlias, int tagLen) {
List<ObjectInspector>[] result = new List[tagLen];
for (byte alias = 0; alias < aliasToObjectInspectors.length; alias++) {
//get big table
if(alias == (byte) posBigTableAlias || aliasToObjectInspectors[alias] == null){
//skip the big tables
continue;
}
List<ObjectInspector> oiList = aliasToObjectInspectors[alias];
ArrayList<ObjectInspector> fieldOIList = new ArrayList<ObjectInspector>(
oiList.size());
for (int i = 0; i < oiList.size(); i++) {
fieldOIList.add(ObjectInspectorUtils.getStandardObjectInspector(oiList
.get(i), ObjectInspectorCopyOption.WRITABLE));
}
result[alias] = fieldOIList;
}
return result;
}
public static int populateJoinKeyValue(List<ExprNodeEvaluator>[] outMap,
Map<Byte, List<ExprNodeDesc>> inputMap, int posBigTableAlias, Configuration conf) throws HiveException {
return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias, conf);
}
public static int populateJoinKeyValue(List<ExprNodeEvaluator>[] outMap,
Map<Byte, List<ExprNodeDesc>> inputMap,
Byte[] order,
int posBigTableAlias, Configuration conf) throws HiveException {
int total = 0;
for (Entry<Byte, List<ExprNodeDesc>> e : inputMap.entrySet()) {
if (e.getValue() == null) {
continue;
}
Byte key = order == null ? e.getKey() : order[e.getKey()];
List<ExprNodeEvaluator> valueFields = new ArrayList<ExprNodeEvaluator>();
for (ExprNodeDesc expr : e.getValue()) {
if (key == (byte) posBigTableAlias) {
valueFields.add(null);
} else {
valueFields.add(ExprNodeEvaluatorFactory.get(expr, conf));
}
}
outMap[key] = valueFields;
total += valueFields.size();
}
return total;
}
/**
* Return the key as a standard object. StandardObject can be inspected by a
* standard ObjectInspector.
*/
public static ArrayList<Object> computeKeys(Object row,
List<ExprNodeEvaluator> keyFields, List<ObjectInspector> keyFieldsOI)
throws HiveException {
// Compute the keys
ArrayList<Object> nr = new ArrayList<Object>(keyFields.size());
for (int i = 0; i < keyFields.size(); i++) {
nr.add(ObjectInspectorUtils.copyToStandardObject(keyFields.get(i)
.evaluate(row), keyFieldsOI.get(i),
ObjectInspectorCopyOption.WRITABLE));
}
return nr;
}
/**
* Return the value as a standard object. StandardObject can be inspected by a
* standard ObjectInspector.
*/
public static Object[] computeMapJoinValues(Object row,
List<ExprNodeEvaluator> valueFields, List<ObjectInspector> valueFieldsOI,
List<ExprNodeEvaluator> filters, List<ObjectInspector> filtersOI,
int[] filterMap) throws HiveException {
// Compute the keys
Object[] nr;
if (filterMap != null) {
nr = new Object[valueFields.size()+1];
// add whether the row is filtered or not.
nr[valueFields.size()] = new ShortWritable(isFiltered(row, filters, filtersOI, filterMap));
}else{
nr = new Object[valueFields.size()];
}
for (int i = 0; i < valueFields.size(); i++) {
nr[i] = ObjectInspectorUtils.copyToStandardObject(valueFields.get(i)
.evaluate(row), valueFieldsOI.get(i),
ObjectInspectorCopyOption.WRITABLE);
}
return nr;
}
/**
* Return the value as a standard object. StandardObject can be inspected by a
* standard ObjectInspector.
* If it would be tagged by filter, reserve one more slot for that.
* outValues can be passed in to avoid allocation
*/
public static List<Object> computeValues(Object row,
List<ExprNodeEvaluator> valueFields, List<ObjectInspector> valueFieldsOI, boolean hasFilter)
throws HiveException {
// Compute the values
int reserve = hasFilter ? valueFields.size() + 1 : valueFields.size();
List<Object> nr = new ArrayList<Object>(reserve);
for (int i = 0; i < valueFields.size(); i++) {
nr.add(ObjectInspectorUtils.copyToStandardObject(valueFields.get(i)
.evaluate(row), valueFieldsOI.get(i),
ObjectInspectorCopyOption.WRITABLE));
}
return nr;
}
private static final short[] MASKS;
static {
int num = 32;
MASKS = new short[num];
MASKS[0] = 1;
for (int idx = 1; idx < num; idx++) {
MASKS[idx] = (short)(2 * MASKS[idx-1]);
}
}
/**
* Returns true if the row does not pass through filters.
*/
protected static boolean isFiltered(Object row, List<ExprNodeEvaluator> filters,
List<ObjectInspector> filtersOIs) throws HiveException {
for (int i = 0; i < filters.size(); i++) {
ExprNodeEvaluator evaluator = filters.get(i);
Object condition = evaluator.evaluate(row);
Boolean result = (Boolean) ((PrimitiveObjectInspector) filtersOIs.get(i)).
getPrimitiveJavaObject(condition);
if (result == null || !result) {
return true;
}
}
return false;
}
/**
* Returns true if the row does not pass through filters.
*/
protected static short isFiltered(Object row, List<ExprNodeEvaluator> filters,
List<ObjectInspector> ois, int[] filterMap) throws HiveException {
// apply join filters on the row.
short ret = 0;
int j = 0;
for (int i = 0; i < filterMap.length; i += 2) {
int tag = filterMap[i];
int length = filterMap[i + 1];
boolean passed = true;
for (; length > 0; length--, j++) {
if (passed) {
Object condition = filters.get(j).evaluate(row);
Boolean result = (Boolean) ((PrimitiveObjectInspector)
ois.get(j)).getPrimitiveJavaObject(condition);
if (result == null || !result) {
passed = false;
}
}
}
if (!passed) {
ret |= MASKS[tag];
}
}
return ret;
}
protected static boolean isFiltered(short filter, int tag) {
return (filter & MASKS[tag]) != 0;
}
protected static boolean hasAnyFiltered(short tag) {
return tag != 0;
}
public static TableDesc getSpillTableDesc(Byte alias, TableDesc[] spillTableDesc,
JoinDesc conf, boolean noFilter) {
if (spillTableDesc == null || spillTableDesc.length == 0) {
spillTableDesc = initSpillTables(conf,noFilter);
}
return spillTableDesc[alias];
}
public static AbstractSerDe getSpillSerDe(byte alias, TableDesc[] spillTableDesc,
JoinDesc conf, boolean noFilter) {
TableDesc desc = getSpillTableDesc(alias, spillTableDesc, conf, noFilter);
if (desc == null) {
return null;
}
AbstractSerDe sd = (AbstractSerDe) ReflectionUtil.newInstance(desc.getDeserializerClass(),
null);
try {
SerDeUtils.initializeSerDe(sd, null, desc.getProperties(), null);
} catch (SerDeException e) {
e.printStackTrace();
return null;
}
return sd;
}
public static TableDesc[] initSpillTables(JoinDesc conf, boolean noFilter) {
int tagLen = conf.getTagLength();
Map<Byte, List<ExprNodeDesc>> exprs = conf.getExprs();
TableDesc[] spillTableDesc = new TableDesc[tagLen];
for (int tag = 0; tag < exprs.size(); tag++) {
List<ExprNodeDesc> valueCols = exprs.get((byte) tag);
int columnSize = valueCols.size();
StringBuilder colNames = new StringBuilder();
StringBuilder colTypes = new StringBuilder();
if (columnSize <= 0) {
continue;
}
for (int k = 0; k < columnSize; k++) {
String newColName = tag + "_VALUE_" + k; // any name, it does not
// matter.
colNames.append(newColName);
colNames.append(',');
colTypes.append(valueCols.get(k).getTypeString());
colTypes.append(',');
}
if (!noFilter) {
colNames.append("filtered");
colNames.append(',');
colTypes.append(TypeInfoFactory.shortTypeInfo.getTypeName());
colTypes.append(',');
}
// remove the last ','
colNames.setLength(colNames.length() - 1);
colTypes.setLength(colTypes.length() - 1);
TableDesc tblDesc = new TableDesc(
SequenceFileInputFormat.class, HiveSequenceFileOutputFormat.class,
Utilities.makeProperties(
org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT, ""
+ Utilities.ctrlaCode,
org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS, colNames
.toString(),
org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES,
colTypes.toString(),
serdeConstants.SERIALIZATION_LIB,LazyBinarySerDe.class.getName()));
spillTableDesc[tag] = tblDesc;
}
return spillTableDesc;
}
public static RowContainer<List<Object>> getRowContainer(Configuration hconf,
List<ObjectInspector> structFieldObjectInspectors,
Byte alias,int containerSize, TableDesc[] spillTableDesc,
JoinDesc conf,boolean noFilter, Reporter reporter) throws HiveException {
TableDesc tblDesc = JoinUtil.getSpillTableDesc(alias,spillTableDesc,conf, noFilter);
AbstractSerDe serde = JoinUtil.getSpillSerDe(alias, spillTableDesc, conf, noFilter);
if (serde == null) {
containerSize = -1;
}
RowContainer<List<Object>> rc = new RowContainer<List<Object>>(containerSize, hconf, reporter);
StructObjectInspector rcOI = null;
if (tblDesc != null) {
// arbitrary column names used internally for serializing to spill table
List<String> colNames = Utilities.getColumnNames(tblDesc.getProperties());
// object inspector for serializing input tuples
rcOI = ObjectInspectorFactory.getStandardStructObjectInspector(colNames,
structFieldObjectInspectors);
}
rc.setSerDe(serde, rcOI);
rc.setTableDesc(tblDesc);
return rc;
}
private static String KEY_FIELD_PREFIX = (Utilities.ReduceField.KEY + ".").toLowerCase();
private static String VALUE_FIELD_PREFIX = (Utilities.ReduceField.VALUE + ".").toLowerCase();
/**
* Create a new struct object inspector for the list of struct fields, first removing the
* prefix from the field name.
* @param fields
* @param prefixToRemove
* @return
*/
private static ObjectInspector createStructFromFields(List<StructField> fields, String prefixToRemove) {
int prefixLength = prefixToRemove.length() + 1; // also remove the '.' after the prefix
ArrayList<String> fieldNames = new ArrayList<String>();
ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
for (StructField field : fields) {
fieldNames.add(field.getFieldName().substring(prefixLength));
fieldOIs.add(field.getFieldObjectInspector());
}
return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
}
/**
* Checks the input object inspector to see if it is in for form of a flattened struct
* like the ones generated by a vectorized reduce sink input:
* { 'key.reducesinkkey0':int, 'value._col0':int, 'value._col1':int, .. }
* If so, then it creates an "unflattened" struct that contains nested key/value
* structs:
* { key: { reducesinkkey0:int }, value: { _col0:int, _col1:int, .. } }
*
* @param oi
* @return unflattened object inspector if unflattening is needed,
* otherwise the original object inspector
*/
private static ObjectInspector unflattenObjInspector(ObjectInspector oi) {
if (oi instanceof StructObjectInspector) {
// Check if all fields start with "key." or "value."
// If so, then unflatten by adding an additional level of nested key and value structs
// Example: { "key.reducesinkkey0":int, "key.reducesinkkey1": int, "value._col6":int }
// Becomes
// { "key": { "reducesinkkey0":int, "reducesinkkey1":int }, "value": { "_col6":int } }
ArrayList<StructField> keyFields = new ArrayList<StructField>();
ArrayList<StructField> valueFields = new ArrayList<StructField>();
for (StructField field : ((StructObjectInspector) oi).getAllStructFieldRefs()) {
String fieldNameLower = field.getFieldName().toLowerCase();
if (fieldNameLower.startsWith(KEY_FIELD_PREFIX)) {
keyFields.add(field);
} else if (fieldNameLower.startsWith(VALUE_FIELD_PREFIX)) {
valueFields.add(field);
} else {
// Not a flattened struct, no need to unflatten
return oi;
}
}
// All field names are of the form "key." or "value."
// Create key/value structs and add the respective fields to each one
ArrayList<ObjectInspector> reduceFieldOIs = new ArrayList<ObjectInspector>();
reduceFieldOIs.add(createStructFromFields(keyFields, Utilities.ReduceField.KEY.toString()));
reduceFieldOIs.add(createStructFromFields(valueFields, Utilities.ReduceField.VALUE.toString()));
// Finally create the outer struct to contain the key, value structs
return ObjectInspectorFactory.getStandardStructObjectInspector(
Utilities.reduceFieldNameList,
reduceFieldOIs);
}
return oi;
}
}