JoinUtil.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.persistence.RowContainer;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.JoinDesc;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils.ObjectInspectorCopyOption;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hive.common.util.ReflectionUtil;

public class JoinUtil {

  /**
   * Represents the join result between two tables
   */
  public static enum JoinResult {
    MATCH,    // A match is found
    NOMATCH,  // No match is found, and the current row will be dropped
    SPILL     // The current row has been spilled to disk, as the join is postponed
  }

  public static List<ObjectInspector>[] getObjectInspectorsFromEvaluators(
      List<ExprNodeEvaluator>[] exprEntries,
      ObjectInspector[] inputObjInspector,
      int posBigTableAlias, int tagLen) throws HiveException {
    List<ObjectInspector>[] result = new List[tagLen];

    int iterate = Math.min(exprEntries.length, inputObjInspector.length);
    for (byte alias = 0; alias < iterate; alias++) {
      ObjectInspector inputOI = inputObjInspector[alias];

      // For vectorized reduce-side operators getting inputs from a reduce sink,
      // the row object inspector will get a flattened version of the object inspector
      // where the nested key/value structs are replaced with a single struct:
      // Example: { key: { reducesinkkey0:int }, value: { _col0:int, _col1:int, .. } }
      // Would get converted to the following for a vectorized input:
      //   { 'key.reducesinkkey0':int, 'value._col0':int, 'value._col1':int, .. }
      // The ExprNodeEvaluator initialzation below gets broken with the flattened
      // object inpsectors, so convert it back to the a form that contains the
      // nested key/value structs.
      inputOI = unflattenObjInspector(inputOI);

      if (alias == (byte) posBigTableAlias ||
          exprEntries[alias] == null || inputOI == null) {
        // skip the driver and directly loadable tables
        continue;
      }

      List<ExprNodeEvaluator> exprList = exprEntries[alias];
      List<ObjectInspector> fieldOIList = new ArrayList<ObjectInspector>();
      for (int i = 0; i < exprList.size(); i++) {
        fieldOIList.add(exprList.get(i).initialize(inputOI));
      }
      result[alias] = fieldOIList;
    }
    return result;
  }


  public static List<ObjectInspector>[] getStandardObjectInspectors(
      List<ObjectInspector>[] aliasToObjectInspectors,
      int posBigTableAlias, int tagLen) {
    List<ObjectInspector>[] result = new List[tagLen];
    for (byte alias = 0; alias < aliasToObjectInspectors.length; alias++) {
      //get big table
      if(alias == (byte) posBigTableAlias || aliasToObjectInspectors[alias] == null){
        //skip the big tables
          continue;
      }

      List<ObjectInspector> oiList = aliasToObjectInspectors[alias];
      ArrayList<ObjectInspector> fieldOIList = new ArrayList<ObjectInspector>(
          oiList.size());
      for (int i = 0; i < oiList.size(); i++) {
        fieldOIList.add(ObjectInspectorUtils.getStandardObjectInspector(oiList
            .get(i), ObjectInspectorCopyOption.WRITABLE));
      }
      result[alias] = fieldOIList;
    }
    return result;

  }

  public static int populateJoinKeyValue(List<ExprNodeEvaluator>[] outMap,
      Map<Byte, List<ExprNodeDesc>> inputMap, int posBigTableAlias, Configuration conf) throws HiveException {
    return populateJoinKeyValue(outMap, inputMap, null, posBigTableAlias, conf);
  }

  public static int populateJoinKeyValue(List<ExprNodeEvaluator>[] outMap,
      Map<Byte, List<ExprNodeDesc>> inputMap,
      Byte[] order,
      int posBigTableAlias, Configuration conf) throws HiveException {
    int total = 0;
    for (Entry<Byte, List<ExprNodeDesc>> e : inputMap.entrySet()) {
      if (e.getValue() == null) {
        continue;
      }
      Byte key = order == null ? e.getKey() : order[e.getKey()];
      List<ExprNodeEvaluator> valueFields = new ArrayList<ExprNodeEvaluator>();
      for (ExprNodeDesc expr : e.getValue()) {
        if (key == (byte) posBigTableAlias) {
          valueFields.add(null);
        } else {
          valueFields.add(ExprNodeEvaluatorFactory.get(expr, conf));
        }
      }
      outMap[key] = valueFields;
      total += valueFields.size();
    }

    return total;
  }


  /**
   * Return the key as a standard object. StandardObject can be inspected by a
   * standard ObjectInspector.
   */
  public static ArrayList<Object> computeKeys(Object row,
      List<ExprNodeEvaluator> keyFields, List<ObjectInspector> keyFieldsOI)
      throws HiveException {

    // Compute the keys
    ArrayList<Object> nr = new ArrayList<Object>(keyFields.size());
    for (int i = 0; i < keyFields.size(); i++) {

      nr.add(ObjectInspectorUtils.copyToStandardObject(keyFields.get(i)
          .evaluate(row), keyFieldsOI.get(i),
          ObjectInspectorCopyOption.WRITABLE));
    }

    return nr;
  }

  /**
   * Return the value as a standard object. StandardObject can be inspected by a
   * standard ObjectInspector.
   */
  public static Object[] computeMapJoinValues(Object row,
      List<ExprNodeEvaluator> valueFields, List<ObjectInspector> valueFieldsOI,
      List<ExprNodeEvaluator> filters, List<ObjectInspector> filtersOI,
      int[] filterMap) throws HiveException {

    // Compute the keys
    Object[] nr;
    if (filterMap != null) {
      nr = new Object[valueFields.size()+1];
      // add whether the row is filtered or not.
      nr[valueFields.size()] = new ShortWritable(isFiltered(row, filters, filtersOI, filterMap));
    }else{
      nr = new Object[valueFields.size()];
    }

    for (int i = 0; i < valueFields.size(); i++) {
      nr[i] = ObjectInspectorUtils.copyToStandardObject(valueFields.get(i)
          .evaluate(row), valueFieldsOI.get(i),
          ObjectInspectorCopyOption.WRITABLE);
    }

    return nr;
  }

  /**
   * Return the value as a standard object. StandardObject can be inspected by a
   * standard ObjectInspector.
   * If it would be tagged by filter, reserve one more slot for that.
   * outValues can be passed in to avoid allocation
   */
  public static List<Object> computeValues(Object row,
      List<ExprNodeEvaluator> valueFields, List<ObjectInspector> valueFieldsOI, boolean hasFilter)
      throws HiveException {

    // Compute the values
    int reserve = hasFilter ? valueFields.size() + 1 : valueFields.size();
    List<Object> nr = new ArrayList<Object>(reserve);   
    for (int i = 0; i < valueFields.size(); i++) {
      nr.add(ObjectInspectorUtils.copyToStandardObject(valueFields.get(i)
          .evaluate(row), valueFieldsOI.get(i),
          ObjectInspectorCopyOption.WRITABLE));
    }
    return nr;
  }

  private static final short[] MASKS;
  static {
    int num = 32;
    MASKS = new short[num];
    MASKS[0] = 1;
    for (int idx = 1; idx < num; idx++) {
      MASKS[idx] = (short)(2 * MASKS[idx-1]);
    }
  }

  /**
   * Returns true if the row does not pass through filters.
   */
  protected static boolean isFiltered(Object row, List<ExprNodeEvaluator> filters,
          List<ObjectInspector> filtersOIs) throws HiveException {
    for (int i = 0; i < filters.size(); i++) {
      ExprNodeEvaluator evaluator = filters.get(i);
      Object condition = evaluator.evaluate(row);
      Boolean result = (Boolean) ((PrimitiveObjectInspector) filtersOIs.get(i)).
              getPrimitiveJavaObject(condition);
      if (result == null || !result) {
        return true;
      }
    }
    return false;
  }

  /**
   * Returns true if the row does not pass through filters.
   */
  protected static short isFiltered(Object row, List<ExprNodeEvaluator> filters,
      List<ObjectInspector> ois, int[] filterMap) throws HiveException {
    // apply join filters on the row.
    short ret = 0;
    int j = 0;
    for (int i = 0; i < filterMap.length; i += 2) {
      int tag = filterMap[i];
      int length = filterMap[i + 1];

      boolean passed = true;
      for (; length > 0; length--, j++) {
        if (passed) {
          Object condition = filters.get(j).evaluate(row);
          Boolean result = (Boolean) ((PrimitiveObjectInspector)
              ois.get(j)).getPrimitiveJavaObject(condition);
          if (result == null || !result) {
            passed = false;
          }
        }
      }
      if (!passed) {
        ret |= MASKS[tag];
      }
    }
    return ret;
  }

  protected static boolean isFiltered(short filter, int tag) {
    return (filter & MASKS[tag]) != 0;
  }

  protected static boolean hasAnyFiltered(short tag) {
    return tag != 0;
  }

  public static TableDesc getSpillTableDesc(Byte alias, TableDesc[] spillTableDesc,
      JoinDesc conf, boolean noFilter) {
    if (spillTableDesc == null || spillTableDesc.length == 0) {
      spillTableDesc = initSpillTables(conf,noFilter);
    }
    return spillTableDesc[alias];
  }

  public static AbstractSerDe getSpillSerDe(byte alias, TableDesc[] spillTableDesc,
      JoinDesc conf, boolean noFilter) {
    TableDesc desc = getSpillTableDesc(alias, spillTableDesc, conf, noFilter);
    if (desc == null) {
      return null;
    }
    AbstractSerDe sd = (AbstractSerDe) ReflectionUtil.newInstance(desc.getDeserializerClass(),
        null);
    try {
      SerDeUtils.initializeSerDe(sd, null, desc.getProperties(), null);
    } catch (SerDeException e) {
      e.printStackTrace();
      return null;
    }
    return sd;
  }

  public static TableDesc[] initSpillTables(JoinDesc conf, boolean noFilter) {
    int tagLen = conf.getTagLength();
    Map<Byte, List<ExprNodeDesc>> exprs = conf.getExprs();
    TableDesc[] spillTableDesc = new TableDesc[tagLen];
    for (int tag = 0; tag < exprs.size(); tag++) {
      List<ExprNodeDesc> valueCols = exprs.get((byte) tag);
      int columnSize = valueCols.size();
      StringBuilder colNames = new StringBuilder();
      StringBuilder colTypes = new StringBuilder();
      if (columnSize <= 0) {
        continue;
      }
      for (int k = 0; k < columnSize; k++) {
        String newColName = tag + "_VALUE_" + k; // any name, it does not
        // matter.
        colNames.append(newColName);
        colNames.append(',');
        colTypes.append(valueCols.get(k).getTypeString());
        colTypes.append(',');
      }
      if (!noFilter) {
        colNames.append("filtered");
        colNames.append(',');
        colTypes.append(TypeInfoFactory.shortTypeInfo.getTypeName());
        colTypes.append(',');
      }
      // remove the last ','
      colNames.setLength(colNames.length() - 1);
      colTypes.setLength(colTypes.length() - 1);
      TableDesc tblDesc = new TableDesc(
          SequenceFileInputFormat.class, HiveSequenceFileOutputFormat.class,
          Utilities.makeProperties(
          org.apache.hadoop.hive.serde.serdeConstants.SERIALIZATION_FORMAT, ""
          + Utilities.ctrlaCode,
          org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMNS, colNames
          .toString(),
          org.apache.hadoop.hive.serde.serdeConstants.LIST_COLUMN_TYPES,
          colTypes.toString(),
          serdeConstants.SERIALIZATION_LIB,LazyBinarySerDe.class.getName()));
      spillTableDesc[tag] = tblDesc;
    }
    return spillTableDesc;
  }


  public static RowContainer<List<Object>> getRowContainer(Configuration hconf,
      List<ObjectInspector> structFieldObjectInspectors,
      Byte alias,int containerSize, TableDesc[] spillTableDesc,
      JoinDesc conf,boolean noFilter, Reporter reporter) throws HiveException {

    TableDesc tblDesc = JoinUtil.getSpillTableDesc(alias,spillTableDesc,conf, noFilter);
    AbstractSerDe serde = JoinUtil.getSpillSerDe(alias, spillTableDesc, conf, noFilter);

    if (serde == null) {
      containerSize = -1;
    }

    RowContainer<List<Object>> rc = new RowContainer<List<Object>>(containerSize, hconf, reporter);
    StructObjectInspector rcOI = null;
    if (tblDesc != null) {
      // arbitrary column names used internally for serializing to spill table
      List<String> colNames = Utilities.getColumnNames(tblDesc.getProperties());
      // object inspector for serializing input tuples
      rcOI = ObjectInspectorFactory.getStandardStructObjectInspector(colNames,
          structFieldObjectInspectors);
    }

    rc.setSerDe(serde, rcOI);
    rc.setTableDesc(tblDesc);
    return rc;
  }

  private static String KEY_FIELD_PREFIX = (Utilities.ReduceField.KEY + ".").toLowerCase();
  private static String VALUE_FIELD_PREFIX = (Utilities.ReduceField.VALUE + ".").toLowerCase();

  /**
   * Create a new struct object inspector for the list of struct fields, first removing the
   * prefix from the field name.
   * @param fields
   * @param prefixToRemove
   * @return
   */
  private static ObjectInspector createStructFromFields(List<StructField> fields, String prefixToRemove) {
    int prefixLength = prefixToRemove.length() + 1; // also remove the '.' after the prefix
    ArrayList<String> fieldNames = new ArrayList<String>();
    ArrayList<ObjectInspector> fieldOIs = new ArrayList<ObjectInspector>();
    for (StructField field : fields) {
      fieldNames.add(field.getFieldName().substring(prefixLength));
      fieldOIs.add(field.getFieldObjectInspector());
    }
    return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs);
  }

  /**
   * Checks the input object inspector to see if it is in for form of a flattened struct
   * like the ones generated by a vectorized reduce sink input:
   *   { 'key.reducesinkkey0':int, 'value._col0':int, 'value._col1':int, .. }
   * If so, then it creates an "unflattened" struct that contains nested key/value
   * structs:
   *   { key: { reducesinkkey0:int }, value: { _col0:int, _col1:int, .. } }
   *
   * @param oi
   * @return unflattened object inspector if unflattening is needed,
   *         otherwise the original object inspector
   */
  private static ObjectInspector unflattenObjInspector(ObjectInspector oi) {
    if (oi instanceof StructObjectInspector) {
      // Check if all fields start with "key." or "value."
      // If so, then unflatten by adding an additional level of nested key and value structs
      // Example: { "key.reducesinkkey0":int, "key.reducesinkkey1": int, "value._col6":int }
      // Becomes
      //   { "key": { "reducesinkkey0":int, "reducesinkkey1":int }, "value": { "_col6":int } }
      ArrayList<StructField> keyFields = new ArrayList<StructField>();
      ArrayList<StructField> valueFields = new ArrayList<StructField>();
      for (StructField field : ((StructObjectInspector) oi).getAllStructFieldRefs()) {
        String fieldNameLower = field.getFieldName().toLowerCase();
        if (fieldNameLower.startsWith(KEY_FIELD_PREFIX)) {
          keyFields.add(field);
        } else if (fieldNameLower.startsWith(VALUE_FIELD_PREFIX)) {
          valueFields.add(field);
        } else {
          // Not a flattened struct, no need to unflatten
          return oi;
        }
      }

      // All field names are of the form "key." or "value."
      // Create key/value structs and add the respective fields to each one
      ArrayList<ObjectInspector> reduceFieldOIs = new ArrayList<ObjectInspector>();
      reduceFieldOIs.add(createStructFromFields(keyFields, Utilities.ReduceField.KEY.toString()));
      reduceFieldOIs.add(createStructFromFields(valueFields, Utilities.ReduceField.VALUE.toString()));

      // Finally create the outer struct to contain the key, value structs
      return ObjectInspectorFactory.getStandardStructObjectInspector(
          Utilities.reduceFieldNameList,
          reduceFieldOIs);
    }

    return oi;
  }
}