RightHashJoin.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.util.Arrays;
import java.util.List;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.gs.collections.api.block.procedure.primitive.IntProcedure;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
import edu.washington.escience.myria.util.MyriaArrayUtils;

/**
 * This is an implementation of unbalanced hash join. This operator only builds hash tables for its right child, thus
 * will begin to output tuples after right child EOS.
 *
 */
public final class RightHashJoin extends BinaryOperator {
  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /**
   * The names of the output columns.
   */
  private final ImmutableList<String> outputColumns;

  /**
   * The column indices for comparing of child 1.
   */
  private final int[] leftCompareIndx;
  /**
   * The column indices for comparing of child 2.
   */
  private final int[] rightCompareIndx;

  /**
   * A hash table for tuples from child 2. {Hashcode -> List of tuple indices with the same hash code}
   */
  private transient IntObjectHashMap<IntArrayList> rightHashTableIndices;

  /**
   * The buffer holding the valid tuples from right.
   */
  private transient MutableTupleBuffer rightHashTable;
  /**
   * The buffer holding the results.
   */
  private transient TupleBatchBuffer ans;
  /** Which columns in the left child are to be output. */
  private final int[] leftAnswerColumns;
  /** Which columns in the right child are to be output. */
  private final int[] rightAnswerColumns;

  /**
   * Traverse through the list of tuples with the same hash code.
   */
  private final class JoinProcedure implements IntProcedure {

    /** serial version id. */
    private static final long serialVersionUID = 1L;

    /**
     * Hash table.
     */
    private MutableTupleBuffer joinAgainstHashTable;

    /**
     *
     * */
    private int[] inputCmpColumns;

    /**
     * the columns to compare against.
     */
    private int[] joinAgainstCmpColumns;
    /**
     * row index of the tuple.
     */
    private int row;

    /**
     * input TupleBatch.
     */
    private TupleBatch inputTB;

    @Override
    public void value(final int index) {
      if (TupleUtils.tupleEquals(
          inputTB, inputCmpColumns, row, joinAgainstHashTable, joinAgainstCmpColumns, index)) {
        addToAns(inputTB, row, joinAgainstHashTable, index);
      }
    }
  };

  /**
   * Traverse through the list of tuples.
   */
  private transient JoinProcedure doJoin;

  /**
   * Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in
   * compareIndx1 and compareIndx2 match.
   *
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names from the children.
   */
  public RightHashJoin(
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2) {
    this(null, left, right, compareIndx1, compareIndx2);
  }

  /**
   * Construct an EquiJoin operator. It returns the specified columns from both children when the corresponding columns
   * in compareIndx1 and compareIndx2 match.
   *
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @param answerColumns1 the columns of the left child to be returned. Order matters.
   * @param answerColumns2 the columns of the right child to be returned. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
   *        <tt>outputSchema</tt> does not have the correct number of columns and column types.
   */
  public RightHashJoin(
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2,
      final int[] answerColumns1,
      final int[] answerColumns2) {
    this(null, left, right, compareIndx1, compareIndx2, answerColumns1, answerColumns2);
  }

  /**
   * Construct an EquiJoin operator. It returns the specified columns from both children when the corresponding columns
   * in compareIndx1 and compareIndx2 match.
   *
   * @param outputColumns the names of the columns in the output schema. If null, the corresponding columns will be
   *          copied from the children.
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @param answerColumns1 the columns of the left child to be returned. Order matters.
   * @param answerColumns2 the columns of the right child to be returned. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names in <tt>outputColumns</tt>, or if
   *        <tt>outputColumns</tt> does not have the correct number of columns and column types.
   */
  public RightHashJoin(
      final List<String> outputColumns,
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2,
      final int[] answerColumns1,
      final int[] answerColumns2) {
    super(left, right);
    Preconditions.checkArgument(compareIndx1.length == compareIndx2.length);
    if (outputColumns != null) {
      Preconditions.checkArgument(
          outputColumns.size() == answerColumns1.length + answerColumns2.length,
          "length mismatch between output column names and columns selected for output");
      Preconditions.checkArgument(
          ImmutableSet.copyOf(outputColumns).size() == outputColumns.size(),
          "duplicate column names in outputColumns");
      this.outputColumns = ImmutableList.copyOf(outputColumns);
    } else {
      this.outputColumns = null;
    }
    leftCompareIndx = MyriaArrayUtils.warnIfNotSet(compareIndx1);
    rightCompareIndx = MyriaArrayUtils.warnIfNotSet(compareIndx2);
    leftAnswerColumns = MyriaArrayUtils.warnIfNotSet(answerColumns1);
    rightAnswerColumns = MyriaArrayUtils.warnIfNotSet(answerColumns2);
  }

  /**
   * Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in
   * compareIndx1 and compareIndx2 match.
   *
   * @param outputColumns the names of the columns in the output schema. If null, the corresponding columns will be
   *          copied from the children.
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
   *        <tt>outputSchema</tt> does not have the correct number of columns and column types.
   */
  public RightHashJoin(
      final List<String> outputColumns,
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2) {
    this(
        outputColumns,
        left,
        right,
        compareIndx1,
        compareIndx2,
        range(left.getSchema().numColumns()),
        range(right.getSchema().numColumns()));
  }

  /**
   * Helper function that generates an array of the numbers 0..max-1.
   *
   * @param max the size of the array.
   * @return an array of the numbers 0..max-1.
   */
  private static int[] range(final int max) {
    int[] ret = new int[max];
    for (int i = 0; i < max; ++i) {
      ret[i] = i;
    }
    return ret;
  }

  @Override
  protected Schema generateSchema() {
    final Schema leftSchema = getLeft().getSchema();
    final Schema rightSchema = getRight().getSchema();
    ImmutableList.Builder<Type> types = ImmutableList.builder();
    ImmutableList.Builder<String> names = ImmutableList.builder();

    /* Assert that the compare index types are the same. */
    for (int i = 0; i < rightCompareIndx.length; ++i) {
      int leftIndex = leftCompareIndx[i];
      int rightIndex = rightCompareIndx[i];
      Type leftType = leftSchema.getColumnType(leftIndex);
      Type rightType = rightSchema.getColumnType(rightIndex);
      Preconditions.checkState(
          leftType == rightType,
          "column types do not match for join at index %s: left column type %s [%s] != right column type %s [%s]",
          i,
          leftIndex,
          leftType,
          rightIndex,
          rightType);
    }

    for (int i : leftAnswerColumns) {
      types.add(leftSchema.getColumnType(i));
      names.add(leftSchema.getColumnName(i));
    }

    for (int i : rightAnswerColumns) {
      types.add(rightSchema.getColumnType(i));
      names.add(rightSchema.getColumnName(i));
    }

    if (outputColumns != null) {
      return new Schema(types.build(), outputColumns);
    } else {
      return new Schema(types, names);
    }
  }

  /**
   * @param cntTB current TB
   * @param row current row
   * @param hashTable the buffer holding the tuples to join against
   * @param index the index of hashTable, which the cntTuple is to join with
   */
  protected void addToAns(
      final TupleBatch cntTB, final int row, final MutableTupleBuffer hashTable, final int index) {
    for (int leftAnswerColumn : leftAnswerColumns) {
      ans.append(cntTB, leftAnswerColumn, row);
    }
    for (int rightAnswerColumn : rightAnswerColumns) {
      ans.append(hashTable, rightAnswerColumn, index);
    }
  }

  @Override
  protected void cleanup() throws DbException {
    rightHashTable = null;
    rightHashTableIndices = null;
    ans = null;
  }

  @Override
  public void checkEOSAndEOI() {
    final Operator left = getLeft();
    final Operator right = getRight();

    if (left.eos() && right.eos() && ans.numTuples() == 0) {
      setEOS();
      return;
    }

    // EOS could be used as an EOI
    if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && ans.numTuples() == 0) {
      setEOI(true);
      Arrays.fill(childrenEOI, false);
    }
  }

  /**
   * Recording the EOI status of the children.
   */
  private final boolean[] childrenEOI = new boolean[2];

  /**
   * Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI.
   *
   * @return whether this operator is ready to set itself EOI
   */
  private boolean isEOIReady() {
    if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) {
      return true;
    }
    return false;
  }

  @Override
  protected TupleBatch fetchNextReady() throws DbException {
    /*
     * blocking mode will have the same logic
     */

    /* If any full tuple batches are ready, output them. */
    TupleBatch nexttb = ans.popAnyUsingTimeout();
    if (nexttb != null) {
      return nexttb;
    }

    final Operator right = getRight();

    /* Drain the right child. */
    while (!right.eos()) {
      TupleBatch rightTB = right.nextReady();
      if (rightTB == null) {
        /* The right child may have realized it's EOS now. If so, we must move onto left child to avoid livelock. */
        if (right.eos()) {
          break;
        }
        return null;
      }
      processRightChildTB(rightTB);
    }

    /* The right child is done, let's drain the left child. */
    final Operator left = getLeft();
    while (!left.eos()) {
      TupleBatch leftTB = left.nextReady();
      /*
       * Left tuple has no data, but we may need to pop partially-full existing batches if left reached EOI/EOS. Break
       * and check for termination.
       */
      if (leftTB == null) {
        break;
      }

      /* Process the data and add new results to ans. */
      processLeftChildTB(leftTB);

      nexttb = ans.popAnyUsingTimeout();
      if (nexttb != null) {
        return nexttb;
      }
      /*
       * We didn't time out or there is no data in ans, and there are no full tuple batches. Either way, check for more
       * data.
       */
    }

    if (isEOIReady()) {
      nexttb = ans.popAny();
    }

    return nexttb;
  }

  @Override
  public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    final Operator right = getRight();

    rightHashTableIndices = new IntObjectHashMap<>();
    rightHashTable = new MutableTupleBuffer(right.getSchema());

    ans = new TupleBatchBuffer(getSchema());
    doJoin = new JoinProcedure();
  }

  /**
   * Process the tuples from left child.
   *
   * @param tb TupleBatch to be processed.
   */
  protected void processLeftChildTB(final TupleBatch tb) {
    doJoin.joinAgainstHashTable = rightHashTable;
    doJoin.inputCmpColumns = leftCompareIndx;
    doJoin.joinAgainstCmpColumns = rightCompareIndx;
    doJoin.inputTB = tb;

    for (int row = 0; row < tb.numTuples(); ++row) {
      final int cntHashCode = HashUtils.hashSubRow(tb, doJoin.inputCmpColumns, row);
      IntArrayList tuplesWithHashCode = rightHashTableIndices.get(cntHashCode);
      if (tuplesWithHashCode != null) {
        doJoin.row = row;
        tuplesWithHashCode.forEach(doJoin);
      }
    }
  }

  /**
   * Process the tuples from right child.
   *
   * @param tb TupleBatch to be processed.
   */
  protected void processRightChildTB(final TupleBatch tb) {

    for (int row = 0; row < tb.numTuples(); ++row) {
      final int cntHashCode = HashUtils.hashSubRow(tb, rightCompareIndx, row);
      // only build hash table on two sides if none of the children is EOS
      addToHashTable(tb, row, rightHashTable, rightHashTableIndices, cntHashCode);
    }
  }

  /**
   * @param tb the source TupleBatch
   * @param row the row number to get added to hash table
   * @param hashTable the target hash table
   * @param hashTable1IndicesLocal hash table 1 indices local
   * @param hashCode the hashCode of the tb.
   */
  private void addToHashTable(
      final TupleBatch tb,
      final int row,
      final MutableTupleBuffer hashTable,
      final IntObjectHashMap<IntArrayList> hashTable1IndicesLocal,
      final int hashCode) {
    final int nextIndex = hashTable.numTuples();
    IntArrayList tupleIndicesList = hashTable1IndicesLocal.get(hashCode);
    if (tupleIndicesList == null) {
      tupleIndicesList = new IntArrayList(1);
      hashTable1IndicesLocal.put(hashCode, tupleIndicesList);
    }
    tupleIndicesList.add(nextIndex);
    List<? extends Column<?>> inputColumns = tb.getDataColumns();
    for (int column = 0; column < tb.numColumns(); column++) {
      hashTable.put(column, inputColumns.get(column), row);
    }
  }
}