SymmetricHashCountingJoin.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.util.Arrays;
import java.util.List;
import java.util.Objects;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.gs.collections.api.block.procedure.primitive.IntProcedure;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;

/**
 * This is an implementation of hash equal join. The same as in DupElim, this implementation does not keep the
 * references to the incoming TupleBatches in order to get better memory performance.
 */
public final class SymmetricHashCountingJoin extends BinaryOperator {
  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /** The column indices for comparing of left child. */
  private final int[] leftCompareIndx;
  /** The column indices for comparing of right child. */
  private final int[] rightCompareIndx;
  /** A hash table for tuples from left child. {Hashcode -> List of tuple indices with the same hash code} */
  private transient IntObjectHashMap<IntArrayList> leftHashTableIndices;
  /** A hash table for tuples from right child. {Hashcode -> List of tuple indices with the same hash code} */
  private transient IntObjectHashMap<IntArrayList> rightHashTableIndices;
  /** The buffer holding the valid tuples from left. */
  private transient MutableTupleBuffer leftHashTable;
  /** The buffer holding the valid tuples from right. */
  private transient MutableTupleBuffer rightHashTable;
  /** How many times each key occurred from left. */
  private transient IntArrayList occuredTimesOnLeft;
  /** How many times each key occurred from right. */
  private transient IntArrayList occuredTimesOnRight;
  /** The number of join output tuples so far. */
  private long ans;
  /** The buffer for storing and returning answer. */
  private transient TupleBatchBuffer ansTBB;
  /** The name of the single column output from this operator. */
  private final String columnName;
  /**
   * Traverse through the list of tuples.
   */
  private transient CountingJoinProcedure doCountingJoin;

  /**
   * Whether this operator has returned answer or not.
   */
  private boolean hasReturnedAnswer = false;

  /**
   * Traverse through the list of tuples with the same hash code.
   */
  private final class CountingJoinProcedure implements IntProcedure {

    /** serial version id. */
    private static final long serialVersionUID = 1L;

    /**
     * Hash table.
     */
    private MutableTupleBuffer joinAgainstHashTable;

    /**
     * times of occure of a key.
     */
    private IntArrayList occuredTimesOnJoinAgainstChild;
    /**
     * Join columns in the input.
     */
    private int[] inputCmpColumns;

    /**
     * Join columns in the other table.
     */
    private int[] otherCmpColumns;

    /**
     * row index of the tuple.
     */
    private int row;

    /**
     * input TupleBatch.
     */
    private TupleBatch inputTB;

    @Override
    public void value(final int index) {
      if (TupleUtils.tupleEquals(
          inputTB, inputCmpColumns, row, joinAgainstHashTable, otherCmpColumns, index)) {
        ans += occuredTimesOnJoinAgainstChild.get(index);
      }
    }
  };

  /**
   * Construct a {@link SymmetricHashCountingJoin}.
   *
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names from the children.
   */
  public SymmetricHashCountingJoin(
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2) {
    this("count", left, right, compareIndx1, compareIndx2);
  }

  /**
   * Construct a {@link SymmetricHashCountingJoin} operator with schema specified.
   *
   * @param outputColumnName the name of the column of the output table.
   * @param left the left child.
   * @param right the right child.
   * @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
   * @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
   * @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
   *        <tt>outputSchema</tt> does not have the correct number of columns and column types.
   */
  public SymmetricHashCountingJoin(
      final String outputColumnName,
      final Operator left,
      final Operator right,
      final int[] compareIndx1,
      final int[] compareIndx2) {
    super(left, right);
    leftCompareIndx = compareIndx1;
    rightCompareIndx = compareIndx2;
    columnName = Objects.requireNonNull(outputColumnName);
  }

  /**
   * consume EOI from Child 1. reset the child's EOI to false 2. record the EOI in childrenEOI[]
   *
   * @param fromLeft true if consuming eoi from left child, false if consuming eoi from right child
   */
  private void consumeChildEOI(final boolean fromLeft) {
    final Operator left = getLeft();
    final Operator right = getRight();
    if (fromLeft) {
      Preconditions.checkArgument(left.eoi());
      left.setEOI(false);
      childrenEOI[0] = true;
    } else {
      Preconditions.checkArgument(right.eoi());
      right.setEOI(false);
      childrenEOI[1] = true;
    }
  }

  /**
   * Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI.
   *
   * @return whether this operator is ready to set itself EOI
   */
  private boolean isEOIReady() {
    if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) {
      return true;
    }
    return false;
  }

  @Override
  protected void cleanup() throws DbException {
    leftHashTable = null;
    rightHashTable = null;
    occuredTimesOnLeft = null;
    occuredTimesOnRight = null;
    leftHashTableIndices = null;
    rightHashTableIndices = null;
    ansTBB = null;
    ans = 0;
  }

  @Override
  public void checkEOSAndEOI() {
    final Operator left = getLeft();
    final Operator right = getRight();
    if (left.eos() && right.eos() && hasReturnedAnswer) {
      setEOS();
      return;
    }

    // at the time of eos, this operator will not return any data, so it can be safely set EOI to true
    if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && hasReturnedAnswer) {
      setEOI(true);
      Arrays.fill(childrenEOI, false);
    }
  }

  /**
   * Recording the EOI status of the children.
   */
  private final boolean[] childrenEOI = new boolean[2];

  @Override
  protected TupleBatch fetchNextReady() throws DbException {

    /**
     * There is no distinction between synchronous EOI and asynchronous EOI
     *
     */
    final Operator left = getLeft();
    final Operator right = getRight();

    int numOfChildNoData = 0;
    while (numOfChildNoData < 2 && (!left.eos() || !right.eos())) {

      /*
       * If one of the children is already EOS, we need to set numOfChildNoData to 1 since "numOfChildNoData++" for this
       * child will not be called.
       */
      if (left.eos() || right.eos()) {
        numOfChildNoData = 1;
      } else {
        numOfChildNoData = 0;
      }

      /* process tuple from left child */
      if (!left.eos()) {
        TupleBatch leftTB = left.nextReady();
        if (leftTB != null) { // process the data that is pulled from left child
          processChildTB(leftTB, true);
        } else {
          /* if left eoi, consume it, check whether it will cause EOI of this operator */
          if (left.eoi()) {
            consumeChildEOI(true);
            /*
             * If this operator is ready to emit EOI ( reminder that it might need to clear buffer), break to EOI handle
             * part
             */
            if (isEOIReady()) {
              break;
            }
          }
          numOfChildNoData++;
        }
      }

      /* process tuple from right child */
      if (!right.eos()) {
        TupleBatch rightTB = right.nextReady();
        if (rightTB != null) { // process the data that is pulled from right child
          processChildTB(rightTB, false);
        } else {
          /* if right eoi, consume it, check whether it will cause EOI of this operator */
          if (right.eoi()) {
            consumeChildEOI(false);
            /*
             * If this operator is ready to emit EOI ( reminder that it might need to clear buffer), break to EOI handle
             * part
             */
            if (isEOIReady()) {
              break;
            }
          }
          numOfChildNoData++;
        }
      }
    }

    /*
     * If the operator is ready to EOI, just set EOI since EOI will not return any data. If the operator is ready to
     * EOS, return answer first, then at the next round set EOS
     */
    if (isEOIReady()) {
      if (left.eos() && right.eos() && (!hasReturnedAnswer)) {
        hasReturnedAnswer = true;
        ansTBB.putLong(0, ans);
        return ansTBB.popAny();
      }
    }
    return null;
  }

  @Override
  public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    leftHashTableIndices = new IntObjectHashMap<>();
    rightHashTableIndices = new IntObjectHashMap<>();
    occuredTimesOnLeft = new IntArrayList();
    occuredTimesOnRight = new IntArrayList();
    leftHashTable = new MutableTupleBuffer(getLeft().getSchema().getSubSchema(leftCompareIndx));
    rightHashTable = new MutableTupleBuffer(getRight().getSchema().getSubSchema(rightCompareIndx));
    ans = 0;
    ansTBB = new TupleBatchBuffer(getSchema());
    doCountingJoin = new CountingJoinProcedure();
  }

  /**
   * @param tb the incoming TupleBatch for processing join.
   * @param fromLeft if the tb is from left.
   */
  protected void processChildTB(final TupleBatch tb, final boolean fromLeft) {

    final Operator left = getLeft();
    final Operator right = getRight();

    MutableTupleBuffer hashTable1Local = null;
    IntObjectHashMap<IntArrayList> hashTable1IndicesLocal = null;
    IntObjectHashMap<IntArrayList> hashTable2IndicesLocal = null;
    IntArrayList ownOccuredTimes = null;
    if (fromLeft) {
      hashTable1Local = leftHashTable;
      doCountingJoin.joinAgainstHashTable = rightHashTable;
      hashTable1IndicesLocal = leftHashTableIndices;
      hashTable2IndicesLocal = rightHashTableIndices;
      doCountingJoin.inputCmpColumns = leftCompareIndx;
      doCountingJoin.otherCmpColumns = rightCompareIndx;
      doCountingJoin.occuredTimesOnJoinAgainstChild = occuredTimesOnRight;
      ownOccuredTimes = occuredTimesOnLeft;
    } else {
      hashTable1Local = rightHashTable;
      doCountingJoin.joinAgainstHashTable = leftHashTable;
      hashTable1IndicesLocal = rightHashTableIndices;
      hashTable2IndicesLocal = leftHashTableIndices;
      doCountingJoin.inputCmpColumns = rightCompareIndx;
      doCountingJoin.otherCmpColumns = leftCompareIndx;
      doCountingJoin.occuredTimesOnJoinAgainstChild = occuredTimesOnLeft;
      ownOccuredTimes = occuredTimesOnRight;
    }
    doCountingJoin.inputTB = tb;

    if (left.eos() && !right.eos()) {
      /*
       * delete right child's hash table if the left child is EOS, since there will be no incoming tuples from right as
       * it will never be probed again.
       */
      rightHashTableIndices = null;
      rightHashTable = null;
    } else if (right.eos() && !left.eos()) {
      /*
       * delete left child's hash table if the right child is EOS, since there will be no incoming tuples from left as
       * it will never be probed again.
       */
      leftHashTableIndices = null;
      leftHashTable = null;
    }

    for (int row = 0; row < tb.numTuples(); ++row) {

      /*
       * update number of count of probing the other child's hash table.
       */
      final int cntHashCode = HashUtils.hashSubRow(tb, doCountingJoin.inputCmpColumns, row);
      IntArrayList tuplesWithHashCode = hashTable2IndicesLocal.get(cntHashCode);
      if (tuplesWithHashCode != null) {
        doCountingJoin.row = row;
        tuplesWithHashCode.forEach(doCountingJoin);
      }

      if (hashTable1Local != null) {
        // only build hash table on two sides if none of the children is EOS
        updateHashTableAndOccureTimes(
            tb,
            row,
            cntHashCode,
            hashTable1Local,
            hashTable1IndicesLocal,
            doCountingJoin.inputCmpColumns,
            ownOccuredTimes);
      }
    }
  }

  @Override
  protected Schema generateSchema() {
    final Schema leftSchema = getLeft().getSchema();
    final Schema rightSchema = getRight().getSchema();

    /* Assert that the compare index types are the same. */
    for (int i = 0; i < rightCompareIndx.length; ++i) {
      int leftIndex = leftCompareIndx[i];
      int rightIndex = rightCompareIndx[i];
      Type leftType = leftSchema.getColumnType(leftIndex);
      Type rightType = rightSchema.getColumnType(rightIndex);
      Preconditions.checkState(
          leftType == rightType,
          "column types do not match for join at index %s: left column type %s [%s] != right column type %s [%s]",
          i,
          leftIndex,
          leftType,
          rightIndex,
          rightType);
    }

    return Schema.of(ImmutableList.of(Type.LONG_TYPE), ImmutableList.of(columnName));
  }

  /**
   * @param tb the source TupleBatch
   * @param row the row number of the to be processed tuple in the source TupleBatch
   * @param hashCode the hashCode of the to be processed tuple
   * @param hashTable the hash table to be updated
   * @param hashTableIndices the hash indices to be updated
   * @param compareColumns compareColumns of input tuple
   * @param occuredTimes occuredTimes array to be updated
   */
  private void updateHashTableAndOccureTimes(
      final TupleBatch tb,
      final int row,
      final int hashCode,
      final MutableTupleBuffer hashTable,
      final IntObjectHashMap<IntArrayList> hashTableIndices,
      final int[] compareColumns,
      final IntArrayList occuredTimes) {

    /* get the index of the tuple's hash code corresponding to */
    final int nextIndex = hashTable.numTuples();
    IntArrayList tupleIndicesList = hashTableIndices.get(hashCode);

    /* create one is there is no such a index yet (there is no tuple with the same hash code has been processed ) */
    if (tupleIndicesList == null) {
      tupleIndicesList = new IntArrayList(1);
      hashTableIndices.put(hashCode, tupleIndicesList);
    }

    Preconditions.checkArgument(hashTable.numColumns() == compareColumns.length);
    List<? extends Column<?>> inputColumns = tb.getDataColumns();

    /* find whether this tuple's comparing key has occured before. If it is, only update occurred times */
    boolean found = false;
    for (int i = 0; i < tupleIndicesList.size(); ++i) {
      int index = tupleIndicesList.get(i);
      if (TupleUtils.tupleEquals(tb, compareColumns, row, hashTable, index)) {
        occuredTimes.set(index, occuredTimes.get(index) + 1);
        found = true;
        break;
      }
    }

    if (!found) {
      tupleIndicesList.add(nextIndex);
      for (int column = 0; column < hashTable.numColumns(); ++column) {
        hashTable.put(column, inputColumns.get(compareColumns[column]), row);
      }
      occuredTimes.add(1);
    }
  }
}