package edu.washington.escience.myria.operator; import java.util.Arrays; import java.util.List; import java.util.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.gs.collections.api.block.procedure.primitive.IntProcedure; import com.gs.collections.impl.list.mutable.primitive.IntArrayList; import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; /** * This is an implementation of hash equal join. The same as in DupElim, this implementation does not keep the * references to the incoming TupleBatches in order to get better memory performance. */ public final class SymmetricHashCountingJoin extends BinaryOperator { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** The column indices for comparing of left child. */ private final int[] leftCompareIndx; /** The column indices for comparing of right child. */ private final int[] rightCompareIndx; /** A hash table for tuples from left child. {Hashcode -> List of tuple indices with the same hash code} */ private transient IntObjectHashMap<IntArrayList> leftHashTableIndices; /** A hash table for tuples from right child. {Hashcode -> List of tuple indices with the same hash code} */ private transient IntObjectHashMap<IntArrayList> rightHashTableIndices; /** The buffer holding the valid tuples from left. */ private transient MutableTupleBuffer leftHashTable; /** The buffer holding the valid tuples from right. */ private transient MutableTupleBuffer rightHashTable; /** How many times each key occurred from left. */ private transient IntArrayList occuredTimesOnLeft; /** How many times each key occurred from right. */ private transient IntArrayList occuredTimesOnRight; /** The number of join output tuples so far. */ private long ans; /** The buffer for storing and returning answer. */ private transient TupleBatchBuffer ansTBB; /** The name of the single column output from this operator. */ private final String columnName; /** * Traverse through the list of tuples. */ private transient CountingJoinProcedure doCountingJoin; /** * Whether this operator has returned answer or not. */ private boolean hasReturnedAnswer = false; /** * Traverse through the list of tuples with the same hash code. */ private final class CountingJoinProcedure implements IntProcedure { /** serial version id. */ private static final long serialVersionUID = 1L; /** * Hash table. */ private MutableTupleBuffer joinAgainstHashTable; /** * times of occure of a key. */ private IntArrayList occuredTimesOnJoinAgainstChild; /** * Join columns in the input. */ private int[] inputCmpColumns; /** * Join columns in the other table. */ private int[] otherCmpColumns; /** * row index of the tuple. */ private int row; /** * input TupleBatch. */ private TupleBatch inputTB; @Override public void value(final int index) { if (TupleUtils.tupleEquals( inputTB, inputCmpColumns, row, joinAgainstHashTable, otherCmpColumns, index)) { ans += occuredTimesOnJoinAgainstChild.get(index); } } }; /** * Construct a {@link SymmetricHashCountingJoin}. * * @param left the left child. * @param right the right child. * @param compareIndx1 the columns of the left child to be compared with the right. Order matters. * @param compareIndx2 the columns of the right child to be compared with the left. Order matters. * @throw IllegalArgumentException if there are duplicated column names from the children. */ public SymmetricHashCountingJoin( final Operator left, final Operator right, final int[] compareIndx1, final int[] compareIndx2) { this("count", left, right, compareIndx1, compareIndx2); } /** * Construct a {@link SymmetricHashCountingJoin} operator with schema specified. * * @param outputColumnName the name of the column of the output table. * @param left the left child. * @param right the right child. * @param compareIndx1 the columns of the left child to be compared with the right. Order matters. * @param compareIndx2 the columns of the right child to be compared with the left. Order matters. * @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if * <tt>outputSchema</tt> does not have the correct number of columns and column types. */ public SymmetricHashCountingJoin( final String outputColumnName, final Operator left, final Operator right, final int[] compareIndx1, final int[] compareIndx2) { super(left, right); leftCompareIndx = compareIndx1; rightCompareIndx = compareIndx2; columnName = Objects.requireNonNull(outputColumnName); } /** * consume EOI from Child 1. reset the child's EOI to false 2. record the EOI in childrenEOI[] * * @param fromLeft true if consuming eoi from left child, false if consuming eoi from right child */ private void consumeChildEOI(final boolean fromLeft) { final Operator left = getLeft(); final Operator right = getRight(); if (fromLeft) { Preconditions.checkArgument(left.eoi()); left.setEOI(false); childrenEOI[0] = true; } else { Preconditions.checkArgument(right.eoi()); right.setEOI(false); childrenEOI[1] = true; } } /** * Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI. * * @return whether this operator is ready to set itself EOI */ private boolean isEOIReady() { if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) { return true; } return false; } @Override protected void cleanup() throws DbException { leftHashTable = null; rightHashTable = null; occuredTimesOnLeft = null; occuredTimesOnRight = null; leftHashTableIndices = null; rightHashTableIndices = null; ansTBB = null; ans = 0; } @Override public void checkEOSAndEOI() { final Operator left = getLeft(); final Operator right = getRight(); if (left.eos() && right.eos() && hasReturnedAnswer) { setEOS(); return; } // at the time of eos, this operator will not return any data, so it can be safely set EOI to true if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && hasReturnedAnswer) { setEOI(true); Arrays.fill(childrenEOI, false); } } /** * Recording the EOI status of the children. */ private final boolean[] childrenEOI = new boolean[2]; @Override protected TupleBatch fetchNextReady() throws DbException { /** * There is no distinction between synchronous EOI and asynchronous EOI * */ final Operator left = getLeft(); final Operator right = getRight(); int numOfChildNoData = 0; while (numOfChildNoData < 2 && (!left.eos() || !right.eos())) { /* * If one of the children is already EOS, we need to set numOfChildNoData to 1 since "numOfChildNoData++" for this * child will not be called. */ if (left.eos() || right.eos()) { numOfChildNoData = 1; } else { numOfChildNoData = 0; } /* process tuple from left child */ if (!left.eos()) { TupleBatch leftTB = left.nextReady(); if (leftTB != null) { // process the data that is pulled from left child processChildTB(leftTB, true); } else { /* if left eoi, consume it, check whether it will cause EOI of this operator */ if (left.eoi()) { consumeChildEOI(true); /* * If this operator is ready to emit EOI ( reminder that it might need to clear buffer), break to EOI handle * part */ if (isEOIReady()) { break; } } numOfChildNoData++; } } /* process tuple from right child */ if (!right.eos()) { TupleBatch rightTB = right.nextReady(); if (rightTB != null) { // process the data that is pulled from right child processChildTB(rightTB, false); } else { /* if right eoi, consume it, check whether it will cause EOI of this operator */ if (right.eoi()) { consumeChildEOI(false); /* * If this operator is ready to emit EOI ( reminder that it might need to clear buffer), break to EOI handle * part */ if (isEOIReady()) { break; } } numOfChildNoData++; } } } /* * If the operator is ready to EOI, just set EOI since EOI will not return any data. If the operator is ready to * EOS, return answer first, then at the next round set EOS */ if (isEOIReady()) { if (left.eos() && right.eos() && (!hasReturnedAnswer)) { hasReturnedAnswer = true; ansTBB.putLong(0, ans); return ansTBB.popAny(); } } return null; } @Override public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { leftHashTableIndices = new IntObjectHashMap<>(); rightHashTableIndices = new IntObjectHashMap<>(); occuredTimesOnLeft = new IntArrayList(); occuredTimesOnRight = new IntArrayList(); leftHashTable = new MutableTupleBuffer(getLeft().getSchema().getSubSchema(leftCompareIndx)); rightHashTable = new MutableTupleBuffer(getRight().getSchema().getSubSchema(rightCompareIndx)); ans = 0; ansTBB = new TupleBatchBuffer(getSchema()); doCountingJoin = new CountingJoinProcedure(); } /** * @param tb the incoming TupleBatch for processing join. * @param fromLeft if the tb is from left. */ protected void processChildTB(final TupleBatch tb, final boolean fromLeft) { final Operator left = getLeft(); final Operator right = getRight(); MutableTupleBuffer hashTable1Local = null; IntObjectHashMap<IntArrayList> hashTable1IndicesLocal = null; IntObjectHashMap<IntArrayList> hashTable2IndicesLocal = null; IntArrayList ownOccuredTimes = null; if (fromLeft) { hashTable1Local = leftHashTable; doCountingJoin.joinAgainstHashTable = rightHashTable; hashTable1IndicesLocal = leftHashTableIndices; hashTable2IndicesLocal = rightHashTableIndices; doCountingJoin.inputCmpColumns = leftCompareIndx; doCountingJoin.otherCmpColumns = rightCompareIndx; doCountingJoin.occuredTimesOnJoinAgainstChild = occuredTimesOnRight; ownOccuredTimes = occuredTimesOnLeft; } else { hashTable1Local = rightHashTable; doCountingJoin.joinAgainstHashTable = leftHashTable; hashTable1IndicesLocal = rightHashTableIndices; hashTable2IndicesLocal = leftHashTableIndices; doCountingJoin.inputCmpColumns = rightCompareIndx; doCountingJoin.otherCmpColumns = leftCompareIndx; doCountingJoin.occuredTimesOnJoinAgainstChild = occuredTimesOnLeft; ownOccuredTimes = occuredTimesOnRight; } doCountingJoin.inputTB = tb; if (left.eos() && !right.eos()) { /* * delete right child's hash table if the left child is EOS, since there will be no incoming tuples from right as * it will never be probed again. */ rightHashTableIndices = null; rightHashTable = null; } else if (right.eos() && !left.eos()) { /* * delete left child's hash table if the right child is EOS, since there will be no incoming tuples from left as * it will never be probed again. */ leftHashTableIndices = null; leftHashTable = null; } for (int row = 0; row < tb.numTuples(); ++row) { /* * update number of count of probing the other child's hash table. */ final int cntHashCode = HashUtils.hashSubRow(tb, doCountingJoin.inputCmpColumns, row); IntArrayList tuplesWithHashCode = hashTable2IndicesLocal.get(cntHashCode); if (tuplesWithHashCode != null) { doCountingJoin.row = row; tuplesWithHashCode.forEach(doCountingJoin); } if (hashTable1Local != null) { // only build hash table on two sides if none of the children is EOS updateHashTableAndOccureTimes( tb, row, cntHashCode, hashTable1Local, hashTable1IndicesLocal, doCountingJoin.inputCmpColumns, ownOccuredTimes); } } } @Override protected Schema generateSchema() { final Schema leftSchema = getLeft().getSchema(); final Schema rightSchema = getRight().getSchema(); /* Assert that the compare index types are the same. */ for (int i = 0; i < rightCompareIndx.length; ++i) { int leftIndex = leftCompareIndx[i]; int rightIndex = rightCompareIndx[i]; Type leftType = leftSchema.getColumnType(leftIndex); Type rightType = rightSchema.getColumnType(rightIndex); Preconditions.checkState( leftType == rightType, "column types do not match for join at index %s: left column type %s [%s] != right column type %s [%s]", i, leftIndex, leftType, rightIndex, rightType); } return Schema.of(ImmutableList.of(Type.LONG_TYPE), ImmutableList.of(columnName)); } /** * @param tb the source TupleBatch * @param row the row number of the to be processed tuple in the source TupleBatch * @param hashCode the hashCode of the to be processed tuple * @param hashTable the hash table to be updated * @param hashTableIndices the hash indices to be updated * @param compareColumns compareColumns of input tuple * @param occuredTimes occuredTimes array to be updated */ private void updateHashTableAndOccureTimes( final TupleBatch tb, final int row, final int hashCode, final MutableTupleBuffer hashTable, final IntObjectHashMap<IntArrayList> hashTableIndices, final int[] compareColumns, final IntArrayList occuredTimes) { /* get the index of the tuple's hash code corresponding to */ final int nextIndex = hashTable.numTuples(); IntArrayList tupleIndicesList = hashTableIndices.get(hashCode); /* create one is there is no such a index yet (there is no tuple with the same hash code has been processed ) */ if (tupleIndicesList == null) { tupleIndicesList = new IntArrayList(1); hashTableIndices.put(hashCode, tupleIndicesList); } Preconditions.checkArgument(hashTable.numColumns() == compareColumns.length); List<? extends Column<?>> inputColumns = tb.getDataColumns(); /* find whether this tuple's comparing key has occured before. If it is, only update occurred times */ boolean found = false; for (int i = 0; i < tupleIndicesList.size(); ++i) { int index = tupleIndicesList.get(i); if (TupleUtils.tupleEquals(tb, compareColumns, row, hashTable, index)) { occuredTimes.set(index, occuredTimes.get(index) + 1); found = true; break; } } if (!found) { tupleIndicesList.add(nextIndex); for (int column = 0; column < hashTable.numColumns(); ++column) { hashTable.put(column, inputColumns.get(compareColumns[column]), row); } occuredTimes.add(1); } } }