package edu.washington.escience.myria.operator; import java.util.Arrays; import java.util.List; import java.util.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.gs.collections.api.block.procedure.primitive.IntProcedure; import com.gs.collections.impl.list.mutable.primitive.IntArrayList; import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; /** * * Counting join which will only build hash table of the right child. * */ public class RightHashCountingJoin extends BinaryOperator { /** * This is required for serialization. */ private static final long serialVersionUID = 1L; /** * The column indices for comparing of child 1. */ private final int[] leftCompareIndx; /** * The column indices for comparing of child 2. */ private final int[] rightCompareIndx; /** * A hash table for tuples from child 2. {Hashcode -> List of tuple indices with the same hash code} */ private transient IntObjectHashMap<IntArrayList> hashTableIndices; /** * The buffer holding the valid tuples from right. */ private transient MutableTupleBuffer hashTable; /** * How many times each key occurred from right. */ private transient IntArrayList occurredTimes; /** * The buffer holding the results. */ private transient long ans; /** The buffer for storing and returning answer. */ private transient TupleBatchBuffer ansTBB; /** The name of the single column output from this operator. */ private final String columnName; /** * Whether this operator has returned answer or not. * */ private boolean hasReturnedAnswer = false; /** * Traverse through the list of tuples. * */ private transient CountingJoinProcedure doCountingJoin; /** * Traverse through the list of tuples with the same hash code. * */ private final class CountingJoinProcedure implements IntProcedure { /** serial version id. */ private static final long serialVersionUID = 1L; /** * Hash table. * */ private MutableTupleBuffer joinAgainstHashTable; /** * times of occure of a key. * */ private IntArrayList occuredTimesOnJoinAgainstChild; /** * * */ private int[] inputCmpColumns; /** * row index of the tuple. * */ private int row; /** * input TupleBatch. * */ private TupleBatch inputTB; @Override public void value(final int index) { if (TupleUtils.tupleEquals(inputTB, inputCmpColumns, row, joinAgainstHashTable, index)) { ans += occuredTimesOnJoinAgainstChild.get(index); } } }; /** * Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI. * * @return whether this operator is ready to set itself EOI */ private boolean isEOIReady() { if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) { return true; } return false; } /** * Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in * compareIndx1 and compareIndx2 match. * * @param left the left child. * @param right the right child. * @param compareIndx1 the columns of the left child to be compared with the right. Order matters. * @param compareIndx2 the columns of the right child to be compared with the left. Order matters. * @throw IllegalArgumentException if there are duplicated column names from the children. */ public RightHashCountingJoin( final Operator left, final Operator right, final int[] compareIndx1, final int[] compareIndx2) { this("count", left, right, compareIndx1, compareIndx2); } /** * Construct a {@link RightHashCountingJoin} operator with output column name specified. * * @param outputColumnName the name of the column of the output table. * @param left the left child. * @param right the right child. * @param compareIndx1 the columns of the left child to be compared with the right. Order matters. * @param compareIndx2 the columns of the right child to be compared with the left. Order matters. * @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if * <tt>outputSchema</tt> does not have the correct number of columns and column types. */ public RightHashCountingJoin( final String outputColumnName, final Operator left, final Operator right, final int[] compareIndx1, final int[] compareIndx2) { super(left, right); leftCompareIndx = compareIndx1; rightCompareIndx = compareIndx2; columnName = Objects.requireNonNull(outputColumnName); } @Override protected void cleanup() throws DbException { hashTable = null; hashTableIndices = null; occurredTimes = null; ansTBB = null; ans = 0; } @Override public void checkEOSAndEOI() { final Operator left = getLeft(); final Operator right = getRight(); if (left.eos() && right.eos() && hasReturnedAnswer) { setEOS(); return; } // EOS could be used as an EOI if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && hasReturnedAnswer) { setEOI(true); Arrays.fill(childrenEOI, false); } } /** * Recording the EOI status of the children. */ private final boolean[] childrenEOI = new boolean[2]; @Override protected TupleBatch fetchNextReady() throws DbException { /* * There is no distinction between blocking and non-blocking. */ final Operator right = getRight(); /* Drain the right child. */ while (!right.eos()) { TupleBatch rightTB = right.nextReady(); if (rightTB == null) { /* The right child may have realized it's EOS now. If so, we must move onto left child to avoid livelock. */ if (right.eos()) { break; } return null; } processRightChildTB(rightTB); } /* The right child is done, let's drain the left child. */ final Operator left = getLeft(); while (!left.eos()) { TupleBatch leftTB = left.nextReady(); /* * Left tuple has no data, but we may need to pop partially-full existing batches if left reached EOI/EOS. Break * and check for termination. */ if (leftTB == null) { break; } /* Process the data and add new results to ans. */ processLeftChildTB(leftTB); } /* * If the operator is ready to EOI, just set EOI since EOI will not return any data. If the operator is ready to * EOS, return answer first, then at the next round set EOS */ if (isEOIReady()) { if (left.eos() && right.eos() && (!hasReturnedAnswer)) { ansTBB.putLong(0, ans); hasReturnedAnswer = true; return ansTBB.popAny(); } } /* If not eos, return null since there is no tuple can be processed right now */ return null; } @Override public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { final Operator right = getRight(); hashTableIndices = new IntObjectHashMap<>(); hashTable = new MutableTupleBuffer(right.getSchema().getSubSchema(rightCompareIndx)); occurredTimes = new IntArrayList(); doCountingJoin = new CountingJoinProcedure(); ans = 0; ansTBB = new TupleBatchBuffer(getSchema()); } /** * Process tuples from right child: build up hash tables. * * @param tb the incoming TupleBatch. */ protected void processRightChildTB(final TupleBatch tb) { for (int row = 0; row < tb.numTuples(); ++row) { final int cntHashCode = HashUtils.hashSubRow(tb, rightCompareIndx, row); // only build hash table on two sides if none of the children is EOS updateHashTableAndOccureTimes( tb, row, cntHashCode, hashTable, hashTableIndices, rightCompareIndx, occurredTimes); } } /** * Process tuples from the left child: do the actual count join. * * @param tb the incoming TupleBatch for processing join. */ protected void processLeftChildTB(final TupleBatch tb) { doCountingJoin.inputCmpColumns = leftCompareIndx; doCountingJoin.inputTB = tb; doCountingJoin.occuredTimesOnJoinAgainstChild = occurredTimes; doCountingJoin.joinAgainstHashTable = hashTable; for (int row = 0; row < tb.numTuples(); ++row) { /* * update number of count of probing the other child's hash table. */ final int cntHashCode = HashUtils.hashSubRow(tb, doCountingJoin.inputCmpColumns, row); IntArrayList tuplesWithHashCode = hashTableIndices.get(cntHashCode); if (tuplesWithHashCode != null) { doCountingJoin.row = row; tuplesWithHashCode.forEach(doCountingJoin); } } } /** * @param tb the source TupleBatch * @param row the row number of the to be processed tuple in the source TupleBatch * @param hashCode the hashCode of the to be processed tuple * @param hashTable the hash table to be updated * @param hashTableIndices the hash indices to be updated * @param compareColumns compareColumns of input tuple * @param occuredTimes occuredTimes array to be updated * */ private void updateHashTableAndOccureTimes( final TupleBatch tb, final int row, final int hashCode, final MutableTupleBuffer hashTable, final IntObjectHashMap<IntArrayList> hashTableIndices, final int[] compareColumns, final IntArrayList occuredTimes) { /* get the index of the tuple's hash code corresponding to */ final int nextIndex = hashTable.numTuples(); IntArrayList tupleIndicesList = hashTableIndices.get(hashCode); /* create one is there is no such a index yet (there is no tuple with the same hash code has been processed ) */ if (tupleIndicesList == null) { tupleIndicesList = new IntArrayList(1); hashTableIndices.put(hashCode, tupleIndicesList); } Preconditions.checkArgument(hashTable.numColumns() == compareColumns.length); List<? extends Column<?>> inputColumns = tb.getDataColumns(); /* find whether this tuple's comparing key has occurred before. If it is, only update occurred times */ boolean found = false; for (int i = 0; i < tupleIndicesList.size(); ++i) { int index = tupleIndicesList.get(i); if (TupleUtils.tupleEquals(tb, compareColumns, row, hashTable, index)) { occuredTimes.set(index, occuredTimes.get(index) + 1); found = true; break; } } if (!found) { tupleIndicesList.add(nextIndex); for (int column = 0; column < hashTable.numColumns(); ++column) { hashTable.put(column, inputColumns.get(compareColumns[column]), row); } occuredTimes.add(1); } } @Override protected Schema generateSchema() { return Schema.of(ImmutableList.of(Type.LONG_TYPE), ImmutableList.of(columnName)); } }