package edu.washington.escience.myria.operator;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.gs.collections.api.block.procedure.primitive.IntProcedure;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
/**
*
* Counting join which will only build hash table of the right child.
*
*/
public class RightHashCountingJoin extends BinaryOperator {
/**
* This is required for serialization.
*/
private static final long serialVersionUID = 1L;
/**
* The column indices for comparing of child 1.
*/
private final int[] leftCompareIndx;
/**
* The column indices for comparing of child 2.
*/
private final int[] rightCompareIndx;
/**
* A hash table for tuples from child 2. {Hashcode -> List of tuple indices with the same hash code}
*/
private transient IntObjectHashMap<IntArrayList> hashTableIndices;
/**
* The buffer holding the valid tuples from right.
*/
private transient MutableTupleBuffer hashTable;
/**
* How many times each key occurred from right.
*/
private transient IntArrayList occurredTimes;
/**
* The buffer holding the results.
*/
private transient long ans;
/** The buffer for storing and returning answer. */
private transient TupleBatchBuffer ansTBB;
/** The name of the single column output from this operator. */
private final String columnName;
/**
* Whether this operator has returned answer or not.
* */
private boolean hasReturnedAnswer = false;
/**
* Traverse through the list of tuples.
* */
private transient CountingJoinProcedure doCountingJoin;
/**
* Traverse through the list of tuples with the same hash code.
* */
private final class CountingJoinProcedure implements IntProcedure {
/** serial version id. */
private static final long serialVersionUID = 1L;
/**
* Hash table.
* */
private MutableTupleBuffer joinAgainstHashTable;
/**
* times of occure of a key.
* */
private IntArrayList occuredTimesOnJoinAgainstChild;
/**
*
* */
private int[] inputCmpColumns;
/**
* row index of the tuple.
* */
private int row;
/**
* input TupleBatch.
* */
private TupleBatch inputTB;
@Override
public void value(final int index) {
if (TupleUtils.tupleEquals(inputTB, inputCmpColumns, row, joinAgainstHashTable, index)) {
ans += occuredTimesOnJoinAgainstChild.get(index);
}
}
};
/**
* Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI.
*
* @return whether this operator is ready to set itself EOI
*/
private boolean isEOIReady() {
if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) {
return true;
}
return false;
}
/**
* Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in
* compareIndx1 and compareIndx2 match.
*
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @throw IllegalArgumentException if there are duplicated column names from the children.
*/
public RightHashCountingJoin(
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2) {
this("count", left, right, compareIndx1, compareIndx2);
}
/**
* Construct a {@link RightHashCountingJoin} operator with output column name specified.
*
* @param outputColumnName the name of the column of the output table.
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
* <tt>outputSchema</tt> does not have the correct number of columns and column types.
*/
public RightHashCountingJoin(
final String outputColumnName,
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2) {
super(left, right);
leftCompareIndx = compareIndx1;
rightCompareIndx = compareIndx2;
columnName = Objects.requireNonNull(outputColumnName);
}
@Override
protected void cleanup() throws DbException {
hashTable = null;
hashTableIndices = null;
occurredTimes = null;
ansTBB = null;
ans = 0;
}
@Override
public void checkEOSAndEOI() {
final Operator left = getLeft();
final Operator right = getRight();
if (left.eos() && right.eos() && hasReturnedAnswer) {
setEOS();
return;
}
// EOS could be used as an EOI
if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && hasReturnedAnswer) {
setEOI(true);
Arrays.fill(childrenEOI, false);
}
}
/**
* Recording the EOI status of the children.
*/
private final boolean[] childrenEOI = new boolean[2];
@Override
protected TupleBatch fetchNextReady() throws DbException {
/*
* There is no distinction between blocking and non-blocking.
*/
final Operator right = getRight();
/* Drain the right child. */
while (!right.eos()) {
TupleBatch rightTB = right.nextReady();
if (rightTB == null) {
/* The right child may have realized it's EOS now. If so, we must move onto left child to avoid livelock. */
if (right.eos()) {
break;
}
return null;
}
processRightChildTB(rightTB);
}
/* The right child is done, let's drain the left child. */
final Operator left = getLeft();
while (!left.eos()) {
TupleBatch leftTB = left.nextReady();
/*
* Left tuple has no data, but we may need to pop partially-full existing batches if left reached EOI/EOS. Break
* and check for termination.
*/
if (leftTB == null) {
break;
}
/* Process the data and add new results to ans. */
processLeftChildTB(leftTB);
}
/*
* If the operator is ready to EOI, just set EOI since EOI will not return any data. If the operator is ready to
* EOS, return answer first, then at the next round set EOS
*/
if (isEOIReady()) {
if (left.eos() && right.eos() && (!hasReturnedAnswer)) {
ansTBB.putLong(0, ans);
hasReturnedAnswer = true;
return ansTBB.popAny();
}
}
/* If not eos, return null since there is no tuple can be processed right now */
return null;
}
@Override
public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
final Operator right = getRight();
hashTableIndices = new IntObjectHashMap<>();
hashTable = new MutableTupleBuffer(right.getSchema().getSubSchema(rightCompareIndx));
occurredTimes = new IntArrayList();
doCountingJoin = new CountingJoinProcedure();
ans = 0;
ansTBB = new TupleBatchBuffer(getSchema());
}
/**
* Process tuples from right child: build up hash tables.
*
* @param tb the incoming TupleBatch.
*/
protected void processRightChildTB(final TupleBatch tb) {
for (int row = 0; row < tb.numTuples(); ++row) {
final int cntHashCode = HashUtils.hashSubRow(tb, rightCompareIndx, row);
// only build hash table on two sides if none of the children is EOS
updateHashTableAndOccureTimes(
tb, row, cntHashCode, hashTable, hashTableIndices, rightCompareIndx, occurredTimes);
}
}
/**
* Process tuples from the left child: do the actual count join.
*
* @param tb the incoming TupleBatch for processing join.
*/
protected void processLeftChildTB(final TupleBatch tb) {
doCountingJoin.inputCmpColumns = leftCompareIndx;
doCountingJoin.inputTB = tb;
doCountingJoin.occuredTimesOnJoinAgainstChild = occurredTimes;
doCountingJoin.joinAgainstHashTable = hashTable;
for (int row = 0; row < tb.numTuples(); ++row) {
/*
* update number of count of probing the other child's hash table.
*/
final int cntHashCode = HashUtils.hashSubRow(tb, doCountingJoin.inputCmpColumns, row);
IntArrayList tuplesWithHashCode = hashTableIndices.get(cntHashCode);
if (tuplesWithHashCode != null) {
doCountingJoin.row = row;
tuplesWithHashCode.forEach(doCountingJoin);
}
}
}
/**
* @param tb the source TupleBatch
* @param row the row number of the to be processed tuple in the source TupleBatch
* @param hashCode the hashCode of the to be processed tuple
* @param hashTable the hash table to be updated
* @param hashTableIndices the hash indices to be updated
* @param compareColumns compareColumns of input tuple
* @param occuredTimes occuredTimes array to be updated
* */
private void updateHashTableAndOccureTimes(
final TupleBatch tb,
final int row,
final int hashCode,
final MutableTupleBuffer hashTable,
final IntObjectHashMap<IntArrayList> hashTableIndices,
final int[] compareColumns,
final IntArrayList occuredTimes) {
/* get the index of the tuple's hash code corresponding to */
final int nextIndex = hashTable.numTuples();
IntArrayList tupleIndicesList = hashTableIndices.get(hashCode);
/* create one is there is no such a index yet (there is no tuple with the same hash code has been processed ) */
if (tupleIndicesList == null) {
tupleIndicesList = new IntArrayList(1);
hashTableIndices.put(hashCode, tupleIndicesList);
}
Preconditions.checkArgument(hashTable.numColumns() == compareColumns.length);
List<? extends Column<?>> inputColumns = tb.getDataColumns();
/* find whether this tuple's comparing key has occurred before. If it is, only update occurred times */
boolean found = false;
for (int i = 0; i < tupleIndicesList.size(); ++i) {
int index = tupleIndicesList.get(i);
if (TupleUtils.tupleEquals(tb, compareColumns, row, hashTable, index)) {
occuredTimes.set(index, occuredTimes.get(index) + 1);
found = true;
break;
}
}
if (!found) {
tupleIndicesList.add(nextIndex);
for (int column = 0; column < hashTable.numColumns(); ++column) {
hashTable.put(column, inputColumns.get(compareColumns[column]), row);
}
occuredTimes.add(1);
}
}
@Override
protected Schema generateSchema() {
return Schema.of(ImmutableList.of(Type.LONG_TYPE), ImmutableList.of(columnName));
}
}