package edu.washington.escience.myria.operator;
import java.util.Arrays;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.gs.collections.api.block.procedure.primitive.IntProcedure;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
import edu.washington.escience.myria.util.MyriaArrayUtils;
/**
* This is an implementation of unbalanced hash join. This operator only builds hash tables for its right child, thus
* will begin to output tuples after right child EOS.
*
*/
public final class RightHashJoin extends BinaryOperator {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/**
* The names of the output columns.
*/
private final ImmutableList<String> outputColumns;
/**
* The column indices for comparing of child 1.
*/
private final int[] leftCompareIndx;
/**
* The column indices for comparing of child 2.
*/
private final int[] rightCompareIndx;
/**
* A hash table for tuples from child 2. {Hashcode -> List of tuple indices with the same hash code}
*/
private transient IntObjectHashMap<IntArrayList> rightHashTableIndices;
/**
* The buffer holding the valid tuples from right.
*/
private transient MutableTupleBuffer rightHashTable;
/**
* The buffer holding the results.
*/
private transient TupleBatchBuffer ans;
/** Which columns in the left child are to be output. */
private final int[] leftAnswerColumns;
/** Which columns in the right child are to be output. */
private final int[] rightAnswerColumns;
/**
* Traverse through the list of tuples with the same hash code.
*/
private final class JoinProcedure implements IntProcedure {
/** serial version id. */
private static final long serialVersionUID = 1L;
/**
* Hash table.
*/
private MutableTupleBuffer joinAgainstHashTable;
/**
*
* */
private int[] inputCmpColumns;
/**
* the columns to compare against.
*/
private int[] joinAgainstCmpColumns;
/**
* row index of the tuple.
*/
private int row;
/**
* input TupleBatch.
*/
private TupleBatch inputTB;
@Override
public void value(final int index) {
if (TupleUtils.tupleEquals(
inputTB, inputCmpColumns, row, joinAgainstHashTable, joinAgainstCmpColumns, index)) {
addToAns(inputTB, row, joinAgainstHashTable, index);
}
}
};
/**
* Traverse through the list of tuples.
*/
private transient JoinProcedure doJoin;
/**
* Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in
* compareIndx1 and compareIndx2 match.
*
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @throw IllegalArgumentException if there are duplicated column names from the children.
*/
public RightHashJoin(
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2) {
this(null, left, right, compareIndx1, compareIndx2);
}
/**
* Construct an EquiJoin operator. It returns the specified columns from both children when the corresponding columns
* in compareIndx1 and compareIndx2 match.
*
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @param answerColumns1 the columns of the left child to be returned. Order matters.
* @param answerColumns2 the columns of the right child to be returned. Order matters.
* @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
* <tt>outputSchema</tt> does not have the correct number of columns and column types.
*/
public RightHashJoin(
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2,
final int[] answerColumns1,
final int[] answerColumns2) {
this(null, left, right, compareIndx1, compareIndx2, answerColumns1, answerColumns2);
}
/**
* Construct an EquiJoin operator. It returns the specified columns from both children when the corresponding columns
* in compareIndx1 and compareIndx2 match.
*
* @param outputColumns the names of the columns in the output schema. If null, the corresponding columns will be
* copied from the children.
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @param answerColumns1 the columns of the left child to be returned. Order matters.
* @param answerColumns2 the columns of the right child to be returned. Order matters.
* @throw IllegalArgumentException if there are duplicated column names in <tt>outputColumns</tt>, or if
* <tt>outputColumns</tt> does not have the correct number of columns and column types.
*/
public RightHashJoin(
final List<String> outputColumns,
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2,
final int[] answerColumns1,
final int[] answerColumns2) {
super(left, right);
Preconditions.checkArgument(compareIndx1.length == compareIndx2.length);
if (outputColumns != null) {
Preconditions.checkArgument(
outputColumns.size() == answerColumns1.length + answerColumns2.length,
"length mismatch between output column names and columns selected for output");
Preconditions.checkArgument(
ImmutableSet.copyOf(outputColumns).size() == outputColumns.size(),
"duplicate column names in outputColumns");
this.outputColumns = ImmutableList.copyOf(outputColumns);
} else {
this.outputColumns = null;
}
leftCompareIndx = MyriaArrayUtils.warnIfNotSet(compareIndx1);
rightCompareIndx = MyriaArrayUtils.warnIfNotSet(compareIndx2);
leftAnswerColumns = MyriaArrayUtils.warnIfNotSet(answerColumns1);
rightAnswerColumns = MyriaArrayUtils.warnIfNotSet(answerColumns2);
}
/**
* Construct an EquiJoin operator. It returns all columns from both children when the corresponding columns in
* compareIndx1 and compareIndx2 match.
*
* @param outputColumns the names of the columns in the output schema. If null, the corresponding columns will be
* copied from the children.
* @param left the left child.
* @param right the right child.
* @param compareIndx1 the columns of the left child to be compared with the right. Order matters.
* @param compareIndx2 the columns of the right child to be compared with the left. Order matters.
* @throw IllegalArgumentException if there are duplicated column names in <tt>outputSchema</tt>, or if
* <tt>outputSchema</tt> does not have the correct number of columns and column types.
*/
public RightHashJoin(
final List<String> outputColumns,
final Operator left,
final Operator right,
final int[] compareIndx1,
final int[] compareIndx2) {
this(
outputColumns,
left,
right,
compareIndx1,
compareIndx2,
range(left.getSchema().numColumns()),
range(right.getSchema().numColumns()));
}
/**
* Helper function that generates an array of the numbers 0..max-1.
*
* @param max the size of the array.
* @return an array of the numbers 0..max-1.
*/
private static int[] range(final int max) {
int[] ret = new int[max];
for (int i = 0; i < max; ++i) {
ret[i] = i;
}
return ret;
}
@Override
protected Schema generateSchema() {
final Schema leftSchema = getLeft().getSchema();
final Schema rightSchema = getRight().getSchema();
ImmutableList.Builder<Type> types = ImmutableList.builder();
ImmutableList.Builder<String> names = ImmutableList.builder();
/* Assert that the compare index types are the same. */
for (int i = 0; i < rightCompareIndx.length; ++i) {
int leftIndex = leftCompareIndx[i];
int rightIndex = rightCompareIndx[i];
Type leftType = leftSchema.getColumnType(leftIndex);
Type rightType = rightSchema.getColumnType(rightIndex);
Preconditions.checkState(
leftType == rightType,
"column types do not match for join at index %s: left column type %s [%s] != right column type %s [%s]",
i,
leftIndex,
leftType,
rightIndex,
rightType);
}
for (int i : leftAnswerColumns) {
types.add(leftSchema.getColumnType(i));
names.add(leftSchema.getColumnName(i));
}
for (int i : rightAnswerColumns) {
types.add(rightSchema.getColumnType(i));
names.add(rightSchema.getColumnName(i));
}
if (outputColumns != null) {
return new Schema(types.build(), outputColumns);
} else {
return new Schema(types, names);
}
}
/**
* @param cntTB current TB
* @param row current row
* @param hashTable the buffer holding the tuples to join against
* @param index the index of hashTable, which the cntTuple is to join with
*/
protected void addToAns(
final TupleBatch cntTB, final int row, final MutableTupleBuffer hashTable, final int index) {
for (int leftAnswerColumn : leftAnswerColumns) {
ans.append(cntTB, leftAnswerColumn, row);
}
for (int rightAnswerColumn : rightAnswerColumns) {
ans.append(hashTable, rightAnswerColumn, index);
}
}
@Override
protected void cleanup() throws DbException {
rightHashTable = null;
rightHashTableIndices = null;
ans = null;
}
@Override
public void checkEOSAndEOI() {
final Operator left = getLeft();
final Operator right = getRight();
if (left.eos() && right.eos() && ans.numTuples() == 0) {
setEOS();
return;
}
// EOS could be used as an EOI
if ((childrenEOI[0] || left.eos()) && (childrenEOI[1] || right.eos()) && ans.numTuples() == 0) {
setEOI(true);
Arrays.fill(childrenEOI, false);
}
}
/**
* Recording the EOI status of the children.
*/
private final boolean[] childrenEOI = new boolean[2];
/**
* Note: If this operator is ready for EOS, this function will return true since EOS is a special EOI.
*
* @return whether this operator is ready to set itself EOI
*/
private boolean isEOIReady() {
if ((childrenEOI[0] || getLeft().eos()) && (childrenEOI[1] || getRight().eos())) {
return true;
}
return false;
}
@Override
protected TupleBatch fetchNextReady() throws DbException {
/*
* blocking mode will have the same logic
*/
/* If any full tuple batches are ready, output them. */
TupleBatch nexttb = ans.popAnyUsingTimeout();
if (nexttb != null) {
return nexttb;
}
final Operator right = getRight();
/* Drain the right child. */
while (!right.eos()) {
TupleBatch rightTB = right.nextReady();
if (rightTB == null) {
/* The right child may have realized it's EOS now. If so, we must move onto left child to avoid livelock. */
if (right.eos()) {
break;
}
return null;
}
processRightChildTB(rightTB);
}
/* The right child is done, let's drain the left child. */
final Operator left = getLeft();
while (!left.eos()) {
TupleBatch leftTB = left.nextReady();
/*
* Left tuple has no data, but we may need to pop partially-full existing batches if left reached EOI/EOS. Break
* and check for termination.
*/
if (leftTB == null) {
break;
}
/* Process the data and add new results to ans. */
processLeftChildTB(leftTB);
nexttb = ans.popAnyUsingTimeout();
if (nexttb != null) {
return nexttb;
}
/*
* We didn't time out or there is no data in ans, and there are no full tuple batches. Either way, check for more
* data.
*/
}
if (isEOIReady()) {
nexttb = ans.popAny();
}
return nexttb;
}
@Override
public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
final Operator right = getRight();
rightHashTableIndices = new IntObjectHashMap<>();
rightHashTable = new MutableTupleBuffer(right.getSchema());
ans = new TupleBatchBuffer(getSchema());
doJoin = new JoinProcedure();
}
/**
* Process the tuples from left child.
*
* @param tb TupleBatch to be processed.
*/
protected void processLeftChildTB(final TupleBatch tb) {
doJoin.joinAgainstHashTable = rightHashTable;
doJoin.inputCmpColumns = leftCompareIndx;
doJoin.joinAgainstCmpColumns = rightCompareIndx;
doJoin.inputTB = tb;
for (int row = 0; row < tb.numTuples(); ++row) {
final int cntHashCode = HashUtils.hashSubRow(tb, doJoin.inputCmpColumns, row);
IntArrayList tuplesWithHashCode = rightHashTableIndices.get(cntHashCode);
if (tuplesWithHashCode != null) {
doJoin.row = row;
tuplesWithHashCode.forEach(doJoin);
}
}
}
/**
* Process the tuples from right child.
*
* @param tb TupleBatch to be processed.
*/
protected void processRightChildTB(final TupleBatch tb) {
for (int row = 0; row < tb.numTuples(); ++row) {
final int cntHashCode = HashUtils.hashSubRow(tb, rightCompareIndx, row);
// only build hash table on two sides if none of the children is EOS
addToHashTable(tb, row, rightHashTable, rightHashTableIndices, cntHashCode);
}
}
/**
* @param tb the source TupleBatch
* @param row the row number to get added to hash table
* @param hashTable the target hash table
* @param hashTable1IndicesLocal hash table 1 indices local
* @param hashCode the hashCode of the tb.
*/
private void addToHashTable(
final TupleBatch tb,
final int row,
final MutableTupleBuffer hashTable,
final IntObjectHashMap<IntArrayList> hashTable1IndicesLocal,
final int hashCode) {
final int nextIndex = hashTable.numTuples();
IntArrayList tupleIndicesList = hashTable1IndicesLocal.get(hashCode);
if (tupleIndicesList == null) {
tupleIndicesList = new IntArrayList(1);
hashTable1IndicesLocal.put(hashCode, tupleIndicesList);
}
tupleIndicesList.add(nextIndex);
List<? extends Column<?>> inputColumns = tb.getDataColumns();
for (int column = 0; column < tb.numColumns(); column++) {
hashTable.put(column, inputColumns.get(column), row);
}
}
}