package edu.washington.escience.myria.operator;
import java.util.BitSet;
import java.util.List;
import com.google.common.collect.ImmutableMap;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.column.Column;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
/**
* Implementation of set difference. Duplicates are not preserved.
*
* This implementation is similar to RightHashJoin: read the right relation into a hash table; probe the left relation's
* tuples with this hash table; eliminate duplicates by adding the left relation to the hash table.
*
* @author whitaker
*/
public final class Difference extends BinaryOperator {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/**
* This buffer stores tuples to remove from the left operator.
*/
private transient MutableTupleBuffer tuplesToRemove = null;
/**
* Mapping from tuple hash code to indices in the tuplesToRemove buffer.
* */
private transient IntObjectHashMap<IntArrayList> tupleIndices;
/**
* Instantiate a set difference operator: left EXCEPT right.
*
* @param left the operator being subtracted from.
* @param right the operator to be subtracted.
*/
public Difference(final Operator left, final Operator right) {
super(left, right);
}
/**
* Mark a particular tuple as seen.
*
* @param batch A tuple batch
* @param rowNum The index of the tuple among the valid tuples in batch.
*
* @return true if this is the first time this tuple has been encountered.
*/
private boolean markAsSeen(final TupleBatch batch, final int rowNum) {
final int tupleHash = HashUtils.hashRow(batch, rowNum);
IntArrayList tupleIndexList = tupleIndices.get(tupleHash);
if (tupleIndexList == null) {
tupleIndexList = new IntArrayList();
tupleIndices.put(tupleHash, tupleIndexList);
}
// Check whether we've seen this tuple before
for (int i = 0; i < tupleIndexList.size(); i++) {
if (TupleUtils.tupleEquals(batch, rowNum, tuplesToRemove, tupleIndexList.get(i))) {
return false;
}
}
// This is a new tuple: add it to the toRemove tuple buffer
final int nextToRemoveRow = tuplesToRemove.numTuples();
final List<? extends Column<?>> columns = batch.getDataColumns();
for (int columnNum = 0; columnNum < batch.numColumns(); columnNum++) {
tuplesToRemove.put(columnNum, columns.get(columnNum), rowNum);
}
tupleIndexList.add(nextToRemoveRow);
return true;
}
/**
* Process a batch of tuples that are removed from the final result.
*
* @param batch A tuple batch
*/
private void processRightChildTB(final TupleBatch batch) {
final int numValidTuples = batch.numTuples();
for (int row = 0; row < numValidTuples; row++) {
markAsSeen(batch, row);
}
}
/**
* Process a batch of tuples that are subtracted from to produce the final result.
*
* @param batch A tuple batch.
*
* @return A filtered batch of tuples.
*/
private TupleBatch processLeftChildTB(final TupleBatch batch) {
final int numValidTuples = batch.numTuples();
final BitSet toRemove = new BitSet(numValidTuples);
for (int row = 0; row < numValidTuples; row++) {
if (!markAsSeen(batch, row)) {
toRemove.set(row);
}
}
return batch.filterOut(toRemove);
}
@Override
protected TupleBatch fetchNextReady() throws Exception {
final Operator right = getRight();
/* Drain the right child. */
while (!right.eos()) {
TupleBatch rightTB = right.nextReady();
if (rightTB == null) {
if (right.eos()) {
break;
}
return null;
}
processRightChildTB(rightTB);
}
/* Drain the left child */
final Operator left = getLeft();
while (!left.eos()) {
TupleBatch leftTB = left.nextReady();
if (leftTB == null) {
return null;
}
return processLeftChildTB(leftTB);
}
return null;
}
@Override
protected Schema generateSchema() {
if (getLeft() == null) {
return null;
} else {
return getLeft().getSchema();
}
}
@Override
public void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
if (!getLeft().getSchema().compatible(getRight().getSchema())) {
throw new DbException("Incompatible input schemas");
}
tupleIndices = new IntObjectHashMap<>();
tuplesToRemove = new MutableTupleBuffer(getSchema());
}
@Override
protected void cleanup() throws DbException {
tuplesToRemove = null;
tupleIndices = null;
}
}