package edu.washington.escience.myria.operator; import java.util.ArrayList; import java.util.BitSet; import java.util.HashMap; import java.util.List; import com.google.common.collect.ImmutableMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.util.HashUtils; /** * A simple implementation of duplicate eliminate. It keeps the references to all the TupleBatches which contain unique * tuples. * */ public final class DupElimRefOnly extends UnaryOperator { /** * Pointer data structure for pointing to a tuple in a TupleBatch. * */ private class IndexedTuple { /** * The row index. * */ private int index; /** * The source data TB. * */ private final TupleBatch tb; /** * @param tb the source data TB. * */ public IndexedTuple(final TupleBatch tb) { this.tb = tb; } /** * @param tb the source data TB. * @param index the row index. * */ public IndexedTuple(final TupleBatch tb, final int index) { this.tb = tb; this.index = index; } /** * compare the equality of a column of two tuples. * * @return true if equal. * @param another another source data TB. * @param colIndx columnIndex to compare * */ public boolean columnEquals(final IndexedTuple another, final int colIndx) { final Type type = tb.getSchema().getColumnType(colIndx); final int rowIndx1 = index; final int rowIndx2 = another.index; switch (type) { case BOOLEAN_TYPE: return tb.getBoolean(colIndx, rowIndx1) == another.tb.getBoolean(colIndx, rowIndx2); case DOUBLE_TYPE: return tb.getDouble(colIndx, rowIndx1) == another.tb.getDouble(colIndx, rowIndx2); case FLOAT_TYPE: return tb.getFloat(colIndx, rowIndx1) == another.tb.getFloat(colIndx, rowIndx2); case INT_TYPE: return tb.getInt(colIndx, rowIndx1) == another.tb.getInt(colIndx, rowIndx2); case LONG_TYPE: return tb.getLong(colIndx, rowIndx1) == another.tb.getLong(colIndx, rowIndx2); case STRING_TYPE: return tb.getString(colIndx, rowIndx1).equals(another.tb.getString(colIndx, rowIndx2)); case DATETIME_TYPE: return tb.getDateTime(colIndx, rowIndx1) .equals(another.tb.getDateTime(colIndx, rowIndx2)); case BLOB_TYPE: return tb.getBlob(colIndx, rowIndx1).equals(another.tb.getBlob(colIndx, rowIndx2)); } return false; } @Override public boolean equals(final Object o) { if (this == o) { return true; } if (!(o instanceof IndexedTuple)) { return false; } final IndexedTuple another = (IndexedTuple) o; if (!(tb.getSchema().equals(another.tb.getSchema()))) { return false; } for (int i = 0; i < tb.getSchema().numColumns(); ++i) { if (!columnEquals(another, i)) { return false; } } return true; } @Override public int hashCode() { return HashUtils.hashRow(tb, index); } } /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** * Storing the unique tuples. * */ private transient HashMap<Integer, List<IndexedTuple>> uniqueTuples; /** * @param child the child * */ public DupElimRefOnly(final Operator child) { super(child); } @Override protected void cleanup() throws DbException {} /** * Do duplicate elimination for the tb. * * @param tb the TB. * @return a new TB with duplicates removed. * */ protected TupleBatch doDupElim(final TupleBatch tb) { final int numTuples = tb.numTuples(); if (numTuples <= 0) { return tb; } final BitSet toRemove = new BitSet(numTuples); final IndexedTuple currentTuple = new IndexedTuple(tb); for (int i = 0; i < numTuples; ++i) { currentTuple.index = i; final int cntHashCode = currentTuple.hashCode(); // might need to check invalid | change to use outputTuples later List<IndexedTuple> tupleList = uniqueTuples.get(cntHashCode); if (tupleList == null) { tupleList = new ArrayList<IndexedTuple>(); uniqueTuples.put(cntHashCode, tupleList); tupleList.add(new IndexedTuple(tb, i)); continue; } boolean unique = true; for (final IndexedTuple oldTuple : tupleList) { if (currentTuple.equals(oldTuple)) { unique = false; break; } } if (unique) { tupleList.add(new IndexedTuple(tb, i)); } else { toRemove.set(i); } } return tb.filterOut(toRemove); } @Override public TupleBatch fetchNextReady() throws DbException { TupleBatch tb = null; tb = getChild().nextReady(); while (tb != null) { tb = doDupElim(tb); if (tb.numTuples() > 0) { return tb; } tb = getChild().nextReady(); } return null; } @Override public Schema generateSchema() { Operator child = getChild(); if (child == null) { return null; } return child.getSchema(); } @Override protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { uniqueTuples = new HashMap<Integer, List<IndexedTuple>>(); } }