package edu.washington.escience.myria.operator; import java.util.BitSet; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.ImmutableMap; import com.gs.collections.api.block.procedure.primitive.IntProcedure; import com.gs.collections.impl.list.mutable.primitive.IntArrayList; import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.column.Column; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; /** * Duplicate elimination. It adds newly meet unique tuples into a buffer so that the source TupleBatches are not * referenced. This implementation reduces memory consumption. * */ public final class StatefulDupElim extends StreamingState { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** * The logger for this class. * */ static final Logger LOGGER = LoggerFactory.getLogger(StatefulDupElim.class); /** * Indices to unique tuples. * */ private transient IntObjectHashMap<IntArrayList> uniqueTupleIndices; /** * The buffer for storing unique tuples. * */ private transient MutableTupleBuffer uniqueTuples = null; @Override public void cleanup() { uniqueTuples = null; uniqueTupleIndices = null; } /** * Do duplicate elimination for tb. * * @param tb the TupleBatch for performing DupElim. * @return the duplicate eliminated TB. * */ protected TupleBatch doDupElim(final TupleBatch tb) { final int numTuples = tb.numTuples(); /* if tb is empty, directly return. */ if (numTuples <= 0) { return tb; } checkUniqueness.inputTB = tb; List<? extends Column<?>> columns = tb.getDataColumns(); final BitSet toRemove = new BitSet(numTuples); for (int i = 0; i < numTuples; ++i) { final int nextIndex = uniqueTuples.numTuples(); final int cntHashCode = HashUtils.hashRow(tb, i); IntArrayList tupleIndexList = uniqueTupleIndices.get(cntHashCode); checkUniqueness.row = i; checkUniqueness.unique = true; if (tupleIndexList == null) { tupleIndexList = new IntArrayList(1); tupleIndexList.add(nextIndex); uniqueTupleIndices.put(cntHashCode, tupleIndexList); } else { tupleIndexList.forEach(checkUniqueness); } if (checkUniqueness.unique) { for (int j = 0; j < tb.numColumns(); ++j) { uniqueTuples.put(j, columns.get(j), i); } tupleIndexList.add(nextIndex); } else { toRemove.set(i); } } return tb.filterOut(toRemove); } @Override public Schema getSchema() { return getOp().getInputSchema(); } @Override public void init(final ImmutableMap<String, Object> execEnvVars) { uniqueTupleIndices = new IntObjectHashMap<>(); uniqueTuples = new MutableTupleBuffer(getSchema()); checkUniqueness = new CheckUniquenessProcedure(); } @Override public TupleBatch update(final TupleBatch tb) { TupleBatch newtb = doDupElim(tb); if (newtb.numTuples() > 0 || tb.isEOI()) { return newtb; } return null; } @Override public List<TupleBatch> exportState() { return uniqueTuples.getAll(); } @Override public int numTuples() { if (uniqueTuples == null) { return 0; } return uniqueTuples.numTuples(); } /** * Traverse through the list of tuples. * */ private transient CheckUniquenessProcedure checkUniqueness; /** * Traverse through the list of tuples with the same hash code. * */ private final class CheckUniquenessProcedure implements IntProcedure { /** serialization id. */ private static final long serialVersionUID = 1L; /** row index of the tuple. */ private int row; /** input TupleBatch. */ private TupleBatch inputTB; /** if found a replacement. */ private boolean unique; @Override public void value(final int index) { if (TupleUtils.tupleEquals(inputTB, row, uniqueTuples, index)) { unique = false; } } }; @Override public StreamingState duplicate() { return new StatefulDupElim(); } }