package edu.washington.escience.myria.operator; import java.nio.ByteBuffer; import java.util.BitSet; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import com.gs.collections.api.iterator.LongIterator; import com.gs.collections.impl.list.mutable.primitive.LongArrayList; import com.gs.collections.impl.map.mutable.primitive.LongLongHashMap; import com.gs.collections.impl.map.mutable.primitive.LongObjectHashMap; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.util.HashUtils; /** * Instead of storing unique tuples to deduplicate against, * this implementation stores 128-bit hash codes of tuples * (to make collisions statistically negligible). * */ public final class DupElim extends UnaryOperator { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** * We store this value instead of a valid hash code to indicate * that a given initial hash code is mapped to multiple final hash codes. */ private static final long COLLIDING_KEY = -1; /** * We return this value from getIfAbsent() to indicate absence, * since 0 and -1 are already legitimate values. */ private static final long ABSENT_VALUE = -2; /** Stores 128-bit hash codes as map of first 64-bit word to second 64-bit word. */ private transient LongLongHashMap initialToFinalhashCodes; /** Map of first 64-bit words to colliding second 64-bit words. */ private transient LongObjectHashMap<LongArrayList> collidingFinalHashCodes; /** Cached byte buffer to avoid allocating a new buffer for each byte copy. */ private transient ByteBuffer byteCopyBuffer; /** * @param child the child * */ public DupElim(final Operator child) { super(child); } @Override protected void cleanup() throws DbException { initialToFinalhashCodes = null; collidingFinalHashCodes = null; byteCopyBuffer = null; } /** * Do duplicate elimination for the tb. * * @param tb the TB. * @return a new TB with duplicates removed. * */ protected TupleBatch doDupElim(final TupleBatch tb) { final int numTuples = tb.numTuples(); if (numTuples <= 0) { return tb; } final BitSet toRemove = new BitSet(numTuples); for (int i = 0; i < numTuples; ++i) { final byte[] hashCodeBytes = HashUtils.hashRowBytes(tb, i); Preconditions.checkArgument( hashCodeBytes.length == 16, "Expected 16 bytes in hash code, found %d", hashCodeBytes.length); byteCopyBuffer.clear(); byteCopyBuffer.put(hashCodeBytes, 0, 8); byteCopyBuffer.flip(); long hashCode1 = byteCopyBuffer.getLong(); byteCopyBuffer.clear(); byteCopyBuffer.put(hashCodeBytes, 8, 8); byteCopyBuffer.flip(); long hashCode2 = byteCopyBuffer.getLong(); long hashCodeValue = initialToFinalhashCodes.getIfAbsent(hashCode1, ABSENT_VALUE); if (hashCodeValue == ABSENT_VALUE) { initialToFinalhashCodes.put(hashCode1, hashCode2); } else if (hashCodeValue == COLLIDING_KEY) { LongArrayList collidingHashCodes = collidingFinalHashCodes.get(hashCode1); Preconditions.checkNotNull(collidingHashCodes); Preconditions.checkState(collidingHashCodes.size() > 1); LongIterator iter = collidingHashCodes.longIterator(); boolean found = false; while (iter.hasNext()) { long hc = iter.next(); if (hc == hashCode2) { // duplicate found toRemove.set(i); found = true; break; } } if (!found) { collidingHashCodes.add(hashCode2); } } else if (hashCodeValue != hashCode2) { LongArrayList collidingHashCodes = LongArrayList.newListWith(hashCodeValue, hashCode2); Preconditions.checkState(!collidingFinalHashCodes.containsKey(hashCode1)); collidingFinalHashCodes.put(hashCode1, collidingHashCodes); initialToFinalhashCodes.put(hashCode1, COLLIDING_KEY); } else { // duplicate found toRemove.set(i); } } return tb.filterOut(toRemove); } @Override public TupleBatch fetchNextReady() throws DbException { TupleBatch tb = null; tb = getChild().nextReady(); while (tb != null) { tb = doDupElim(tb); if (tb.numTuples() > 0) { return tb; } tb = getChild().nextReady(); } return null; } @Override public Schema generateSchema() { Operator child = getChild(); if (child == null) { return null; } return child.getSchema(); } @Override protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException { initialToFinalhashCodes = new LongLongHashMap(); collidingFinalHashCodes = new LongObjectHashMap<LongArrayList>(); byteCopyBuffer = ByteBuffer.allocate(Long.BYTES); } }