package edu.washington.escience.myria.operator; import java.io.Serializable; import com.gs.collections.api.iterator.IntIterator; import com.gs.collections.impl.list.mutable.primitive.IntArrayList; import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.storage.MutableTupleBuffer; import edu.washington.escience.myria.storage.ReadableTable; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.HashUtils; /** * An abstraction of a hash table of tuples. */ public final class TupleHashTable implements Serializable { /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** Map from hash codes to indices. */ private transient IntObjectHashMap<IntArrayList> keyHashCodesToIndices; /** The table containing keys and values. */ private transient MutableTupleBuffer data; /** Key column indices. */ private final int[] keyColumns; /** * @param schema schema * @param keyColumns key column indices */ public TupleHashTable(final Schema schema, final int[] keyColumns) { this.keyColumns = keyColumns; data = new MutableTupleBuffer(schema); keyHashCodesToIndices = new IntObjectHashMap<IntArrayList>(); } /** * @return the number of tuples this hash table has. */ public int numTuples() { return data.numTuples(); } /** * Get the data table indices given key columns from a tuple in a tuple batch. * * @param tb the input tuple batch * @param key the key columns * @param row the row index of the tuple * @return the indices */ public IntArrayList getIndices(final ReadableTable tb, final int[] key, final int row) { IntArrayList ret = new IntArrayList(); IntArrayList indices = keyHashCodesToIndices.get(HashUtils.hashSubRow(tb, key, row)); if (indices != null) { IntIterator iter = indices.intIterator(); while (iter.hasNext()) { int i = iter.next(); if (TupleUtils.tupleEquals(tb, key, row, data, keyColumns, i)) { ret.add(i); } } } return ret; } /** * Replace tuples in the hash table with the input tuple if they have the same key. * * @param tb the input tuple batch * @param keyColumns the key columns * @param row the row index of the input tuple * @return if at least one tuple is replaced */ public boolean replace(final TupleBatch tb, final int[] keyColumns, final int row) { IntIterator iter = getIndices(tb, keyColumns, row).intIterator(); if (!iter.hasNext()) { return false; } while (iter.hasNext()) { int i = iter.next(); for (int j = 0; j < data.numColumns(); ++j) { data.replace(j, i, tb.getDataColumns().get(j), row); } } return true; } /** * @param tb tuple batch of the input tuple * @param keyColumns key column indices * @param row row index of the input tuple * @param keyOnly only add keyColumns */ public void addTuple( final ReadableTable tb, final int[] keyColumns, final int row, final boolean keyOnly) { int hashcode = HashUtils.hashSubRow(tb, keyColumns, row); IntArrayList indices = keyHashCodesToIndices.get(hashcode); if (indices == null) { indices = new IntArrayList(); keyHashCodesToIndices.put(hashcode, indices); } indices.add(numTuples()); if (keyOnly) { for (int i = 0; i < keyColumns.length; ++i) { data.put(i, tb.asColumn(keyColumns[i]), row); } } else { for (int i = 0; i < data.numColumns(); ++i) { data.put(i, tb.asColumn(i), row); } } } /** * @return the data */ public MutableTupleBuffer getData() { return data; } /** * Clean up the hash table. */ public void cleanup() { keyHashCodesToIndices = new IntObjectHashMap<IntArrayList>(); data = new MutableTupleBuffer(data.getSchema()); } }