package edu.washington.escience.myria.operator;
import java.io.Serializable;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.gs.collections.api.iterator.IntIterator;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.LongIntHashMap;
import com.gs.collections.impl.map.mutable.primitive.LongObjectHashMap;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.ReadableTable;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;
/**
* An abstraction of a hash table of unique tuples.
*/
public final class UniqueTupleHashTable implements Serializable {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/**
* We store this value instead of a valid index to indicate
* that a given hash code is mapped to multiple indexes.
*/
private static final int COLLIDING_KEY = -1;
/**
* We return this value from getIfAbsent() to indicate absence,
* since 0 and -1 are already legitimate values.
*/
private static final int ABSENT_VALUE = -2;
/** Map from unique hash codes to indexes. */
private transient LongIntHashMap keyHashCodesToIndexes;
/** Map from colliding hash codes to indexes. */
private transient LongObjectHashMap<IntArrayList> collidingKeyHashCodesToIndexes;
/** The table containing keys and values. */
private transient MutableTupleBuffer data;
/** Key column indices. */
private final int[] keyColumns;
/** The logger for this class. */
protected static final org.slf4j.Logger LOGGER =
LoggerFactory.getLogger(UniqueTupleHashTable.class);
/**
* @param schema schema
* @param keyColumns key column indices
*/
public UniqueTupleHashTable(final Schema schema, final int[] keyColumns) {
this.keyColumns = keyColumns;
data = new MutableTupleBuffer(schema);
keyHashCodesToIndexes = new LongIntHashMap();
collidingKeyHashCodesToIndexes = new LongObjectHashMap<IntArrayList>();
}
/**
* @return the number of tuples this hash table has.
*/
public int numTuples() {
return data.numTuples();
}
/**
* Get the data table index given key columns from a tuple in a tuple batch.
*
* @param tb the input tuple batch
* @param key the key columns
* @param row the row index of the tuple
* @return the index of the matching tuple in the data table, or -1 if no match
*/
public int getIndex(final ReadableTable tb, final int[] key, final int row) {
final long hashcode = HashUtils.hashSubRowLong(tb, key, row);
int index = keyHashCodesToIndexes.getIfAbsent(hashcode, ABSENT_VALUE);
if (index == ABSENT_VALUE) {
return -1;
}
if (index == COLLIDING_KEY) {
IntArrayList collidingIndexes = collidingKeyHashCodesToIndexes.get(hashcode);
Preconditions.checkNotNull(collidingIndexes);
Preconditions.checkState(collidingIndexes.size() > 1);
IntIterator iter = collidingIndexes.intIterator();
while (iter.hasNext()) {
int idx = iter.next();
if (TupleUtils.tupleEquals(tb, key, row, data, keyColumns, idx)) {
return idx;
}
}
return -1; // our search key doesn't exist but collides with an existing key
}
if (TupleUtils.tupleEquals(tb, key, row, data, keyColumns, index)) {
return index;
}
return -1; // our search key doesn't exist but collides with an existing key
}
/**
* Replace a matching tuple in the data table with the input tuple.
*
* @param tb the input tuple batch
* @param keyColumns the key columns
* @param row the row index of the input tuple
* @return if at least one tuple is replaced
*/
public boolean replace(final TupleBatch tb, final int[] keyColumns, final int row) {
int index = getIndex(tb, keyColumns, row);
if (index == -1) {
return false;
}
for (int j = 0; j < data.numColumns(); ++j) {
data.replace(j, index, tb.getDataColumns().get(j), row);
}
return true;
}
/**
* @param tb tuple batch of the input tuple
* @param keyColumns key column indices
* @param row row index of the input tuple
* @param keyOnly only add keyColumns
*/
public void addTuple(
final ReadableTable tb, final int[] keyColumns, final int row, final boolean keyOnly) {
final long hashcode = HashUtils.hashSubRowLong(tb, keyColumns, row);
int index = keyHashCodesToIndexes.getIfAbsent(hashcode, ABSENT_VALUE);
if (index == ABSENT_VALUE) {
keyHashCodesToIndexes.put(hashcode, numTuples());
} else if (index == COLLIDING_KEY) {
IntArrayList collidingIndexes = collidingKeyHashCodesToIndexes.get(hashcode);
Preconditions.checkNotNull(collidingIndexes);
Preconditions.checkState(collidingIndexes.size() > 1);
collidingIndexes.add(numTuples());
} else {
LOGGER.warn("Collision detected with {} elements in table!", numTuples());
IntArrayList collidingIndexes = IntArrayList.newListWith(index, numTuples());
Preconditions.checkState(!collidingKeyHashCodesToIndexes.containsKey(hashcode));
collidingKeyHashCodesToIndexes.put(hashcode, collidingIndexes);
keyHashCodesToIndexes.put(hashcode, COLLIDING_KEY);
}
if (keyOnly) {
for (int i = 0; i < keyColumns.length; ++i) {
data.put(i, tb.asColumn(keyColumns[i]), row);
}
} else {
for (int i = 0; i < data.numColumns(); ++i) {
data.put(i, tb.asColumn(i), row);
}
}
}
/**
* @return the data
*/
public MutableTupleBuffer getData() {
return data;
}
/**
* Clean up the hash table.
*/
public void cleanup() {
keyHashCodesToIndexes = new LongIntHashMap();
collidingKeyHashCodesToIndexes = new LongObjectHashMap<IntArrayList>();
data = new MutableTupleBuffer(data.getSchema());
}
}