UniqueTupleHashTable.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.io.Serializable;

import org.slf4j.LoggerFactory;

import com.google.common.base.Preconditions;
import com.gs.collections.api.iterator.IntIterator;
import com.gs.collections.impl.list.mutable.primitive.IntArrayList;
import com.gs.collections.impl.map.mutable.primitive.LongIntHashMap;
import com.gs.collections.impl.map.mutable.primitive.LongObjectHashMap;

import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.storage.MutableTupleBuffer;
import edu.washington.escience.myria.storage.ReadableTable;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.HashUtils;

/**
 * An abstraction of a hash table of unique tuples.
 */
public final class UniqueTupleHashTable implements Serializable {
  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;
  /**
   * We store this value instead of a valid index to indicate
   * that a given hash code is mapped to multiple indexes.
   */
  private static final int COLLIDING_KEY = -1;
  /**
   * We return this value from getIfAbsent() to indicate absence,
   * since 0 and -1 are already legitimate values.
   */
  private static final int ABSENT_VALUE = -2;

  /** Map from unique hash codes to indexes. */
  private transient LongIntHashMap keyHashCodesToIndexes;
  /** Map from colliding hash codes to indexes. */
  private transient LongObjectHashMap<IntArrayList> collidingKeyHashCodesToIndexes;
  /** The table containing keys and values. */
  private transient MutableTupleBuffer data;
  /** Key column indices. */
  private final int[] keyColumns;

  /** The logger for this class. */
  protected static final org.slf4j.Logger LOGGER =
      LoggerFactory.getLogger(UniqueTupleHashTable.class);

  /**
   * @param schema schema
   * @param keyColumns key column indices
   */
  public UniqueTupleHashTable(final Schema schema, final int[] keyColumns) {
    this.keyColumns = keyColumns;
    data = new MutableTupleBuffer(schema);
    keyHashCodesToIndexes = new LongIntHashMap();
    collidingKeyHashCodesToIndexes = new LongObjectHashMap<IntArrayList>();
  }

  /**
   * @return the number of tuples this hash table has.
   */
  public int numTuples() {
    return data.numTuples();
  }

  /**
   * Get the data table index given key columns from a tuple in a tuple batch.
   *
   * @param tb the input tuple batch
   * @param key the key columns
   * @param row the row index of the tuple
   * @return the index of the matching tuple in the data table, or -1 if no match
   */
  public int getIndex(final ReadableTable tb, final int[] key, final int row) {
    final long hashcode = HashUtils.hashSubRowLong(tb, key, row);
    int index = keyHashCodesToIndexes.getIfAbsent(hashcode, ABSENT_VALUE);
    if (index == ABSENT_VALUE) {
      return -1;
    }
    if (index == COLLIDING_KEY) {
      IntArrayList collidingIndexes = collidingKeyHashCodesToIndexes.get(hashcode);
      Preconditions.checkNotNull(collidingIndexes);
      Preconditions.checkState(collidingIndexes.size() > 1);
      IntIterator iter = collidingIndexes.intIterator();
      while (iter.hasNext()) {
        int idx = iter.next();
        if (TupleUtils.tupleEquals(tb, key, row, data, keyColumns, idx)) {
          return idx;
        }
      }
      return -1; // our search key doesn't exist but collides with an existing key
    }
    if (TupleUtils.tupleEquals(tb, key, row, data, keyColumns, index)) {
      return index;
    }
    return -1; // our search key doesn't exist but collides with an existing key
  }

  /**
   * Replace a matching tuple in the data table with the input tuple.
   *
   * @param tb the input tuple batch
   * @param keyColumns the key columns
   * @param row the row index of the input tuple
   * @return if at least one tuple is replaced
   */
  public boolean replace(final TupleBatch tb, final int[] keyColumns, final int row) {
    int index = getIndex(tb, keyColumns, row);
    if (index == -1) {
      return false;
    }
    for (int j = 0; j < data.numColumns(); ++j) {
      data.replace(j, index, tb.getDataColumns().get(j), row);
    }
    return true;
  }

  /**
   * @param tb tuple batch of the input tuple
   * @param keyColumns key column indices
   * @param row row index of the input tuple
   * @param keyOnly only add keyColumns
   */
  public void addTuple(
      final ReadableTable tb, final int[] keyColumns, final int row, final boolean keyOnly) {
    final long hashcode = HashUtils.hashSubRowLong(tb, keyColumns, row);
    int index = keyHashCodesToIndexes.getIfAbsent(hashcode, ABSENT_VALUE);
    if (index == ABSENT_VALUE) {
      keyHashCodesToIndexes.put(hashcode, numTuples());
    } else if (index == COLLIDING_KEY) {
      IntArrayList collidingIndexes = collidingKeyHashCodesToIndexes.get(hashcode);
      Preconditions.checkNotNull(collidingIndexes);
      Preconditions.checkState(collidingIndexes.size() > 1);
      collidingIndexes.add(numTuples());
    } else {
      LOGGER.warn("Collision detected with {} elements in table!", numTuples());
      IntArrayList collidingIndexes = IntArrayList.newListWith(index, numTuples());
      Preconditions.checkState(!collidingKeyHashCodesToIndexes.containsKey(hashcode));
      collidingKeyHashCodesToIndexes.put(hashcode, collidingIndexes);
      keyHashCodesToIndexes.put(hashcode, COLLIDING_KEY);
    }
    if (keyOnly) {
      for (int i = 0; i < keyColumns.length; ++i) {
        data.put(i, tb.asColumn(keyColumns[i]), row);
      }
    } else {
      for (int i = 0; i < data.numColumns(); ++i) {
        data.put(i, tb.asColumn(i), row);
      }
    }
  }

  /**
   * @return the data
   */
  public MutableTupleBuffer getData() {
    return data;
  }

  /**
   * Clean up the hash table.
   */
  public void cleanup() {
    keyHashCodesToIndexes = new LongIntHashMap();
    collidingKeyHashCodesToIndexes = new LongObjectHashMap<IntArrayList>();
    data = new MutableTupleBuffer(data.getSchema());
  }
}