package edu.washington.escience.myria.util; import java.util.Objects; import com.google.common.base.Preconditions; import com.google.common.hash.HashCode; import com.google.common.hash.HashFunction; import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; import edu.washington.escience.myria.storage.ReadableColumn; import edu.washington.escience.myria.storage.ReadableTable; /** * A utility class for hashing tuples and parts of tuples. */ public final class HashUtils { /** Utility classes have no constructors. */ private HashUtils() {} /** picked from http://planetmath.org/goodhashtableprimes. */ private static final int[] SEEDS = { 243, 402653189, 24593, 786433, 3145739, 12289, 49157, 6151, 98317, 1572869, }; /** The hash functions. */ private static final HashFunction[] HASH_FUNCTIONS = { Hashing.murmur3_128(SEEDS[0]), Hashing.murmur3_128(SEEDS[1]), Hashing.murmur3_128(SEEDS[2]), Hashing.murmur3_128(SEEDS[3]), Hashing.murmur3_128(SEEDS[4]), Hashing.murmur3_128(SEEDS[5]), Hashing.murmur3_128(SEEDS[6]), Hashing.murmur3_128(SEEDS[7]), Hashing.murmur3_128(SEEDS[8]), Hashing.murmur3_128(SEEDS[9]) }; /** * Size of the hash function pool. */ public static final int NUM_OF_HASHFUNCTIONS = 10; private static HashCode getHashCode( final ReadableTable table, final int[] hashColumns, final int row, final int seedIndex) { Objects.requireNonNull(table, "table"); Objects.requireNonNull(hashColumns, "hashColumns"); Hasher hasher = HASH_FUNCTIONS[seedIndex].newHasher(); for (int column : hashColumns) { addValue(hasher, table, column, row); } return hasher.hash(); } private static HashCode getHashCode(final ReadableTable table, final int row) { return getHashCode(table, MyriaUtils.range(table.numColumns()), row, 0); } /** * Compute the hash code of all the values in the specified row, in column order. * * @param table the table containing the values * @param row the row to be hashed * @return the hash code of all the values in the specified row, in column order */ public static int hashRow(final ReadableTable table, final int row) { return getHashCode(table, row).asInt(); } /** * Compute the hash code of all the values in the specified row, in column order. * * @param table the table containing the values * @param row the row to be hashed * @return the hash code of all the values in the specified row, in column order */ public static long hashRowLong(final ReadableTable table, final int row) { return getHashCode(table, row).asLong(); } /** * Compute the hash code of all the values in the specified row, in column order. * * @param table the table containing the values * @param row the row to be hashed * @return the hash code of all the values in the specified row, in column order */ public static byte[] hashRowBytes(final ReadableTable table, final int row) { return getHashCode(table, row).asBytes(); } /** * Compute the hash code of the value in the specified column and row of the given table. * * @param table the table containing the values to be hashed * @param column the column containing the value to be hashed * @param row the row containing the value to be hashed * @return the hash code of the specified value */ public static int hashValue(final ReadableTable table, final int column, final int row) { return hashValue(table, column, row, 0); } /** * Compute the hash code of the value in the specified column and row of the given table with specific hashcode. * * @param table the table containing the values to be hashed * @param column the column containing the value to be hashed * @param row the row containing the value to be hashed * @param seedIndex the index of the chosen hash function * @return hash code of the specified seed */ public static int hashValue( final ReadableTable table, final int column, final int row, final int seedIndex) { Preconditions.checkPositionIndex(seedIndex, NUM_OF_HASHFUNCTIONS); Hasher hasher = HASH_FUNCTIONS[seedIndex].newHasher(); addValue(hasher, table, column, row); return hasher.hash().asInt(); } /** * Compute the hash code of the specified columns in the specified row of the given table. * * @param table the table containing the values to be hashed * @param hashColumns the columns to be hashed. Order matters * @param row the row containing the values to be hashed * @return the hash code of the specified columns in the specified row of the given table */ public static int hashSubRow(final ReadableTable table, final int[] hashColumns, final int row) { return hashSubRow(table, hashColumns, row, 0); } /** * Compute the hash code of the specified columns in the specified row of the given table. * * @param table the table containing the values to be hashed * @param hashColumns the columns to be hashed. Order matters * @param row the row containing the values to be hashed * @param seedIndex the index of the chosen hash function * @return the hash code of the specified columns in the specified row of the given table */ public static int hashSubRow( final ReadableTable table, final int[] hashColumns, final int row, final int seedIndex) { return getHashCode(table, hashColumns, row, seedIndex).asInt(); } /** * Compute the hash code of the value in the specified column and row of the given table. * * @param table the table containing the values to be hashed * @param column the column containing the value to be hashed * @param row the row containing the value to be hashed * @return the hash code of the specified value */ public static long hashValueLong(final ReadableTable table, final int column, final int row) { return hashValueLong(table, column, row, 0); } /** * Compute the hash code of the value in the specified column and row of the given table with specific hashcode. * * @param table the table containing the values to be hashed * @param column the column containing the value to be hashed * @param row the row containing the value to be hashed * @param seedIndex the index of the chosen hash function * @return hash code of the specified seed */ public static long hashValueLong( final ReadableTable table, final int column, final int row, final int seedIndex) { Preconditions.checkPositionIndex(seedIndex, NUM_OF_HASHFUNCTIONS); Hasher hasher = HASH_FUNCTIONS[seedIndex].newHasher(); addValue(hasher, table, column, row); return hasher.hash().asLong(); } /** * Compute the hash code of the specified columns in the specified row of the given table. * * @param table the table containing the values to be hashed * @param hashColumns the columns to be hashed. Order matters * @param row the row containing the values to be hashed * @return the hash code of the specified columns in the specified row of the given table */ public static long hashSubRowLong( final ReadableTable table, final int[] hashColumns, final int row) { return hashSubRowLong(table, hashColumns, row, 0); } /** * Compute the hash code of the specified columns in the specified row of the given table. * * @param table the table containing the values to be hashed * @param hashColumns the columns to be hashed. Order matters * @param row the row containing the values to be hashed * @param seedIndex the index of the chosen hash function * @return the hash code of the specified columns in the specified row of the given table */ public static long hashSubRowLong( final ReadableTable table, final int[] hashColumns, final int row, final int seedIndex) { return getHashCode(table, hashColumns, row, seedIndex).asLong(); } /** * Add the value at the specified row and column to the specified hasher. * * @param hasher the hasher * @param table the table containing the value * @param column the column containing the value * @param row the row containing the value * @return the hasher */ private static Hasher addValue( final Hasher hasher, final ReadableTable table, final int column, final int row) { return addValue(hasher, table.asColumn(column), row); } /** * Add the value at the specified row and column to the specified hasher. * * @param hasher the hasher * @param column the column containing the value * @param row the row containing the value * @return the hasher */ private static Hasher addValue(final Hasher hasher, final ReadableColumn column, final int row) { switch (column.getType()) { case BOOLEAN_TYPE: return hasher.putBoolean(column.getBoolean(row)); case DATETIME_TYPE: return hasher.putObject(column.getDateTime(row), TypeFunnel.INSTANCE); case DOUBLE_TYPE: return hasher.putDouble(column.getDouble(row)); case FLOAT_TYPE: return hasher.putFloat(column.getFloat(row)); case INT_TYPE: return hasher.putInt(column.getInt(row)); case LONG_TYPE: return hasher.putLong(column.getLong(row)); case STRING_TYPE: return hasher.putObject(column.getString(row), TypeFunnel.INSTANCE); default: throw new UnsupportedOperationException( "Hashing a column of type " + column.getType() + " is unsupported"); } } }