/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector.expressions;
import java.util.Arrays;
import java.util.Random;
/**
* A high-performance set implementation used to support fast set membership testing,
* using Cuckoo hashing. This is used to support fast tests of the form
*
* column IN ( <list-of-values )
*
* For details on the algorithm, see R. Pagh and F. F. Rodler, "Cuckoo Hashing,"
* Elsevier Science preprint, Dec. 2003. http://www.itu.dk/people/pagh/papers/cuckoo-jour.pdf.
*
*/
public class CuckooSetLong {
private long t1[];
private long t2[];
private long prev1[] = null; // used for rehashing to get last set of values
private long prev2[] = null; // " "
private int n; // current array size
private static final double PADDING_FACTOR = 1.0/0.40; // have minimum 40% fill factor
private int salt[] = new int[6];
private Random gen = new Random(676983475);
private long blank = Long.MIN_VALUE;
private int rehashCount = 0;
// some prime numbers spaced about at powers of 2 in magnitude
public static int primes[] = {7, 13, 17, 23, 31, 53, 67, 89, 127, 269, 571, 1019, 2089,
4507, 8263, 16361, 32327, 65437, 131111, 258887, 525961, 999983, 2158909, 4074073,
8321801, 15485863, 32452867, 67867967, 122949829, 256203221, 553105253, 982451653,
1645333507, 2147483647};
/**
* Allocate a new set to hold expectedSize values. Re-allocation to expand
* the set is not implemented, so the expected size must be at least the
* size of the set to be inserteed.
* @param expectedSize At least the size of the set of values that will be inserted.
*/
public CuckooSetLong(int expectedSize) {
// Choose array size. We have two hash tables to hold entries, so the sum
// of the two should have a bit more than twice as much space as the
// minimum required.
n = (int) (expectedSize * PADDING_FACTOR / 2.0);
// try to get prime number table size to have less dependence on good hash function
for (int i = 0; i != primes.length; i++) {
if (n <= primes[i]) {
n = primes[i];
break;
}
}
t1 = new long[n];
t2 = new long[n];
Arrays.fill(t1, blank);
Arrays.fill(t2, blank);
updateHashSalt();
}
/**
* Return true if and only if the value x is present in the set.
*/
public boolean lookup(long x) {
/* Must check that x is not blank because otherwise you could
* get a false positive if the blank value was a value you
* were legitimately testing to see if it was in the set.
*/
return x != blank && (t1[h1(x)] == x || t2[h2(x)] == x);
}
public void insert(long x) {
if (x == blank) {
findNewBlank();
}
long temp;
if (lookup(x)) {
return;
}
// Try to insert up to n times. Rehash if that fails.
for(int i = 0; i != n; i++) {
if (t1[h1(x)] == blank) {
t1[h1(x)] = x;
return;
}
// swap x and t1[h1(x)]
temp = t1[h1(x)];
t1[h1(x)] = x;
x = temp;
if (t2[h2(x)] == blank) {
t2[h2(x)] = x;
return;
}
// swap x and t2[h2(x)]
temp = t2[h2(x)];
t2[h2(x)] = x;
x = temp;
}
rehash();
insert(x);
}
/**
* Insert all values in the input array into the set.
*/
public void load(long[] a) {
for (Long x : a) {
insert(x);
}
}
/**
* Need to change current blank value to something else because it is in
* the input data set.
*/
private void findNewBlank() {
long newBlank = gen.nextLong();
while(newBlank == blank || lookup(newBlank)) {
newBlank = gen.nextLong();
}
// replace existing blanks with new blanks
for(int i = 0; i != n; i++) {
if (t1[i] == blank) {
t1[i] = newBlank;
}
if (t2[i] == blank) {
t2[i] = newBlank;
}
}
blank = newBlank;
}
/**
* Try to insert with up to n value's "poked out". Return the last value poked out.
* If the value is not blank then we assume there was a cycle.
* Don't try to insert the same value twice. This is for use in rehash only,
* so you won't see the same value twice.
*/
private long tryInsert(long x) {
long temp;
for(int i = 0; i != n; i++) {
if (t1[h1(x)] == blank) {
t1[h1(x)] = x;
return blank;
}
// swap x and t1[h1(x)]
temp = t1[h1(x)];
t1[h1(x)] = x;
x = temp;
if (t2[h2(x)] == blank) {
t2[h2(x)] = x;
return blank;
}
// swap x and t2[h2(x)]
temp = t2[h2(x)];
t2[h2(x)] = x;
x = temp;
if (x == blank) {
break;
}
}
return x;
}
/**
* Variation of Robert Jenkins' hash function.
*/
private int h1(long y) {
int x = (int) ((((y >>> 32) ^ y)) & 0xFFFFFFFF);
x = (x + salt[0]) + (x << 12);
x = (x ^ salt[1]) ^ (x >> 19);
x = (x + salt[2]) + (x << 5);
x = (x + salt[3]) ^ (x << 9);
x = (x + salt[4]) + (x << 3);
x = (x ^ salt[5]) ^ (x >> 16);
// Return value modulo n but always in the positive range (0..n-1).
// And with the mask to zero the sign bit to make the input to mod positive
// so the output will definitely be positive.
return (x & 0x7FFFFFFF) % n;
}
/**
* basic modular hash function
*/
private int h2(long x) {
// Return value modulo n but always in the positive range (0..n-1).
// Since n is prime, this gives good spread for numbers that are multiples
// of one billion, which is important since timestamps internally
// are stored as a number of nanoseconds, and the fractional seconds
// part is often 0.
return (((int) (x % n)) + n) % n;
}
/**
* In case of rehash, hash function h2 is changed by updating the
* entries in the salt array with new random values.
*/
private void updateHashSalt() {
for (int i = 0; i != 6; i++) {
salt[i] = gen.nextInt(0x7FFFFFFF);
}
}
private void rehash() {
rehashCount++;
if (rehashCount > 20) {
throw new RuntimeException("Too many rehashes");
}
updateHashSalt();
// Save original values
if (prev1 == null) {
prev1 = t1;
prev2 = t2;
}
t1 = new long[n];
t2 = new long[n];
Arrays.fill(t1, blank);
Arrays.fill(t2, blank);
for (Long v : prev1) {
if (v != blank) {
long x = tryInsert(v);
if (x != blank) {
rehash();
return;
}
}
}
for (Long v : prev2) {
if (v != blank) {
long x = tryInsert(v);
if (x != blank) {
rehash();
return;
}
}
}
// We succeeded in adding all the values, so
// clear the previous values recorded.
prev1 = null;
prev2 = null;
}
}