/* * Copyright 2016 Ben Manes. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.benmanes.caffeine.cache.simulator.membership.bloom; import static com.google.common.base.Preconditions.checkArgument; import java.util.Arrays; import javax.annotation.Nonnegative; import javax.annotation.concurrent.NotThreadSafe; import com.github.benmanes.caffeine.cache.simulator.membership.Membership; /** * A Bloom filter is a space and time efficient probabilistic data structure that is used to test * whether an element is a member of a set. False positives are possible, but false negatives are * not. Elements can be added to the set, but not removed. The more elements that are added the * higher the probability of false positives. While risking false positives, Bloom filters have a * space advantage over other data structures for representing sets by not storing the items. * * @author ben.manes@gmail.com (Ben Manes) */ @NotThreadSafe public final class BloomFilter implements Membership { static final long[] SEED = new long[] { // A mixture of seeds from FNV-1a, CityHash, and Murmur3 0xc3a5c85c97cb3127L, 0xb492b66fbe98f273L, 0x9ae16a3b2f90404fL, 0xcbf29ce484222325L}; static final int BITS_PER_LONG_SHIFT = 6; // 64-bits static final int BITS_PER_LONG_MASK = Long.SIZE - 1; final int randomSeed; int tableShift; long[] table; /** * Creates a membership sketch based on the expected number of insertions and the false positive * probability. * * @param expectedInsertions the number of expected insertions * @param fpp the false positive probability, where 0.0 > fpp < 1.0 * @param randomSeed the smear to protect against hash flooding, adjusted to an odd value */ public BloomFilter(@Nonnegative long expectedInsertions, @Nonnegative double fpp, int randomSeed) { this.randomSeed = ((randomSeed & 1) == 0) ? randomSeed + 1 : randomSeed; ensureCapacity(expectedInsertions, fpp); } /** * Initializes and increases the capacity of this <tt>BloomFilter</tt> instance, if necessary, * to ensure that it can accurately estimate the membership of elements given the expected * number of insertions. This operation forgets all previous memberships when resizing. * * @param expectedInsertions the number of expected insertions * @param fpp the false positive probability, where 0.0 > fpp < 1.0 */ void ensureCapacity(@Nonnegative long expectedInsertions, @Nonnegative double fpp) { checkArgument(expectedInsertions >= 0); checkArgument(fpp > 0 && fpp < 1); double optimalBitsFactor = -Math.log(fpp) / (Math.log(2) * Math.log(2)); int optimalNumberOfBits = (int) (expectedInsertions * optimalBitsFactor); int optimalSize = optimalNumberOfBits >>> BITS_PER_LONG_SHIFT; if ((table != null) && (table.length >= optimalSize)) { return; } else if (optimalSize == 0) { tableShift = Integer.SIZE - 1; table = new long[1]; } else { int powerOfTwoShift = Integer.SIZE - Integer.numberOfLeadingZeros(optimalSize - 1); tableShift = Integer.SIZE - powerOfTwoShift; table = new long[1 << powerOfTwoShift]; } } @Override public boolean mightContain(long e) { int item = spread(Long.hashCode(e)); for (int i = 0; i < 4; i++) { int hash = seeded(item, i); int index = hash >>> tableShift; if ((table[index] & bitmask(hash)) == 0L) { return false; } } return true; } @Override public void clear() { Arrays.fill(table, 0L); } @Override @SuppressWarnings("ShortCircuitBoolean") public boolean put(long e) { int item = spread(Long.hashCode(e)); return setAt(item, 0) | setAt(item, 1) | setAt(item, 2) | setAt(item, 3); } /** * Sets the membership flag for the computed bit location. * * @param item the element's hash * @param seedIndex the hash seed index * @return if the membership changed as a result of this operation */ boolean setAt(int item, int seedIndex) { int hash = seeded(item, seedIndex); int index = hash >>> tableShift; long previous = table[index]; table[index] |= bitmask(hash); return (table[index] != previous); } /** * Applies a supplemental hash function to a given hashCode, which defends against poor quality * hash functions. */ int spread(int x) { x = ((x >>> 16) ^ x) * 0x45d9f3b; x = ((x >>> 16) ^ x) * randomSeed; return (x >>> 16) ^ x; } /** * Applies the independent hash function for the given seed index. * * @param item the element's hash * @param i the hash seed index * @return the table index */ static int seeded(int item, int i) { long hash = SEED[i] * item; hash += hash >> 32; return (int) hash; } /** * Applies a hash function to determine the index of the bit. * * @param hash the seeded hash code * @return the mask to the bit */ static long bitmask(int hash) { return 1L << (hash & BITS_PER_LONG_MASK); } }